diff options
Diffstat (limited to 'arch/powerpc/mm')
99 files changed, 21442 insertions, 13202 deletions
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c deleted file mode 100644 index 5810967511d4..000000000000 --- a/arch/powerpc/mm/40x_mmu.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * This file contains the routines for initializing the MMU - * on the 4xx series of chips. - * -- paulus - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/stddef.h> -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/highmem.h> -#include <linux/memblock.h> - -#include <asm/pgalloc.h> -#include <asm/prom.h> -#include <asm/io.h> -#include <asm/mmu_context.h> -#include <asm/pgtable.h> -#include <asm/mmu.h> -#include <asm/uaccess.h> -#include <asm/smp.h> -#include <asm/bootx.h> -#include <asm/machdep.h> -#include <asm/setup.h> - -#include "mmu_decl.h" - -extern int __map_without_ltlbs; -/* - * MMU_init_hw does the chip-specific initialization of the MMU hardware. - */ -void __init MMU_init_hw(void) -{ - /* - * The Zone Protection Register (ZPR) defines how protection will - * be applied to every page which is a member of a given zone. At - * present, we utilize only two of the 4xx's zones. - * The zone index bits (of ZSEL) in the PTE are used for software - * indicators, except the LSB. For user access, zone 1 is used, - * for kernel access, zone 0 is used. We set all but zone 1 - * to zero, allowing only kernel access as indicated in the PTE. - * For zone 1, we set a 01 binary (a value of 10 will not work) - * to allow user access as indicated in the PTE. This also allows - * kernel access as indicated in the PTE. - */ - - mtspr(SPRN_ZPR, 0x10000000); - - flush_instruction_cache(); - - /* - * Set up the real-mode cache parameters for the exception vector - * handlers (which are run in real-mode). - */ - - mtspr(SPRN_DCWR, 0x00000000); /* All caching is write-back */ - - /* - * Cache instruction and data space where the exception - * vectors and the kernel live in real-mode. - */ - - mtspr(SPRN_DCCR, 0xFFFF0000); /* 2GByte of data space at 0x0. */ - mtspr(SPRN_ICCR, 0xFFFF0000); /* 2GByte of instr. space at 0x0. */ -} - -#define LARGE_PAGE_SIZE_16M (1<<24) -#define LARGE_PAGE_SIZE_4M (1<<22) - -unsigned long __init mmu_mapin_ram(unsigned long top) -{ - unsigned long v, s, mapped; - phys_addr_t p; - - v = KERNELBASE; - p = 0; - s = total_lowmem; - - if (__map_without_ltlbs) - return 0; - - while (s >= LARGE_PAGE_SIZE_16M) { - pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE; - - pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); - pmd_val(*pmdp++) = val; - pmd_val(*pmdp++) = val; - pmd_val(*pmdp++) = val; - pmd_val(*pmdp++) = val; - - v += LARGE_PAGE_SIZE_16M; - p += LARGE_PAGE_SIZE_16M; - s -= LARGE_PAGE_SIZE_16M; - } - - while (s >= LARGE_PAGE_SIZE_4M) { - pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE; - - pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); - pmd_val(*pmdp) = val; - - v += LARGE_PAGE_SIZE_4M; - p += LARGE_PAGE_SIZE_4M; - s -= LARGE_PAGE_SIZE_4M; - } - - mapped = total_lowmem - s; - - /* If the size of RAM is not an exact power of two, we may not - * have covered RAM in its entirety with 16 and 4 MiB - * pages. Consequently, restrict the top end of RAM currently - * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail" - * coverage with normal-sized pages (or other reasons) do not - * attempt to allocate outside the allowed range. - */ - memblock_set_current_limit(mapped); - - return mapped; -} - -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); - - /* 40x can only access 16MB at the moment (see head_40x.S) */ - memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000)); -} diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 51230ee6a407..8c1582b2987d 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -1,38 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux ppc-specific parts of the memory manager. # -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - -ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) - -obj-y := fault.o mem.o pgtable.o gup.o mmap.o \ - init_$(CONFIG_WORD_SIZE).o \ - pgtable_$(CONFIG_WORD_SIZE).o -obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ - tlb_nohash_low.o -obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o -hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o -obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \ - slb_low.o slb.o stab.o \ - $(hash64-y) -obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o -obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \ - tlb_hash$(CONFIG_WORD_SIZE).o \ - mmu_context_hash$(CONFIG_WORD_SIZE).o -obj-$(CONFIG_PPC_ICSWX) += icswx.o -obj-$(CONFIG_PPC_ICSWX_PID) += icswx_pid.o -obj-$(CONFIG_40x) += 40x_mmu.o -obj-$(CONFIG_44x) += 44x_mmu.o -obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o -obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o -obj-$(CONFIG_PPC_MM_SLICES) += slice.o -obj-y += hugetlbpage.o -ifeq ($(CONFIG_HUGETLB_PAGE),y) -obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o -obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o -endif -obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o -obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o +obj-y := fault.o mem.o pgtable.o maccess.o pageattr.o \ + init_$(BITS).o pgtable_$(BITS).o \ + pgtable-frag.o ioremap.o ioremap_$(BITS).o \ + init-common.o mmu_context.o drmem.o \ + cacheflush.o +obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ +obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ +obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ +obj-$(CONFIG_NUMA) += numa.o +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o -obj-$(CONFIG_HIGHMEM) += highmem.o +obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o +obj-$(CONFIG_PTDUMP) += ptdump/ +obj-$(CONFIG_KASAN) += kasan/ diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile new file mode 100644 index 000000000000..50dd8f6bdf46 --- /dev/null +++ b/arch/powerpc/mm/book3s32/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +KASAN_SANITIZE_mmu.o := n + +ifdef CONFIG_KASAN +CFLAGS_mmu.o += -DDISABLE_BRANCH_PROFILING +endif + +obj-y += mmu.o mmu_context.o +obj-$(CONFIG_PPC_BOOK3S_603) += nohash_low.o +obj-$(CONFIG_PPC_BOOK3S_604) += hash_low.o tlb.o +obj-$(CONFIG_PPC_KUAP) += kuap.o diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/book3s32/hash_low.S index 115347f74ce5..4ed0efd03db5 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/book3s32/hash_low.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -11,48 +12,46 @@ * This file contains low-level assembler routines for managing * the PowerPC MMU hash table. (PPC 8xx processors don't use a * hash table, so this file is not used on them.) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ +#include <linux/export.h> +#include <linux/pgtable.h> +#include <linux/init.h> #include <asm/reg.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/cputable.h> #include <asm/ppc_asm.h> #include <asm/thread_info.h> #include <asm/asm-offsets.h> +#include <asm/feature-fixups.h> +#include <asm/code-patching-asm.h> -#ifdef CONFIG_SMP - .section .bss - .align 2 - .globl mmu_hash_lock -mmu_hash_lock: - .space 4 -#endif /* CONFIG_SMP */ +#ifdef CONFIG_PTE_64BIT +#define PTE_T_SIZE 8 +#define PTE_FLAGS_OFFSET 4 /* offset of PTE flags, in bytes */ +#else +#define PTE_T_SIZE 4 +#define PTE_FLAGS_OFFSET 0 +#endif /* * Load a PTE into the hash table, if possible. - * The address is in r4, and r3 contains an access flag: - * _PAGE_RW (0x400) if a write. + * The address is in r4, and r3 contains required access flags: + * - For ISI: _PAGE_PRESENT | _PAGE_EXEC + * - For DSI: _PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE if a write. * r9 contains the SRR1 value, from which we use the MSR_PR bit. * SPRG_THREAD contains the physical address of the current task's thread. * * Returns to the caller if the access is illegal or there is no * mapping for the address. Otherwise it places an appropriate PTE * in the hash table and returns from the exception. - * Uses r0, r3 - r8, r10, ctr, lr. + * Uses r0, r3 - r6, r8, r10, ctr, lr. */ .text _GLOBAL(hash_page) - tophys(r7,0) /* gets -KERNELBASE into r7 */ #ifdef CONFIG_SMP - addis r8,r7,mmu_hash_lock@h - ori r8,r8,mmu_hash_lock@l + lis r8, (mmu_hash_lock - PAGE_OFFSET)@h + ori r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l lis r0,0x0fff b 10f 11: lwz r6,0(r8) @@ -66,16 +65,20 @@ _GLOBAL(hash_page) isync #endif /* Get PTE (linux-style) and check access */ - lis r0,KERNELBASE@h /* check if kernel address */ + lis r0, TASK_SIZE@h /* check if kernel address */ cmplw 0,r4,r0 mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ - ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ lwz r5,PGDIR(r8) /* virt page-table root */ blt+ 112f /* assume user more likely */ lis r5,swapper_pg_dir@ha /* if kernel address, use */ + andi. r0,r9,MSR_PR /* Check usermode */ addi r5,r5,swapper_pg_dir@l /* kernel page table */ - rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ -112: add r5,r5,r7 /* convert to phys addr */ +#ifdef CONFIG_SMP + bne- .Lhash_page_out /* return if usermode */ +#else + bnelr- +#endif +112: tophys(r5, r5) #ifndef CONFIG_PTE_64BIT rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ lwz r8,0(r5) /* get pmd entry */ @@ -86,7 +89,7 @@ _GLOBAL(hash_page) rlwinm. r8,r8,0,0,20 /* extract pt base address */ #endif #ifdef CONFIG_SMP - beq- hash_page_out /* return if no mapping */ + beq- .Lhash_page_out /* return if no mapping */ #else /* XXX it seems like the 601 will give a machine fault on the rfi if its alignment is wrong (bottom 4 bits of address are @@ -98,27 +101,35 @@ _GLOBAL(hash_page) rlwimi r8,r4,22,20,29 /* insert next 10 bits of address */ #else rlwimi r8,r4,23,20,28 /* compute pte address */ + /* + * If PTE_64BIT is set, the low word is the flags word; use that + * word for locking since it contains all the interesting bits. + */ + addi r8,r8,PTE_FLAGS_OFFSET #endif - rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */ - ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE /* * Update the linux PTE atomically. We do the lwarx up-front * because almost always, there won't be a permission violation * and there won't already be an HPTE, and thus we will have * to update the PTE to set _PAGE_HASHPTE. -- paulus. - * - * If PTE_64BIT is set, the low word is the flags word; use that - * word for locking since it contains all the interesting bits. */ -#if (PTE_FLAGS_OFFSET != 0) - addi r8,r8,PTE_FLAGS_OFFSET -#endif -retry: +.Lretry: lwarx r6,0,r8 /* get linux-style pte, flag word */ +#ifdef CONFIG_PPC_KUAP + mfsrin r5,r4 + rlwinm r0,r9,28,_PAGE_WRITE /* MSR[PR] => _PAGE_WRITE */ + rlwinm r5,r5,12,_PAGE_WRITE /* Ks => _PAGE_WRITE */ + andc r5,r5,r0 /* Ks & ~MSR[PR] */ + andc r5,r6,r5 /* Clear _PAGE_WRITE when Ks = 1 && MSR[PR] = 0 */ + andc. r5,r3,r5 /* check access & ~permission */ +#else andc. r5,r3,r6 /* check access & ~permission */ +#endif + rlwinm r0,r3,32-3,24,24 /* _PAGE_WRITE access -> _PAGE_DIRTY */ + ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE #ifdef CONFIG_SMP - bne- hash_page_out /* return if access not permitted */ + bne- .Lhash_page_out /* return if access not permitted */ #else bnelr- #endif @@ -133,36 +144,28 @@ retry: #endif /* CONFIG_SMP */ #endif /* CONFIG_PTE_64BIT */ stwcx. r5,0,r8 /* attempt to update PTE */ - bne- retry /* retry if someone got there first */ + bne- .Lretry /* retry if someone got there first */ mfsrin r3,r4 /* get segment reg for segment */ - mfctr r0 - stw r0,_CTR(r11) bl create_hpte /* add the hash table entry */ #ifdef CONFIG_SMP eieio - addis r8,r7,mmu_hash_lock@ha + lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha li r0,0 - stw r0,mmu_hash_lock@l(r8) + stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) #endif - - /* Return from the exception */ - lwz r5,_CTR(r11) - mtctr r5 - lwz r0,GPR0(r11) - lwz r7,GPR7(r11) - lwz r8,GPR8(r11) - b fast_exception_return + b fast_hash_page_return #ifdef CONFIG_SMP -hash_page_out: +.Lhash_page_out: eieio - addis r8,r7,mmu_hash_lock@ha + lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha li r0,0 - stw r0,mmu_hash_lock@l(r8) + stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) blr #endif /* CONFIG_SMP */ +_ASM_NOKPROBE_SYMBOL(hash_page) /* * Add an entry for a particular page to the hash table. @@ -177,15 +180,8 @@ _GLOBAL(add_hash_page) mflr r0 stw r0,4(r1) - /* Convert context and va to VSID */ - mulli r3,r3,897*16 /* multiply context by context skew */ - rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */ - mulli r0,r0,0x111 /* multiply by ESID skew */ - add r3,r3,r0 /* note create_hpte trims to 24 bits */ - #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r8, r1) /* use cpu number to make tag */ - lwz r8,TI_CPU(r8) /* to go in mmu_hash_lock */ + lwz r8,TASK_CPU(r2) /* to go in mmu_hash_lock */ oris r8,r8,12 #endif /* CONFIG_SMP */ @@ -199,25 +195,21 @@ _GLOBAL(add_hash_page) * covered by a BAT). -- paulus */ mfmsr r9 - SYNC rlwinm r0,r9,0,17,15 /* clear bit 16 (MSR_EE) */ rlwinm r0,r0,0,28,26 /* clear MSR_DR */ mtmsr r0 - SYNC_601 isync - tophys(r7,0) - #ifdef CONFIG_SMP - addis r6,r7,mmu_hash_lock@ha - addi r6,r6,mmu_hash_lock@l + lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l 10: lwarx r0,0,r6 /* take the mmu_hash_lock */ - cmpi 0,r0,0 + cmpwi 0,r0,0 bne- 11f stwcx. r8,0,r6 beq+ 12f 11: lwz r0,0(r6) - cmpi 0,r0,0 + cmpwi 0,r0,0 beq 10b b 11b 12: isync @@ -251,12 +243,18 @@ _GLOBAL(add_hash_page) stwcx. r5,0,r8 bne- 1b + /* Convert context and va to VSID */ + mulli r3,r3,897*16 /* multiply context by context skew */ + rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */ + mulli r0,r0,0x111 /* multiply by ESID skew */ + add r3,r3,r0 /* note create_hpte trims to 24 bits */ + bl create_hpte 9: #ifdef CONFIG_SMP - addis r6,r7,mmu_hash_lock@ha - addi r6,r6,mmu_hash_lock@l + lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l eieio li r0,0 stw r0,0(r6) /* clear mmu_hash_lock */ @@ -264,22 +262,20 @@ _GLOBAL(add_hash_page) /* reenable interrupts and DR */ mtmsr r9 - SYNC_601 isync lwz r0,4(r1) mtlr r0 blr +_ASM_NOKPROBE_SYMBOL(add_hash_page) /* * This routine adds a hardware PTE to the hash table. * It is designed to be called with the MMU either on or off. * r3 contains the VSID, r4 contains the virtual address, * r5 contains the linux PTE, r6 contains the old value of the - * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the - * offset to be added to addresses (0 if the MMU is on, - * -KERNELBASE if it is off). r10 contains the upper half of - * the PTE if CONFIG_PTE_64BIT. + * linux PTE (before setting _PAGE_HASHPTE). r10 contains the + * upper half of the PTE if CONFIG_PTE_64BIT. * On SMP, the caller should have the mmu_hash_lock held. * We assume that the caller has (or will) set the _PAGE_HASHPTE * bit in the linux PTE in memory. The value passed in r6 should @@ -290,9 +286,9 @@ _GLOBAL(add_hash_page) * * For speed, 4 of the instructions get patched once the size and * physical address of the hash table are known. These definitions - * of Hash_base and Hash_bits below are just an example. + * of Hash_base and Hash_bits below are for the early hash table. */ -Hash_base = 0xc0180000 +Hash_base = early_hash Hash_bits = 12 /* e.g. 256kB hash table */ Hash_msk = (((1 << Hash_bits) - 1) * 64) @@ -313,15 +309,19 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64) #define HASH_LEFT 31-(LG_PTEG_SIZE+Hash_bits-1) #define HASH_RIGHT 31-LG_PTEG_SIZE +__REF _GLOBAL(create_hpte) /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ - rlwinm r8,r5,32-10,31,31 /* _PAGE_RW -> PP lsb */ - rlwinm r0,r5,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ + lis r0, TASK_SIZE@h + rlwinm r5,r5,0,~3 /* Clear PP bits */ + cmplw r4,r0 + rlwinm r8,r5,32-9,30,30 /* _PAGE_WRITE -> PP msb */ + rlwinm r0,r5,32-6,30,30 /* _PAGE_DIRTY -> PP msb */ and r8,r8,r0 /* writable if _RW & _DIRTY */ - rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ - ori r8,r8,0xe04 /* clear out reserved bits */ - andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */ + bge- 1f /* Kernelspace ? Skip */ + ori r5,r5,3 /* Userspace ? PP = 3 */ +1: ori r8,r8,0xe04 /* clear out reserved bits */ + andc r8,r5,r8 /* PP = user? (rw&dirty? 1: 3): 0 */ BEGIN_FTR_SECTION rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -336,11 +336,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) rlwimi r5,r4,10,26,31 /* put in API (abbrev page index) */ SET_V(r5) /* set V (valid) bit */ + patch_site 0f, patch__hash_page_A0 + patch_site 1f, patch__hash_page_A1 + patch_site 2f, patch__hash_page_A2 /* Get the address of the primary PTE group in the hash table (r3) */ -_GLOBAL(hash_page_patch_A) - addis r0,r7,Hash_base@h /* base address of hash table */ - rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ - rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ +0: lis r0, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */ +1: rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ +2: rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ xor r3,r3,r0 /* make primary hash */ li r0,8 /* PTEs/group */ @@ -352,30 +354,25 @@ _GLOBAL(hash_page_patch_A) beq+ 10f /* no PTE: go look for an empty slot */ tlbie r4 - addis r4,r7,htab_hash_searches@ha - lwz r6,htab_hash_searches@l(r4) - addi r6,r6,1 /* count how many searches we do */ - stw r6,htab_hash_searches@l(r4) - /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ mtctr r0 addi r4,r3,-HPTE_SIZE 1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */ CMPPTE 0,r6,r5 bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ - beq+ found_slot + beq+ .Lfound_slot + patch_site 0f, patch__hash_page_B /* Search the secondary PTEG for a matching PTE */ ori r5,r5,PTE_H /* set H (secondary hash) bit */ -_GLOBAL(hash_page_patch_B) - xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ +0: xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ xori r4,r4,(-PTEG_SIZE & 0xffff) addi r4,r4,-HPTE_SIZE mtctr r0 2: LDPTEu r6,HPTE_SIZE(r4) CMPPTE 0,r6,r5 bdnzf 2,2b - beq+ found_slot + beq+ .Lfound_slot xori r5,r5,PTE_H /* clear H bit again */ /* Search the primary PTEG for an empty slot */ @@ -384,25 +381,19 @@ _GLOBAL(hash_page_patch_B) 1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */ TST_V(r6) /* test valid bit */ bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ - beq+ found_empty - - /* update counter of times that the primary PTEG is full */ - addis r4,r7,primary_pteg_full@ha - lwz r6,primary_pteg_full@l(r4) - addi r6,r6,1 - stw r6,primary_pteg_full@l(r4) + beq+ .Lfound_empty + patch_site 0f, patch__hash_page_C /* Search the secondary PTEG for an empty slot */ ori r5,r5,PTE_H /* set H (secondary hash) bit */ -_GLOBAL(hash_page_patch_C) - xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ +0: xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ xori r4,r4,(-PTEG_SIZE & 0xffff) addi r4,r4,-HPTE_SIZE mtctr r0 2: LDPTEu r6,HPTE_SIZE(r4) TST_V(r6) bdnzf 2,2b - beq+ found_empty + beq+ .Lfound_empty xori r5,r5,PTE_H /* clear H bit again */ /* @@ -413,36 +404,20 @@ _GLOBAL(hash_page_patch_C) * and we know there is a definite (although small) speed * advantage to putting the PTE in the primary PTEG, we always * put the PTE in the primary PTEG. - * - * In addition, we skip any slot that is mapping kernel text in - * order to avoid a deadlock when not using BAT mappings if - * trying to hash in the kernel hash code itself after it has - * already taken the hash table lock. This works in conjunction - * with pre-faulting of the kernel text. - * - * If the hash table bucket is full of kernel text entries, we'll - * lockup here but that shouldn't happen */ -1: addis r4,r7,next_slot@ha /* get next evict slot */ - lwz r6,next_slot@l(r4) + lis r4, (next_slot - PAGE_OFFSET)@ha /* get next evict slot */ + lwz r6, (next_slot - PAGE_OFFSET)@l(r4) addi r6,r6,HPTE_SIZE /* search for candidate */ andi. r6,r6,7*HPTE_SIZE stw r6,next_slot@l(r4) add r4,r3,r6 - LDPTE r0,HPTE_SIZE/2(r4) /* get PTE second word */ - clrrwi r0,r0,12 - lis r6,etext@h - ori r6,r6,etext@l /* get etext */ - tophys(r6,r6) - cmpl cr0,r0,r6 /* compare and try again */ - blt 1b #ifndef CONFIG_SMP /* Store PTE in PTEG */ -found_empty: +.Lfound_empty: STPTE r5,0(r4) -found_slot: +.Lfound_slot: STPTE r8,HPTE_SIZE/2(r4) #else /* CONFIG_SMP */ @@ -463,8 +438,8 @@ found_slot: * We do however have to make sure that the PTE is never in an invalid * state with the V bit set. */ -found_empty: -found_slot: +.Lfound_empty: +.Lfound_slot: CLR_V(r5,r0) /* clear V (valid) bit in PTE */ STPTE r5,0(r4) sync @@ -477,15 +452,13 @@ found_slot: sync /* make sure pte updates get to memory */ blr + .previous +_ASM_NOKPROBE_SYMBOL(create_hpte) .section .bss .align 2 next_slot: .space 4 -primary_pteg_full: - .space 4 -htab_hash_searches: - .space 4 .previous /* @@ -496,9 +469,8 @@ htab_hash_searches: * * We assume that there is a hash table in use (Hash != 0). */ +__REF _GLOBAL(flush_hash_pages) - tophys(r7,0) - /* * We disable interrupts here, even on UP, because we want * the _PAGE_HASHPTE bit to be a reliable indication of @@ -508,11 +480,9 @@ _GLOBAL(flush_hash_pages) * covered by a BAT). -- paulus */ mfmsr r10 - SYNC rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ rlwinm r0,r0,0,28,26 /* clear MSR_DR */ mtmsr r0 - SYNC_601 isync /* First find a PTE in the range that has _PAGE_HASHPTE set */ @@ -520,14 +490,15 @@ _GLOBAL(flush_hash_pages) rlwimi r5,r4,22,20,29 #else rlwimi r5,r4,23,20,28 + addi r5,r5,PTE_FLAGS_OFFSET #endif -1: lwz r0,PTE_FLAGS_OFFSET(r5) +1: lwz r0,0(r5) cmpwi cr1,r6,1 andi. r0,r0,_PAGE_HASHPTE bne 2f ble cr1,19f addi r4,r4,0x1000 - addi r5,r5,PTE_SIZE + addi r5,r5,PTE_T_SIZE addi r6,r6,-1 b 1b @@ -543,19 +514,18 @@ _GLOBAL(flush_hash_pages) SET_V(r11) /* set V (valid) bit */ #ifdef CONFIG_SMP - addis r9,r7,mmu_hash_lock@ha - addi r9,r9,mmu_hash_lock@l - CURRENT_THREAD_INFO(r8, r1) - add r8,r8,r7 - lwz r8,TI_CPU(r8) + lis r9, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l + tophys (r8, r2) + lwz r8, TASK_CPU(r8) oris r8,r8,9 10: lwarx r0,0,r9 - cmpi 0,r0,0 + cmpwi 0,r0,0 bne- 11f stwcx. r8,0,r9 beq+ 12f 11: lwz r0,0(r9) - cmpi 0,r0,0 + cmpwi 0,r0,0 beq 10b b 11b 12: isync @@ -566,9 +536,6 @@ _GLOBAL(flush_hash_pages) * already clear, we're done (for this pte). If not, * clear it (atomically) and proceed. -- paulus. */ -#if (PTE_FLAGS_OFFSET != 0) - addi r5,r5,PTE_FLAGS_OFFSET -#endif 33: lwarx r8,0,r5 /* fetch the pte flags word */ andi. r0,r8,_PAGE_HASHPTE beq 8f /* done if HASHPTE is already clear */ @@ -576,11 +543,13 @@ _GLOBAL(flush_hash_pages) stwcx. r8,0,r5 /* update the pte */ bne- 33b + patch_site 0f, patch__flush_hash_A0 + patch_site 1f, patch__flush_hash_A1 + patch_site 2f, patch__flush_hash_A2 /* Get the address of the primary PTE group in the hash table (r3) */ -_GLOBAL(flush_hash_patch_A) - addis r8,r7,Hash_base@h /* base address of hash table */ - rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ - rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ +0: lis r8, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */ +1: rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ +2: rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ xor r8,r0,r8 /* make primary hash */ /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ @@ -592,11 +561,11 @@ _GLOBAL(flush_hash_patch_A) bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ beq+ 3f + patch_site 0f, patch__flush_hash_B /* Search the secondary PTEG for a matching PTE */ ori r11,r11,PTE_H /* set H (secondary hash) bit */ li r0,8 /* PTEs/group */ -_GLOBAL(flush_hash_patch_B) - xoris r12,r8,Hash_msk>>16 /* compute secondary hash */ +0: xoris r12,r8,Hash_msk>>16 /* compute secondary hash */ xori r12,r12,(-PTEG_SIZE & 0xffff) addi r12,r12,-HPTE_SIZE mtctr r0 @@ -614,7 +583,7 @@ _GLOBAL(flush_hash_patch_B) 8: ble cr1,9f /* if all ptes checked */ 81: addi r6,r6,-1 - addi r5,r5,PTE_SIZE + addi r5,r5,PTE_T_SIZE addi r4,r4,0x1000 lwz r0,0(r5) /* check next pte */ cmpwi cr1,r6,1 @@ -630,83 +599,8 @@ _GLOBAL(flush_hash_patch_B) #endif 19: mtmsr r10 - SYNC_601 - isync - blr - -/* - * Flush an entry from the TLB - */ -_GLOBAL(_tlbie) -#ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r8, r1) - lwz r8,TI_CPU(r8) - oris r8,r8,11 - mfmsr r10 - SYNC - rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ - rlwinm r0,r0,0,28,26 /* clear DR */ - mtmsr r0 - SYNC_601 - isync - lis r9,mmu_hash_lock@h - ori r9,r9,mmu_hash_lock@l - tophys(r9,r9) -10: lwarx r7,0,r9 - cmpwi 0,r7,0 - bne- 10b - stwcx. r8,0,r9 - bne- 10b - eieio - tlbie r3 - sync - TLBSYNC - li r0,0 - stw r0,0(r9) /* clear mmu_hash_lock */ - mtmsr r10 - SYNC_601 - isync -#else /* CONFIG_SMP */ - tlbie r3 - sync -#endif /* CONFIG_SMP */ - blr - -/* - * Flush the entire TLB. 603/603e only - */ -_GLOBAL(_tlbia) -#if defined(CONFIG_SMP) - CURRENT_THREAD_INFO(r8, r1) - lwz r8,TI_CPU(r8) - oris r8,r8,10 - mfmsr r10 - SYNC - rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ - rlwinm r0,r0,0,28,26 /* clear DR */ - mtmsr r0 - SYNC_601 isync - lis r9,mmu_hash_lock@h - ori r9,r9,mmu_hash_lock@l - tophys(r9,r9) -10: lwarx r7,0,r9 - cmpwi 0,r7,0 - bne- 10b - stwcx. r8,0,r9 - bne- 10b - sync - tlbia - sync - TLBSYNC - li r0,0 - stw r0,0(r9) /* clear mmu_hash_lock */ - mtmsr r10 - SYNC_601 - isync -#else /* CONFIG_SMP */ - sync - tlbia - sync -#endif /* CONFIG_SMP */ blr + .previous +EXPORT_SYMBOL(flush_hash_pages) +_ASM_NOKPROBE_SYMBOL(flush_hash_pages) diff --git a/arch/powerpc/mm/book3s32/kuap.c b/arch/powerpc/mm/book3s32/kuap.c new file mode 100644 index 000000000000..3a8815555a48 --- /dev/null +++ b/arch/powerpc/mm/book3s32/kuap.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <asm/kup.h> +#include <asm/smp.h> + +void setup_kuap(bool disabled) +{ + if (!disabled) { + update_user_segments(mfsr(0) | SR_KS); + isync(); /* Context sync required after mtsr() */ + init_mm.context.sr0 |= SR_KS; + current->thread.sr0 |= SR_KS; + } + + if (smp_processor_id() != boot_cpuid) + return; + + if (disabled) + cur_cpu_spec->mmu_features &= ~MMU_FTR_KUAP; + else + pr_info("Activating Kernel Userspace Access Protection\n"); +} diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c new file mode 100644 index 000000000000..c42ecdf94e48 --- /dev/null +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -0,0 +1,446 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file contains the routines for handling the MMU on those + * PowerPC implementations where the MMU substantially follows the + * architecture specification. This includes the 6xx, 7xx, 7xxx, + * and 8260 implementations but excludes the 8xx and 4xx. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/memblock.h> + +#include <asm/mmu.h> +#include <asm/machdep.h> +#include <asm/text-patching.h> +#include <asm/sections.h> + +#include <mm/mmu_decl.h> + +u8 __initdata early_hash[SZ_256K] __aligned(SZ_256K) = {0}; + +static struct hash_pte __initdata *Hash = (struct hash_pte *)early_hash; +static unsigned long __initdata Hash_size, Hash_mask; +static unsigned int __initdata hash_mb, hash_mb2; +unsigned long __initdata _SDR1; + +struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */ + +static struct batrange { /* stores address ranges mapped by BATs */ + unsigned long start; + unsigned long limit; + phys_addr_t phys; +} bat_addrs[8]; + +#ifdef CONFIG_SMP +unsigned long mmu_hash_lock; +#endif + +/* + * Return PA for this VA if it is mapped by a BAT, or 0 + */ +phys_addr_t v_block_mapped(unsigned long va) +{ + int b; + for (b = 0; b < ARRAY_SIZE(bat_addrs); ++b) + if (va >= bat_addrs[b].start && va < bat_addrs[b].limit) + return bat_addrs[b].phys + (va - bat_addrs[b].start); + return 0; +} + +/* + * Return VA for a given PA or 0 if not mapped + */ +unsigned long p_block_mapped(phys_addr_t pa) +{ + int b; + for (b = 0; b < ARRAY_SIZE(bat_addrs); ++b) + if (pa >= bat_addrs[b].phys + && pa < (bat_addrs[b].limit-bat_addrs[b].start) + +bat_addrs[b].phys) + return bat_addrs[b].start+(pa-bat_addrs[b].phys); + return 0; +} + +int __init find_free_bat(void) +{ + int b; + int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + + for (b = 0; b < n; b++) { + struct ppc_bat *bat = BATS[b]; + + if (!(bat[1].batu & 3)) + return b; + } + return -1; +} + +/* + * This function calculates the size of the larger block usable to map the + * beginning of an area based on the start address and size of that area: + * - max block size is 256 on 6xx. + * - base address must be aligned to the block size. So the maximum block size + * is identified by the lowest bit set to 1 in the base address (for instance + * if base is 0x16000000, max size is 0x02000000). + * - block size has to be a power of two. This is calculated by finding the + * highest bit set to 1. + */ +unsigned int bat_block_size(unsigned long base, unsigned long top) +{ + unsigned int max_size = SZ_256M; + unsigned int base_shift = (ffs(base) - 1) & 31; + unsigned int block_shift = (fls(top - base) - 1) & 31; + + return min3(max_size, 1U << base_shift, 1U << block_shift); +} + +/* + * Set up one of the IBAT (block address translation) register pairs. + * The parameters are not checked; in particular size must be a power + * of 2 between 128k and 256M. + */ +static void setibat(int index, unsigned long virt, phys_addr_t phys, + unsigned int size, pgprot_t prot) +{ + unsigned int bl = (size >> 17) - 1; + int wimgxpp; + struct ppc_bat *bat = BATS[index]; + unsigned long flags = pgprot_val(prot); + + if (!cpu_has_feature(CPU_FTR_NEED_COHERENT)) + flags &= ~_PAGE_COHERENT; + + wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX); + bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ + bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp; + if (!is_kernel_addr(virt)) + bat[0].batu |= 1; /* Vp = 1 */ +} + +static void clearibat(int index) +{ + struct ppc_bat *bat = BATS[index]; + + bat[0].batu = 0; + bat[0].batl = 0; +} + +static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long top) +{ + int idx; + + while ((idx = find_free_bat()) != -1 && base != top) { + unsigned int size = bat_block_size(base, top); + + if (size < 128 << 10) + break; + setbat(idx, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X); + base += size; + } + + return base; +} + +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) +{ + unsigned long done; + unsigned long border = (unsigned long)__srwx_boundary - PAGE_OFFSET; + unsigned long size; + + size = roundup_pow_of_two((unsigned long)_einittext - PAGE_OFFSET); + setibat(0, PAGE_OFFSET, 0, size, PAGE_KERNEL_X); + + if (debug_pagealloc_enabled_or_kfence()) { + pr_debug_once("Read-Write memory mapped without BATs\n"); + if (base >= border) + return base; + if (top >= border) + top = border; + } + + if (!strict_kernel_rwx_enabled() || base >= border || top <= border) + return __mmu_mapin_ram(base, top); + + done = __mmu_mapin_ram(base, border); + if (done != border) + return done; + + return __mmu_mapin_ram(border, top); +} + +static bool is_module_segment(unsigned long addr) +{ + if (!IS_ENABLED(CONFIG_EXECMEM)) + return false; + if (addr < ALIGN_DOWN(MODULES_VADDR, SZ_256M)) + return false; + if (addr > ALIGN(MODULES_END, SZ_256M) - 1) + return false; + return true; +} + +int mmu_mark_initmem_nx(void) +{ + int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + int i; + unsigned long base = (unsigned long)_stext - PAGE_OFFSET; + unsigned long top = ALIGN((unsigned long)_etext - PAGE_OFFSET, SZ_128K); + unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; + unsigned long size; + + for (i = 0; i < nb - 1 && base < top;) { + size = bat_block_size(base, top); + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X); + base += size; + } + if (base < top) { + size = bat_block_size(base, top); + if ((top - base) > size) { + size <<= 1; + if (strict_kernel_rwx_enabled() && base + size > border) + pr_warn("Some RW data is getting mapped X. " + "Adjust CONFIG_DATA_SHIFT to avoid that.\n"); + } + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X); + base += size; + } + for (; i < nb; i++) + clearibat(i); + + update_bats(); + + BUILD_BUG_ON(ALIGN_DOWN(MODULES_VADDR, SZ_256M) < TASK_SIZE); + + for (i = TASK_SIZE >> 28; i < 16; i++) { + /* Do not set NX on VM space for modules */ + if (is_module_segment(i << 28)) + continue; + + mtsr(mfsr(i << 28) | 0x10000000, i << 28); + } + return 0; +} + +int mmu_mark_rodata_ro(void) +{ + int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + int i; + + for (i = 0; i < nb; i++) { + struct ppc_bat *bat = BATS[i]; + + if (bat_addrs[i].start < (unsigned long)__end_rodata) + bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX; + } + + update_bats(); + + return 0; +} + +/* + * Set up one of the D BAT (block address translation) register pairs. + * The parameters are not checked; in particular size must be a power + * of 2 between 128k and 256M. + */ +void __init setbat(int index, unsigned long virt, phys_addr_t phys, + unsigned int size, pgprot_t prot) +{ + unsigned int bl; + int wimgxpp; + struct ppc_bat *bat; + unsigned long flags = pgprot_val(prot); + + if (index == -1) + index = find_free_bat(); + if (index == -1) { + pr_err("%s: no BAT available for mapping 0x%llx\n", __func__, + (unsigned long long)phys); + return; + } + bat = BATS[index]; + + if ((flags & _PAGE_NO_CACHE) || + (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0)) + flags &= ~_PAGE_COHERENT; + + bl = (size >> 17) - 1; + /* Do DBAT first */ + wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE + | _PAGE_COHERENT | _PAGE_GUARDED); + wimgxpp |= (flags & _PAGE_WRITE) ? BPP_RW : BPP_RX; + bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ + bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; + if (!is_kernel_addr(virt)) + bat[1].batu |= 1; /* Vp = 1 */ + if (flags & _PAGE_GUARDED) { + /* G bit must be zero in IBATs */ + flags &= ~_PAGE_EXEC; + } + + bat_addrs[index].start = virt; + bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1; + bat_addrs[index].phys = phys; +} + +/* + * Preload a translation in the hash table + */ +static void hash_preload(struct mm_struct *mm, unsigned long ea) +{ + pmd_t *pmd; + + if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return; + pmd = pmd_off(mm, ea); + if (!pmd_none(*pmd)) + add_hash_page(mm->context.id, ea, pmd_val(*pmd)); +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux PTE. + * + * This must always be called with the pte lock held. + */ +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + /* + * We don't need to worry about _PAGE_PRESENT here because we are + * called with either mm->page_table_lock held or ptl lock held + */ + + /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ + if (!pte_young(*ptep) || address >= TASK_SIZE) + return; + + /* We have to test for regs NULL since init will get here first thing at boot */ + if (!current->thread.regs) + return; + + /* We also avoid filling the hash if not coming from a fault */ + if (TRAP(current->thread.regs) != 0x300 && TRAP(current->thread.regs) != 0x400) + return; + + hash_preload(vma->vm_mm, address); +} + +/* + * Initialize the hash table and patch the instructions in hashtable.S. + */ +void __init MMU_init_hw(void) +{ + unsigned int n_hpteg, lg_n_hpteg; + + if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return; + + if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105); + +#define LG_HPTEG_SIZE 6 /* 64 bytes per HPTEG */ +#define SDR1_LOW_BITS ((n_hpteg - 1) >> 10) +#define MIN_N_HPTEG 1024 /* min 64kB hash table */ + + /* + * Allow 1 HPTE (1/8 HPTEG) for each page of memory. + * This is less than the recommended amount, but then + * Linux ain't AIX. + */ + n_hpteg = total_memory / (PAGE_SIZE * 8); + if (n_hpteg < MIN_N_HPTEG) + n_hpteg = MIN_N_HPTEG; + lg_n_hpteg = __ilog2(n_hpteg); + if (n_hpteg & (n_hpteg - 1)) { + ++lg_n_hpteg; /* round up if not power of 2 */ + n_hpteg = 1 << lg_n_hpteg; + } + Hash_size = n_hpteg << LG_HPTEG_SIZE; + + /* + * Find some memory for the hash table. + */ + if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322); + Hash = memblock_alloc_or_panic(Hash_size, Hash_size); + _SDR1 = __pa(Hash) | SDR1_LOW_BITS; + + pr_info("Total memory = %lldMB; using %ldkB for hash table\n", + (unsigned long long)(total_memory >> 20), Hash_size >> 10); + + + Hash_mask = n_hpteg - 1; + hash_mb2 = hash_mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; + if (lg_n_hpteg > 16) + hash_mb2 = 16 - LG_HPTEG_SIZE; +} + +void __init MMU_init_hw_patch(void) +{ + unsigned int hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); + unsigned int hash = (unsigned int)Hash - PAGE_OFFSET; + + if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return; + + if (ppc_md.progress) + ppc_md.progress("hash:patch", 0x345); + if (ppc_md.progress) + ppc_md.progress("hash:done", 0x205); + + /* WARNING: Make sure nothing can trigger a KASAN check past this point */ + + /* + * Patch up the instructions in hashtable.S:create_hpte + */ + modify_instruction_site(&patch__hash_page_A0, 0xffff, hash >> 16); + modify_instruction_site(&patch__hash_page_A1, 0x7c0, hash_mb << 6); + modify_instruction_site(&patch__hash_page_A2, 0x7c0, hash_mb2 << 6); + modify_instruction_site(&patch__hash_page_B, 0xffff, hmask); + modify_instruction_site(&patch__hash_page_C, 0xffff, hmask); + + /* + * Patch up the instructions in hashtable.S:flush_hash_page + */ + modify_instruction_site(&patch__flush_hash_A0, 0xffff, hash >> 16); + modify_instruction_site(&patch__flush_hash_A1, 0x7c0, hash_mb << 6); + modify_instruction_site(&patch__flush_hash_A2, 0x7c0, hash_mb2 << 6); + modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask); +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + memblock_set_current_limit(min_t(u64, first_memblock_size, SZ_256M)); +} + +void __init print_system_hash_info(void) +{ + pr_info("Hash_size = 0x%lx\n", Hash_size); + if (Hash_mask) + pr_info("Hash_mask = 0x%lx\n", Hash_mask); +} + +void __init early_init_mmu(void) +{ +} diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/book3s32/mmu_context.c index 78fef6726e10..1922f9a6b058 100644 --- a/arch/powerpc/mm/mmu_context_hash32.c +++ b/arch/powerpc/mm/book3s32/mmu_context.c @@ -1,8 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains the routines for handling the MMU on those * PowerPC implementations where the MMU substantially follows the * architecture specification. This includes the 6xx, 7xx, 7xxx, - * 8260, and POWER3 implementations but excludes the 8xx and 4xx. + * and 8260 implementations but excludes the 8xx and 4xx. * -- paulus * * Derived from arch/ppc/mm/init.c: @@ -14,12 +15,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/mm.h> @@ -27,7 +22,12 @@ #include <linux/export.h> #include <asm/mmu_context.h> -#include <asm/tlbflush.h> + +/* + * Room for two PTE pointers, usually the kernel and current user pointers + * to their respective root page table. + */ +void *abatron_pteptrs[2]; /* * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs @@ -45,19 +45,6 @@ #define LAST_CONTEXT 32767 #define FIRST_CONTEXT 1 -/* - * This function defines the mapping from contexts to VSIDs (virtual - * segment IDs). We use a skew on both the context and the high 4 bits - * of the 32-bit virtual address (the "effective segment ID") in order - * to spread out the entries in the MMU hash table. Note, if this - * function is changed then arch/ppc/mm/hashtable.S will have to be - * changed to correspond. - * - * - * CTX_TO_VSID(ctx, va) (((ctx) * (897 * 16) + ((va) >> 28) * 0x111) \ - * & 0xffffff) - */ - static unsigned long next_mmu_context; static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1]; @@ -82,6 +69,12 @@ EXPORT_SYMBOL_GPL(__init_new_context); int init_new_context(struct task_struct *t, struct mm_struct *mm) { mm->context.id = __init_new_context(); + mm->context.sr0 = CTX_TO_VSID(mm->context.id, 0); + + if (IS_ENABLED(CONFIG_PPC_KUEP)) + mm->context.sr0 |= SR_NX; + if (!kuap_is_disabled()) + mm->context.sr0 |= SR_KS; return 0; } @@ -117,3 +110,25 @@ void __init mmu_context_init(void) context_map[0] = (1 << FIRST_CONTEXT) - 1; next_mmu_context = FIRST_CONTEXT; } + +void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) +{ + long id = next->context.id; + + if (id < 0) + panic("mm_struct %p has no context ID", next); + + isync(); + + update_user_segments(next->context.sr0); + + if (IS_ENABLED(CONFIG_BDI_SWITCH)) + abatron_pteptrs[1] = next->pgd; + + if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) + mtspr(SPRN_SDR1, rol32(__pa(next->pgd), 4) & 0xffff01ff); + + mb(); /* sync */ + isync(); +} +EXPORT_SYMBOL(switch_mmu_context); diff --git a/arch/powerpc/mm/book3s32/nohash_low.S b/arch/powerpc/mm/book3s32/nohash_low.S new file mode 100644 index 000000000000..19f418b0ed2d --- /dev/null +++ b/arch/powerpc/mm/book3s32/nohash_low.S @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * This file contains low-level assembler routines for managing + * the PowerPC 603 tlb invalidation. + */ + +#include <asm/page.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + +/* + * Flush an entry from the TLB + */ +#ifdef CONFIG_SMP +_GLOBAL(_tlbie) + lwz r8,TASK_CPU(r2) + oris r8,r8,11 + mfmsr r10 + rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r0,0,28,26 /* clear DR */ + mtmsr r0 + isync + lis r9,mmu_hash_lock@h + ori r9,r9,mmu_hash_lock@l + tophys(r9,r9) +10: lwarx r7,0,r9 + cmpwi 0,r7,0 + bne- 10b + stwcx. r8,0,r9 + bne- 10b + eieio + tlbie r3 + sync + TLBSYNC + li r0,0 + stw r0,0(r9) /* clear mmu_hash_lock */ + mtmsr r10 + isync + blr +_ASM_NOKPROBE_SYMBOL(_tlbie) +#endif /* CONFIG_SMP */ + +/* + * Flush the entire TLB. 603/603e only + */ +_GLOBAL(_tlbia) +#if defined(CONFIG_SMP) + lwz r8,TASK_CPU(r2) + oris r8,r8,10 + mfmsr r10 + rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r0,0,28,26 /* clear DR */ + mtmsr r0 + isync + lis r9,mmu_hash_lock@h + ori r9,r9,mmu_hash_lock@l + tophys(r9,r9) +10: lwarx r7,0,r9 + cmpwi 0,r7,0 + bne- 10b + stwcx. r8,0,r9 + bne- 10b +#endif /* CONFIG_SMP */ + li r5, 32 + lis r4, KERNELBASE@h + mtctr r5 + sync +0: tlbie r4 + addi r4, r4, 0x1000 + bdnz 0b + sync +#ifdef CONFIG_SMP + TLBSYNC + li r0,0 + stw r0,0(r9) /* clear mmu_hash_lock */ + mtmsr r10 + isync +#endif /* CONFIG_SMP */ + blr +_ASM_NOKPROBE_SYMBOL(_tlbia) diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c new file mode 100644 index 000000000000..e54a7b011232 --- /dev/null +++ b/arch/powerpc/mm/book3s32/tlb.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file contains the routines for TLB flushing. + * On machines where the MMU uses a hash table to store virtual to + * physical translations, these routines flush entries from the + * hash table also. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/export.h> + +#include <asm/tlbflush.h> +#include <asm/tlb.h> + +#include <mm/mmu_decl.h> + +/* + * TLB flushing: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * since the hardware hash table functions as an extension of the + * tlb as far as the linux tables are concerned, flush it too. + * -- Cort + */ + +/* + * For each address in the range, find the pte for the address + * and check _PAGE_HASHPTE bit; if it is set, find and destroy + * the corresponding HPTE. + */ +void hash__flush_range(struct mm_struct *mm, unsigned long start, unsigned long end) +{ + pmd_t *pmd; + unsigned long pmd_end; + int count; + unsigned int ctx = mm->context.id; + + start &= PAGE_MASK; + if (start >= end) + return; + end = (end - 1) | ~PAGE_MASK; + pmd = pmd_off(mm, start); + for (;;) { + pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1; + if (pmd_end > end) + pmd_end = end; + if (!pmd_none(*pmd)) { + count = ((pmd_end - start) >> PAGE_SHIFT) + 1; + flush_hash_pages(ctx, start, pmd_val(*pmd), count); + } + if (pmd_end == end) + break; + start = pmd_end + 1; + ++pmd; + } +} +EXPORT_SYMBOL(hash__flush_range); + +/* + * Flush all the (user) entries for the address space described by mm. + */ +void hash__flush_tlb_mm(struct mm_struct *mm) +{ + struct vm_area_struct *mp; + VMA_ITERATOR(vmi, mm, 0); + + /* + * It is safe to iterate the vmas when called from dup_mmap, + * holding mmap_lock. It would also be safe from unmap_region + * or exit_mmap, but not from vmtruncate on SMP - but it seems + * dup_mmap is the only SMP case which gets here. + */ + for_each_vma(vmi, mp) + hash__flush_range(mp->vm_mm, mp->vm_start, mp->vm_end); +} +EXPORT_SYMBOL(hash__flush_tlb_mm); + +void hash__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + struct mm_struct *mm; + pmd_t *pmd; + + mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm; + pmd = pmd_off(mm, vmaddr); + if (!pmd_none(*pmd)) + flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1); +} +EXPORT_SYMBOL(hash__flush_tlb_page); + +void hash__flush_gather(struct mmu_gather *tlb) +{ + if (tlb->fullmm || tlb->need_flush_all) + hash__flush_tlb_mm(tlb->mm); + else + hash__flush_range(tlb->mm, tlb->start, tlb->end); +} +EXPORT_SYMBOL(hash__flush_gather); diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile new file mode 100644 index 000000000000..33af5795856a --- /dev/null +++ b/arch/powerpc/mm/book3s64/Makefile @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += mmu_context.o pgtable.o trace.o +ifdef CONFIG_PPC_64S_HASH_MMU +CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) +obj-y += hash_pgtable.o hash_utils.o hash_tlb.o slb.o slice.o +obj-$(CONFIG_PPC_HASH_MMU_NATIVE) += hash_native.o +obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o +obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o +obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o +endif + +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o + +obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o +ifdef CONFIG_HUGETLB_PAGE +obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o +endif +obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o +obj-$(CONFIG_PPC_PKEY) += pkeys.o + +# Instrumenting the SLB fault path can lead to duplicate SLB entries +KCOV_INSTRUMENT_slb.o := n + +# Parts of these can run in real mode and therefore are +# not safe with the current outline KASAN implementation +KASAN_SANITIZE_mmu_context.o := n +KASAN_SANITIZE_pgtable.o := n +KASAN_SANITIZE_radix_pgtable.o := n +KASAN_SANITIZE_radix_tlb.o := n +KASAN_SANITIZE_slb.o := n +KASAN_SANITIZE_pkeys.o := n diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c new file mode 100644 index 000000000000..02acbfd05b46 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_4k.c @@ -0,0 +1,129 @@ +/* + * Copyright IBM Corporation, 2015 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include <linux/mm.h> +#include <asm/machdep.h> +#include <asm/mmu.h> + +#include "internal.h" + +int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, unsigned long flags, + int ssize, int subpg_prot) +{ + real_pte_t rpte; + unsigned long hpte_group; + unsigned long rflags, pa; + unsigned long old_pte, new_pte; + unsigned long vpn, hash, slot; + unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift; + + /* + * atomically mark the linux large page PTE busy and dirty + */ + do { + pte_t pte = READ_ONCE(*ptep); + + old_pte = pte_val(pte); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & H_PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pte))) + return 1; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access. Since this is 4K insert of 64K page size + * also add H_PAGE_COMBO + */ + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) + new_pte |= _PAGE_DIRTY; + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* + * PP bits. _PAGE_USER is already PP bit 0x2, so we only + * need to add in 0x1 if it's a read-only user page + */ + rflags = htab_convert_pte_flags(new_pte, flags); + rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); + + if (cpu_has_feature(CPU_FTR_NOEXECUTE) && + !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + + vpn = hpt_vpn(ea, vsid, ssize); + if (unlikely(old_pte & H_PAGE_HASHPTE)) { + /* + * There MIGHT be an HPTE for this pte + */ + unsigned long gslot = pte_get_hash_gslot(vpn, shift, ssize, + rpte, 0); + + if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_4K, + MMU_PAGE_4K, ssize, flags) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & H_PAGE_HASHPTE))) { + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + hash = hpt_hash(vpn, shift, ssize); + +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + /* Insert into the hash table, primary slot */ + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, + MMU_PAGE_4K, MMU_PAGE_4K, ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, + rflags, + HPTE_V_SECONDARY, + MMU_PAGE_4K, + MMU_PAGE_4K, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; + mmu_hash_ops.hpte_remove(hpte_group); + /* + * FIXME!! Should be try the group from which we removed ? + */ + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + MMU_PAGE_4K, MMU_PAGE_4K, old_pte); + return -1; + } + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); + + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); + } + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; +} diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c new file mode 100644 index 000000000000..954af420f358 --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_64k.c @@ -0,0 +1,343 @@ +/* + * Copyright IBM Corporation, 2015 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include <linux/mm.h> +#include <asm/machdep.h> +#include <asm/mmu.h> + +#include "internal.h" + +/* + * Return true, if the entry has a slot value which + * the software considers as invalid. + */ +static inline bool hpte_soft_invalid(unsigned long hidx) +{ + return ((hidx & 0xfUL) == 0xfUL); +} + +/* + * index from 0 - 15 + */ +bool __rpte_sub_valid(real_pte_t rpte, unsigned long index) +{ + return !(hpte_soft_invalid(__rpte_to_hidx(rpte, index))); +} + +int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, unsigned long flags, + int ssize, int subpg_prot) +{ + real_pte_t rpte; + unsigned long hpte_group; + unsigned int subpg_index; + unsigned long rflags, pa; + unsigned long old_pte, new_pte, subpg_pte; + unsigned long vpn, hash, slot, gslot; + unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift; + + /* + * atomically mark the linux large page PTE busy and dirty + */ + do { + pte_t pte = READ_ONCE(*ptep); + + old_pte = pte_val(pte); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & H_PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pte))) + return 1; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access. Since this is 4K insert of 64K page size + * also add H_PAGE_COMBO + */ + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO; + if (access & _PAGE_WRITE) + new_pte |= _PAGE_DIRTY; + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* + * Handle the subpage protection bits + */ + subpg_pte = new_pte & ~subpg_prot; + rflags = htab_convert_pte_flags(subpg_pte, flags); + + if (cpu_has_feature(CPU_FTR_NOEXECUTE) && + !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + } + + subpg_index = (ea & (PAGE_SIZE - 1)) >> shift; + vpn = hpt_vpn(ea, vsid, ssize); + rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); + /* + *None of the sub 4k page is hashed + */ + if (!(old_pte & H_PAGE_HASHPTE)) + goto htab_insert_hpte; + /* + * Check if the pte was already inserted into the hash table + * as a 64k HW page, and invalidate the 64k HPTE if so. + */ + if (!(old_pte & H_PAGE_COMBO)) { + flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags); + /* + * clear the old slot details from the old and new pte. + * On hash insert failure we use old pte value and we don't + * want slot information there if we have a insert failure. + */ + old_pte &= ~H_PAGE_HASHPTE; + new_pte &= ~H_PAGE_HASHPTE; + goto htab_insert_hpte; + } + /* + * Check for sub page valid and update + */ + if (__rpte_sub_valid(rpte, subpg_index)) { + int ret; + + gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, + subpg_index); + ret = mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, + MMU_PAGE_4K, MMU_PAGE_4K, + ssize, flags); + + /* + * If we failed because typically the HPTE wasn't really here + * we try an insertion. + */ + if (ret == -1) + goto htab_insert_hpte; + + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; + } + +htab_insert_hpte: + + /* + * Initialize all hidx entries to invalid value, the first time + * the PTE is about to allocate a 4K HPTE. + */ + if (!(old_pte & H_PAGE_COMBO)) + rpte.hidx = INVALID_RPTE_HIDX; + + /* + * handle H_PAGE_4K_PFN case + */ + if (old_pte & H_PAGE_4K_PFN) { + /* + * All the sub 4k page have the same + * physical address. + */ + pa = pte_pfn(__pte(old_pte)) << HW_PAGE_SHIFT; + } else { + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + pa += (subpg_index << shift); + } + hash = hpt_hash(vpn, shift, ssize); +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + /* Insert into the hash table, primary slot */ + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, + MMU_PAGE_4K, MMU_PAGE_4K, ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + bool soft_invalid; + + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, + rflags, HPTE_V_SECONDARY, + MMU_PAGE_4K, MMU_PAGE_4K, + ssize); + + soft_invalid = hpte_soft_invalid(slot); + if (unlikely(soft_invalid)) { + /* + * We got a valid slot from a hardware point of view. + * but we cannot use it, because we use this special + * value; as defined by hpte_soft_invalid(), to track + * invalid slots. We cannot use it. So invalidate it. + */ + gslot = slot & _PTEIDX_GROUP_IX; + mmu_hash_ops.hpte_invalidate(hpte_group + gslot, vpn, + MMU_PAGE_4K, MMU_PAGE_4K, + ssize, 0); + } + + if (unlikely(slot == -1 || soft_invalid)) { + /* + * For soft invalid slot, let's ensure that we release a + * slot from the primary, with the hope that we will + * acquire that slot next time we try. This will ensure + * that we do not get the same soft-invalid slot. + */ + if (soft_invalid || (mftb() & 0x1)) + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + mmu_hash_ops.hpte_remove(hpte_group); + /* + * FIXME!! Should be try the group from which we removed ? + */ + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + MMU_PAGE_4K, MMU_PAGE_4K, old_pte); + return -1; + } + + new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE); + new_pte |= H_PAGE_HASHPTE; + + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); + + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; +} + +int __hash_page_64K(unsigned long ea, unsigned long access, + unsigned long vsid, pte_t *ptep, unsigned long trap, + unsigned long flags, int ssize) +{ + real_pte_t rpte; + unsigned long hpte_group; + unsigned long rflags, pa; + unsigned long old_pte, new_pte; + unsigned long vpn, hash, slot; + unsigned long shift = mmu_psize_defs[MMU_PAGE_64K].shift; + + /* + * atomically mark the linux large page PTE busy and dirty + */ + do { + pte_t pte = READ_ONCE(*ptep); + + old_pte = pte_val(pte); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & H_PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pte))) + return 1; + /* + * Check if PTE has the cache-inhibit bit set + * If so, bail out and refault as a 4k page + */ + if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) && + unlikely(pte_ci(pte))) + return 0; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access. + */ + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) + new_pte |= _PAGE_DIRTY; + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + rflags = htab_convert_pte_flags(new_pte, flags); + rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE); + + if (cpu_has_feature(CPU_FTR_NOEXECUTE) && + !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + + vpn = hpt_vpn(ea, vsid, ssize); + if (unlikely(old_pte & H_PAGE_HASHPTE)) { + unsigned long gslot; + + /* + * There MIGHT be an HPTE for this pte + */ + gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0); + if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K, + MMU_PAGE_64K, ssize, + flags) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & H_PAGE_HASHPTE))) { + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + hash = hpt_hash(vpn, shift, ssize); + +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + /* Insert into the hash table, primary slot */ + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, + MMU_PAGE_64K, MMU_PAGE_64K, + ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, + rflags, + HPTE_V_SECONDARY, + MMU_PAGE_64K, + MMU_PAGE_64K, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; + mmu_hash_ops.hpte_remove(hpte_group); + /* + * FIXME!! Should be try the group from which we removed ? + */ + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + MMU_PAGE_64K, MMU_PAGE_64K, old_pte); + return -1; + } + + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); + + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); + } + + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + + return 0; +} diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/book3s64/hash_hugepage.c index 34de9e0cdc34..cdfd4fe75edb 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/book3s64/hash_hugepage.c @@ -1,6 +1,6 @@ /* * Copyright IBM Corporation, 2013 - * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License @@ -19,8 +19,8 @@ #include <asm/machdep.h> int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, - pmd_t *pmdp, unsigned long trap, int local, int ssize, - unsigned int psize) + pmd_t *pmdp, unsigned long trap, unsigned long flags, + int ssize, unsigned int psize) { unsigned int index, valid; unsigned char *hpte_slot_array; @@ -33,70 +33,81 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, * atomically mark the linux large page PMD busy and dirty */ do { - old_pmd = pmd_val(*pmdp); + pmd_t pmd = READ_ONCE(*pmdp); + + old_pmd = pmd_val(pmd); /* If PMD busy, retry the access */ - if (unlikely(old_pmd & _PAGE_BUSY)) - return 0; - /* If PMD is trans splitting retry the access */ - if (unlikely(old_pmd & _PAGE_SPLITTING)) + if (unlikely(old_pmd & H_PAGE_BUSY)) return 0; /* If PMD permissions don't match, take page fault */ - if (unlikely(access & ~old_pmd)) + if (unlikely(!check_pte_access(access, old_pmd))) return 1; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access */ - new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) new_pmd |= _PAGE_DIRTY; - } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp, - old_pmd, new_pmd)); + } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); + /* - * PP bits. _PAGE_USER is already PP bit 0x2, so we only - * need to add in 0x1 if it's a read-only user page + * Make sure this is thp or devmap entry */ - rflags = new_pmd & _PAGE_USER; - if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) && - (new_pmd & _PAGE_DIRTY))) - rflags |= 0x1; + if (!(old_pmd & H_PAGE_THP_HUGE)) + return 0; + + rflags = htab_convert_pte_flags(new_pmd, flags); + /* - * _PAGE_EXEC -> HW_NO_EXEC since it's inverted + * THPs are only supported on platforms that can do mixed page size + * segments (MPSS) and all such platforms have coherent icache. Hence we + * don't need to do lazy icache flush (hash_page_do_lazy_icache()) on + * noexecute fault. */ - rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N); - -#if 0 - if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { - /* - * No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case - */ - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - } -#endif /* * Find the slot index details for this ea, using base page size. */ shift = mmu_psize_defs[psize].shift; index = (ea & ~HPAGE_PMD_MASK) >> shift; - BUG_ON(index >= 4096); + BUG_ON(index >= PTE_FRAG_SIZE); vpn = hpt_vpn(ea, vsid, ssize); - hash = hpt_hash(vpn, shift, ssize); hpte_slot_array = get_hpte_slot_array(pmdp); + if (psize == MMU_PAGE_4K) { + /* + * invalidate the old hpte entry if we have that mapped via 64K + * base page size. This is because demote_segment won't flush + * hash page table entries. + */ + if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) { + flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K, + ssize, flags); + /* + * With THP, we also clear the slot information with + * respect to all the 64K hash pte mapping the 16MB + * page. They are all invalid now. This make sure we + * don't find the slot valid when we fault with 4k + * base page size. + * + */ + memset(hpte_slot_array, 0, PTE_FRAG_SIZE); + } + } valid = hpte_valid(hpte_slot_array, index); if (valid) { /* update the hpte bits */ + hash = hpt_hash(vpn, shift, ssize); hidx = hpte_hash_index(hpte_slot_array, index); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += hidx & _PTEIDX_GROUP_IX; - ret = ppc_md.hpte_updatepp(slot, rflags, vpn, - psize, lpsize, ssize, local); + ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, + psize, lpsize, ssize, flags); /* * We failed to update, try to insert a new entry. */ @@ -107,46 +118,39 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, * safely update this here. */ valid = 0; - new_pmd &= ~_PAGE_HPTEFLAGS; hpte_slot_array[index] = 0; - } else - /* clear the busy bits and set the hash pte bits */ - new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; + } } if (!valid) { unsigned long hpte_group; + hash = hpt_hash(vpn, shift, ssize); /* insert new entry */ pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; -repeat: - hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; - - /* clear the busy bits and set the hash pte bits */ - new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; + new_pmd |= H_PAGE_HASHPTE; - /* Add in WIMG bits */ - rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | - _PAGE_COHERENT | _PAGE_GUARDED)); +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; /* Insert into the hash table, primary slot */ - slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0, - psize, lpsize, ssize); + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, + psize, lpsize, ssize); /* * Primary is full, try the secondary */ if (unlikely(slot == -1)) { - hpte_group = ((~hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; - slot = ppc_md.hpte_insert(hpte_group, vpn, pa, - rflags, HPTE_V_SECONDARY, - psize, lpsize, ssize); + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, + rflags, + HPTE_V_SECONDARY, + psize, lpsize, ssize); if (slot == -1) { if (mftb() & 0x1) - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; - ppc_md.hpte_remove(hpte_group); + mmu_hash_ops.hpte_remove(hpte_group); goto repeat; } } @@ -168,8 +172,17 @@ repeat: mark_hpte_slot_valid(hpte_slot_array, index, slot); } /* - * No need to use ldarx/stdcx here + * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with + * base page size 4k. + */ + if (psize == MMU_PAGE_4K) + new_pmd |= H_PAGE_COMBO; + /* + * The hpte valid is stored in the pgtable whose address is in the + * second half of the PMD. Order this against clearing of the busy bit in + * huge pmd. */ - *pmdp = __pmd(new_pmd & ~_PAGE_BUSY); + smp_wmb(); + *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/book3s64/hash_native.c index 3f0c30ae4791..e9e2dd70c060 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * native hashtable management. * * SMP scalability work: * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #undef DEBUG_LOW @@ -15,19 +11,21 @@ #include <linux/spinlock.h> #include <linux/bitops.h> #include <linux/of.h> +#include <linux/processor.h> #include <linux/threads.h> #include <linux/smp.h> +#include <linux/pgtable.h> #include <asm/machdep.h> #include <asm/mmu.h> #include <asm/mmu_context.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> +#include <asm/trace.h> #include <asm/tlb.h> #include <asm/cputable.h> #include <asm/udbg.h> #include <asm/kexec.h> #include <asm/ppc-opcode.h> +#include <asm/feature-fixups.h> #ifdef DEBUG_LOW #define DBG_LOW(fmt...) udbg_printf(fmt) @@ -35,20 +33,49 @@ #define DBG_LOW(fmt...) #endif +#ifdef __BIG_ENDIAN__ #define HPTE_LOCK_BIT 3 +#else +#define HPTE_LOCK_BIT (56+3) +#endif -DEFINE_RAW_SPINLOCK(native_tlbie_lock); +static DEFINE_RAW_SPINLOCK(native_tlbie_lock); -static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) +#ifdef CONFIG_LOCKDEP +static struct lockdep_map hpte_lock_map = + STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map); + +static void acquire_hpte_lock(void) +{ + lock_map_acquire(&hpte_lock_map); +} + +static void release_hpte_lock(void) +{ + lock_map_release(&hpte_lock_map); +} +#else +static void acquire_hpte_lock(void) +{ +} + +static void release_hpte_lock(void) +{ +} +#endif + +static inline unsigned long ___tlbie(unsigned long vpn, int psize, + int apsize, int ssize) { unsigned long va; unsigned int penc; + unsigned long sllp; /* * We need 14 to 65 bits of va for a tlibe of 4K page * With vpn we ignore the lower VPN_SHIFT bits already. * And top two bits are already ignored because we can - * only accomadate 76 bits in a 64 bit vpn with a VPN_SHIFT + * only accomodate 76 bits in a 64 bit vpn with a VPN_SHIFT * of 12. */ va = vpn << VPN_SHIFT; @@ -57,14 +84,16 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) * Older versions of the architecture (2.02 and earler) require the * masking of the top 16 bits. */ - va &= ~(0xffffULL << 48); + if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA)) + va &= ~(0xffffULL << 48); switch (psize) { case MMU_PAGE_4K: /* clear out bits after (52) [0....52.....63] */ va &= ~((1ul << (64 - 52)) - 1); va |= ssize << 8; - va |= mmu_psize_defs[apsize].sllp << 6; + sllp = get_sllp_encoding(apsize); + va |= sllp << 5; asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2) : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) : "memory"); @@ -75,29 +104,68 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1); va |= penc << 12; va |= ssize << 8; - /* Add AVAL part */ - if (psize != apsize) { - /* - * MPSS, 64K base page size and 16MB parge page size - * We don't need all the bits, but rest of the bits - * must be ignored by the processor. - * vpn cover upto 65 bits of va. (0...65) and we need - * 58..64 bits of va. - */ - va |= (vpn & 0xfe); - } + /* + * AVAL bits: + * We don't need all the bits, but rest of the bits + * must be ignored by the processor. + * vpn cover upto 65 bits of va. (0...65) and we need + * 58..64 bits of va. + */ + va |= (vpn & 0xfe); /* AVAL */ va |= 1; /* L */ asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2) : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) : "memory"); break; } + return va; +} + +static inline void fixup_tlbie_vpn(unsigned long vpn, int psize, + int apsize, int ssize) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + /* Radix flush for a hash guest */ + + unsigned long rb,rs,prs,r,ric; + + rb = PPC_BIT(52); /* IS = 2 */ + rs = 0; /* lpid = 0 */ + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + ric = 0; /* RIC_FLSUH_TLB */ + + /* + * Need the extra ptesync to make sure we don't + * re-order the tlbie + */ + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), + "i"(ric), "r"(rs) : "memory"); + } + + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + /* Need the extra ptesync to ensure we don't reorder tlbie*/ + asm volatile("ptesync": : :"memory"); + ___tlbie(vpn, psize, apsize, ssize); + } +} + +static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) +{ + unsigned long rb; + + rb = ___tlbie(vpn, psize, apsize, ssize); + trace_tlbie(0, 0, rb, 0, 0, 0, 0); } static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) { unsigned long va; unsigned int penc; + unsigned long sllp; /* VPN_SHIFT can be atmost 12 */ va = vpn << VPN_SHIFT; @@ -106,16 +174,19 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) * Older versions of the architecture (2.02 and earler) require the * masking of the top 16 bits. */ - va &= ~(0xffffULL << 48); + if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA)) + va &= ~(0xffffULL << 48); switch (psize) { case MMU_PAGE_4K: /* clear out bits after(52) [0....52.....63] */ va &= ~((1ul << (64 - 52)) - 1); va |= ssize << 8; - va |= mmu_psize_defs[apsize].sllp << 6; - asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)" - : : "r"(va) : "memory"); + sllp = get_sllp_encoding(apsize); + va |= sllp << 5; + asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 0), %1) + : : "r" (va), "i" (CPU_FTR_ARCH_206) + : "memory"); break; default: /* We need 14 to 14 + i bits of va */ @@ -123,22 +194,21 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1); va |= penc << 12; va |= ssize << 8; - /* Add AVAL part */ - if (psize != apsize) { - /* - * MPSS, 64K base page size and 16MB parge page size - * We don't need all the bits, but rest of the bits - * must be ignored by the processor. - * vpn cover upto 65 bits of va. (0...65) and we need - * 58..64 bits of va. - */ - va |= (vpn & 0xfe); - } + /* + * AVAL bits: + * We don't need all the bits, but rest of the bits + * must be ignored by the processor. + * vpn cover upto 65 bits of va. (0...65) and we need + * 58..64 bits of va. + */ + va |= (vpn & 0xfe); va |= 1; /* L */ - asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)" - : : "r"(va) : "memory"); + asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 1), %1) + : : "r" (va), "i" (CPU_FTR_ARCH_206) + : "memory"); break; } + trace_tlbie(0, 1, va, 0, 0, 0, 0); } @@ -155,9 +225,10 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize, asm volatile("ptesync": : :"memory"); if (use_local) { __tlbiel(vpn, psize, apsize, ssize); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); } else { __tlbie(vpn, psize, apsize, ssize); + fixup_tlbie_vpn(vpn, psize, apsize, ssize); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } if (lock_tlbie && !use_local) @@ -166,20 +237,24 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize, static inline void native_lock_hpte(struct hash_pte *hptep) { - unsigned long *word = &hptep->v; + unsigned long *word = (unsigned long *)&hptep->v; + acquire_hpte_lock(); while (1) { if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) break; + spin_begin(); while(test_bit(HPTE_LOCK_BIT, word)) - cpu_relax(); + spin_cpu_relax(); + spin_end(); } } static inline void native_unlock_hpte(struct hash_pte *hptep) { - unsigned long *word = &hptep->v; + unsigned long *word = (unsigned long *)&hptep->v; + release_hpte_lock(); clear_bit_unlock(HPTE_LOCK_BIT, word); } @@ -189,8 +264,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, { struct hash_pte *hptep = htab_address + hpte_group; unsigned long hpte_v, hpte_r; + unsigned long flags; int i; + local_irq_save(flags); + if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," " rflags=%lx, vflags=%lx, psize=%d)\n", @@ -198,10 +276,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, } for (i = 0; i < HPTES_PER_GROUP; i++) { - if (! (hptep->v & HPTE_V_VALID)) { + if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) { /* retry with lock held */ native_lock_hpte(hptep); - if (! (hptep->v & HPTE_V_VALID)) + if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) break; native_unlock_hpte(hptep); } @@ -209,8 +287,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hptep++; } - if (i == HPTES_PER_GROUP) + if (i == HPTES_PER_GROUP) { + local_irq_restore(flags); return -1; + } hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; @@ -220,26 +300,36 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, i, hpte_v, hpte_r); } - hptep->r = hpte_r; + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hpte_r = hpte_old_to_new_r(hpte_v, hpte_r); + hpte_v = hpte_old_to_new_v(hpte_v); + } + + hptep->r = cpu_to_be64(hpte_r); /* Guarantee the second dword is visible before the valid bit */ eieio(); /* * Now set the first dword including the valid bit * NOTE: this also unlocks the hpte */ - hptep->v = hpte_v; + release_hpte_lock(); + hptep->v = cpu_to_be64(hpte_v); __asm__ __volatile__ ("ptesync" : : : "memory"); + local_irq_restore(flags); + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); } static long native_hpte_remove(unsigned long hpte_group) { + unsigned long hpte_v, flags; struct hash_pte *hptep; int i; int slot_offset; - unsigned long hpte_v; + + local_irq_save(flags); DBG_LOW(" remove(group=%lx)\n", hpte_group); @@ -248,12 +338,12 @@ static long native_hpte_remove(unsigned long hpte_group) for (i = 0; i < HPTES_PER_GROUP; i++) { hptep = htab_address + hpte_group + slot_offset; - hpte_v = hptep->v; + hpte_v = be64_to_cpu(hptep->v); if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) { /* retry with lock held */ native_lock_hpte(hptep); - hpte_v = hptep->v; + hpte_v = be64_to_cpu(hptep->v); if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) break; @@ -264,31 +354,36 @@ static long native_hpte_remove(unsigned long hpte_group) slot_offset &= 0x7; } - if (i == HPTES_PER_GROUP) - return -1; + if (i == HPTES_PER_GROUP) { + i = -1; + goto out; + } /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; - +out: + local_irq_restore(flags); return i; } static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, unsigned long vpn, int bpsize, - int apsize, int ssize, int local) + int apsize, int ssize, unsigned long flags) { struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v, want_v; - int ret = 0; + int ret = 0, local = 0; + unsigned long irqflags; + + local_irq_save(irqflags); want_v = hpte_encode_avpn(vpn, bpsize, ssize); DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)", vpn, want_v & HPTE_V_AVPN, slot, newpp); - native_lock_hpte(hptep); - - hpte_v = hptep->v; + hpte_v = hpte_get_old_v(hptep); /* * We need to invalidate the TLB always because hpte_remove doesn't do * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less @@ -300,36 +395,46 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, DBG_LOW(" -> miss\n"); ret = -1; } else { - DBG_LOW(" -> hit\n"); - /* Update the HPTE */ - hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | - (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C)); + native_lock_hpte(hptep); + /* recheck with locks held */ + hpte_v = hpte_get_old_v(hptep); + if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) || + !(hpte_v & HPTE_V_VALID))) { + ret = -1; + } else { + DBG_LOW(" -> hit\n"); + /* Update the HPTE */ + hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & + ~(HPTE_R_PPP | HPTE_R_N)) | + (newpp & (HPTE_R_PPP | HPTE_R_N | + HPTE_R_C))); + } + native_unlock_hpte(hptep); } - native_unlock_hpte(hptep); - /* Ensure it is out of the tlb too. */ - tlbie(vpn, bpsize, apsize, ssize, local); + if (flags & HPTE_LOCAL_UPDATE) + local = 1; + /* + * Ensure it is out of the tlb too if it is not a nohpte fault + */ + if (!(flags & HPTE_NOHPTE_UPDATE)) + tlbie(vpn, bpsize, apsize, ssize, local); + + local_irq_restore(irqflags); return ret; } -static long native_hpte_find(unsigned long vpn, int psize, int ssize) +static long __native_hpte_find(unsigned long want_v, unsigned long slot) { struct hash_pte *hptep; - unsigned long hash; + unsigned long hpte_v; unsigned long i; - long slot; - unsigned long want_v, hpte_v; - hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); - want_v = hpte_encode_avpn(vpn, psize, ssize); - - /* Bolted mappings are only ever in the primary group */ - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; for (i = 0; i < HPTES_PER_GROUP; i++) { - hptep = htab_address + slot; - hpte_v = hptep->v; + hptep = htab_address + slot; + hpte_v = hpte_get_old_v(hptep); if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) /* HPTE matches */ return slot; @@ -339,6 +444,33 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize) return -1; } +static long native_hpte_find(unsigned long vpn, int psize, int ssize) +{ + unsigned long hpte_group; + unsigned long want_v; + unsigned long hash; + long slot; + + hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); + + /* + * We try to keep bolted entries always in primary hash + * But in some case we can find them in secondary too. + */ + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = __native_hpte_find(want_v, hpte_group); + if (slot < 0) { + /* Try in secondary */ + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = __native_hpte_find(want_v, hpte_group); + if (slot < 0) + return -1; + } + + return slot; +} + /* * Update the page protection bits. Intended to be used to create * guard pages for kernel data structures on pages which are bolted @@ -353,6 +485,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -363,15 +498,56 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, hptep = htab_address + slot; /* Update the HPTE */ - hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | - (newpp & (HPTE_R_PP | HPTE_R_N)); + hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & + ~(HPTE_R_PPP | HPTE_R_N)) | + (newpp & (HPTE_R_PPP | HPTE_R_N))); /* * Ensure it is out of the tlb too. Bolted entries base and * actual page size will be same. */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); +} + +/* + * Remove a bolted kernel entry. Memory hotplug uses this. + * + * No need to lock here because we should be the only user. + */ +static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) +{ + unsigned long vpn; + unsigned long vsid; + long slot; + struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); + + vsid = get_kernel_vsid(ea, ssize); + vpn = hpt_vpn(ea, vsid, ssize); + + slot = native_hpte_find(vpn, psize, ssize); + if (slot == -1) + return -ENOENT; + + hptep = htab_address + slot; + + VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED)); + + /* Invalidate the hpte */ + hptep->v = 0; + + /* Invalidate the TLB */ + tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); + + return 0; } + static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, int bpsize, int apsize, int ssize, int local) { @@ -385,9 +561,20 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot); want_v = hpte_encode_avpn(vpn, bpsize, ssize); - native_lock_hpte(hptep); - hpte_v = hptep->v; + hpte_v = hpte_get_old_v(hptep); + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + native_lock_hpte(hptep); + /* recheck with locks held */ + hpte_v = hpte_get_old_v(hptep); + + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); + hptep->v = 0; + } else + native_unlock_hpte(hptep); + } /* * We need to invalidate the TLB always because hpte_remove doesn't do * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less @@ -395,30 +582,24 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, * (hpte_remove) because we assume the old translation is still * technically "valid". */ - if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) - native_unlock_hpte(hptep); - else - /* Invalidate the hpte. NOTE: this also unlocks it */ - hptep->v = 0; - - /* Invalidate the TLB */ tlbie(vpn, bpsize, apsize, ssize, local); local_irq_restore(flags); } -static void native_hugepage_invalidate(struct mm_struct *mm, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void native_hugepage_invalidate(unsigned long vsid, + unsigned long addr, unsigned char *hpte_slot_array, - unsigned long addr, int psize) + int psize, int ssize, int local) { - int ssize = 0, i; - int lock_tlbie; + int i; struct hash_pte *hptep; int actual_psize = MMU_PAGE_16M; unsigned int max_hpte_count, valid; unsigned long flags, s_addr = addr; unsigned long hpte_v, want_v, shift; - unsigned long hidx, vpn = 0, vsid, hash, slot; + unsigned long hidx, vpn = 0, hash, slot; shift = mmu_psize_defs[psize].shift; max_hpte_count = 1U << (PMD_SHIFT - shift); @@ -432,15 +613,6 @@ static void native_hugepage_invalidate(struct mm_struct *mm, /* get the vpn */ addr = s_addr + (i * (1ul << shift)); - if (!is_kernel_addr(addr)) { - ssize = user_segment_size(addr); - vsid = get_vsid(mm->context.id, addr, ssize); - WARN_ON(vsid == 0); - } else { - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); - ssize = mmu_kernel_ssize; - } - vpn = hpt_vpn(addr, vsid, ssize); hash = hpt_hash(vpn, shift, ssize); if (hidx & _PTEIDX_SECONDARY) @@ -451,88 +623,61 @@ static void native_hugepage_invalidate(struct mm_struct *mm, hptep = htab_address + slot; want_v = hpte_encode_avpn(vpn, psize, ssize); - native_lock_hpte(hptep); - hpte_v = hptep->v; + hpte_v = hpte_get_old_v(hptep); /* Even if we miss, we need to invalidate the TLB */ - if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) - native_unlock_hpte(hptep); - else - /* Invalidate the hpte. NOTE: this also unlocks it */ - hptep->v = 0; - } - /* - * Since this is a hugepage, we just need a single tlbie. - * use the last vpn. - */ - lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); - if (lock_tlbie) - raw_spin_lock(&native_tlbie_lock); - - asm volatile("ptesync":::"memory"); - __tlbie(vpn, psize, actual_psize, ssize); - asm volatile("eieio; tlbsync; ptesync":::"memory"); - - if (lock_tlbie) - raw_spin_unlock(&native_tlbie_lock); - - local_irq_restore(flags); -} - -static inline int __hpte_actual_psize(unsigned int lp, int psize) -{ - int i, shift; - unsigned int mask; - - /* start from 1 ignoring MMU_PAGE_4K */ - for (i = 1; i < MMU_PAGE_COUNT; i++) { + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + /* recheck with locks held */ + native_lock_hpte(hptep); + hpte_v = hpte_get_old_v(hptep); - /* invalid penc */ - if (mmu_psize_defs[psize].penc[i] == -1) - continue; + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); + hptep->v = 0; + } else + native_unlock_hpte(hptep); + } /* - * encoding bits per actual page size - * PTE LP actual page size - * rrrr rrrz >=8KB - * rrrr rrzz >=16KB - * rrrr rzzz >=32KB - * rrrr zzzz >=64KB - * ....... + * We need to do tlb invalidate for all the address, tlbie + * instruction compares entry_VA in tlb with the VA specified + * here */ - shift = mmu_psize_defs[i].shift - LP_SHIFT; - if (shift > LP_BITS) - shift = LP_BITS; - mask = (1 << shift) - 1; - if ((lp & mask) == mmu_psize_defs[psize].penc[i]) - return i; + tlbie(vpn, psize, actual_psize, ssize, local); } - return -1; + local_irq_restore(flags); +} +#else +static void native_hugepage_invalidate(unsigned long vsid, + unsigned long addr, + unsigned char *hpte_slot_array, + int psize, int ssize, int local) +{ + WARN(1, "%s called without THP support\n", __func__); } +#endif static void hpte_decode(struct hash_pte *hpte, unsigned long slot, int *psize, int *apsize, int *ssize, unsigned long *vpn) { unsigned long avpn, pteg, vpi; - unsigned long hpte_v = hpte->v; + unsigned long hpte_v = be64_to_cpu(hpte->v); + unsigned long hpte_r = be64_to_cpu(hpte->r); unsigned long vsid, seg_off; int size, a_size, shift; /* Look at the 8 bit LP value */ - unsigned int lp = (hpte->r >> LP_SHIFT) & ((1 << LP_BITS) - 1); + unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hpte_v = hpte_new_to_old_v(hpte_v, hpte_r); + hpte_r = hpte_new_to_old_r(hpte_r); + } if (!(hpte_v & HPTE_V_LARGE)) { size = MMU_PAGE_4K; a_size = MMU_PAGE_4K; } else { - for (size = 0; size < MMU_PAGE_COUNT; size++) { - - /* valid entries have a shift value */ - if (!mmu_psize_defs[size].shift) - continue; - - a_size = __hpte_actual_psize(lp, size); - if (a_size != -1) - break; - } + size = hpte_page_sizes[lp] & 0xf; + a_size = hpte_page_sizes[lp] >> 4; } /* This works for all page sizes, and for 256M and 1T segments */ *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; @@ -554,6 +699,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, seg_off |= vpi << shift; } *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT; + break; case MMU_SEGSIZE_1T: /* We only have 40 - 23 bits of seg_off in avpn */ seg_off = (avpn & 0x1ffff) << 23; @@ -563,6 +709,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, seg_off |= vpi << shift; } *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT; + break; default: *vpn = size = 0; } @@ -575,13 +722,21 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, * be when they isi), and we are the only one left. We rely on our kernel * mapping being 0xC0's and the hardware ignoring those two real bits. * + * This must be called with interrupts disabled. + * + * Taking the native_tlbie_lock is unsafe here due to the possibility of + * lockdep being on. On pre POWER5 hardware, not taking the lock could + * cause deadlock. POWER5 and newer not taking the lock is fine. This only + * gets called during boot before secondary CPUs have come up and during + * crashdump and all bets are off anyway. + * * TODO: add batching support when enabled. remember, no dynamic memory here, - * athough there is the control page available... + * although there is the control page available... */ -static void native_hpte_clear(void) +static notrace void native_hpte_clear(void) { unsigned long vpn = 0; - unsigned long slot, slots, flags; + unsigned long slot, slots; struct hash_pte *hptep = htab_address; unsigned long hpte_v; unsigned long pteg_count; @@ -589,13 +744,6 @@ static void native_hpte_clear(void) pteg_count = htab_hash_mask + 1; - local_irq_save(flags); - - /* we take the tlbie lock and hold it. Some hardware will - * deadlock if we try to tlbie from two processors at once. - */ - raw_spin_lock(&native_tlbie_lock); - slots = pteg_count * HPTES_PER_GROUP; for (slot = 0; slot < slots; slot++, hptep++) { @@ -604,22 +752,20 @@ static void native_hpte_clear(void) * running, right? and for crash dump, we probably * don't want to wait for a maybe bad cpu. */ - hpte_v = hptep->v; + hpte_v = be64_to_cpu(hptep->v); /* - * Call __tlbie() here rather than tlbie() since we - * already hold the native_tlbie_lock. + * Call __tlbie() here rather than tlbie() since we can't take the + * native_tlbie_lock. */ if (hpte_v & HPTE_V_VALID) { hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn); hptep->v = 0; - __tlbie(vpn, psize, apsize, ssize); + ___tlbie(vpn, psize, apsize, ssize); } } asm volatile("eieio; tlbsync; ptesync":::"memory"); - raw_spin_unlock(&native_tlbie_lock); - local_irq_restore(flags); } /* @@ -628,14 +774,14 @@ static void native_hpte_clear(void) */ static void native_flush_hash_range(unsigned long number, int local) { - unsigned long vpn; + unsigned long vpn = 0; unsigned long hash, index, hidx, shift, slot; struct hash_pte *hptep; unsigned long hpte_v; unsigned long want_v; unsigned long flags; real_pte_t pte; - struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); + struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); unsigned long psize = batch->psize; int ssize = batch->ssize; int i; @@ -655,13 +801,21 @@ static void native_flush_hash_range(unsigned long number, int local) slot += hidx & _PTEIDX_GROUP_IX; hptep = htab_address + slot; want_v = hpte_encode_avpn(vpn, psize, ssize); + hpte_v = hpte_get_old_v(hptep); + + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) + continue; + /* lock and try again */ native_lock_hpte(hptep); - hpte_v = hptep->v; - if (!HPTE_V_COMPARE(hpte_v, want_v) || - !(hpte_v & HPTE_V_VALID)) + hpte_v = hpte_get_old_v(hptep); + + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); - else + else { + release_hpte_lock(); hptep->v = 0; + } + } pte_iterate_hashed_end(); } @@ -677,7 +831,7 @@ static void native_flush_hash_range(unsigned long number, int local) __tlbiel(vpn, psize, psize, ssize); } pte_iterate_hashed_end(); } - asm volatile("ptesync":::"memory"); + ppc_after_tlbiel_barrier(); } else { int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); @@ -694,6 +848,10 @@ static void native_flush_hash_range(unsigned long number, int local) __tlbie(vpn, psize, psize, ssize); } pte_iterate_hashed_end(); } + /* + * Just do one more with the last used values. + */ + fixup_tlbie_vpn(vpn, psize, psize, ssize); asm volatile("eieio; tlbsync; ptesync":::"memory"); if (lock_tlbie) @@ -705,12 +863,13 @@ static void native_flush_hash_range(unsigned long number, int local) void __init hpte_init_native(void) { - ppc_md.hpte_invalidate = native_hpte_invalidate; - ppc_md.hpte_updatepp = native_hpte_updatepp; - ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp; - ppc_md.hpte_insert = native_hpte_insert; - ppc_md.hpte_remove = native_hpte_remove; - ppc_md.hpte_clear_all = native_hpte_clear; - ppc_md.flush_hash_range = native_flush_hash_range; - ppc_md.hugepage_invalidate = native_hugepage_invalidate; + mmu_hash_ops.hpte_invalidate = native_hpte_invalidate; + mmu_hash_ops.hpte_updatepp = native_hpte_updatepp; + mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp; + mmu_hash_ops.hpte_removebolted = native_hpte_removebolted; + mmu_hash_ops.hpte_insert = native_hpte_insert; + mmu_hash_ops.hpte_remove = native_hpte_remove; + mmu_hash_ops.hpte_clear_all = native_hpte_clear; + mmu_hash_ops.flush_hash_range = native_flush_hash_range; + mmu_hash_ops.hugepage_invalidate = native_hugepage_invalidate; } diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c new file mode 100644 index 000000000000..82d31177630b --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -0,0 +1,563 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2005, Paul Mackerras, IBM Corporation. + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + */ + +#include <linux/sched.h> +#include <linux/mm_types.h> +#include <linux/mm.h> +#include <linux/stop_machine.h> + +#include <asm/sections.h> +#include <asm/mmu.h> +#include <asm/tlb.h> +#include <asm/firmware.h> + +#include <mm/mmu_decl.h> + +#include <trace/events/thp.h> + +#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE)) +#warning Limited user VSID range means pagetable space is wasted +#endif + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * vmemmap is the starting address of the virtual address space where + * struct pages are allocated for all possible PFNs present on the system + * including holes and bad memory (hence sparse). These virtual struct + * pages are stored in sequence in this virtual address space irrespective + * of the fact whether the corresponding PFN is valid or not. This achieves + * constant relationship between address of struct page and its PFN. + * + * During boot or memory hotplug operation when a new memory section is + * added, physical memory allocation (including hash table bolting) will + * be performed for the set of struct pages which are part of the memory + * section. This saves memory by not allocating struct pages for PFNs + * which are not valid. + * + * ---------------------------------------------- + * | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES| + * ---------------------------------------------- + * + * f000000000000000 c000000000000000 + * vmemmap +--------------+ +--------------+ + * + | page struct | +--------------> | page struct | + * | +--------------+ +--------------+ + * | | page struct | +--------------> | page struct | + * | +--------------+ | +--------------+ + * | | page struct | + +------> | page struct | + * | +--------------+ | +--------------+ + * | | page struct | | +--> | page struct | + * | +--------------+ | | +--------------+ + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | | | + * | +--------------+ | | + * | | page struct | +-------+ | + * | +--------------+ | + * | | page struct | +-----------+ + * | +--------------+ + * | | page struct | No mapping + * | +--------------+ + * | | page struct | No mapping + * v +--------------+ + * + * ----------------------------------------- + * | RELATION BETWEEN STRUCT PAGES AND PFNS| + * ----------------------------------------- + * + * vmemmap +--------------+ +---------------+ + * + | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | | + * | +--------------+ + * | | | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * | +--------------+ +---------------+ + * | | page struct | +-------------> | PFN | + * v +--------------+ +---------------+ + */ +/* + * On hash-based CPUs, the vmemmap is bolted in the hash table. + * + */ +int __meminit hash__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + int rc; + + if ((start + page_size) >= H_VMEMMAP_END) { + pr_warn("Outside the supported range\n"); + return -1; + } + + rc = htab_bolt_mapping(start, start + page_size, phys, + pgprot_val(PAGE_KERNEL), + mmu_vmemmap_psize, mmu_kernel_ssize); + if (rc < 0) { + int rc2 = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(rc2 && (rc2 != -ENOENT)); + } + return rc; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void hash__vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ + int rc = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON((rc < 0) && (rc != -ENOENT)); + WARN_ON(rc == -ENOENT); +} +#endif +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +/* + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); + } else { + /* + * If the mm subsystem is not fully up, we cannot create a + * linux page table entry for this mapping. Simply bolt an + * entry in the hardware page table. + * + */ + if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot), + mmu_io_psize, mmu_kernel_ssize)) { + printk(KERN_ERR "Failed to do bolted mapping IO " + "memory at %016lx !\n", pa); + return -ENOMEM; + } + } + + smp_wmb(); + return 0; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + __be64 old_be, tmp; + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!hash__pmd_trans_huge(*pmdp)); + assert_spin_locked(pmd_lockptr(mm, pmdp)); +#endif + + __asm__ __volatile__( + "1: ldarx %0,0,%3\n\ + and. %1,%0,%6\n\ + bne- 1b \n\ + andc %1,%0,%4 \n\ + or %1,%1,%7\n\ + stdcx. %1,0,%3 \n\ + bne- 1b" + : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) + : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), + "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) + : "cc" ); + + old = be64_to_cpu(old_be); + + trace_hugepage_update_pmd(addr, old, clr, set); + if (old & H_PAGE_HASHPTE) + hpte_do_hugepage_flush(mm, addr, pmdp, old); + return old; +} + +pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_trans_huge(*pmdp)); + + pmd = *pmdp; + pmd_clear(pmdp); + /* + * Wait for all pending hash_page to finish. This is needed + * in case of subpage collapse. When we collapse normal pages + * to hugepage, we first clear the pmd, then invalidate all + * the PTE entries. The assumption here is that any low level + * page fault will see a none pmd and take the slow path that + * will wait on mmap_lock. But we could very well be in a + * hash_page with local ptep pointer value. Such a hash page + * can result in adding new HPTE entries for normal subpages. + * That means we could be modifying the page content as we + * copy them to a huge page. So wait for parallel hash_page + * to finish before invalidating HPTE entries. We can do this + * by sending an IPI to all the cpus and executing a dummy + * function there. + */ + serialize_against_pte_lookup(vma->vm_mm); + /* + * Now invalidate the hpte entries in the range + * covered by pmd. This make sure we take a + * fault and will find the pmd as none, which will + * result in a major fault which takes mmap_lock and + * hence wait for collapse to complete. Without this + * the __collapse_huge_page_copy can result in copying + * the old content. + */ + flush_hash_table_pmd_range(vma->vm_mm, &pmd, address); + return pmd; +} + +/* + * We want to put the pgtable in pmd and use pgtable for tracking + * the base page size hptes + */ +void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + pgtable_t *pgtable_slot; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + /* + * we store the pgtable in the second half of PMD + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + *pgtable_slot = pgtable; + /* + * expose the deposited pgtable to other cpus. + * before we set the hugepage PTE at pmd level + * hash fault code looks at the deposted pgtable + * to store hash index values. + */ + smp_wmb(); +} + +pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pgtable_t pgtable; + pgtable_t *pgtable_slot; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Once we withdraw, mark the entry NULL. + */ + *pgtable_slot = NULL; + /* + * We store HPTE information in the deposited PTE fragment. + * zero out the content on withdraw. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + return pgtable; +} + +/* + * A linux hugepage PMD was changed and the corresponding hash table entries + * neesd to be flushed. + */ +void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long old_pmd) +{ + int ssize; + unsigned int psize; + unsigned long vsid; + unsigned long flags = 0; + + /* get the base page size,vsid and segment size */ +#ifdef CONFIG_DEBUG_VM + psize = get_slice_psize(mm, addr); + BUG_ON(psize == MMU_PAGE_16M); +#endif + if (old_pmd & H_PAGE_COMBO) + psize = MMU_PAGE_4K; + else + psize = MMU_PAGE_64K; + + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_user_vsid(&mm->context, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + if (mm_is_thread_local(mm)) + flags |= HPTE_LOCAL_UPDATE; + + return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); +} + +pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + pgtable_t pgtable; + unsigned long old; + pgtable_t *pgtable_slot; + + old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * We have pmd == none and we are holding page_table_lock. + * So we can safely go and clear the pgtable hash + * index info. + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Let's zero out old valid and hash index details + * hash fault look at them. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + return old_pmd; +} + +int hash__has_transparent_hugepage(void) +{ + + if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + return 0; + /* + * We support THP only if PMD_SIZE is 16MB. + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) + return 0; + /* + * We need to make sure that we support 16MB hugepage in a segment + * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE + * of 64K. + */ + /* + * If we have 64K HPTE, we will be using that by default + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift && + (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) + return 0; + /* + * Ok we only have 4K HPTE + */ + if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) + return 0; + + return 1; +} +EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); + +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_STRICT_KERNEL_RWX + +struct change_memory_parms { + unsigned long start, end, newpp; + unsigned int step, nr_cpus; + atomic_t master_cpu; + atomic_t cpu_counter; +}; + +// We'd rather this was on the stack but it has to be in the RMO +static struct change_memory_parms chmem_parms; + +// And therefore we need a lock to protect it from concurrent use +static DEFINE_MUTEX(chmem_lock); + +static void change_memory_range(unsigned long start, unsigned long end, + unsigned int step, unsigned long newpp) +{ + unsigned long idx; + + pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n", + start, end, newpp, step); + + for (idx = start; idx < end; idx += step) + /* Not sure if we can do much with the return value */ + mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize, + mmu_kernel_ssize); +} + +static int notrace chmem_secondary_loop(struct change_memory_parms *parms) +{ + unsigned long msr, tmp, flags; + int *p; + + p = &parms->cpu_counter.counter; + + local_irq_save(flags); + hard_irq_disable(); + + asm volatile ( + // Switch to real mode and leave interrupts off + "mfmsr %[msr] ;" + "li %[tmp], %[MSR_IR_DR] ;" + "andc %[tmp], %[msr], %[tmp] ;" + "mtmsrd %[tmp] ;" + + // Tell the master we are in real mode + "1: " + "lwarx %[tmp], 0, %[p] ;" + "addic %[tmp], %[tmp], -1 ;" + "stwcx. %[tmp], 0, %[p] ;" + "bne- 1b ;" + + // Spin until the counter goes to zero + "2: ;" + "lwz %[tmp], 0(%[p]) ;" + "cmpwi %[tmp], 0 ;" + "bne- 2b ;" + + // Switch back to virtual mode + "mtmsrd %[msr] ;" + + : // outputs + [msr] "=&r" (msr), [tmp] "=&b" (tmp), "+m" (*p) + : // inputs + [p] "b" (p), [MSR_IR_DR] "i" (MSR_IR | MSR_DR) + : // clobbers + "cc", "xer" + ); + + local_irq_restore(flags); + + return 0; +} + +static int change_memory_range_fn(void *data) +{ + struct change_memory_parms *parms = data; + + // First CPU goes through, all others wait. + if (atomic_xchg(&parms->master_cpu, 1) == 1) + return chmem_secondary_loop(parms); + + // Wait for all but one CPU (this one) to call-in + while (atomic_read(&parms->cpu_counter) > 1) + barrier(); + + change_memory_range(parms->start, parms->end, parms->step, parms->newpp); + + mb(); + + // Signal the other CPUs that we're done + atomic_dec(&parms->cpu_counter); + + return 0; +} + +static bool hash__change_memory_range(unsigned long start, unsigned long end, + unsigned long newpp) +{ + unsigned int step, shift; + + shift = mmu_psize_defs[mmu_linear_psize].shift; + step = 1 << shift; + + start = ALIGN_DOWN(start, step); + end = ALIGN(end, step); // aligns up + + if (start >= end) + return false; + + if (firmware_has_feature(FW_FEATURE_LPAR)) { + mutex_lock(&chmem_lock); + + chmem_parms.start = start; + chmem_parms.end = end; + chmem_parms.step = step; + chmem_parms.newpp = newpp; + atomic_set(&chmem_parms.master_cpu, 0); + + cpus_read_lock(); + + atomic_set(&chmem_parms.cpu_counter, num_online_cpus()); + + // Ensure state is consistent before we call the other CPUs + mb(); + + stop_machine_cpuslocked(change_memory_range_fn, &chmem_parms, + cpu_online_mask); + + cpus_read_unlock(); + mutex_unlock(&chmem_lock); + } else + change_memory_range(start, end, step, newpp); + + return true; +} + +void hash__mark_rodata_ro(void) +{ + unsigned long start, end, pp; + + start = (unsigned long)_stext; + end = (unsigned long)__end_rodata; + + pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY); + + WARN_ON(!hash__change_memory_range(start, end, pp)); +} + +void hash__mark_initmem_nx(void) +{ + unsigned long start, end, pp; + + start = (unsigned long)__init_begin; + end = (unsigned long)__init_end; + + pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY); + + WARN_ON(!hash__change_memory_range(start, end, pp)); +} +#endif diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/book3s64/hash_tlb.c index 36e44b4260eb..21fcad97ae80 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains the routines for flushing entries from the * TLB and MMU hash table. @@ -14,22 +15,19 @@ * * Dave Engebretsen <engebret@us.ibm.com> * Rework for PPC64 port. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/init.h> #include <linux/percpu.h> #include <linux/hardirq.h> -#include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/bug.h> +#include <asm/pte-walk.h> + + +#include <trace/events/thp.h> DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); @@ -48,11 +46,12 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, unsigned int psize; int ssize; real_pte_t rpte; - int i; + int i, offset; i = batch->index; - /* Get page size (maybe move back to caller). + /* + * Get page size (maybe move back to caller). * * NOTE: when using special 64K mappings in 4K environment like * for SPEs, we obtain the page size from the slice, which thus @@ -64,40 +63,45 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, psize = get_slice_psize(mm, addr); /* Mask the address for the correct page size */ addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1); + if (unlikely(psize == MMU_PAGE_16G)) + offset = PTRS_PER_PUD; + else + offset = PTRS_PER_PMD; #else BUG(); psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */ #endif } else { psize = pte_pagesize_index(mm, addr, pte); - /* Mask the address for the standard page size. If we + /* + * Mask the address for the standard page size. If we * have a 64k page kernel, but the hardware does not * support 64k pages, this might be different from the - * hardware page size encoded in the slice table. */ + * hardware page size encoded in the slice table. + */ addr &= PAGE_MASK; + offset = PTRS_PER_PTE; } /* Build full vaddr */ if (!is_kernel_addr(addr)) { ssize = user_segment_size(addr); - vsid = get_vsid(mm->context.id, addr, ssize); + vsid = get_user_vsid(&mm->context, addr, ssize); } else { vsid = get_kernel_vsid(addr, mmu_kernel_ssize); ssize = mmu_kernel_ssize; } WARN_ON(vsid == 0); vpn = hpt_vpn(addr, vsid, ssize); - rpte = __real_pte(__pte(pte), ptep); + rpte = __real_pte(__pte(pte), ptep, offset); /* * Check if we have an active batch on this CPU. If not, just - * flush now and return. For now, we don global invalidates - * in that case, might be worth testing the mm cpu mask though - * and decide to use local invalidates instead... + * flush now and return. */ if (!batch->active) { - flush_hash_page(vpn, rpte, psize, ssize, 0); + flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm)); put_cpu_var(ppc64_tlb_batch); return; } @@ -139,13 +143,10 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, */ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) { - const struct cpumask *tmp; - int i, local = 0; + int i, local; i = batch->index; - tmp = cpumask_of(smp_processor_id()); - if (cpumask_equal(mm_cpumask(batch->mm), tmp)) - local = 1; + local = mm_is_thread_local(batch->mm); if (i == 1) flush_hash_page(batch->vpn[0], batch->pte[0], batch->psize, batch->ssize, local); @@ -154,11 +155,12 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) batch->index = 0; } -void tlb_flush(struct mmu_gather *tlb) +void hash__tlb_flush(struct mmu_gather *tlb) { struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); - /* If there's a TLB batch pending, then we must flush it because the + /* + * If there's a TLB batch pending, then we must flush it because the * pages are going to be freed and we really don't want to have a CPU * access a freed page because it has a stale TLB */ @@ -173,7 +175,6 @@ void tlb_flush(struct mmu_gather *tlb) * from the hash table (and the TLB). But keeps * the linux PTEs intact. * - * @mm : mm_struct of the target address space (generally init_mm) * @start : starting address * @end : ending address (not included in the flush) * @@ -186,18 +187,17 @@ void tlb_flush(struct mmu_gather *tlb) * Because of that usage pattern, it is implemented for small size rather * than speed. */ -void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, - unsigned long end) +void __flush_hash_table_range(unsigned long start, unsigned long end) { int hugepage_shift; unsigned long flags; - start = _ALIGN_DOWN(start, PAGE_SIZE); - end = _ALIGN_UP(end, PAGE_SIZE); + start = ALIGN_DOWN(start, PAGE_SIZE); + end = ALIGN(end, PAGE_SIZE); - BUG_ON(!mm->pgd); - /* Note: Normally, we should only ever use a batch within a + /* + * Note: Normally, we should only ever use a batch within a * PTE locked section. This violates the rule, but will work * since we don't actually modify the PTEs, we just flush the * hash while leaving the PTEs intact (including their reference @@ -207,32 +207,29 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, local_irq_save(flags); arch_enter_lazy_mmu_mode(); for (; start < end; start += PAGE_SIZE) { - pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, - &hugepage_shift); + pte_t *ptep = find_init_mm_pte(start, &hugepage_shift); unsigned long pte; if (ptep == NULL) continue; pte = pte_val(*ptep); - if (!(pte & _PAGE_HASHPTE)) + if (!(pte & H_PAGE_HASHPTE)) continue; - if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte))) - hpte_do_hugepage_flush(mm, start, (pmd_t *)pte); - else - hpte_need_flush(mm, start, ptep, pte, 0); + hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift); } arch_leave_lazy_mmu_mode(); local_irq_restore(flags); } -void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) +void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) { pte_t *pte; pte_t *start_pte; unsigned long flags; - addr = _ALIGN_DOWN(addr, PMD_SIZE); - /* Note: Normally, we should only ever use a batch within a + addr = ALIGN_DOWN(addr, PMD_SIZE); + /* + * Note: Normally, we should only ever use a batch within a * PTE locked section. This violates the rule, but will work * since we don't actually modify the PTEs, we just flush the * hash while leaving the PTEs intact (including their reference @@ -242,12 +239,16 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) local_irq_save(flags); arch_enter_lazy_mmu_mode(); start_pte = pte_offset_map(pmd, addr); + if (!start_pte) + goto out; for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { unsigned long pteval = pte_val(*pte); - if (pteval & _PAGE_HASHPTE) + if (pteval & H_PAGE_HASHPTE) hpte_need_flush(mm, addr, pte, pteval, 0); addr += PAGE_SIZE; } + pte_unmap(start_pte); +out: arch_leave_lazy_mmu_mode(); local_irq_restore(flags); } diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c new file mode 100644 index 000000000000..9dc5889d6ecb --- /dev/null +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -0,0 +1,2486 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * PowerPC64 port by Mike Corrigan and Dave Engebretsen + * {mikejc|engebret}@us.ibm.com + * + * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com> + * + * SMP scalability work: + * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM + * + * Module name: htab.c + * + * Description: + * PowerPC Hashed Page Table functions + */ + +#undef DEBUG +#undef DEBUG_LOW + +#define pr_fmt(fmt) "hash-mmu: " fmt +#include <linux/spinlock.h> +#include <linux/errno.h> +#include <linux/sched/mm.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/sysctl.h> +#include <linux/export.h> +#include <linux/ctype.h> +#include <linux/cache.h> +#include <linux/init.h> +#include <linux/signal.h> +#include <linux/memblock.h> +#include <linux/context_tracking.h> +#include <linux/libfdt.h> +#include <linux/pkeys.h> +#include <linux/hugetlb.h> +#include <linux/cpu.h> +#include <linux/pgtable.h> +#include <linux/debugfs.h> +#include <linux/random.h> +#include <linux/elf-randomize.h> +#include <linux/of_fdt.h> +#include <linux/kfence.h> + +#include <asm/interrupt.h> +#include <asm/processor.h> +#include <asm/mmu.h> +#include <asm/mmu_context.h> +#include <asm/page.h> +#include <asm/pgalloc.h> +#include <asm/types.h> +#include <linux/uaccess.h> +#include <asm/machdep.h> +#include <asm/io.h> +#include <asm/eeh.h> +#include <asm/tlb.h> +#include <asm/cacheflush.h> +#include <asm/cputable.h> +#include <asm/sections.h> +#include <asm/spu.h> +#include <asm/udbg.h> +#include <asm/text-patching.h> +#include <asm/fadump.h> +#include <asm/firmware.h> +#include <asm/tm.h> +#include <asm/trace.h> +#include <asm/ps3.h> +#include <asm/pte-walk.h> +#include <asm/asm-prototypes.h> +#include <asm/ultravisor.h> +#include <asm/kfence.h> + +#include <mm/mmu_decl.h> + +#include "internal.h" + + +#ifdef DEBUG +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +#ifdef DEBUG_LOW +#define DBG_LOW(fmt...) udbg_printf(fmt) +#else +#define DBG_LOW(fmt...) +#endif + +#define KB (1024) +#define MB (1024*KB) +#define GB (1024L*MB) + +/* + * Note: pte --> Linux PTE + * HPTE --> PowerPC Hashed Page Table Entry + * + * Execution context: + * htab_initialize is called with the MMU off (of course), but + * the kernel has been copied down to zero so it can directly + * reference global data. At this point it is very difficult + * to print debug info. + * + */ + +static unsigned long _SDR1; + +u8 hpte_page_sizes[1 << LP_BITS]; +EXPORT_SYMBOL_GPL(hpte_page_sizes); + +struct hash_pte *htab_address; +unsigned long htab_size_bytes; +unsigned long htab_hash_mask; +EXPORT_SYMBOL_GPL(htab_hash_mask); +int mmu_linear_psize = MMU_PAGE_4K; +EXPORT_SYMBOL_GPL(mmu_linear_psize); +int mmu_virtual_psize = MMU_PAGE_4K; +int mmu_vmalloc_psize = MMU_PAGE_4K; +EXPORT_SYMBOL_GPL(mmu_vmalloc_psize); +int mmu_io_psize = MMU_PAGE_4K; +int mmu_kernel_ssize = MMU_SEGSIZE_256M; +EXPORT_SYMBOL_GPL(mmu_kernel_ssize); +int mmu_highuser_ssize = MMU_SEGSIZE_256M; +u16 mmu_slb_size = 64; +EXPORT_SYMBOL_GPL(mmu_slb_size); +#ifdef CONFIG_PPC_64K_PAGES +int mmu_ci_restrictions; +#endif +struct mmu_hash_ops mmu_hash_ops __ro_after_init; +EXPORT_SYMBOL(mmu_hash_ops); + +/* + * These are definitions of page sizes arrays to be used when none + * is provided by the firmware. + */ + +/* + * Fallback (4k pages only) + */ +static struct mmu_psize_def mmu_psize_defaults[] = { + [MMU_PAGE_4K] = { + .shift = 12, + .sllp = 0, + .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, + .avpnm = 0, + .tlbiel = 0, + }, +}; + +/* + * POWER4, GPUL, POWER5 + * + * Support for 16Mb large pages + */ +static struct mmu_psize_def mmu_psize_defaults_gp[] = { + [MMU_PAGE_4K] = { + .shift = 12, + .sllp = 0, + .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, + .avpnm = 0, + .tlbiel = 1, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .sllp = SLB_VSID_L, + .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0, + [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 }, + .avpnm = 0x1UL, + .tlbiel = 0, + }, +}; + +static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) +{ + unsigned long rb; + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + + asm volatile("tlbiel %0" : : "r" (rb)); +} + +/* + * tlbiel instruction for hash, set invalidation + * i.e., r=1 and is=01 or is=10 or is=11 + */ +static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, + unsigned int pid, + unsigned int ric, unsigned int prs) +{ + unsigned long rb; + unsigned long rs; + unsigned int r = 0; /* hash format */ + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r) + : "memory"); +} + + +static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + for (set = 0; set < num_sets; set++) + tlbiel_hash_set_isa206(set, is); + + ppc_after_tlbiel_barrier(); +} + +static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the partition table cache if this is HV mode. + */ + if (early_cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_hash_set_isa300(0, is, 0, 2, 0); + + /* + * Now invalidate the process table cache. UPRT=0 HPT modes (what + * current hardware implements) do not use the process table, but + * add the flushes anyway. + * + * From ISA v3.0B p. 1078: + * The following forms are invalid. + * * PRS=1, R=0, and RIC!=2 (The only process-scoped + * HPT caching is of the Process Table.) + */ + tlbiel_hash_set_isa300(0, is, 0, 2, 1); + + /* + * Then flush the sets of the TLB proper. Hash mode uses + * partition scoped TLB translations, which may be flushed + * in !HV mode. + */ + for (set = 0; set < num_sets; set++) + tlbiel_hash_set_isa300(set, is, 0, 0, 0); + + ppc_after_tlbiel_barrier(); + + asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +void hash__tlbiel_all(unsigned int action) +{ + unsigned int is; + + switch (action) { + case TLB_INVAL_SCOPE_GLOBAL: + is = 3; + break; + case TLB_INVAL_SCOPE_LPID: + is = 2; + break; + default: + BUG(); + } + + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) + tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is); + else if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) + tlbiel_all_isa206(POWER8_TLB_SETS, is); + else if (early_cpu_has_feature(CPU_FTR_ARCH_206)) + tlbiel_all_isa206(POWER7_TLB_SETS, is); + else + WARN(1, "%s called on pre-POWER7 CPU\n", __func__); +} + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) +static void kernel_map_linear_page(unsigned long vaddr, unsigned long idx, + u8 *slots, raw_spinlock_t *lock) +{ + unsigned long hash; + unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); + unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY); + long ret; + + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); + + /* Don't create HPTE entries for bad address */ + if (!vsid) + return; + + if (slots[idx] & 0x80) + return; + + ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, + HPTE_V_BOLTED, + mmu_linear_psize, mmu_kernel_ssize); + + BUG_ON (ret < 0); + raw_spin_lock(lock); + BUG_ON(slots[idx] & 0x80); + slots[idx] = ret | 0x80; + raw_spin_unlock(lock); +} + +static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long idx, + u8 *slots, raw_spinlock_t *lock) +{ + unsigned long hash, hslot, slot; + unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); + + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); + raw_spin_lock(lock); + if (!(slots[idx] & 0x80)) { + raw_spin_unlock(lock); + return; + } + hslot = slots[idx] & 0x7f; + slots[idx] = 0; + raw_spin_unlock(lock); + if (hslot & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hslot & _PTEIDX_GROUP_IX; + mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize, + mmu_linear_psize, + mmu_kernel_ssize, 0); +} +#endif + +static inline bool hash_supports_debug_pagealloc(void) +{ + unsigned long max_hash_count = ppc64_rma_size / 4; + unsigned long linear_map_count = memblock_end_of_DRAM() >> PAGE_SHIFT; + + if (!debug_pagealloc_enabled() || linear_map_count > max_hash_count) + return false; + return true; +} + +#ifdef CONFIG_DEBUG_PAGEALLOC +static u8 *linear_map_hash_slots; +static unsigned long linear_map_hash_count; +static DEFINE_RAW_SPINLOCK(linear_map_hash_lock); +static __init void hash_debug_pagealloc_alloc_slots(void) +{ + if (!hash_supports_debug_pagealloc()) + return; + + linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; + linear_map_hash_slots = memblock_alloc_try_nid( + linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT, + ppc64_rma_size, NUMA_NO_NODE); + if (!linear_map_hash_slots) + panic("%s: Failed to allocate %lu bytes max_addr=%pa\n", + __func__, linear_map_hash_count, &ppc64_rma_size); +} + +static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, + int slot) +{ + if (!debug_pagealloc_enabled() || !linear_map_hash_count) + return; + if ((paddr >> PAGE_SHIFT) < linear_map_hash_count) + linear_map_hash_slots[paddr >> PAGE_SHIFT] = slot | 0x80; +} + +static int hash_debug_pagealloc_map_pages(struct page *page, int numpages, + int enable) +{ + unsigned long flags, vaddr, lmi; + int i; + + if (!debug_pagealloc_enabled() || !linear_map_hash_count) + return 0; + + local_irq_save(flags); + for (i = 0; i < numpages; i++, page++) { + vaddr = (unsigned long)page_address(page); + lmi = __pa(vaddr) >> PAGE_SHIFT; + if (lmi >= linear_map_hash_count) + continue; + if (enable) + kernel_map_linear_page(vaddr, lmi, + linear_map_hash_slots, &linear_map_hash_lock); + else + kernel_unmap_linear_page(vaddr, lmi, + linear_map_hash_slots, &linear_map_hash_lock); + } + local_irq_restore(flags); + return 0; +} + +#else /* CONFIG_DEBUG_PAGEALLOC */ +static inline void hash_debug_pagealloc_alloc_slots(void) {} +static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot) {} +static int __maybe_unused +hash_debug_pagealloc_map_pages(struct page *page, int numpages, int enable) +{ + return 0; +} +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +#ifdef CONFIG_KFENCE +static u8 *linear_map_kf_hash_slots; +static unsigned long linear_map_kf_hash_count; +static DEFINE_RAW_SPINLOCK(linear_map_kf_hash_lock); + +static phys_addr_t kfence_pool; + +static __init void hash_kfence_alloc_pool(void) +{ + if (!kfence_early_init_enabled()) + goto err; + + /* allocate linear map for kfence within RMA region */ + linear_map_kf_hash_count = KFENCE_POOL_SIZE >> PAGE_SHIFT; + linear_map_kf_hash_slots = memblock_alloc_try_nid( + linear_map_kf_hash_count, 1, + MEMBLOCK_LOW_LIMIT, ppc64_rma_size, + NUMA_NO_NODE); + if (!linear_map_kf_hash_slots) { + pr_err("%s: memblock for linear map (%lu) failed\n", __func__, + linear_map_kf_hash_count); + goto err; + } + + /* allocate kfence pool early */ + kfence_pool = memblock_phys_alloc_range(KFENCE_POOL_SIZE, PAGE_SIZE, + MEMBLOCK_LOW_LIMIT, MEMBLOCK_ALLOC_ANYWHERE); + if (!kfence_pool) { + pr_err("%s: memblock for kfence pool (%lu) failed\n", __func__, + KFENCE_POOL_SIZE); + memblock_free(linear_map_kf_hash_slots, + linear_map_kf_hash_count); + linear_map_kf_hash_count = 0; + goto err; + } + memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE); + + return; +err: + pr_info("Disabling kfence\n"); + disable_kfence(); +} + +static __init void hash_kfence_map_pool(void) +{ + unsigned long kfence_pool_start, kfence_pool_end; + unsigned long prot = pgprot_val(PAGE_KERNEL); + unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift; + + if (!kfence_pool) + return; + + kfence_pool_start = (unsigned long) __va(kfence_pool); + kfence_pool_end = kfence_pool_start + KFENCE_POOL_SIZE; + __kfence_pool = (char *) kfence_pool_start; + BUG_ON(htab_bolt_mapping(kfence_pool_start, kfence_pool_end, + kfence_pool, prot, mmu_linear_psize, + mmu_kernel_ssize)); + update_page_count(mmu_linear_psize, KFENCE_POOL_SIZE >> pshift); + memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); +} + +static inline void hash_kfence_add_slot(phys_addr_t paddr, int slot) +{ + unsigned long vaddr = (unsigned long) __va(paddr); + unsigned long lmi = (vaddr - (unsigned long)__kfence_pool) + >> PAGE_SHIFT; + + if (!kfence_pool) + return; + BUG_ON(!is_kfence_address((void *)vaddr)); + BUG_ON(lmi >= linear_map_kf_hash_count); + linear_map_kf_hash_slots[lmi] = slot | 0x80; +} + +static int hash_kfence_map_pages(struct page *page, int numpages, int enable) +{ + unsigned long flags, vaddr, lmi; + int i; + + WARN_ON_ONCE(!linear_map_kf_hash_count); + local_irq_save(flags); + for (i = 0; i < numpages; i++, page++) { + vaddr = (unsigned long)page_address(page); + lmi = (vaddr - (unsigned long)__kfence_pool) >> PAGE_SHIFT; + + /* Ideally this should never happen */ + if (lmi >= linear_map_kf_hash_count) { + WARN_ON_ONCE(1); + continue; + } + + if (enable) + kernel_map_linear_page(vaddr, lmi, + linear_map_kf_hash_slots, + &linear_map_kf_hash_lock); + else + kernel_unmap_linear_page(vaddr, lmi, + linear_map_kf_hash_slots, + &linear_map_kf_hash_lock); + } + local_irq_restore(flags); + return 0; +} +#else +static inline void hash_kfence_alloc_pool(void) {} +static inline void hash_kfence_map_pool(void) {} +static inline void hash_kfence_add_slot(phys_addr_t paddr, int slot) {} +static int __maybe_unused +hash_kfence_map_pages(struct page *page, int numpages, int enable) +{ + return 0; +} +#endif + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) +int hash__kernel_map_pages(struct page *page, int numpages, int enable) +{ + void *vaddr = page_address(page); + + if (is_kfence_address(vaddr)) + return hash_kfence_map_pages(page, numpages, enable); + else + return hash_debug_pagealloc_map_pages(page, numpages, enable); +} + +static void hash_linear_map_add_slot(phys_addr_t paddr, int slot) +{ + if (is_kfence_address(__va(paddr))) + hash_kfence_add_slot(paddr, slot); + else + hash_debug_pagealloc_add_slot(paddr, slot); +} +#else +static void hash_linear_map_add_slot(phys_addr_t paddr, int slot) {} +#endif + +/* + * 'R' and 'C' update notes: + * - Under pHyp or KVM, the updatepp path will not set C, thus it *will* + * create writeable HPTEs without C set, because the hcall H_PROTECT + * that we use in that case will not update C + * - The above is however not a problem, because we also don't do that + * fancy "no flush" variant of eviction and we use H_REMOVE which will + * do the right thing and thus we don't have the race I described earlier + * + * - Under bare metal, we do have the race, so we need R and C set + * - We make sure R is always set and never lost + * - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping + */ +unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags) +{ + unsigned long rflags = 0; + + /* _PAGE_EXEC -> NOEXEC */ + if ((pteflags & _PAGE_EXEC) == 0) + rflags |= HPTE_R_N; + /* + * PPP bits: + * Linux uses slb key 0 for kernel and 1 for user. + * kernel RW areas are mapped with PPP=0b000 + * User area is mapped with PPP=0b010 for read/write + * or PPP=0b011 for read-only (including writeable but clean pages). + */ + if (pteflags & _PAGE_PRIVILEGED) { + /* + * Kernel read only mapped with ppp bits 0b110 + */ + if (!(pteflags & _PAGE_WRITE)) { + if (mmu_has_feature(MMU_FTR_KERNEL_RO)) + rflags |= (HPTE_R_PP0 | 0x2); + else + rflags |= 0x3; + } + VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request"); + } else { + if (pteflags & _PAGE_RWX) + rflags |= 0x2; + /* + * We should never hit this in normal fault handling because + * a permission check (check_pte_access()) will bubble this + * to higher level linux handler even for PAGE_NONE. + */ + VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request"); + if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) + rflags |= 0x1; + } + /* + * We can't allow hardware to update hpte bits. Hence always + * set 'R' bit and set 'C' if it is a write fault + */ + rflags |= HPTE_R_R; + + if (pteflags & _PAGE_DIRTY) + rflags |= HPTE_R_C; + /* + * Add in WIG bits + */ + + if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) + rflags |= HPTE_R_I; + else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT) + rflags |= (HPTE_R_I | HPTE_R_G); + else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO) + rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M); + else + /* + * Add memory coherence if cache inhibited is not set + */ + rflags |= HPTE_R_M; + + rflags |= pte_to_hpte_pkey_bits(pteflags, flags); + return rflags; +} + +int htab_bolt_mapping(unsigned long vstart, unsigned long vend, + unsigned long pstart, unsigned long prot, + int psize, int ssize) +{ + unsigned long vaddr, paddr; + unsigned int step, shift; + int ret = 0; + + shift = mmu_psize_defs[psize].shift; + step = 1 << shift; + + prot = htab_convert_pte_flags(prot, HPTE_USE_KERNEL_KEY); + + DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n", + vstart, vend, pstart, prot, psize, ssize); + + /* Carefully map only the possible range */ + vaddr = ALIGN(vstart, step); + paddr = ALIGN(pstart, step); + vend = ALIGN_DOWN(vend, step); + + for (; vaddr < vend; vaddr += step, paddr += step) { + unsigned long hash, hpteg; + unsigned long vsid = get_kernel_vsid(vaddr, ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); + unsigned long tprot = prot; + bool secondary_hash = false; + + /* + * If we hit a bad address return error. + */ + if (!vsid) + return -1; + /* Make kernel text executable */ + if (overlaps_kernel_text(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + + /* + * If relocatable, check if it overlaps interrupt vectors that + * are copied down to real 0. For relocatable kernel + * (e.g. kdump case) we copy interrupt vectors down to real + * address 0. Mark that region as executable. This is + * because on p8 system with relocation on exception feature + * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence + * in order to execute the interrupt handlers in virtual + * mode the vector region need to be marked as executable. + */ + if ((PHYSICAL_START > MEMORY_START) && + overlaps_interrupt_vector_text(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + + hash = hpt_hash(vpn, shift, ssize); + hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); + + BUG_ON(!mmu_hash_ops.hpte_insert); +repeat: + ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot, + HPTE_V_BOLTED, psize, psize, + ssize); + if (ret == -1) { + /* + * Try to keep bolted entries in primary. + * Remove non bolted entries and try insert again + */ + ret = mmu_hash_ops.hpte_remove(hpteg); + if (ret != -1) + ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot, + HPTE_V_BOLTED, psize, psize, + ssize); + if (ret == -1 && !secondary_hash) { + secondary_hash = true; + hpteg = ((~hash & htab_hash_mask) * HPTES_PER_GROUP); + goto repeat; + } + } + + if (ret < 0) + break; + + cond_resched(); + /* add slot info in debug_pagealloc / kfence linear map */ + hash_linear_map_add_slot(paddr, ret); + } + return ret < 0 ? ret : 0; +} + +int htab_remove_mapping(unsigned long vstart, unsigned long vend, + int psize, int ssize) +{ + unsigned long vaddr, time_limit; + unsigned int step, shift; + int rc; + int ret = 0; + + shift = mmu_psize_defs[psize].shift; + step = 1 << shift; + + if (!mmu_hash_ops.hpte_removebolted) + return -ENODEV; + + /* Unmap the full range specificied */ + vaddr = ALIGN_DOWN(vstart, step); + time_limit = jiffies + HZ; + + for (;vaddr < vend; vaddr += step) { + rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize); + + /* + * For large number of mappings introduce a cond_resched() + * to prevent softlockup warnings. + */ + if (time_after(jiffies, time_limit)) { + cond_resched(); + time_limit = jiffies + HZ; + } + if (rc == -ENOENT) { + ret = -ENOENT; + continue; + } + if (rc < 0) + return rc; + } + + return ret; +} + +static bool disable_1tb_segments __ro_after_init; + +static int __init parse_disable_1tb_segments(char *p) +{ + disable_1tb_segments = true; + return 0; +} +early_param("disable_1tb_segments", parse_disable_1tb_segments); + +bool stress_hpt_enabled __initdata; + +static int __init parse_stress_hpt(char *p) +{ + stress_hpt_enabled = true; + return 0; +} +early_param("stress_hpt", parse_stress_hpt); + +__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_hpt_key); + +/* + * per-CPU array allocated if we enable stress_hpt. + */ +#define STRESS_MAX_GROUPS 16 +struct stress_hpt_struct { + unsigned long last_group[STRESS_MAX_GROUPS]; +}; + +static inline int stress_nr_groups(void) +{ + /* + * LPAR H_REMOVE flushes TLB, so need some number > 1 of entries + * to allow practical forward progress. Bare metal returns 1, which + * seems to help uncover more bugs. + */ + if (firmware_has_feature(FW_FEATURE_LPAR)) + return STRESS_MAX_GROUPS; + else + return 1; +} + +static struct stress_hpt_struct *stress_hpt_struct; + +static int __init htab_dt_scan_seg_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + int size = 0; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size); + if (prop == NULL) + return 0; + for (; size >= 4; size -= 4, ++prop) { + if (be32_to_cpu(prop[0]) == 40) { + DBG("1T segment support detected\n"); + + if (disable_1tb_segments) { + DBG("1T segments disabled by command line\n"); + break; + } + + cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT; + return 1; + } + } + cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; + return 0; +} + +static int __init get_idx_from_shift(unsigned int shift) +{ + int idx = -1; + + switch (shift) { + case 0xc: + idx = MMU_PAGE_4K; + break; + case 0x10: + idx = MMU_PAGE_64K; + break; + case 0x14: + idx = MMU_PAGE_1M; + break; + case 0x18: + idx = MMU_PAGE_16M; + break; + case 0x22: + idx = MMU_PAGE_16G; + break; + } + return idx; +} + +static int __init htab_dt_scan_page_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + int size = 0; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size); + if (!prop) + return 0; + + pr_info("Page sizes from device-tree:\n"); + size /= 4; + cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE); + while(size > 0) { + unsigned int base_shift = be32_to_cpu(prop[0]); + unsigned int slbenc = be32_to_cpu(prop[1]); + unsigned int lpnum = be32_to_cpu(prop[2]); + struct mmu_psize_def *def; + int idx, base_idx; + + size -= 3; prop += 3; + base_idx = get_idx_from_shift(base_shift); + if (base_idx < 0) { + /* skip the pte encoding also */ + prop += lpnum * 2; size -= lpnum * 2; + continue; + } + def = &mmu_psize_defs[base_idx]; + if (base_idx == MMU_PAGE_16M) + cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE; + + def->shift = base_shift; + if (base_shift <= 23) + def->avpnm = 0; + else + def->avpnm = (1 << (base_shift - 23)) - 1; + def->sllp = slbenc; + /* + * We don't know for sure what's up with tlbiel, so + * for now we only set it for 4K and 64K pages + */ + if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K) + def->tlbiel = 1; + else + def->tlbiel = 0; + + while (size > 0 && lpnum) { + unsigned int shift = be32_to_cpu(prop[0]); + int penc = be32_to_cpu(prop[1]); + + prop += 2; size -= 2; + lpnum--; + + idx = get_idx_from_shift(shift); + if (idx < 0) + continue; + + if (penc == -1) + pr_err("Invalid penc for base_shift=%d " + "shift=%d\n", base_shift, shift); + + def->penc[idx] = penc; + pr_info("base_shift=%d: shift=%d, sllp=0x%04lx," + " avpnm=0x%08lx, tlbiel=%d, penc=%d\n", + base_shift, shift, def->sllp, + def->avpnm, def->tlbiel, def->penc[idx]); + } + } + + return 1; +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * Scan for 16G memory blocks that have been set aside for huge pages + * and reserve those blocks for 16G huge pages. + */ +static int __init htab_dt_scan_hugepage_blocks(unsigned long node, + const char *uname, int depth, + void *data) { + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be64 *addr_prop; + const __be32 *page_count_prop; + unsigned int expected_pages; + long unsigned int phys_addr; + long unsigned int block_size; + + /* We are scanning "memory" nodes only */ + if (type == NULL || strcmp(type, "memory") != 0) + return 0; + + /* + * This property is the log base 2 of the number of virtual pages that + * will represent this memory block. + */ + page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); + if (page_count_prop == NULL) + return 0; + expected_pages = (1 << be32_to_cpu(page_count_prop[0])); + addr_prop = of_get_flat_dt_prop(node, "reg", NULL); + if (addr_prop == NULL) + return 0; + phys_addr = be64_to_cpu(addr_prop[0]); + block_size = be64_to_cpu(addr_prop[1]); + if (block_size != (16 * GB)) + return 0; + pr_info("Huge page(16GB) memory: " + "addr = 0x%lX size = 0x%lX pages = %d\n", + phys_addr, block_size, expected_pages); + if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) { + memblock_reserve(phys_addr, block_size * expected_pages); + pseries_add_gpage(phys_addr, block_size, expected_pages); + } + return 0; +} +#endif /* CONFIG_HUGETLB_PAGE */ + +static void __init mmu_psize_set_default_penc(void) +{ + int bpsize, apsize; + for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) + for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++) + mmu_psize_defs[bpsize].penc[apsize] = -1; +} + +#ifdef CONFIG_PPC_64K_PAGES + +static bool __init might_have_hea(void) +{ + /* + * The HEA ethernet adapter requires awareness of the + * GX bus. Without that awareness we can easily assume + * we will never see an HEA ethernet device. + */ +#ifdef CONFIG_IBMEBUS + return !cpu_has_feature(CPU_FTR_ARCH_207S) && + firmware_has_feature(FW_FEATURE_SPLPAR); +#else + return false; +#endif +} + +#endif /* #ifdef CONFIG_PPC_64K_PAGES */ + +static void __init htab_scan_page_sizes(void) +{ + int rc; + + /* se the invalid penc to -1 */ + mmu_psize_set_default_penc(); + + /* Default to 4K pages only */ + memcpy(mmu_psize_defs, mmu_psize_defaults, + sizeof(mmu_psize_defaults)); + + /* + * Try to find the available page sizes in the device-tree + */ + rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL); + if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) { + /* + * Nothing in the device-tree, but the CPU supports 16M pages, + * so let's fallback on a known size list for 16M capable CPUs. + */ + memcpy(mmu_psize_defs, mmu_psize_defaults_gp, + sizeof(mmu_psize_defaults_gp)); + } + +#ifdef CONFIG_HUGETLB_PAGE + if (!hugetlb_disabled && !early_radix_enabled() ) { + /* Reserve 16G huge page memory sections for huge pages */ + of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); + } +#endif /* CONFIG_HUGETLB_PAGE */ +} + +/* + * Fill in the hpte_page_sizes[] array. + * We go through the mmu_psize_defs[] array looking for all the + * supported base/actual page size combinations. Each combination + * has a unique pagesize encoding (penc) value in the low bits of + * the LP field of the HPTE. For actual page sizes less than 1MB, + * some of the upper LP bits are used for RPN bits, meaning that + * we need to fill in several entries in hpte_page_sizes[]. + * + * In diagrammatic form, with r = RPN bits and z = page size bits: + * PTE LP actual page size + * rrrr rrrz >=8KB + * rrrr rrzz >=16KB + * rrrr rzzz >=32KB + * rrrr zzzz >=64KB + * ... + * + * The zzzz bits are implementation-specific but are chosen so that + * no encoding for a larger page size uses the same value in its + * low-order N bits as the encoding for the 2^(12+N) byte page size + * (if it exists). + */ +static void __init init_hpte_page_sizes(void) +{ + long int ap, bp; + long int shift, penc; + + for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) { + if (!mmu_psize_defs[bp].shift) + continue; /* not a supported page size */ + for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) { + penc = mmu_psize_defs[bp].penc[ap]; + if (penc == -1 || !mmu_psize_defs[ap].shift) + continue; + shift = mmu_psize_defs[ap].shift - LP_SHIFT; + if (shift <= 0) + continue; /* should never happen */ + /* + * For page sizes less than 1MB, this loop + * replicates the entry for all possible values + * of the rrrr bits. + */ + while (penc < (1 << LP_BITS)) { + hpte_page_sizes[penc] = (ap << 4) | bp; + penc += 1 << shift; + } + } + } +} + +static void __init htab_init_page_sizes(void) +{ + bool aligned = true; + init_hpte_page_sizes(); + + if (!hash_supports_debug_pagealloc() && !kfence_early_init_enabled()) { + /* + * Pick a size for the linear mapping. Currently, we only + * support 16M, 1M and 4K which is the default + */ + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && + (unsigned long)_stext % 0x1000000) { + if (mmu_psize_defs[MMU_PAGE_16M].shift) + pr_warn("Kernel not 16M aligned, disabling 16M linear map alignment\n"); + aligned = false; + } + + if (mmu_psize_defs[MMU_PAGE_16M].shift && aligned) + mmu_linear_psize = MMU_PAGE_16M; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + mmu_linear_psize = MMU_PAGE_1M; + } + +#ifdef CONFIG_PPC_64K_PAGES + /* + * Pick a size for the ordinary pages. Default is 4K, we support + * 64K for user mappings and vmalloc if supported by the processor. + * We only use 64k for ioremap if the processor + * (and firmware) support cache-inhibited large pages. + * If not, we use 4k and set mmu_ci_restrictions so that + * hash_page knows to switch processes that use cache-inhibited + * mappings to 4k pages. + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift) { + mmu_virtual_psize = MMU_PAGE_64K; + mmu_vmalloc_psize = MMU_PAGE_64K; + if (mmu_linear_psize == MMU_PAGE_4K) + mmu_linear_psize = MMU_PAGE_64K; + if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) { + /* + * When running on pSeries using 64k pages for ioremap + * would stop us accessing the HEA ethernet. So if we + * have the chance of ever seeing one, stay at 4k. + */ + if (!might_have_hea()) + mmu_io_psize = MMU_PAGE_64K; + } else + mmu_ci_restrictions = 1; + } +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* + * We try to use 16M pages for vmemmap if that is supported + * and we have at least 1G of RAM at boot + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift && + memblock_phys_mem_size() >= 0x40000000) + mmu_vmemmap_psize = MMU_PAGE_16M; + else + mmu_vmemmap_psize = mmu_virtual_psize; +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + + pr_info("Page orders: linear mapping = %d, " + "virtual = %d, io = %d" +#ifdef CONFIG_SPARSEMEM_VMEMMAP + ", vmemmap = %d" +#endif + "\n", + mmu_psize_defs[mmu_linear_psize].shift, + mmu_psize_defs[mmu_virtual_psize].shift, + mmu_psize_defs[mmu_io_psize].shift +#ifdef CONFIG_SPARSEMEM_VMEMMAP + ,mmu_psize_defs[mmu_vmemmap_psize].shift +#endif + ); +} + +static int __init htab_dt_scan_pftsize(unsigned long node, + const char *uname, int depth, + void *data) +{ + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL); + if (prop != NULL) { + /* pft_size[0] is the NUMA CEC cookie */ + ppc64_pft_size = be32_to_cpu(prop[1]); + return 1; + } + return 0; +} + +unsigned htab_shift_for_mem_size(unsigned long mem_size) +{ + unsigned memshift = __ilog2(mem_size); + unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift; + unsigned pteg_shift; + + /* round mem_size up to next power of 2 */ + if ((1UL << memshift) < mem_size) + memshift += 1; + + /* aim for 2 pages / pteg */ + pteg_shift = memshift - (pshift + 1); + + /* + * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab + * size permitted by the architecture. + */ + return max(pteg_shift + 7, 18U); +} + +static unsigned long __init htab_get_table_size(void) +{ + /* + * If hash size isn't already provided by the platform, we try to + * retrieve it from the device-tree. If it's not there neither, we + * calculate it now based on the total RAM size + */ + if (ppc64_pft_size == 0) + of_scan_flat_dt(htab_dt_scan_pftsize, NULL); + if (ppc64_pft_size) + return 1UL << ppc64_pft_size; + + return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size()); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static int resize_hpt_for_hotplug(unsigned long new_mem_size) +{ + unsigned target_hpt_shift; + + if (!mmu_hash_ops.resize_hpt) + return 0; + + target_hpt_shift = htab_shift_for_mem_size(new_mem_size); + + /* + * To avoid lots of HPT resizes if memory size is fluctuating + * across a boundary, we deliberately have some hysterisis + * here: we immediately increase the HPT size if the target + * shift exceeds the current shift, but we won't attempt to + * reduce unless the target shift is at least 2 below the + * current shift + */ + if (target_hpt_shift > ppc64_pft_size || + target_hpt_shift < ppc64_pft_size - 1) + return mmu_hash_ops.resize_hpt(target_hpt_shift); + + return 0; +} + +int hash__create_section_mapping(unsigned long start, unsigned long end, + int nid, pgprot_t prot) +{ + int rc; + unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift; + + if (end >= H_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + return -1; + } + + resize_hpt_for_hotplug(memblock_phys_mem_size()); + + rc = htab_bolt_mapping(start, end, __pa(start), + pgprot_val(prot), mmu_linear_psize, + mmu_kernel_ssize); + + if (rc < 0) { + int rc2 = htab_remove_mapping(start, end, mmu_linear_psize, + mmu_kernel_ssize); + BUG_ON(rc2 && (rc2 != -ENOENT)); + } + update_page_count(mmu_linear_psize, (end - start) >> pshift); + return rc; +} + +int hash__remove_section_mapping(unsigned long start, unsigned long end) +{ + unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift; + + int rc = htab_remove_mapping(start, end, mmu_linear_psize, + mmu_kernel_ssize); + + if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC) + pr_warn("Hash collision while resizing HPT\n"); + + if (!rc) + update_page_count(mmu_linear_psize, -((end - start) >> pshift)); + return rc; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static void __init hash_init_partition_table(phys_addr_t hash_table, + unsigned long htab_size) +{ + mmu_partition_table_init(); + + /* + * PS field (VRMA page size) is not used for LPID 0, hence set to 0. + * For now, UPRT is 0 and we have no segment table. + */ + htab_size = __ilog2(htab_size) - 18; + mmu_partition_table_set_entry(0, hash_table | htab_size, 0, false); + pr_info("Partition table %p\n", partition_tb); +} + +void hpt_clear_stress(void); +static struct timer_list stress_hpt_timer; +static void stress_hpt_timer_fn(struct timer_list *timer) +{ + int next_cpu; + + hpt_clear_stress(); + if (!firmware_has_feature(FW_FEATURE_LPAR)) + tlbiel_all(); + + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10); + add_timer_on(&stress_hpt_timer, next_cpu); +} + +static void __init htab_initialize(void) +{ + unsigned long table; + unsigned long pteg_count; + unsigned long prot; + phys_addr_t base = 0, size = 0, end, limit = MEMBLOCK_ALLOC_ANYWHERE; + u64 i; + unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift; + + DBG(" -> htab_initialize()\n"); + + if (firmware_has_feature(FW_FEATURE_LPAR)) + limit = ppc64_rma_size; + + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { + mmu_kernel_ssize = MMU_SEGSIZE_1T; + mmu_highuser_ssize = MMU_SEGSIZE_1T; + pr_info("Using 1TB segments\n"); + } + + if (stress_slb_enabled) + static_branch_enable(&stress_slb_key); + + if (no_slb_preload) + static_branch_enable(&no_slb_preload_key); + + if (stress_hpt_enabled) { + unsigned long tmp; + static_branch_enable(&stress_hpt_key); + // Too early to use nr_cpu_ids, so use NR_CPUS + tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS, + __alignof__(struct stress_hpt_struct), + MEMBLOCK_LOW_LIMIT, limit); + memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS); + stress_hpt_struct = __va(tmp); + + timer_setup(&stress_hpt_timer, stress_hpt_timer_fn, 0); + stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10); + add_timer(&stress_hpt_timer); + } + + /* + * Calculate the required size of the htab. We want the number of + * PTEGs to equal one half the number of real pages. + */ + htab_size_bytes = htab_get_table_size(); + pteg_count = htab_size_bytes >> 7; + + htab_hash_mask = pteg_count - 1; + + if (firmware_has_feature(FW_FEATURE_LPAR) || + firmware_has_feature(FW_FEATURE_PS3_LV1)) { + /* Using a hypervisor which owns the htab */ + htab_address = NULL; + _SDR1 = 0; +#ifdef CONFIG_FA_DUMP + /* + * If firmware assisted dump is active firmware preserves + * the contents of htab along with entire partition memory. + * Clear the htab if firmware assisted dump is active so + * that we dont end up using old mappings. + */ + if (is_fadump_active() && mmu_hash_ops.hpte_clear_all) + mmu_hash_ops.hpte_clear_all(); +#endif + } else { + + table = memblock_phys_alloc_range(htab_size_bytes, + htab_size_bytes, + MEMBLOCK_LOW_LIMIT, limit); + if (!table) + panic("ERROR: Failed to allocate %pa bytes below %pa\n", + &htab_size_bytes, &limit); + + DBG("Hash table allocated at %lx, size: %lx\n", table, + htab_size_bytes); + + htab_address = __va(table); + + /* htab absolute addr + encoded htabsize */ + _SDR1 = table + __ilog2(htab_size_bytes) - 18; + + /* Initialize the HPT with no entries */ + memset((void *)table, 0, htab_size_bytes); + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + /* Set SDR1 */ + mtspr(SPRN_SDR1, _SDR1); + else + hash_init_partition_table(table, htab_size_bytes); + } + + prot = pgprot_val(PAGE_KERNEL); + + hash_debug_pagealloc_alloc_slots(); + hash_kfence_alloc_pool(); + /* create bolted the linear mapping in the hash table */ + for_each_mem_range(i, &base, &end) { + size = end - base; + base = (unsigned long)__va(base); + + pr_debug("creating mapping for region: 0x%pa..0x%pa (prot: %lx)\n", + &base, &size, prot); + + if ((base + size) >= H_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + continue; + } + + BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), + prot, mmu_linear_psize, mmu_kernel_ssize)); + + update_page_count(mmu_linear_psize, size >> pshift); + } + hash_kfence_map_pool(); + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + + /* + * If we have a memory_limit and we've allocated TCEs then we need to + * explicitly map the TCE area at the top of RAM. We also cope with the + * case that the TCEs start below memory_limit. + * tce_alloc_start/end are 16MB aligned so the mapping should work + * for either 4K or 16MB pages. + */ + if (tce_alloc_start) { + tce_alloc_start = (unsigned long)__va(tce_alloc_start); + tce_alloc_end = (unsigned long)__va(tce_alloc_end); + + if (base + size >= tce_alloc_start) + tce_alloc_start = base + size + 1; + + BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, + __pa(tce_alloc_start), prot, + mmu_linear_psize, mmu_kernel_ssize)); + update_page_count(mmu_linear_psize, + (tce_alloc_end - tce_alloc_start) >> pshift); + } + + + DBG(" <- htab_initialize()\n"); +} +#undef KB +#undef MB + +void __init hash__early_init_devtree(void) +{ + /* Initialize segment sizes */ + of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL); + + /* Initialize page sizes */ + htab_scan_page_sizes(); +} + +static struct hash_mm_context init_hash_mm_context; +void __init hash__early_init_mmu(void) +{ +#ifndef CONFIG_PPC_64K_PAGES + /* + * We have code in __hash_page_4K() and elsewhere, which assumes it can + * do the following: + * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX); + * + * Where the slot number is between 0-15, and values of 8-15 indicate + * the secondary bucket. For that code to work H_PAGE_F_SECOND and + * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and + * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here + * with a BUILD_BUG_ON(). + */ + BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3))); +#endif /* CONFIG_PPC_64K_PAGES */ + + htab_init_page_sizes(); + + /* + * initialize page table size + */ + __pte_frag_nr = H_PTE_FRAG_NR; + __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; + __pmd_frag_nr = H_PMD_FRAG_NR; + __pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT; + + __pte_index_size = H_PTE_INDEX_SIZE; + __pmd_index_size = H_PMD_INDEX_SIZE; + __pud_index_size = H_PUD_INDEX_SIZE; + __pgd_index_size = H_PGD_INDEX_SIZE; + __pud_cache_index = H_PUD_CACHE_INDEX; + __pte_table_size = H_PTE_TABLE_SIZE; + __pmd_table_size = H_PMD_TABLE_SIZE; + __pud_table_size = H_PUD_TABLE_SIZE; + __pgd_table_size = H_PGD_TABLE_SIZE; + __pmd_val_bits = HASH_PMD_VAL_BITS; + __pud_val_bits = HASH_PUD_VAL_BITS; + __pgd_val_bits = HASH_PGD_VAL_BITS; + + __kernel_virt_start = H_KERN_VIRT_START; + __vmalloc_start = H_VMALLOC_START; + __vmalloc_end = H_VMALLOC_END; + __kernel_io_start = H_KERN_IO_START; + __kernel_io_end = H_KERN_IO_END; + vmemmap = (struct page *)H_VMEMMAP_START; + ioremap_bot = IOREMAP_BASE; + +#ifdef CONFIG_PCI + pci_io_base = ISA_IO_BASE; +#endif + + /* Select appropriate backend */ + if (firmware_has_feature(FW_FEATURE_PS3_LV1)) + ps3_early_mm_init(); + else if (firmware_has_feature(FW_FEATURE_LPAR)) + hpte_init_pseries(); + else if (IS_ENABLED(CONFIG_PPC_HASH_MMU_NATIVE)) + hpte_init_native(); + + if (!mmu_hash_ops.hpte_insert) + panic("hash__early_init_mmu: No MMU hash ops defined!\n"); + + /* + * Initialize the MMU Hash table and create the linear mapping + * of memory. Has to be done before SLB initialization as this is + * currently where the page size encoding is obtained. + */ + htab_initialize(); + + init_mm.context.hash_context = &init_hash_mm_context; + mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT); + + pr_info("Initializing hash mmu with SLB\n"); + /* Initialize SLB management */ + slb_initialize(); + + if (cpu_has_feature(CPU_FTR_ARCH_206) + && cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_all(); +} + +#ifdef CONFIG_SMP +void hash__early_init_mmu_secondary(void) +{ + /* Initialize hash table for that CPU */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + mtspr(SPRN_SDR1, _SDR1); + else + set_ptcr_when_no_uv(__pa(partition_tb) | + (PATB_SIZE_SHIFT - 12)); + } + /* Initialize SLB */ + slb_initialize(); + + if (cpu_has_feature(CPU_FTR_ARCH_206) + && cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_all(); + +#ifdef CONFIG_PPC_MEM_KEYS + if (mmu_has_feature(MMU_FTR_PKEY)) + mtspr(SPRN_UAMOR, default_uamor); +#endif +} +#endif /* CONFIG_SMP */ + +/* + * Called by asm hashtable.S for doing lazy icache flush + */ +unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) +{ + struct folio *folio; + + if (!pfn_valid(pte_pfn(pte))) + return pp; + + folio = page_folio(pte_page(pte)); + + /* page is dirty */ + if (!test_bit(PG_dcache_clean, &folio->flags.f) && + !folio_test_reserved(folio)) { + if (trap == INTERRUPT_INST_STORAGE) { + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags.f); + } else + pp |= HPTE_R_N; + } + return pp; +} + +static unsigned int get_paca_psize(unsigned long addr) +{ + unsigned char *psizes; + unsigned long index, mask_index; + + if (addr < SLICE_LOW_TOP) { + psizes = get_paca()->mm_ctx_low_slices_psize; + index = GET_LOW_SLICE_INDEX(addr); + } else { + psizes = get_paca()->mm_ctx_high_slices_psize; + index = GET_HIGH_SLICE_INDEX(addr); + } + mask_index = index & 0x1; + return (psizes[index >> 1] >> (mask_index * 4)) & 0xF; +} + + +/* + * Demote a segment to using 4k pages. + * For now this makes the whole process use 4k pages. + */ +#ifdef CONFIG_PPC_64K_PAGES +void demote_segment_4k(struct mm_struct *mm, unsigned long addr) +{ + if (get_slice_psize(mm, addr) == MMU_PAGE_4K) + return; + slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); +#ifdef CONFIG_SPU_BASE + spu_flush_all_slbs(mm); +#endif + if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { + + copy_mm_to_paca(mm); + slb_flush_and_restore_bolted(); + } +} +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_PPC_SUBPAGE_PROT +/* + * This looks up a 2-bit protection code for a 4k subpage of a 64k page. + * Userspace sets the subpage permissions using the subpage_prot system call. + * + * Result is 0: full permissions, _PAGE_RW: read-only, + * _PAGE_RWX: no access. + */ +static int subpage_protection(struct mm_struct *mm, unsigned long ea) +{ + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); + u32 spp = 0; + u32 **sbpm, *sbpp; + + if (!spt) + return 0; + + if (ea >= spt->maxaddr) + return 0; + if (ea < 0x100000000UL) { + /* addresses below 4GB use spt->low_prot */ + sbpm = spt->low_prot; + } else { + sbpm = spt->protptrs[ea >> SBP_L3_SHIFT]; + if (!sbpm) + return 0; + } + sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; + if (!sbpp) + return 0; + spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)]; + + /* extract 2-bit bitfield for this 4k subpage */ + spp >>= 30 - 2 * ((ea >> 12) & 0xf); + + /* + * 0 -> full permission + * 1 -> Read only + * 2 -> no access. + * We return the flag that need to be cleared. + */ + spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0); + return spp; +} + +#else /* CONFIG_PPC_SUBPAGE_PROT */ +static inline int subpage_protection(struct mm_struct *mm, unsigned long ea) +{ + return 0; +} +#endif + +void hash_failure_debug(unsigned long ea, unsigned long access, + unsigned long vsid, unsigned long trap, + int ssize, int psize, int lpsize, unsigned long pte) +{ + if (!printk_ratelimit()) + return; + pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n", + ea, access, current->comm); + pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n", + trap, vsid, ssize, psize, lpsize, pte); +} + +static void check_paca_psize(unsigned long ea, struct mm_struct *mm, + int psize, bool user_region) +{ + if (user_region) { + if (psize != get_paca_psize(ea)) { + copy_mm_to_paca(mm); + slb_flush_and_restore_bolted(); + } + } else if (get_paca()->vmalloc_sllp != + mmu_psize_defs[mmu_vmalloc_psize].sllp) { + get_paca()->vmalloc_sllp = + mmu_psize_defs[mmu_vmalloc_psize].sllp; + slb_vmalloc_update(); + } +} + +/* + * Result code is: + * 0 - handled + * 1 - normal page fault + * -1 - critical hash insertion error + * -2 - access not permitted by subpage protection mechanism + */ +int hash_page_mm(struct mm_struct *mm, unsigned long ea, + unsigned long access, unsigned long trap, + unsigned long flags) +{ + bool is_thp; + pgd_t *pgdir; + unsigned long vsid; + pte_t *ptep; + unsigned hugeshift; + int rc, user_region = 0; + int psize, ssize; + + DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", + ea, access, trap); + trace_hash_fault(ea, access, trap); + + /* Get region & vsid */ + switch (get_region_id(ea)) { + case USER_REGION_ID: + user_region = 1; + if (! mm) { + DBG_LOW(" user region with no mm !\n"); + rc = 1; + goto bail; + } + psize = get_slice_psize(mm, ea); + ssize = user_segment_size(ea); + vsid = get_user_vsid(&mm->context, ea, ssize); + break; + case VMALLOC_REGION_ID: + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + psize = mmu_vmalloc_psize; + ssize = mmu_kernel_ssize; + flags |= HPTE_USE_KERNEL_KEY; + break; + + case IO_REGION_ID: + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + psize = mmu_io_psize; + ssize = mmu_kernel_ssize; + flags |= HPTE_USE_KERNEL_KEY; + break; + default: + /* + * Not a valid range + * Send the problem up to do_page_fault() + */ + rc = 1; + goto bail; + } + DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); + + /* Bad address. */ + if (!vsid) { + DBG_LOW("Bad address!\n"); + rc = 1; + goto bail; + } + /* Get pgdir */ + pgdir = mm->pgd; + if (pgdir == NULL) { + rc = 1; + goto bail; + } + + /* Check CPU locality */ + if (user_region && mm_is_thread_local(mm)) + flags |= HPTE_LOCAL_UPDATE; + +#ifndef CONFIG_PPC_64K_PAGES + /* + * If we use 4K pages and our psize is not 4K, then we might + * be hitting a special driver mapping, and need to align the + * address before we fetch the PTE. + * + * It could also be a hugepage mapping, in which case this is + * not necessary, but it's not harmful, either. + */ + if (psize != MMU_PAGE_4K) + ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get PTE and page size from page tables */ + ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift); + if (ptep == NULL || !pte_present(*ptep)) { + DBG_LOW(" no PTE !\n"); + rc = 1; + goto bail; + } + + if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !radix_enabled()) { + if (hugeshift == PMD_SHIFT && psize == MMU_PAGE_16M) + hugeshift = mmu_psize_defs[MMU_PAGE_16M].shift; + if (hugeshift == PUD_SHIFT && psize == MMU_PAGE_16G) + hugeshift = mmu_psize_defs[MMU_PAGE_16G].shift; + } + + /* + * Add _PAGE_PRESENT to the required access perm. If there are parallel + * updates to the pte that can possibly clear _PAGE_PTE, catch that too. + * + * We can safely use the return pte address in rest of the function + * because we do set H_PAGE_BUSY which prevents further updates to pte + * from generic code. + */ + access |= _PAGE_PRESENT | _PAGE_PTE; + + /* + * Pre-check access permissions (will be re-checked atomically + * in __hash_page_XX but this pre-check is a fast path + */ + if (!check_pte_access(access, pte_val(*ptep))) { + DBG_LOW(" no access !\n"); + rc = 1; + goto bail; + } + + if (hugeshift) { + if (is_thp) + rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, + trap, flags, ssize, psize); +#ifdef CONFIG_HUGETLB_PAGE + else + rc = __hash_page_huge(ea, access, vsid, ptep, trap, + flags, ssize, hugeshift, psize); +#else + else { + /* + * if we have hugeshift, and is not transhuge with + * hugetlb disabled, something is really wrong. + */ + rc = 1; + WARN_ON(1); + } +#endif + if (current->mm == mm) + check_paca_psize(ea, mm, psize, user_region); + + goto bail; + } + +#ifndef CONFIG_PPC_64K_PAGES + DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); +#else + DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), + pte_val(*(ptep + PTRS_PER_PTE))); +#endif + /* Do actual hashing */ +#ifdef CONFIG_PPC_64K_PAGES + /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */ + if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) { + demote_segment_4k(mm, ea); + psize = MMU_PAGE_4K; + } + + /* + * If this PTE is non-cacheable and we have restrictions on + * using non cacheable large pages, then we switch to 4k + */ + if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) { + if (user_region) { + demote_segment_4k(mm, ea); + psize = MMU_PAGE_4K; + } else if (ea < VMALLOC_END) { + /* + * some driver did a non-cacheable mapping + * in vmalloc space, so switch vmalloc + * to 4k pages + */ + pr_alert("Reducing vmalloc segment " + "to 4kB pages because of " + "non-cacheable mapping\n"); + psize = mmu_vmalloc_psize = MMU_PAGE_4K; +#ifdef CONFIG_SPU_BASE + spu_flush_all_slbs(mm); +#endif + } + } + +#endif /* CONFIG_PPC_64K_PAGES */ + + if (current->mm == mm) + check_paca_psize(ea, mm, psize, user_region); + +#ifdef CONFIG_PPC_64K_PAGES + if (psize == MMU_PAGE_64K) + rc = __hash_page_64K(ea, access, vsid, ptep, trap, + flags, ssize); + else +#endif /* CONFIG_PPC_64K_PAGES */ + { + int spp = subpage_protection(mm, ea); + if (access & spp) + rc = -2; + else + rc = __hash_page_4K(ea, access, vsid, ptep, trap, + flags, ssize, spp); + } + + /* + * Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, psize, + psize, pte_val(*ptep)); +#ifndef CONFIG_PPC_64K_PAGES + DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); +#else + DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep), + pte_val(*(ptep + PTRS_PER_PTE))); +#endif + DBG_LOW(" -> rc=%d\n", rc); + +bail: + return rc; +} +EXPORT_SYMBOL_GPL(hash_page_mm); + +int hash_page(unsigned long ea, unsigned long access, unsigned long trap, + unsigned long dsisr) +{ + unsigned long flags = 0; + struct mm_struct *mm = current->mm; + + if ((get_region_id(ea) == VMALLOC_REGION_ID) || + (get_region_id(ea) == IO_REGION_ID)) + mm = &init_mm; + + if (dsisr & DSISR_NOHPTE) + flags |= HPTE_NOHPTE_UPDATE; + + return hash_page_mm(mm, ea, access, trap, flags); +} +EXPORT_SYMBOL_GPL(hash_page); + +DEFINE_INTERRUPT_HANDLER(do_hash_fault) +{ + unsigned long ea = regs->dar; + unsigned long dsisr = regs->dsisr; + unsigned long access = _PAGE_PRESENT | _PAGE_READ; + unsigned long flags = 0; + struct mm_struct *mm; + unsigned int region_id; + long err; + + if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT))) { + hash__do_page_fault(regs); + return; + } + + region_id = get_region_id(ea); + if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID)) + mm = &init_mm; + else + mm = current->mm; + + if (dsisr & DSISR_NOHPTE) + flags |= HPTE_NOHPTE_UPDATE; + + if (dsisr & DSISR_ISSTORE) + access |= _PAGE_WRITE; + /* + * We set _PAGE_PRIVILEGED only when + * kernel mode access kernel space. + * + * _PAGE_PRIVILEGED is NOT set + * 1) when kernel mode access user space + * 2) user space access kernel space. + */ + access |= _PAGE_PRIVILEGED; + if (user_mode(regs) || (region_id == USER_REGION_ID)) + access &= ~_PAGE_PRIVILEGED; + + if (TRAP(regs) == INTERRUPT_INST_STORAGE) + access |= _PAGE_EXEC; + + err = hash_page_mm(mm, ea, access, TRAP(regs), flags); + if (unlikely(err < 0)) { + // failed to insert a hash PTE due to an hypervisor error + if (user_mode(regs)) { + if (IS_ENABLED(CONFIG_PPC_SUBPAGE_PROT) && err == -2) + _exception(SIGSEGV, regs, SEGV_ACCERR, ea); + else + _exception(SIGBUS, regs, BUS_ADRERR, ea); + } else { + bad_page_fault(regs, SIGBUS); + } + err = 0; + + } else if (err) { + hash__do_page_fault(regs); + } +} + +static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) +{ + int psize = get_slice_psize(mm, ea); + + /* We only prefault standard pages for now */ + if (unlikely(psize != mm_ctx_user_psize(&mm->context))) + return false; + + /* + * Don't prefault if subpage protection is enabled for the EA. + */ + if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea))) + return false; + + return true; +} + +static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, + bool is_exec, unsigned long trap) +{ + unsigned long vsid; + pgd_t *pgdir; + int rc, ssize, update_flags = 0; + unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); + unsigned long flags; + + BUG_ON(get_region_id(ea) != USER_REGION_ID); + + if (!should_hash_preload(mm, ea)) + return; + + DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," + " trap=%lx\n", mm, mm->pgd, ea, access, trap); + + /* Get Linux PTE if available */ + pgdir = mm->pgd; + if (pgdir == NULL) + return; + + /* Get VSID */ + ssize = user_segment_size(ea); + vsid = get_user_vsid(&mm->context, ea, ssize); + if (!vsid) + return; + +#ifdef CONFIG_PPC_64K_PAGES + /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on + * a 64K kernel), then we don't preload, hash_page() will take + * care of it once we actually try to access the page. + * That way we don't have to duplicate all of the logic for segment + * page size demotion here + * Called with PTL held, hence can be sure the value won't change in + * between. + */ + if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) + return; +#endif /* CONFIG_PPC_64K_PAGES */ + + /* + * __hash_page_* must run with interrupts off, including PMI interrupts + * off, as it sets the H_PAGE_BUSY bit. + * + * It's otherwise possible for perf interrupts to hit at any time and + * may take a hash fault reading the user stack, which could take a + * hash miss and deadlock on the same H_PAGE_BUSY bit. + * + * Interrupts must also be off for the duration of the + * mm_is_thread_local test and update, to prevent preempt running the + * mm on another CPU (XXX: this may be racy vs kthread_use_mm). + */ + powerpc_local_irq_pmu_save(flags); + + /* Is that local to this CPU ? */ + if (mm_is_thread_local(mm)) + update_flags |= HPTE_LOCAL_UPDATE; + + /* Hash it in */ +#ifdef CONFIG_PPC_64K_PAGES + if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K) + rc = __hash_page_64K(ea, access, vsid, ptep, trap, + update_flags, ssize); + else +#endif /* CONFIG_PPC_64K_PAGES */ + rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags, + ssize, subpage_protection(mm, ea)); + + /* Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, + mm_ctx_user_psize(&mm->context), + mm_ctx_user_psize(&mm->context), + pte_val(*ptep)); + + powerpc_local_irq_pmu_restore(flags); +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux PTE. + * + * This must always be called with the pte lock held. + */ +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + /* + * We don't need to worry about _PAGE_PRESENT here because we are + * called with either mm->page_table_lock held or ptl lock held + */ + unsigned long trap; + bool is_exec; + + /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ + if (!pte_young(*ptep) || address >= TASK_SIZE) + return; + + /* + * We try to figure out if we are coming from an instruction + * access fault and pass that down to __hash_page so we avoid + * double-faulting on execution of fresh text. We have to test + * for regs NULL since init will get here first thing at boot. + * + * We also avoid filling the hash if not coming from a fault. + */ + + trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; + switch (trap) { + case 0x300: + is_exec = false; + break; + case 0x400: + is_exec = true; + break; + default: + return; + } + + hash_preload(vma->vm_mm, ptep, address, is_exec, trap); +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static inline void tm_flush_hash_page(int local) +{ + /* + * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a + * page back to a block device w/PIO could pick up transactional data + * (bad!) so we force an abort here. Before the sync the page will be + * made read-only, which will flush_hash_page. BIG ISSUE here: if the + * kernel uses a page from userspace without unmapping it first, it may + * see the speculated version. + */ + if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs && + MSR_TM_ACTIVE(current->thread.regs->msr)) { + tm_enable(); + tm_abort(TM_CAUSE_TLBI); + } +} +#else +static inline void tm_flush_hash_page(int local) +{ +} +#endif + +/* + * Return the global hash slot, corresponding to the given PTE, which contains + * the HPTE. + */ +unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift, + int ssize, real_pte_t rpte, unsigned int subpg_index) +{ + unsigned long hash, gslot, hidx; + + hash = hpt_hash(vpn, shift, ssize); + hidx = __rpte_to_hidx(rpte, subpg_index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + gslot += hidx & _PTEIDX_GROUP_IX; + return gslot; +} + +void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, + unsigned long flags) +{ + unsigned long index, shift, gslot; + int local = flags & HPTE_LOCAL_UPDATE; + + DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn); + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index); + DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot); + /* + * We use same base page size and actual psize, because we don't + * use these functions for hugepage + */ + mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize, + ssize, local); + } pte_iterate_hashed_end(); + + tm_flush_hash_page(local); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void flush_hash_hugepage(unsigned long vsid, unsigned long addr, + pmd_t *pmdp, unsigned int psize, int ssize, + unsigned long flags) +{ + int i, max_hpte_count, valid; + unsigned long s_addr; + unsigned char *hpte_slot_array; + unsigned long hidx, shift, vpn, hash, slot; + int local = flags & HPTE_LOCAL_UPDATE; + + s_addr = addr & HPAGE_PMD_MASK; + hpte_slot_array = get_hpte_slot_array(pmdp); + /* + * IF we try to do a HUGE PTE update after a withdraw is done. + * we will find the below NULL. This happens when we do + * split_huge_pmd + */ + if (!hpte_slot_array) + return; + + if (mmu_hash_ops.hugepage_invalidate) { + mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array, + psize, ssize, local); + goto tm_abort; + } + /* + * No bluk hpte removal support, invalidate each entry + */ + shift = mmu_psize_defs[psize].shift; + max_hpte_count = HPAGE_PMD_SIZE >> shift; + for (i = 0; i < max_hpte_count; i++) { + /* + * 8 bits per each hpte entries + * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] + */ + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + mmu_hash_ops.hpte_invalidate(slot, vpn, psize, + MMU_PAGE_16M, ssize, local); + } +tm_abort: + tm_flush_hash_page(local); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +void flush_hash_range(unsigned long number, int local) +{ + if (mmu_hash_ops.flush_hash_range) + mmu_hash_ops.flush_hash_range(number, local); + else { + int i; + struct ppc64_tlb_batch *batch = + this_cpu_ptr(&ppc64_tlb_batch); + + for (i = 0; i < number; i++) + flush_hash_page(batch->vpn[i], batch->pte[i], + batch->psize, batch->ssize, local); + } +} + +long hpte_insert_repeating(unsigned long hash, unsigned long vpn, + unsigned long pa, unsigned long rflags, + unsigned long vflags, int psize, int ssize) +{ + unsigned long hpte_group; + long slot; + +repeat: + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + + /* Insert into the hash table, primary slot */ + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags, + psize, psize, ssize); + + /* Primary is full, try the secondary */ + if (unlikely(slot == -1)) { + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, + vflags | HPTE_V_SECONDARY, + psize, psize, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; + + mmu_hash_ops.hpte_remove(hpte_group); + goto repeat; + } + } + + return slot; +} + +void hpt_clear_stress(void) +{ + int cpu = raw_smp_processor_id(); + int g; + + for (g = 0; g < stress_nr_groups(); g++) { + unsigned long last_group; + last_group = stress_hpt_struct[cpu].last_group[g]; + + if (last_group != -1UL) { + int i; + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (mmu_hash_ops.hpte_remove(last_group) == -1) + break; + } + stress_hpt_struct[cpu].last_group[g] = -1; + } + } +} + +void hpt_do_stress(unsigned long ea, unsigned long hpte_group) +{ + unsigned long last_group; + int cpu = raw_smp_processor_id(); + + last_group = stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1]; + if (hpte_group == last_group) + return; + + if (last_group != -1UL) { + int i; + /* + * Concurrent CPUs might be inserting into this group, so + * give up after a number of iterations, to prevent a live + * lock. + */ + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (mmu_hash_ops.hpte_remove(last_group) == -1) + break; + } + stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1] = -1; + } + + if (ea >= PAGE_OFFSET) { + /* + * We would really like to prefetch to get the TLB loaded, then + * remove the PTE before returning from fault interrupt, to + * increase the hash fault rate. + * + * Unfortunately QEMU TCG does not model the TLB in a way that + * makes this possible, and systemsim (mambo) emulator does not + * bring in TLBs with prefetches (although loads/stores do + * work for non-CI PTEs). + * + * So remember this PTE and clear it on the next hash fault. + */ + memmove(&stress_hpt_struct[cpu].last_group[1], + &stress_hpt_struct[cpu].last_group[0], + (stress_nr_groups() - 1) * sizeof(unsigned long)); + stress_hpt_struct[cpu].last_group[0] = hpte_group; + } +} + +void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* + * We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* + * On virtualized systems the first entry is our RMA region aka VRMA, + * non-virtualized 64-bit hash MMU systems don't have a limitation + * on real mode access. + * + * For guests on platforms before POWER9, we clamp the it limit to 1G + * to avoid some funky things such as RTAS bugs etc... + * + * On POWER9 we limit to 1TB in case the host erroneously told us that + * the RMA was >1TB. Effective address bits 0:23 are treated as zero + * (meaning the access is aliased to zero i.e. addr = addr % 1TB) + * for virtual real mode addressing and so it doesn't make sense to + * have an area larger than 1TB as it can't be addressed. + */ + if (!early_cpu_has_feature(CPU_FTR_HVMODE)) { + ppc64_rma_size = first_memblock_size; + if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) + ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000); + else + ppc64_rma_size = min_t(u64, ppc64_rma_size, + 1UL << SID_SHIFT_1T); + + /* Finally limit subsequent allocations */ + memblock_set_current_limit(ppc64_rma_size); + } else { + ppc64_rma_size = ULONG_MAX; + } +} + +#ifdef CONFIG_DEBUG_FS + +static int hpt_order_get(void *data, u64 *val) +{ + *val = ppc64_pft_size; + return 0; +} + +static int hpt_order_set(void *data, u64 val) +{ + int ret; + + if (!mmu_hash_ops.resize_hpt) + return -ENODEV; + + cpus_read_lock(); + ret = mmu_hash_ops.resize_hpt(val); + cpus_read_unlock(); + + return ret; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); + +static int __init hash64_debugfs(void) +{ + if (radix_enabled()) + return 0; + debugfs_create_file("hpt_order", 0600, arch_debugfs_dir, NULL, + &fops_hpt_order); + return 0; +} +machine_device_initcall(pseries, hash64_debugfs); +#endif /* CONFIG_DEBUG_FS */ + +void __init print_system_hash_info(void) +{ + pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); + + if (htab_hash_mask) + pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + /* + * If we are using 1TB segments and we are allowed to randomise + * the heap, we can put it above 1TB so it is backed by a 1TB + * segment. Otherwise the heap will be in the bottom 1TB + * which always uses 256MB segments and this may result in a + * performance penalty. + */ + if (is_32bit_task()) + return randomize_page(mm->brk, SZ_32M); + else if (!radix_enabled() && mmu_highuser_ssize == MMU_SEGSIZE_1T) + return randomize_page(max_t(unsigned long, mm->brk, SZ_1T), SZ_1G); + else + return randomize_page(mm->brk, SZ_1G); +} diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c new file mode 100644 index 000000000000..2bcbbf9d85ac --- /dev/null +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later) + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + * + * Based on the IA-32 version: + * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> + */ + +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <asm/cacheflush.h> +#include <asm/machdep.h> + +unsigned int hpage_shift; +EXPORT_SYMBOL(hpage_shift); + +#ifdef CONFIG_PPC_64S_HASH_MMU +int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, unsigned long flags, + int ssize, unsigned int shift, unsigned int mmu_psize) +{ + real_pte_t rpte; + unsigned long vpn; + unsigned long old_pte, new_pte; + unsigned long rflags, pa; + long slot, offset; + + BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); + + /* Search the Linux page table for a match with va */ + vpn = hpt_vpn(ea, vsid, ssize); + + /* + * At this point, we have a pte (old_pte) which can be used to build + * or update an HPTE. There are 2 cases: + * + * 1. There is a valid (present) pte with no associated HPTE (this is + * the most common case) + * 2. There is a valid (present) pte with an associated HPTE. The + * current values of the pp bits in the HPTE prevent access + * because we are doing software DIRTY bit management and the + * page is currently not DIRTY. + */ + + + do { + old_pte = pte_val(*ptep); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & H_PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(!check_pte_access(access, old_pte))) + return 1; + /* + * If hash-4k, hugepages use seeral contiguous PxD entries + * so bail out and let mm make the page young or dirty + */ + if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) { + if (!(old_pte & _PAGE_ACCESSED)) + return 1; + if ((access & _PAGE_WRITE) && !(old_pte & _PAGE_DIRTY)) + return 1; + } + + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access + */ + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) + new_pte |= _PAGE_DIRTY; + } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* Make sure this is a hugetlb entry */ + if (old_pte & H_PAGE_THP_HUGE) + return 0; + + rflags = htab_convert_pte_flags(new_pte, flags); + if (unlikely(mmu_psize == MMU_PAGE_16G)) + offset = PTRS_PER_PUD; + else + offset = PTRS_PER_PMD; + rpte = __real_pte(__pte(old_pte), ptep, offset); + + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + + /* Check if pte already has an hpte (case 2) */ + if (unlikely(old_pte & H_PAGE_HASHPTE)) { + /* There MIGHT be an HPTE for this pte */ + unsigned long gslot; + + gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0); + if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize, + mmu_psize, ssize, flags) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & H_PAGE_HASHPTE))) { + unsigned long hash = hpt_hash(vpn, shift, ssize); + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + + /* clear HPTE slot informations in new PTE */ + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + + slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, + mmu_psize, ssize); + + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + mmu_psize, mmu_psize, old_pte); + return -1; + } + + new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset); + } + + /* + * No need to use ldarx/stdcx here + */ + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; +} +#endif + +pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + unsigned long pte_val; + /* + * Clear the _PAGE_PRESENT so that no hardware parallel update is + * possible. Also keep the pte_present true so that we don't take + * wrong fault. + */ + pte_val = pte_update(vma->vm_mm, addr, ptep, + _PAGE_PRESENT, _PAGE_INVALID, 1); + + return __pte(pte_val); +} + +void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) +{ + unsigned long psize; + + if (radix_enabled()) + return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, + old_pte, pte); + + psize = huge_page_size(hstate_vma(vma)); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); +} + +void __init hugetlbpage_init_defaultsize(void) +{ + /* Set default large page size. Currently, we pick 16M or 1M + * depending on what is available + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_16M].shift; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_1M].shift; + else if (mmu_psize_defs[MMU_PAGE_2M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_2M].shift; +} diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h new file mode 100644 index 000000000000..cad08d83369c --- /dev/null +++ b/arch/powerpc/mm/book3s64/internal.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H +#define ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H + +#include <linux/jump_label.h> + +extern bool stress_slb_enabled; + +DECLARE_STATIC_KEY_FALSE(stress_slb_key); + +static inline bool stress_slb(void) +{ + return static_branch_unlikely(&stress_slb_key); +} + +extern bool stress_hpt_enabled; + +DECLARE_STATIC_KEY_FALSE(stress_hpt_key); + +static inline bool stress_hpt(void) +{ + return static_branch_unlikely(&stress_hpt_key); +} + +extern bool no_slb_preload; +DECLARE_STATIC_KEY_FALSE(no_slb_preload_key); +static inline bool slb_preload_disabled(void) +{ + return static_branch_unlikely(&no_slb_preload_key); +} + +void hpt_do_stress(unsigned long ea, unsigned long hpte_group); + +void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush); + +#endif /* ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H */ diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c new file mode 100644 index 000000000000..c0e8d597e4cb --- /dev/null +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IOMMU helpers in MMU context. + * + * Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru> + */ + +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include <linux/rculist.h> +#include <linux/vmalloc.h> +#include <linux/mutex.h> +#include <linux/migrate.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/sizes.h> +#include <linux/mm.h> +#include <asm/mmu_context.h> +#include <asm/pte-walk.h> +#include <linux/mm_inline.h> + +static DEFINE_MUTEX(mem_list_mutex); + +#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1 +#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1) + +struct mm_iommu_table_group_mem_t { + struct list_head next; + struct rcu_head rcu; + unsigned long used; + atomic64_t mapped; + unsigned int pageshift; + u64 ua; /* userspace address */ + u64 entries; /* number of entries in hpas/hpages[] */ + /* + * in mm_iommu_get we temporarily use this to store + * struct page address. + * + * We need to convert ua to hpa in real mode. Make it + * simpler by storing physical address. + */ + union { + struct page **hpages; /* vmalloc'ed */ + phys_addr_t *hpas; + }; +#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1) + u64 dev_hpa; /* Device memory base address */ +}; + +bool mm_iommu_preregistered(struct mm_struct *mm) +{ + return !list_empty(&mm->context.iommu_group_mem_list); +} +EXPORT_SYMBOL_GPL(mm_iommu_preregistered); + +static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, + unsigned long entries, unsigned long dev_hpa, + struct mm_iommu_table_group_mem_t **pmem) +{ + struct mm_iommu_table_group_mem_t *mem, *mem2; + long i, ret, locked_entries = 0, pinned = 0; + unsigned int pageshift; + unsigned long entry, chunk; + + if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { + ret = account_locked_vm(mm, entries, true); + if (ret) + return ret; + + locked_entries = entries; + } + + mem = kzalloc(sizeof(*mem), GFP_KERNEL); + if (!mem) { + ret = -ENOMEM; + goto unlock_exit; + } + + if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) { + mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT)); + mem->dev_hpa = dev_hpa; + goto good_exit; + } + mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA; + + /* + * For a starting point for a maximum page size calculation + * we use @ua and @entries natural alignment to allow IOMMU pages + * smaller than huge pages but still bigger than PAGE_SIZE. + */ + mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT)); + mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0]))); + if (!mem->hpas) { + kfree(mem); + ret = -ENOMEM; + goto unlock_exit; + } + + mmap_read_lock(mm); + chunk = (1UL << (PAGE_SHIFT + MAX_PAGE_ORDER)) / + sizeof(struct vm_area_struct *); + chunk = min(chunk, entries); + for (entry = 0; entry < entries; entry += chunk) { + unsigned long n = min(entries - entry, chunk); + + ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n, + FOLL_WRITE | FOLL_LONGTERM, + mem->hpages + entry); + if (ret == n) { + pinned += n; + continue; + } + if (ret > 0) + pinned += ret; + break; + } + mmap_read_unlock(mm); + if (pinned != entries) { + if (!ret) + ret = -EFAULT; + goto free_exit; + } + +good_exit: + atomic64_set(&mem->mapped, 1); + mem->used = 1; + mem->ua = ua; + mem->entries = entries; + + mutex_lock(&mem_list_mutex); + + list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next, + lockdep_is_held(&mem_list_mutex)) { + /* Overlap? */ + if ((mem2->ua < (ua + (entries << PAGE_SHIFT))) && + (ua < (mem2->ua + + (mem2->entries << PAGE_SHIFT)))) { + ret = -EINVAL; + mutex_unlock(&mem_list_mutex); + goto free_exit; + } + } + + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { + /* + * Allow to use larger than 64k IOMMU pages. Only do that + * if we are backed by hugetlb. Skip device memory as it is not + * backed with page structs. + */ + pageshift = PAGE_SHIFT; + for (i = 0; i < entries; ++i) { + struct page *page = mem->hpages[i]; + + if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) + pageshift = page_shift(compound_head(page)); + mem->pageshift = min(mem->pageshift, pageshift); + /* + * We don't need struct page reference any more, switch + * to physical address. + */ + mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; + } + } + + list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); + + mutex_unlock(&mem_list_mutex); + + *pmem = mem; + + return 0; + +free_exit: + /* free the references taken */ + unpin_user_pages(mem->hpages, pinned); + + vfree(mem->hpas); + kfree(mem); + +unlock_exit: + account_locked_vm(mm, locked_entries, false); + + return ret; +} + +long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, + struct mm_iommu_table_group_mem_t **pmem) +{ + return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA, + pmem); +} +EXPORT_SYMBOL_GPL(mm_iommu_new); + +long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua, + unsigned long entries, unsigned long dev_hpa, + struct mm_iommu_table_group_mem_t **pmem) +{ + return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem); +} +EXPORT_SYMBOL_GPL(mm_iommu_newdev); + +static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) +{ + long i; + struct page *page = NULL; + + if (!mem->hpas) + return; + + for (i = 0; i < mem->entries; ++i) { + if (!mem->hpas[i]) + continue; + + page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT); + if (!page) + continue; + + if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY) + SetPageDirty(page); + + unpin_user_page(page); + + mem->hpas[i] = 0; + } +} + +static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem) +{ + + mm_iommu_unpin(mem); + vfree(mem->hpas); + kfree(mem); +} + +static void mm_iommu_free(struct rcu_head *head) +{ + struct mm_iommu_table_group_mem_t *mem = container_of(head, + struct mm_iommu_table_group_mem_t, rcu); + + mm_iommu_do_free(mem); +} + +static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) +{ + list_del_rcu(&mem->next); + call_rcu(&mem->rcu, mm_iommu_free); +} + +long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) +{ + long ret = 0; + unsigned long unlock_entries = 0; + + mutex_lock(&mem_list_mutex); + + if (mem->used == 0) { + ret = -ENOENT; + goto unlock_exit; + } + + --mem->used; + /* There are still users, exit */ + if (mem->used) + goto unlock_exit; + + /* Are there still mappings? */ + if (atomic64_cmpxchg(&mem->mapped, 1, 0) != 1) { + ++mem->used; + ret = -EBUSY; + goto unlock_exit; + } + + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) + unlock_entries = mem->entries; + + /* @mapped became 0 so now mappings are disabled, release the region */ + mm_iommu_release(mem); + +unlock_exit: + mutex_unlock(&mem_list_mutex); + + account_locked_vm(mm, unlock_entries, false); + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_put); + +struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, + unsigned long ua, unsigned long size) +{ + struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { + if ((mem->ua <= ua) && + (ua + size <= mem->ua + + (mem->entries << PAGE_SHIFT))) { + ret = mem; + break; + } + } + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_lookup); + +struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, + unsigned long ua, unsigned long entries) +{ + struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + + mutex_lock(&mem_list_mutex); + + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next, + lockdep_is_held(&mem_list_mutex)) { + if ((mem->ua == ua) && (mem->entries == entries)) { + ret = mem; + ++mem->used; + break; + } + } + + mutex_unlock(&mem_list_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_get); + +long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, + unsigned long ua, unsigned int pageshift, unsigned long *hpa) +{ + const long entry = (ua - mem->ua) >> PAGE_SHIFT; + u64 *va; + + if (entry >= mem->entries) + return -EFAULT; + + if (pageshift > mem->pageshift) + return -EFAULT; + + if (!mem->hpas) { + *hpa = mem->dev_hpa + (ua - mem->ua); + return 0; + } + + va = &mem->hpas[entry]; + *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); + + return 0; +} +EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); + +bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, + unsigned int pageshift, unsigned long *size) +{ + struct mm_iommu_table_group_mem_t *mem; + unsigned long end; + + rcu_read_lock(); + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) + continue; + + end = mem->dev_hpa + (mem->entries << PAGE_SHIFT); + if ((mem->dev_hpa <= hpa) && (hpa < end)) { + /* + * Since the IOMMU page size might be bigger than + * PAGE_SIZE, the amount of preregistered memory + * starting from @hpa might be smaller than 1<<pageshift + * and the caller needs to distinguish this situation. + */ + *size = min(1UL << pageshift, end - hpa); + return true; + } + } + rcu_read_unlock(); + + return false; +} +EXPORT_SYMBOL_GPL(mm_iommu_is_devmem); + +long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem) +{ + if (atomic64_inc_not_zero(&mem->mapped)) + return 0; + + /* Last mm_iommu_put() has been called, no more mappings allowed() */ + return -ENXIO; +} +EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc); + +void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem) +{ + atomic64_add_unless(&mem->mapped, -1, 1); +} +EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec); + +void mm_iommu_init(struct mm_struct *mm) +{ + INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list); +} diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c new file mode 100644 index 000000000000..fb9dcf9ca599 --- /dev/null +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -0,0 +1,347 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * MMU context allocation for 64-bit kernels. + * + * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org> + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/pkeys.h> +#include <linux/spinlock.h> +#include <linux/idr.h> +#include <linux/export.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/cpu.h> + +#include <asm/mmu_context.h> +#include <asm/pgalloc.h> + +#include "internal.h" + +static DEFINE_IDA(mmu_context_ida); + +static int alloc_context_id(int min_id, int max_id) +{ + return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL); +} + +#ifdef CONFIG_PPC_64S_HASH_MMU +void __init hash__reserve_context_id(int id) +{ + int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL); + + WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result); +} + +int hash__alloc_context_id(void) +{ + unsigned long max; + + if (mmu_has_feature(MMU_FTR_68_BIT_VA)) + max = MAX_USER_CONTEXT; + else + max = MAX_USER_CONTEXT_65BIT_VA; + + return alloc_context_id(MIN_USER_CONTEXT, max); +} +EXPORT_SYMBOL_GPL(hash__alloc_context_id); +#endif + +#ifdef CONFIG_PPC_64S_HASH_MMU +static int realloc_context_ids(mm_context_t *ctx) +{ + int i, id; + + /* + * id 0 (aka. ctx->id) is special, we always allocate a new one, even if + * there wasn't one allocated previously (which happens in the exec + * case where ctx is newly allocated). + * + * We have to be a bit careful here. We must keep the existing ids in + * the array, so that we can test if they're non-zero to decide if we + * need to allocate a new one. However in case of error we must free the + * ids we've allocated but *not* any of the existing ones (or risk a + * UAF). That's why we decrement i at the start of the error handling + * loop, to skip the id that we just tested but couldn't reallocate. + */ + for (i = 0; i < ARRAY_SIZE(ctx->extended_id); i++) { + if (i == 0 || ctx->extended_id[i]) { + id = hash__alloc_context_id(); + if (id < 0) + goto error; + + ctx->extended_id[i] = id; + } + } + + /* The caller expects us to return id */ + return ctx->id; + +error: + for (i--; i >= 0; i--) { + if (ctx->extended_id[i]) + ida_free(&mmu_context_ida, ctx->extended_id[i]); + } + + return id; +} + +static int hash__init_new_context(struct mm_struct *mm) +{ + int index; + + mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context), + GFP_KERNEL); + if (!mm->context.hash_context) + return -ENOMEM; + + /* + * The old code would re-promote on fork, we don't do that when using + * slices as it could cause problem promoting slices that have been + * forced down to 4K. + * + * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check + * explicitly against context.id == 0. This ensures that we properly + * initialize context slice details for newly allocated mm's (which will + * have id == 0) and don't alter context slice inherited via fork (which + * will have id != 0). + * + * We should not be calling init_new_context() on init_mm. Hence a + * check against 0 is OK. + */ + if (mm->context.id == 0) { + memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context)); + slice_init_new_context_exec(mm); + } else { + /* This is fork. Copy hash_context details from current->mm */ + memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context)); +#ifdef CONFIG_PPC_SUBPAGE_PROT + /* inherit subpage prot details if we have one. */ + if (current->mm->context.hash_context->spt) { + mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table), + GFP_KERNEL); + if (!mm->context.hash_context->spt) { + kfree(mm->context.hash_context); + return -ENOMEM; + } + } +#endif + } + + index = realloc_context_ids(&mm->context); + if (index < 0) { +#ifdef CONFIG_PPC_SUBPAGE_PROT + kfree(mm->context.hash_context->spt); +#endif + kfree(mm->context.hash_context); + return index; + } + + pkey_mm_init(mm); + return index; +} + +void hash__setup_new_exec(void) +{ + slice_setup_new_exec(); +} +#else +static inline int hash__init_new_context(struct mm_struct *mm) +{ + BUILD_BUG(); + return 0; +} +#endif + +static int radix__init_new_context(struct mm_struct *mm) +{ + unsigned long rts_field; + int index, max_id; + + max_id = (1 << mmu_pid_bits) - 1; + index = alloc_context_id(mmu_base_pid, max_id); + if (index < 0) + return index; + + /* + * set the process table entry, + */ + rts_field = radix__get_tree_size(); + process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); + + /* + * Order the above store with subsequent update of the PID + * register (at which point HW can start loading/caching + * the entry) and the corresponding load by the MMU from + * the L2 cache. + */ + asm volatile("ptesync;isync" : : : "memory"); + +#ifdef CONFIG_PPC_64S_HASH_MMU + mm->context.hash_context = NULL; +#endif + + return index; +} + +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + int index; + + if (radix_enabled()) + index = radix__init_new_context(mm); + else + index = hash__init_new_context(mm); + + if (index < 0) + return index; + + mm->context.id = index; + + mm->context.pte_frag = NULL; + mm->context.pmd_frag = NULL; +#ifdef CONFIG_SPAPR_TCE_IOMMU + mm_iommu_init(mm); +#endif + atomic_set(&mm->context.active_cpus, 0); + atomic_set(&mm->context.copros, 0); + + return 0; +} + +void __destroy_context(int context_id) +{ + ida_free(&mmu_context_ida, context_id); +} +EXPORT_SYMBOL_GPL(__destroy_context); + +static void destroy_contexts(mm_context_t *ctx) +{ + if (radix_enabled()) { + ida_free(&mmu_context_ida, ctx->id); + } else { +#ifdef CONFIG_PPC_64S_HASH_MMU + int index, context_id; + + for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { + context_id = ctx->extended_id[index]; + if (context_id) + ida_free(&mmu_context_ida, context_id); + } + kfree(ctx->hash_context); +#else + BUILD_BUG(); // radix_enabled() should be constant true +#endif + } +} + +static void pmd_frag_destroy(void *pmd_frag) +{ + int count; + struct ptdesc *ptdesc; + + ptdesc = virt_to_ptdesc(pmd_frag); + /* drop all the pending references */ + count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; + /* We allow PTE_FRAG_NR fragments from a PTE page */ + if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); + } +} + +static void destroy_pagetable_cache(struct mm_struct *mm) +{ + void *frag; + + frag = mm->context.pte_frag; + if (frag) + pte_frag_destroy(frag); + + frag = mm->context.pmd_frag; + if (frag) + pmd_frag_destroy(frag); + return; +} + +void destroy_context(struct mm_struct *mm) +{ +#ifdef CONFIG_SPAPR_TCE_IOMMU + WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); +#endif + /* + * For tasks which were successfully initialized we end up calling + * arch_exit_mmap() which clears the process table entry. And + * arch_exit_mmap() is called before the required fullmm TLB flush + * which does a RIC=2 flush. Hence for an initialized task, we do clear + * any cached process table entries. + * + * The condition below handles the error case during task init. We have + * set the process table entry early and if we fail a task + * initialization, we need to ensure the process table entry is zeroed. + * We need not worry about process table entry caches because the task + * never ran with the PID value. + */ + if (radix_enabled()) + process_tb[mm->context.id].prtb0 = 0; + else + subpage_prot_free(mm); + destroy_contexts(&mm->context); + mm->context.id = MMU_NO_CONTEXT; +} + +void arch_exit_mmap(struct mm_struct *mm) +{ + destroy_pagetable_cache(mm); + + if (radix_enabled()) { + /* + * Radix doesn't have a valid bit in the process table + * entries. However we know that at least P9 implementation + * will avoid caching an entry with an invalid RTS field, + * and 0 is invalid. So this will do. + * + * This runs before the "fullmm" tlb flush in exit_mmap, + * which does a RIC=2 tlbie to clear the process table + * entry. See the "fullmm" comments in tlb-radix.c. + * + * No barrier required here after the store because + * this process will do the invalidate, which starts with + * ptesync. + */ + process_tb[mm->context.id].prtb0 = 0; + } +} + +#ifdef CONFIG_PPC_RADIX_MMU +void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +{ + mtspr(SPRN_PID, next->context.id); + isync(); +} +#endif + +/** + * cleanup_cpu_mmu_context - Clean up MMU details for this CPU (newly offlined) + * + * This clears the CPU from mm_cpumask for all processes, and then flushes the + * local TLB to ensure TLB coherency in case the CPU is onlined again. + * + * KVM guest translations are not necessarily flushed here. If KVM started + * using mm_cpumask or the Linux APIs which do, this would have to be resolved. + */ +#ifdef CONFIG_HOTPLUG_CPU +void cleanup_cpu_mmu_context(void) +{ + int cpu = smp_processor_id(); + + clear_tasks_mm_cpumask(cpu); + tlbiel_all(); +} +#endif diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c new file mode 100644 index 000000000000..e3485db7de02 --- /dev/null +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -0,0 +1,665 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + */ + +#include <linux/sched.h> +#include <linux/mm_types.h> +#include <linux/memblock.h> +#include <linux/memremap.h> +#include <linux/pkeys.h> +#include <linux/debugfs.h> +#include <linux/proc_fs.h> + +#include <asm/pgalloc.h> +#include <asm/tlb.h> +#include <asm/trace.h> +#include <asm/powernv.h> +#include <asm/firmware.h> +#include <asm/ultravisor.h> +#include <asm/kexec.h> + +#include <mm/mmu_decl.h> +#include <trace/events/thp.h> + +#include "internal.h" + +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; +EXPORT_SYMBOL_GPL(mmu_psize_defs); + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int mmu_vmemmap_psize = MMU_PAGE_4K; +#endif + +unsigned long __pmd_frag_nr; +EXPORT_SYMBOL(__pmd_frag_nr); +unsigned long __pmd_frag_size_shift; +EXPORT_SYMBOL(__pmd_frag_size_shift); + +#ifdef CONFIG_KFENCE +extern bool kfence_early_init; +static int __init parse_kfence_early_init(char *arg) +{ + int val; + + if (get_option(&arg, &val)) + kfence_early_init = !!val; + return 0; +} +early_param("kfence.sample_interval", parse_kfence_early_init); +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * This is called when relaxing access to a hugepage. It's also called in the page + * fault path when we don't hit any of the major fault cases, ie, a minor + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have + * handled those two for us, we additionally deal with missing execute + * permission here on some processors + */ +int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp, pmd_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp)); +#endif + changed = !pmd_same(*(pmdp), entry); + if (changed) { + /* + * We can use MMU_PAGE_2M here, because only radix + * path look at the psize. + */ + __ptep_set_access_flags(vma, pmdp_ptep(pmdp), + pmd_pte(entry), address, MMU_PAGE_2M); + } + return changed; +} + +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp, pud_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + assert_spin_locked(pud_lockptr(vma->vm_mm, pudp)); +#endif + changed = !pud_same(*(pudp), entry); + if (changed) { + /* + * We can use MMU_PAGE_1G here, because only radix + * path look at the psize. + */ + __ptep_set_access_flags(vma, pudp_ptep(pudp), + pud_pte(entry), address, MMU_PAGE_1G); + } + return changed; +} + + +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); +} + +int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp) +{ + return __pudp_test_and_clear_young(vma->vm_mm, address, pudp); +} + +/* + * set a new huge pmd. We should not be called for updating + * an existing pmd entry. That should go via pmd_hugepage_update. + */ +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ +#ifdef CONFIG_DEBUG_VM + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + + WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); + assert_spin_locked(pmd_lockptr(mm, pmdp)); + WARN_ON(!(pmd_leaf(pmd))); +#endif + trace_hugepage_set_pmd(addr, pmd_val(pmd)); + return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); +} + +void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ +#ifdef CONFIG_DEBUG_VM + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + + WARN_ON(pte_hw_valid(pud_pte(*pudp))); + assert_spin_locked(pud_lockptr(mm, pudp)); + WARN_ON(!(pud_leaf(pud))); +#endif + trace_hugepage_set_pud(addr, pud_val(pud)); + return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud)); +} + +static void do_serialize(void *arg) +{ + /* We've taken the IPI, so try to trim the mask while here */ + if (radix_enabled()) { + struct mm_struct *mm = arg; + exit_lazy_flush_tlb(mm, false); + } +} + +/* + * Serialize against __find_linux_pte() which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * __find_linux_pte() to finish. + */ +void serialize_against_pte_lookup(struct mm_struct *mm) +{ + smp_mb(); + smp_call_function_many(mm_cpumask(mm), do_serialize, mm, 1); +} + +/* + * We use this to invalidate a pmdp entry before switching from a + * hugepte to regular pmd entry. + */ +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + unsigned long old_pmd; + + VM_WARN_ON_ONCE(!pmd_present(*pmdp)); + old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID); + flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return __pmd(old_pmd); +} + +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp) +{ + unsigned long old_pud; + + VM_WARN_ON_ONCE(!pud_present(*pudp)); + old_pud = pud_hugepage_update(vma->vm_mm, address, pudp, _PAGE_PRESENT, _PAGE_INVALID); + flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); + return __pud(old_pud); +} + +pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp, int full) +{ + pmd_t pmd; + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp)) || + !pmd_present(*pmdp)); + pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); + /* + * if it not a fullmm flush, then we can possibly end up converting + * this PMD pte entry to a regular level 0 PTE by a parallel page fault. + * Make sure we flush the tlb in this case. + */ + if (!full) + flush_pmd_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); + return pmd; +} + +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp, int full) +{ + pud_t pud; + + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + VM_BUG_ON(!pud_present(*pudp)); + pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp); + /* + * if it not a fullmm flush, then we can possibly end up converting + * this PMD pte entry to a regular level 0 PTE by a parallel page fault. + * Make sure we flush the tlb in this case. + */ + if (!full) + flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE); + return pud; +} + +static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) +{ + return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); +} + +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot) +{ + return __pud(pud_val(pud) | pgprot_val(pgprot)); +} + +/* + * At some point we should be able to get rid of + * pmd_mkhuge() and mk_huge_pmd() when we update all the + * other archs to mark the pmd huge in pfn_pmd() + */ +pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pmdv; + + pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + + return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot)); +} + +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pudv; + + pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + + return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot)); +} + +pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + unsigned long pmdv; + + pmdv = pmd_val(pmd); + pmdv &= _HPAGE_CHG_MASK; + return pmd_set_protbits(__pmd(pmdv), newprot); +} + +pud_t pud_modify(pud_t pud, pgprot_t newprot) +{ + unsigned long pudv; + + pudv = pud_val(pud); + pudv &= _HPAGE_CHG_MASK; + return pud_set_protbits(__pud(pudv), newprot); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +/* For use by kexec, called with MMU off */ +notrace void mmu_cleanup_all(void) +{ + if (radix_enabled()) + radix__mmu_cleanup_all(); + else if (mmu_hash_ops.hpte_clear_all) + mmu_hash_ops.hpte_clear_all(); + + reset_sprs(); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +int __meminit create_section_mapping(unsigned long start, unsigned long end, + int nid, pgprot_t prot) +{ + if (radix_enabled()) + return radix__create_section_mapping(start, end, nid, prot); + + return hash__create_section_mapping(start, end, nid, prot); +} + +int __meminit remove_section_mapping(unsigned long start, unsigned long end) +{ + if (radix_enabled()) + return radix__remove_section_mapping(start, end); + + return hash__remove_section_mapping(start, end); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +void __init mmu_partition_table_init(void) +{ + unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; + unsigned long ptcr; + + /* Initialize the Partition Table with no entries */ + partition_tb = memblock_alloc_or_panic(patb_size, patb_size); + ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); + set_ptcr_when_no_uv(ptcr); + powernv_set_nmmu_ptcr(ptcr); +} + +static void flush_partition(unsigned int lpid, bool radix) +{ + if (radix) { + radix__flush_all_lpid(lpid); + radix__flush_all_lpid_guest(lpid); + } else { + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + /* do we need fixup here ?*/ + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); + } +} + +void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, + unsigned long dw1, bool flush) +{ + unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); + + /* + * When ultravisor is enabled, the partition table is stored in secure + * memory and can only be accessed doing an ultravisor call. However, we + * maintain a copy of the partition table in normal memory to allow Nest + * MMU translations to occur (for normal VMs). + * + * Therefore, here we always update partition_tb, regardless of whether + * we are running under an ultravisor or not. + */ + partition_tb[lpid].patb0 = cpu_to_be64(dw0); + partition_tb[lpid].patb1 = cpu_to_be64(dw1); + + /* + * If ultravisor is enabled, we do an ultravisor call to register the + * partition table entry (PATE), which also do a global flush of TLBs + * and partition table caches for the lpid. Otherwise, just do the + * flush. The type of flush (hash or radix) depends on what the previous + * use of the partition ID was, not the new use. + */ + if (firmware_has_feature(FW_FEATURE_ULTRAVISOR)) { + uv_register_pate(lpid, dw0, dw1); + pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n", + dw0, dw1); + } else if (flush) { + /* + * Boot does not need to flush, because MMU is off and each + * CPU does a tlbiel_all() before switching them on, which + * flushes everything. + */ + flush_partition(lpid, (old & PATB_HR)); + } +} +EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); + +static pmd_t *get_pmd_from_cache(struct mm_struct *mm) +{ + void *pmd_frag, *ret; + + if (PMD_FRAG_NR == 1) + return NULL; + + spin_lock(&mm->page_table_lock); + ret = mm->context.pmd_frag; + if (ret) { + pmd_frag = ret + PMD_FRAG_SIZE; + /* + * If we have taken up all the fragments mark PTE page NULL + */ + if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0) + pmd_frag = NULL; + mm->context.pmd_frag = pmd_frag; + } + spin_unlock(&mm->page_table_lock); + return (pmd_t *)ret; +} + +static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) +{ + void *ret = NULL; + struct ptdesc *ptdesc; + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + ptdesc = pagetable_alloc(gfp, 0); + if (!ptdesc) + return NULL; + if (!pagetable_pmd_ctor(mm, ptdesc)) { + pagetable_free(ptdesc); + return NULL; + } + + atomic_set(&ptdesc->pt_frag_refcount, 1); + + ret = ptdesc_address(ptdesc); + /* + * if we support only one fragment just return the + * allocated page. + */ + if (PMD_FRAG_NR == 1) + return ret; + + spin_lock(&mm->page_table_lock); + /* + * If we find ptdesc_page set, we return + * the allocated page with single fragment + * count. + */ + if (likely(!mm->context.pmd_frag)) { + atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR); + mm->context.pmd_frag = ret + PMD_FRAG_SIZE; + } + spin_unlock(&mm->page_table_lock); + + return (pmd_t *)ret; +} + +pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr) +{ + pmd_t *pmd; + + pmd = get_pmd_from_cache(mm); + if (pmd) + return pmd; + + return __alloc_for_pmdcache(mm); +} + +void pmd_fragment_free(unsigned long *pmd) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); + + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); + + BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); + } +} + +static inline void pgtable_free(void *table, int index) +{ + switch (index) { + case PTE_INDEX: + pte_fragment_free(table, 0); + break; + case PMD_INDEX: + pmd_fragment_free(table); + break; + case PUD_INDEX: + __pud_free(table); + break; + /* We don't free pgd table via RCU callback */ + default: + BUG(); + } +} + +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index) +{ + unsigned long pgf = (unsigned long)table; + + BUG_ON(index > MAX_PGTABLE_INDEX_SIZE); + pgf |= index; + tlb_remove_table(tlb, (void *)pgf); +} + +void __tlb_remove_table(void *_table) +{ + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); + unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; + + return pgtable_free(table, index); +} + +#ifdef CONFIG_PROC_FS +atomic_long_t direct_pages_count[MMU_PAGE_COUNT]; + +void arch_report_meminfo(struct seq_file *m) +{ + seq_printf(m, "DirectMap4k: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2); + seq_printf(m, "DirectMap64k: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6); + if (radix_enabled()) { + seq_printf(m, "DirectMap2M: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11); + seq_printf(m, "DirectMap1G: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20); + } else { + seq_printf(m, "DirectMap16M: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_16M]) << 14); + seq_printf(m, "DirectMap16G: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_16G]) << 24); + } +} +#endif /* CONFIG_PROC_FS */ + +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep) +{ + unsigned long pte_val; + + /* + * Clear the _PAGE_PRESENT so that no hardware parallel update is + * possible. Also keep the pte_present true so that we don't take + * wrong fault. + */ + pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0); + + return __pte(pte_val); + +} + +void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) +{ + if (radix_enabled()) + return radix__ptep_modify_prot_commit(vma, addr, + ptep, old_pte, pte); + set_pte_at(vma->vm_mm, addr, ptep, pte); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * For hash translation mode, we use the deposited table to store hash slot + * information and they are stored at PTRS_PER_PMD offset from related pmd + * location. Hence a pmd move requires deposit and withdraw. + * + * For radix translation with split pmd ptl, we store the deposited table in the + * pmd page. Hence if we have different pmd page we need to withdraw during pmd + * move. + * + * With hash we use deposited table always irrespective of anon or not. + * With radix we use deposited table only for anonymous mapping. + */ +int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, + struct spinlock *old_pmd_ptl, + struct vm_area_struct *vma) +{ + if (radix_enabled()) + return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); + + return true; +} +#endif + +/* + * Does the CPU support tlbie? + */ +bool tlbie_capable __read_mostly = IS_ENABLED(CONFIG_PPC_RADIX_BROADCAST_TLBIE); +EXPORT_SYMBOL(tlbie_capable); + +/* + * Should tlbie be used for management of CPU TLBs, for kernel and process + * address spaces? tlbie may still be used for nMMU accelerators, and for KVM + * guest address spaces. + */ +bool tlbie_enabled __read_mostly = IS_ENABLED(CONFIG_PPC_RADIX_BROADCAST_TLBIE); + +static int __init setup_disable_tlbie(char *str) +{ + if (!radix_enabled()) { + pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n"); + return 1; + } + + tlbie_capable = false; + tlbie_enabled = false; + + return 1; +} +__setup("disable_tlbie", setup_disable_tlbie); + +static int __init pgtable_debugfs_setup(void) +{ + if (!tlbie_capable) + return 0; + + /* + * There is no locking vs tlb flushing when changing this value. + * The tlb flushers will see one value or another, and use either + * tlbie or tlbiel with IPIs. In both cases the TLBs will be + * invalidated as expected. + */ + debugfs_create_bool("tlbie_enabled", 0600, + arch_debugfs_dir, + &tlbie_enabled); + + return 0; +} +arch_initcall(pgtable_debugfs_setup); + +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN) +/* + * Override the generic version in mm/memremap.c. + * + * With hash translation, the direct-map range is mapped with just one + * page size selected by htab_init_page_sizes(). Consult + * mmu_psize_defs[] to determine the minimum page size alignment. +*/ +unsigned long memremap_compat_align(void) +{ + if (!radix_enabled()) { + unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift; + return max(SUBSECTION_SIZE, 1UL << shift); + } + + return SUBSECTION_SIZE; +} +EXPORT_SYMBOL_GPL(memremap_compat_align); +#endif + +pgprot_t vm_get_page_prot(vm_flags_t vm_flags) +{ + unsigned long prot; + + /* Radix supports execute-only, but protection_map maps X -> RX */ + if (!radix_enabled() && ((vm_flags & VM_ACCESS_FLAGS) == VM_EXEC)) + vm_flags |= VM_READ; + + prot = pgprot_val(protection_map[vm_flags & (VM_ACCESS_FLAGS | VM_SHARED)]); + + if (vm_flags & VM_SAO) + prot |= _PAGE_SAO; + +#ifdef CONFIG_PPC_MEM_KEYS + prot |= vmflag_to_pte_pkey_bits(vm_flags); +#endif + + return __pgprot(prot); +} +EXPORT_SYMBOL(vm_get_page_prot); diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c new file mode 100644 index 000000000000..a974baf8f327 --- /dev/null +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -0,0 +1,471 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * PowerPC Memory Protection Keys management + * + * Copyright 2017, Ram Pai, IBM Corporation. + */ + +#include <asm/mman.h> +#include <asm/mmu_context.h> +#include <asm/mmu.h> +#include <asm/setup.h> +#include <asm/smp.h> +#include <asm/firmware.h> + +#include <linux/pkeys.h> +#include <linux/of_fdt.h> + + +int num_pkey; /* Max number of pkeys supported */ +/* + * Keys marked in the reservation list cannot be allocated by userspace + */ +u32 reserved_allocation_mask __ro_after_init; + +/* Bits set for the initially allocated keys */ +static u32 initial_allocation_mask __ro_after_init; + +/* + * Even if we allocate keys with sys_pkey_alloc(), we need to make sure + * other thread still find the access denied using the same keys. + */ +u64 default_amr __ro_after_init = ~0x0UL; +u64 default_iamr __ro_after_init = 0x5555555555555555UL; +u64 default_uamor __ro_after_init; +EXPORT_SYMBOL(default_amr); +/* + * Key used to implement PROT_EXEC mmap. Denies READ/WRITE + * We pick key 2 because 0 is special key and 1 is reserved as per ISA. + */ +static int execute_only_key = 2; +static bool pkey_execute_disable_supported; + + +#define AMR_BITS_PER_PKEY 2 +#define AMR_RD_BIT 0x1UL +#define AMR_WR_BIT 0x2UL +#define IAMR_EX_BIT 0x1UL +#define PKEY_REG_BITS (sizeof(u64) * 8) +#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY)) + +static int __init dt_scan_storage_keys(unsigned long node, + const char *uname, int depth, + void *data) +{ + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + int *pkeys_total = (int *) data; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,processor-storage-keys", NULL); + if (!prop) + return 0; + *pkeys_total = be32_to_cpu(prop[0]); + return 1; +} + +static int __init scan_pkey_feature(void) +{ + int ret; + int pkeys_total = 0; + + /* + * Pkey is not supported with Radix translation. + */ + if (early_radix_enabled()) + return 0; + + ret = of_scan_flat_dt(dt_scan_storage_keys, &pkeys_total); + if (ret == 0) { + /* + * Let's assume 32 pkeys on P8/P9 bare metal, if its not defined by device + * tree. We make this exception since some version of skiboot forgot to + * expose this property on power8/9. + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + unsigned long pvr = mfspr(SPRN_PVR); + + if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E || + PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9 || + PVR_VER(pvr) == PVR_HX_C2000) + pkeys_total = 32; + } + } + +#ifdef CONFIG_PPC_MEM_KEYS + /* + * Adjust the upper limit, based on the number of bits supported by + * arch-neutral code. + */ + pkeys_total = min_t(int, pkeys_total, + ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + 1)); +#endif + return pkeys_total; +} + +void __init pkey_early_init_devtree(void) +{ + int pkeys_total, i; + +#ifdef CONFIG_PPC_MEM_KEYS + /* + * We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral + * generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE. + * Ensure that the bits a distinct. + */ + BUILD_BUG_ON(PKEY_DISABLE_EXECUTE & + (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + /* + * pkey_to_vmflag_bits() assumes that the pkey bits are contiguous + * in the vmaflag. Make sure that is really the case. + */ + BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + + __builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + != (sizeof(u64) * BITS_PER_BYTE)); +#endif + /* + * Only P7 and above supports SPRN_AMR update with MSR[PR] = 1 + */ + if (!early_cpu_has_feature(CPU_FTR_ARCH_206)) + return; + + /* scan the device tree for pkey feature */ + pkeys_total = scan_pkey_feature(); + if (!pkeys_total) + goto out; + + /* Allow all keys to be modified by default */ + default_uamor = ~0x0UL; + + cur_cpu_spec->mmu_features |= MMU_FTR_PKEY; + + /* + * The device tree cannot be relied to indicate support for + * execute_disable support. Instead we use a PVR check. + */ + if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p)) + pkey_execute_disable_supported = false; + else + pkey_execute_disable_supported = true; + +#ifdef CONFIG_PPC_4K_PAGES + /* + * The OS can manage only 8 pkeys due to its inability to represent them + * in the Linux 4K PTE. Mark all other keys reserved. + */ + num_pkey = min(8, pkeys_total); +#else + num_pkey = pkeys_total; +#endif + + if (unlikely(num_pkey <= execute_only_key) || !pkey_execute_disable_supported) { + /* + * Insufficient number of keys to support + * execute only key. Mark it unavailable. + */ + execute_only_key = -1; + } else { + /* + * Mark the execute_only_pkey as not available for + * user allocation via pkey_alloc. + */ + reserved_allocation_mask |= (0x1 << execute_only_key); + + /* + * Deny READ/WRITE for execute_only_key. + * Allow execute in IAMR. + */ + default_amr |= (0x3ul << pkeyshift(execute_only_key)); + default_iamr &= ~(0x1ul << pkeyshift(execute_only_key)); + + /* + * Clear the uamor bits for this key. + */ + default_uamor &= ~(0x3ul << pkeyshift(execute_only_key)); + } + + if (unlikely(num_pkey <= 3)) { + /* + * Insufficient number of keys to support + * KUAP/KUEP feature. + */ + disable_kuep = true; + disable_kuap = true; + WARN(1, "Disabling kernel user protection due to low (%d) max supported keys\n", num_pkey); + } else { + /* handle key which is used by kernel for KAUP */ + reserved_allocation_mask |= (0x1 << 3); + /* + * Mark access for kup_key in default amr so that + * we continue to operate with that AMR in + * copy_to/from_user(). + */ + default_amr &= ~(0x3ul << pkeyshift(3)); + default_iamr &= ~(0x1ul << pkeyshift(3)); + default_uamor &= ~(0x3ul << pkeyshift(3)); + } + + /* + * Allow access for only key 0. And prevent any other modification. + */ + default_amr &= ~(0x3ul << pkeyshift(0)); + default_iamr &= ~(0x1ul << pkeyshift(0)); + default_uamor &= ~(0x3ul << pkeyshift(0)); + /* + * key 0 is special in that we want to consider it an allocated + * key which is preallocated. We don't allow changing AMR bits + * w.r.t key 0. But one can pkey_free(key0) + */ + initial_allocation_mask |= (0x1 << 0); + + /* + * key 1 is recommended not to be used. PowerISA(3.0) page 1015, + * programming note. + */ + reserved_allocation_mask |= (0x1 << 1); + default_uamor &= ~(0x3ul << pkeyshift(1)); + + /* + * Prevent the usage of OS reserved keys. Update UAMOR + * for those keys. Also mark the rest of the bits in the + * 32 bit mask as reserved. + */ + for (i = num_pkey; i < 32 ; i++) { + reserved_allocation_mask |= (0x1 << i); + default_uamor &= ~(0x3ul << pkeyshift(i)); + } + /* + * Prevent the allocation of reserved keys too. + */ + initial_allocation_mask |= reserved_allocation_mask; + + pr_info("Enabling pkeys with max key count %d\n", num_pkey); +out: + /* + * Setup uamor on boot cpu + */ + mtspr(SPRN_UAMOR, default_uamor); + + return; +} + +#ifdef CONFIG_PPC_KUEP +void setup_kuep(bool disabled) +{ + if (disabled) + return; + /* + * On hash if PKEY feature is not enabled, disable KUAP too. + */ + if (!early_radix_enabled() && !early_mmu_has_feature(MMU_FTR_PKEY)) + return; + + if (smp_processor_id() == boot_cpuid) { + pr_info("Activating Kernel Userspace Execution Prevention\n"); + cur_cpu_spec->mmu_features |= MMU_FTR_BOOK3S_KUEP; + } + + /* + * Radix always uses key0 of the IAMR to determine if an access is + * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction + * fetch. + */ + mtspr(SPRN_IAMR, AMR_KUEP_BLOCKED); + isync(); +} +#endif + +#ifdef CONFIG_PPC_KUAP +void setup_kuap(bool disabled) +{ + if (disabled) + return; + /* + * On hash if PKEY feature is not enabled, disable KUAP too. + */ + if (!early_radix_enabled() && !early_mmu_has_feature(MMU_FTR_PKEY)) + return; + + if (smp_processor_id() == boot_cpuid) { + pr_info("Activating Kernel Userspace Access Prevention\n"); + cur_cpu_spec->mmu_features |= MMU_FTR_KUAP; + } + + /* + * Set the default kernel AMR values on all cpus. + */ + mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); + isync(); +} +#endif + +#ifdef CONFIG_PPC_MEM_KEYS +void pkey_mm_init(struct mm_struct *mm) +{ + if (!mmu_has_feature(MMU_FTR_PKEY)) + return; + mm_pkey_allocation_map(mm) = initial_allocation_mask; + mm->context.execute_only_pkey = execute_only_key; +} + +static inline void init_amr(int pkey, u8 init_bits) +{ + u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey)); + u64 old_amr = current_thread_amr() & ~((u64)(0x3ul) << pkeyshift(pkey)); + + current->thread.regs->amr = old_amr | new_amr_bits; +} + +static inline void init_iamr(int pkey, u8 init_bits) +{ + u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey)); + u64 old_iamr = current_thread_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey)); + + if (!likely(pkey_execute_disable_supported)) + return; + + current->thread.regs->iamr = old_iamr | new_iamr_bits; +} + +/* + * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that + * specified in @init_val. + */ +int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val) +{ + u64 new_amr_bits = 0x0ul; + u64 new_iamr_bits = 0x0ul; + u64 pkey_bits, uamor_pkey_bits; + + /* + * Check whether the key is disabled by UAMOR. + */ + pkey_bits = 0x3ul << pkeyshift(pkey); + uamor_pkey_bits = (default_uamor & pkey_bits); + + /* + * Both the bits in UAMOR corresponding to the key should be set + */ + if (uamor_pkey_bits != pkey_bits) + return -EINVAL; + + if (init_val & PKEY_DISABLE_EXECUTE) { + if (!pkey_execute_disable_supported) + return -EINVAL; + new_iamr_bits |= IAMR_EX_BIT; + } + init_iamr(pkey, new_iamr_bits); + + /* Set the bits we need in AMR: */ + if (init_val & PKEY_DISABLE_ACCESS) + new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT; + else if (init_val & PKEY_DISABLE_WRITE) + new_amr_bits |= AMR_WR_BIT; + + init_amr(pkey, new_amr_bits); + return 0; +} + +int execute_only_pkey(struct mm_struct *mm) +{ + return mm->context.execute_only_pkey; +} + +static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) +{ + /* Do this check first since the vm_flags should be hot */ + if ((vma->vm_flags & VM_ACCESS_FLAGS) != VM_EXEC) + return false; + + return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey); +} + +/* + * This should only be called for *plain* mprotect calls. + */ +int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, + int pkey) +{ + /* + * If the currently associated pkey is execute-only, but the requested + * protection is not execute-only, move it back to the default pkey. + */ + if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC)) + return 0; + + /* + * The requested protection is execute-only. Hence let's use an + * execute-only pkey. + */ + if (prot == PROT_EXEC) { + pkey = execute_only_pkey(vma->vm_mm); + if (pkey > 0) + return pkey; + } + + /* Nothing to override. */ + return vma_pkey(vma); +} + +static bool pkey_access_permitted(int pkey, bool write, bool execute) +{ + int pkey_shift; + u64 amr; + + pkey_shift = pkeyshift(pkey); + if (execute) + return !(current_thread_iamr() & (IAMR_EX_BIT << pkey_shift)); + + amr = current_thread_amr(); + if (write) + return !(amr & (AMR_WR_BIT << pkey_shift)); + + return !(amr & (AMR_RD_BIT << pkey_shift)); +} + +bool arch_pte_access_permitted(u64 pte, bool write, bool execute) +{ + if (!mmu_has_feature(MMU_FTR_PKEY)) + return true; + + return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute); +} + +/* + * We only want to enforce protection keys on the current thread because we + * effectively have no access to AMR/IAMR for other threads or any way to tell + * which AMR/IAMR in a threaded process we could use. + * + * So do not enforce things if the VMA is not from the current mm, or if we are + * in a kernel thread. + */ +bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, + bool execute, bool foreign) +{ + if (!mmu_has_feature(MMU_FTR_PKEY)) + return true; + /* + * Do not enforce our key-permissions on a foreign vma. + */ + if (foreign || vma_is_foreign(vma)) + return true; + + return pkey_access_permitted(vma_pkey(vma), write, execute); +} + +void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm) +{ + if (!mmu_has_feature(MMU_FTR_PKEY)) + return; + + /* Duplicate the oldmm pkey state in mm: */ + mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm); + mm->context.execute_only_pkey = oldmm->context.execute_only_pkey; +} + +#endif /* CONFIG_PPC_MEM_KEYS */ diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c new file mode 100644 index 000000000000..35fd2a95be24 --- /dev/null +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/security.h> +#include <asm/cacheflush.h> +#include <asm/machdep.h> +#include <asm/mman.h> +#include <asm/tlb.h> + +void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + int psize; + struct hstate *hstate = hstate_file(vma->vm_file); + + psize = hstate_get_psize(hstate); + radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize); +} + +void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + int psize; + struct hstate *hstate = hstate_file(vma->vm_file); + + psize = hstate_get_psize(hstate); + radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize); +} + +void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + int psize; + struct hstate *hstate = hstate_file(vma->vm_file); + + psize = hstate_get_psize(hstate); + /* + * Flush PWC even if we get PUD_SIZE hugetlb invalidate to keep this simpler. + */ + if (end - start >= PUD_SIZE) + radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize); + else + radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); +} + +void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long psize = huge_page_size(hstate_vma(vma)); + + /* + * POWER9 NMMU must flush the TLB after clearing the PTE before + * installing a PTE with more relaxed access permissions, see + * radix__ptep_set_access_flags. + */ + if (!cpu_has_feature(CPU_FTR_ARCH_31) && + is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + atomic_read(&mm->context.copros) > 0) + radix__flush_hugetlb_page(vma, addr); + + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); +} diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c new file mode 100644 index 000000000000..73977dbabcf2 --- /dev/null +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -0,0 +1,1694 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Page table handling routines for radix page table. + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + */ + +#define pr_fmt(fmt) "radix-mmu: " fmt + +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/sched/mm.h> +#include <linux/memblock.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/string_helpers.h> +#include <linux/memory.h> +#include <linux/kfence.h> + +#include <asm/pgalloc.h> +#include <asm/mmu_context.h> +#include <asm/dma.h> +#include <asm/machdep.h> +#include <asm/mmu.h> +#include <asm/firmware.h> +#include <asm/powernv.h> +#include <asm/sections.h> +#include <asm/smp.h> +#include <asm/trace.h> +#include <asm/uaccess.h> +#include <asm/ultravisor.h> +#include <asm/set_memory.h> +#include <asm/kfence.h> + +#include <trace/events/thp.h> + +#include <mm/mmu_decl.h> + +unsigned int mmu_base_pid; + +static __ref void *early_alloc_pgtable(unsigned long size, int nid, + unsigned long region_start, unsigned long region_end) +{ + phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; + phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; + void *ptr; + + if (region_start) + min_addr = region_start; + if (region_end) + max_addr = region_end; + + ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid); + + if (!ptr) + panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n", + __func__, size, size, nid, &min_addr, &max_addr); + + return ptr; +} + +/* + * When allocating pud or pmd pointers, we allocate a complete page + * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This + * is to ensure that the page obtained from the memblock allocator + * can be completely used as page table page and can be freed + * correctly when the page table entries are removed. + */ +static int early_map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size, + int nid, + unsigned long region_start, unsigned long region_end) +{ + unsigned long pfn = pa >> PAGE_SHIFT; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_k(ea); + p4dp = p4d_offset(pgdp, ea); + if (p4d_none(*p4dp)) { + pudp = early_alloc_pgtable(PAGE_SIZE, nid, + region_start, region_end); + p4d_populate(&init_mm, p4dp, pudp); + } + pudp = pud_offset(p4dp, ea); + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start, + region_end); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (map_page_size == PMD_SIZE) { + ptep = pmdp_ptep(pmdp); + goto set_the_pte; + } + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE, nid, + region_start, region_end); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + +set_the_pte: + set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); + asm volatile("ptesync": : :"memory"); + return 0; +} + +/* + * nid, region_start, and region_end are hints to try to place the page + * table memory in the same node or region. + */ +static int __map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size, + int nid, + unsigned long region_start, unsigned long region_end) +{ + unsigned long pfn = pa >> PAGE_SHIFT; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + /* + * Make sure task size is correct as per the max adddr + */ + BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); + +#ifdef CONFIG_PPC_64K_PAGES + BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); +#endif + + if (unlikely(!slab_is_available())) + return early_map_kernel_page(ea, pa, flags, map_page_size, + nid, region_start, region_end); + + /* + * Should make page table allocation functions be able to take a + * node, so we can place kernel page tables on the right nodes after + * boot. + */ + pgdp = pgd_offset_k(ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); + if (!pudp) + return -ENOMEM; + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + if (map_page_size == PMD_SIZE) { + ptep = pmdp_ptep(pmdp); + goto set_the_pte; + } + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + +set_the_pte: + set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); + asm volatile("ptesync": : :"memory"); + return 0; +} + +int radix__map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size) +{ + return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); +} + +#ifdef CONFIG_STRICT_KERNEL_RWX +static void radix__change_memory_range(unsigned long start, unsigned long end, + unsigned long clear) +{ + unsigned long idx; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + start = ALIGN_DOWN(start, PAGE_SIZE); + end = PAGE_ALIGN(end); // aligns up + + pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", + start, end, clear); + + for (idx = start; idx < end; idx += PAGE_SIZE) { + pgdp = pgd_offset_k(idx); + p4dp = p4d_offset(pgdp, idx); + pudp = pud_alloc(&init_mm, p4dp, idx); + if (!pudp) + continue; + if (pud_leaf(*pudp)) { + ptep = (pte_t *)pudp; + goto update_the_pte; + } + pmdp = pmd_alloc(&init_mm, pudp, idx); + if (!pmdp) + continue; + if (pmd_leaf(*pmdp)) { + ptep = pmdp_ptep(pmdp); + goto update_the_pte; + } + ptep = pte_alloc_kernel(pmdp, idx); + if (!ptep) + continue; +update_the_pte: + radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); + } + + radix__flush_tlb_kernel_range(start, end); +} + +void radix__mark_rodata_ro(void) +{ + unsigned long start, end; + + start = (unsigned long)_stext; + end = (unsigned long)__end_rodata; + + radix__change_memory_range(start, end, _PAGE_WRITE); + + for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) { + end = start + PAGE_SIZE; + if (overlaps_interrupt_vector_text(start, end)) + radix__change_memory_range(start, end, _PAGE_WRITE); + else + break; + } +} + +void radix__mark_initmem_nx(void) +{ + unsigned long start = (unsigned long)__init_begin; + unsigned long end = (unsigned long)__init_end; + + radix__change_memory_range(start, end, _PAGE_EXEC); +} +#endif /* CONFIG_STRICT_KERNEL_RWX */ + +static inline void __meminit +print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) +{ + char buf[10]; + + if (end <= start) + return; + + string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); + + pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, + exec ? " (exec)" : ""); +} + +static unsigned long next_boundary(unsigned long addr, unsigned long end) +{ +#ifdef CONFIG_STRICT_KERNEL_RWX + unsigned long stext_phys; + + stext_phys = __pa_symbol(_stext); + + // Relocatable kernel running at non-zero real address + if (stext_phys != 0) { + // The end of interrupts code at zero is a rodata boundary + unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys; + if (addr < end_intr) + return end_intr; + + // Start of relocated kernel text is a rodata boundary + if (addr < stext_phys) + return stext_phys; + } + + if (addr < __pa_symbol(__srwx_boundary)) + return __pa_symbol(__srwx_boundary); +#endif + return end; +} + +static int __meminit create_physical_mapping(unsigned long start, + unsigned long end, + int nid, pgprot_t _prot, + unsigned long mapping_sz_limit) +{ + unsigned long vaddr, addr, mapping_size = 0; + bool prev_exec, exec = false; + pgprot_t prot; + int psize; + unsigned long max_mapping_size = memory_block_size; + + if (mapping_sz_limit < max_mapping_size) + max_mapping_size = mapping_sz_limit; + + if (debug_pagealloc_enabled()) + max_mapping_size = PAGE_SIZE; + + start = ALIGN(start, PAGE_SIZE); + end = ALIGN_DOWN(end, PAGE_SIZE); + for (addr = start; addr < end; addr += mapping_size) { + unsigned long gap, previous_size; + int rc; + + gap = next_boundary(addr, end) - addr; + if (gap > max_mapping_size) + gap = max_mapping_size; + previous_size = mapping_size; + prev_exec = exec; + + if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && + mmu_psize_defs[MMU_PAGE_1G].shift) { + mapping_size = PUD_SIZE; + psize = MMU_PAGE_1G; + } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && + mmu_psize_defs[MMU_PAGE_2M].shift) { + mapping_size = PMD_SIZE; + psize = MMU_PAGE_2M; + } else { + mapping_size = PAGE_SIZE; + psize = mmu_virtual_psize; + } + + vaddr = (unsigned long)__va(addr); + + if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || + overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { + prot = PAGE_KERNEL_X; + exec = true; + } else { + prot = _prot; + exec = false; + } + + if (mapping_size != previous_size || exec != prev_exec) { + print_mapping(start, addr, previous_size, prev_exec); + start = addr; + } + + rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); + if (rc) + return rc; + + update_page_count(psize, 1); + } + + print_mapping(start, addr, mapping_size, exec); + return 0; +} + +#ifdef CONFIG_KFENCE +static __init phys_addr_t alloc_kfence_pool(void) +{ + phys_addr_t kfence_pool; + + /* + * TODO: Support to enable KFENCE after bootup depends on the ability to + * split page table mappings. As such support is not currently + * implemented for radix pagetables, support enabling KFENCE + * only at system startup for now. + * + * After support for splitting mappings is available on radix, + * alloc_kfence_pool() & map_kfence_pool() can be dropped and + * mapping for __kfence_pool memory can be + * split during arch_kfence_init_pool(). + */ + if (!kfence_early_init) + goto no_kfence; + + kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); + if (!kfence_pool) + goto no_kfence; + + memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE); + return kfence_pool; + +no_kfence: + disable_kfence(); + return 0; +} + +static __init void map_kfence_pool(phys_addr_t kfence_pool) +{ + if (!kfence_pool) + return; + + if (create_physical_mapping(kfence_pool, kfence_pool + KFENCE_POOL_SIZE, + -1, PAGE_KERNEL, PAGE_SIZE)) + goto err; + + memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); + __kfence_pool = __va(kfence_pool); + return; + +err: + memblock_phys_free(kfence_pool, KFENCE_POOL_SIZE); + disable_kfence(); +} +#else +static inline phys_addr_t alloc_kfence_pool(void) { return 0; } +static inline void map_kfence_pool(phys_addr_t kfence_pool) { } +#endif + +static void __init radix_init_pgtable(void) +{ + phys_addr_t kfence_pool; + unsigned long rts_field; + phys_addr_t start, end; + u64 i; + + /* We don't support slb for radix */ + slb_set_size(0); + + kfence_pool = alloc_kfence_pool(); + + /* + * Create the linear mapping + */ + for_each_mem_range(i, &start, &end) { + /* + * The memblock allocator is up at this point, so the + * page tables will be allocated within the range. No + * need or a node (which we don't have yet). + */ + + if (end >= RADIX_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + continue; + } + + WARN_ON(create_physical_mapping(start, end, + -1, PAGE_KERNEL, ~0UL)); + } + + map_kfence_pool(kfence_pool); + + if (!cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { + /* + * Older versions of KVM on these machines prefer if the + * guest only uses the low 19 PID bits. + */ + mmu_pid_bits = 19; + } + mmu_base_pid = 1; + + /* + * Allocate Partition table and process table for the + * host. + */ + BUG_ON(PRTB_SIZE_SHIFT > 36); + process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); + /* + * Fill in the process table. + */ + rts_field = radix__get_tree_size(); + process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); + + /* + * The init_mm context is given the first available (non-zero) PID, + * which is the "guard PID" and contains no page table. PIDR should + * never be set to zero because that duplicates the kernel address + * space at the 0x0... offset (quadrant 0)! + * + * An arbitrary PID that may later be allocated by the PID allocator + * for userspace processes must not be used either, because that + * would cause stale user mappings for that PID on CPUs outside of + * the TLB invalidation scheme (because it won't be in mm_cpumask). + * + * So permanently carve out one PID for the purpose of a guard PID. + */ + init_mm.context.id = mmu_base_pid; + mmu_base_pid++; +} + +static void __init radix_init_partition_table(void) +{ + unsigned long rts_field, dw0, dw1; + + mmu_partition_table_init(); + rts_field = radix__get_tree_size(); + dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; + dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR; + mmu_partition_table_set_entry(0, dw0, dw1, false); + + pr_info("Initializing Radix MMU\n"); +} + +static int __init get_idx_from_shift(unsigned int shift) +{ + int idx = -1; + + switch (shift) { + case 0xc: + idx = MMU_PAGE_4K; + break; + case 0x10: + idx = MMU_PAGE_64K; + break; + case 0x15: + idx = MMU_PAGE_2M; + break; + case 0x1e: + idx = MMU_PAGE_1G; + break; + } + return idx; +} + +static int __init radix_dt_scan_page_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + int size = 0; + int shift, idx; + unsigned int ap; + const __be32 *prop; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + /* Grab page size encodings */ + prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); + if (!prop) + return 0; + + pr_info("Page sizes from device-tree:\n"); + for (; size >= 4; size -= 4, ++prop) { + + struct mmu_psize_def *def; + + /* top 3 bit is AP encoding */ + shift = be32_to_cpu(prop[0]) & ~(0xe << 28); + ap = be32_to_cpu(prop[0]) >> 29; + pr_info("Page size shift = %d AP=0x%x\n", shift, ap); + + idx = get_idx_from_shift(shift); + if (idx < 0) + continue; + + def = &mmu_psize_defs[idx]; + def->shift = shift; + def->ap = ap; + def->h_rpt_pgsize = psize_to_rpti_pgsize(idx); + } + + /* needed ? */ + cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; + return 1; +} + +void __init radix__early_init_devtree(void) +{ + int rc; + + /* + * Try to find the available page sizes in the device-tree + */ + rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); + if (!rc) { + /* + * No page size details found in device tree. + * Let's assume we have page 4k and 64k support + */ + mmu_psize_defs[MMU_PAGE_4K].shift = 12; + mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; + mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize = + psize_to_rpti_pgsize(MMU_PAGE_4K); + + mmu_psize_defs[MMU_PAGE_64K].shift = 16; + mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; + mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize = + psize_to_rpti_pgsize(MMU_PAGE_64K); + } + return; +} + +void __init radix__early_init_mmu(void) +{ + unsigned long lpcr; + +#ifdef CONFIG_PPC_64S_HASH_MMU +#ifdef CONFIG_PPC_64K_PAGES + /* PAGE_SIZE mappings */ + mmu_virtual_psize = MMU_PAGE_64K; +#else + mmu_virtual_psize = MMU_PAGE_4K; +#endif +#endif + /* + * initialize page table size + */ + __pte_index_size = RADIX_PTE_INDEX_SIZE; + __pmd_index_size = RADIX_PMD_INDEX_SIZE; + __pud_index_size = RADIX_PUD_INDEX_SIZE; + __pgd_index_size = RADIX_PGD_INDEX_SIZE; + __pud_cache_index = RADIX_PUD_INDEX_SIZE; + __pte_table_size = RADIX_PTE_TABLE_SIZE; + __pmd_table_size = RADIX_PMD_TABLE_SIZE; + __pud_table_size = RADIX_PUD_TABLE_SIZE; + __pgd_table_size = RADIX_PGD_TABLE_SIZE; + + __pmd_val_bits = RADIX_PMD_VAL_BITS; + __pud_val_bits = RADIX_PUD_VAL_BITS; + __pgd_val_bits = RADIX_PGD_VAL_BITS; + + __kernel_virt_start = RADIX_KERN_VIRT_START; + __vmalloc_start = RADIX_VMALLOC_START; + __vmalloc_end = RADIX_VMALLOC_END; + __kernel_io_start = RADIX_KERN_IO_START; + __kernel_io_end = RADIX_KERN_IO_END; + vmemmap = (struct page *)RADIX_VMEMMAP_START; + ioremap_bot = IOREMAP_BASE; + +#ifdef CONFIG_PCI + pci_io_base = ISA_IO_BASE; +#endif + __pte_frag_nr = RADIX_PTE_FRAG_NR; + __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; + __pmd_frag_nr = RADIX_PMD_FRAG_NR; + __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; + + radix_init_pgtable(); + + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); + radix_init_partition_table(); + } else { + radix_init_pseries(); + } + + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + + /* Switch to the guard PID before turning on MMU */ + radix__switch_mmu_context(NULL, &init_mm); + tlbiel_all(); +} + +void radix__early_init_mmu_secondary(void) +{ + unsigned long lpcr; + /* + * update partition table control register and UPRT + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); + + set_ptcr_when_no_uv(__pa(partition_tb) | + (PATB_SIZE_SHIFT - 12)); + } + + radix__switch_mmu_context(NULL, &init_mm); + tlbiel_all(); + + /* Make sure userspace can't change the AMR */ + mtspr(SPRN_UAMOR, 0); +} + +/* Called during kexec sequence with MMU off */ +notrace void radix__mmu_cleanup_all(void) +{ + unsigned long lpcr; + + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); + set_ptcr_when_no_uv(0); + powernv_set_nmmu_ptcr(0); + radix__flush_tlb_all(); + } +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static void free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pte_free_kernel(&init_mm, pte_start); + pmd_clear(pmd); +} + +static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + pmd_free(&init_mm, pmd_start); + pud_clear(pud); +} + +static void free_pud_table(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + pud_free(&init_mm, pud_start); + p4d_clear(p4d); +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); + + return !vmemmap_populated(start, PMD_SIZE); +} + +static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); + + return !vmemmap_populated(start, PAGE_SIZE); + +} +#endif + +static void __meminit free_vmemmap_pages(struct page *page, + struct vmem_altmap *altmap, + int order) +{ + unsigned int nr_pages = 1 << order; + + if (altmap) { + unsigned long alt_start, alt_end; + unsigned long base_pfn = page_to_pfn(page); + + /* + * with 2M vmemmap mmaping we can have things setup + * such that even though atlmap is specified we never + * used altmap. + */ + alt_start = altmap->base_pfn; + alt_end = altmap->base_pfn + altmap->reserve + altmap->free; + + if (base_pfn >= alt_start && base_pfn < alt_end) { + vmem_altmap_free(altmap, nr_pages); + return; + } + } + + if (PageReserved(page)) { + /* allocated from memblock */ + while (nr_pages--) + free_reserved_page(page++); + } else + __free_pages(page, order); +} + +static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, + unsigned long end, bool direct, + struct vmem_altmap *altmap) +{ + unsigned long next, pages = 0; + pte_t *pte; + + pte = pte_start + pte_index(addr); + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { + if (!direct) + free_vmemmap_pages(pte_page(*pte), altmap, 0); + pte_clear(&init_mm, addr, pte); + pages++; + } +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (!direct && vmemmap_page_is_unused(addr, next)) { + free_vmemmap_pages(pte_page(*pte), altmap, 0); + pte_clear(&init_mm, addr, pte); + } +#endif + } + if (direct) + update_page_count(mmu_virtual_psize, -pages); +} + +static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, + unsigned long end, bool direct, + struct vmem_altmap *altmap) +{ + unsigned long next, pages = 0; + pte_t *pte_base; + pmd_t *pmd; + + pmd = pmd_start + pmd_index(addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (pmd_leaf(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); + pte_clear(&init_mm, addr, (pte_t *)pmd); + pages++; + } +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (!direct && vmemmap_pmd_is_unused(addr, next)) { + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); + pte_clear(&init_mm, addr, (pte_t *)pmd); + } +#endif + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmd); + remove_pte_table(pte_base, addr, next, direct, altmap); + free_pte_table(pte_base, pmd); + } + if (direct) + update_page_count(MMU_PAGE_2M, -pages); +} + +static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, + unsigned long end, bool direct, + struct vmem_altmap *altmap) +{ + unsigned long next, pages = 0; + pmd_t *pmd_base; + pud_t *pud; + + pud = pud_start + pud_index(addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (pud_leaf(*pud)) { + if (!IS_ALIGNED(addr, PUD_SIZE) || + !IS_ALIGNED(next, PUD_SIZE)) { + WARN_ONCE(1, "%s: unaligned range\n", __func__); + continue; + } + pte_clear(&init_mm, addr, (pte_t *)pud); + pages++; + continue; + } + + pmd_base = pud_pgtable(*pud); + remove_pmd_table(pmd_base, addr, next, direct, altmap); + free_pmd_table(pmd_base, pud); + } + if (direct) + update_page_count(MMU_PAGE_1G, -pages); +} + +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) +{ + unsigned long addr, next; + pud_t *pud_base; + pgd_t *pgd; + p4d_t *p4d; + + spin_lock(&init_mm.page_table_lock); + + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + continue; + + if (p4d_leaf(*p4d)) { + if (!IS_ALIGNED(addr, P4D_SIZE) || + !IS_ALIGNED(next, P4D_SIZE)) { + WARN_ONCE(1, "%s: unaligned range\n", __func__); + continue; + } + + pte_clear(&init_mm, addr, (pte_t *)pgd); + continue; + } + + pud_base = p4d_pgtable(*p4d); + remove_pud_table(pud_base, addr, next, direct, altmap); + free_pud_table(pud_base, p4d); + } + + spin_unlock(&init_mm.page_table_lock); + radix__flush_tlb_kernel_range(start, end); +} + +int __meminit radix__create_section_mapping(unsigned long start, + unsigned long end, int nid, + pgprot_t prot) +{ + if (end >= RADIX_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + return -1; + } + + return create_physical_mapping(__pa(start), __pa(end), + nid, prot, ~0UL); +} + +int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) +{ + remove_pagetable(start, end, true, NULL); + return 0; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, + pgprot_t flags, unsigned int map_page_size, + int nid) +{ + return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); +} + +int __meminit radix__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding */ + int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); + int ret; + + if ((start + page_size) >= RADIX_VMEMMAP_END) { + pr_warn("Outside the supported range\n"); + return -1; + } + + ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid); + BUG_ON(ret); + + return 0; +} + +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP +bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) +{ + if (radix_enabled()) + return __vmemmap_can_optimize(altmap, pgmap); + + return false; +} +#endif + +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, + unsigned long addr, unsigned long next) +{ + int large = pmd_leaf(*pmdp); + + if (large) + vmemmap_verify(pmdp_ptep(pmdp), node, addr, next); + + return large; +} + +void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, + unsigned long addr, unsigned long next) +{ + pte_t entry; + pte_t *ptep = pmdp_ptep(pmdp); + + VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE)); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, ptep, entry); + asm volatile("ptesync": : :"memory"); + + vmemmap_verify(ptep, node, addr, next); +} + +static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr, + int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pte_t *pte = pte_offset_kernel(pmdp, addr); + + if (pte_none(*pte)) { + pte_t entry; + void *p; + + if (!reuse) { + /* + * make sure we don't create altmap mappings + * covering things outside the device. + */ + if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE)) + altmap = NULL; + + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); + if (!p && altmap) + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); + if (!p) + return NULL; + pr_debug("PAGE_SIZE vmemmap mapping\n"); + } else { + /* + * When a PTE/PMD entry is freed from the init_mm + * there's a free_pages() call to this page allocated + * above. Thus this get_page() is paired with the + * put_page_testzero() on the freeing path. + * This can only called by certain ZONE_DEVICE path, + * and through vmemmap_populate_compound_pages() when + * slab is available. + */ + get_page(reuse); + p = page_to_virt(reuse); + pr_debug("Tail page reuse vmemmap mapping\n"); + } + + VM_BUG_ON(!PAGE_ALIGNED(addr)); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + asm volatile("ptesync": : :"memory"); + } + return pte; +} + +static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node, + unsigned long address) +{ + pud_t *pud; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(p4d_none(*p4dp))) { + if (unlikely(!slab_is_available())) { + pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + p4d_populate(&init_mm, p4dp, pud); + /* go to the pud_offset */ + } else + return pud_alloc(&init_mm, p4dp, address); + } + return pud_offset(p4dp, address); +} + +static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node, + unsigned long address) +{ + pmd_t *pmd; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(pud_none(*pudp))) { + if (unlikely(!slab_is_available())) { + pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + pud_populate(&init_mm, pudp, pmd); + } else + return pmd_alloc(&init_mm, pudp, address); + } + return pmd_offset(pudp, address); +} + +static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node, + unsigned long address) +{ + pte_t *pte; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(pmd_none(*pmdp))) { + if (unlikely(!slab_is_available())) { + pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + pmd_populate(&init_mm, pmdp, pte); + } else + return pte_alloc_kernel(pmdp, address); + } + return pte_offset_kernel(pmdp, address); +} + + + +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + unsigned long addr; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* + * If altmap is present, Make sure we align the start vmemmap addr + * to PAGE_SIZE so that we calculate the correct start_pfn in + * altmap boundary check to decide whether we should use altmap or + * RAM based backing memory allocation. Also the address need to be + * aligned for set_pte operation. If the start addr is already + * PMD_SIZE aligned and with in the altmap boundary then we will + * try to use a pmd size altmap mapping else we go for page size + * mapping. + * + * If altmap is not present, align the vmemmap addr to PMD_SIZE and + * always allocate a PMD size page for vmemmap backing. + * + */ + + if (altmap) + start = ALIGN_DOWN(start, PAGE_SIZE); + else + start = ALIGN_DOWN(start, PMD_SIZE); + + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return -ENOMEM; + + if (pmd_none(READ_ONCE(*pmd))) { + void *p; + + /* + * keep it simple by checking addr PMD_SIZE alignment + * and verifying the device boundary condition. + * For us to use a pmd mapping, both addr and pfn should + * be aligned. We skip if addr is not aligned and for + * pfn we hope we have extra area in the altmap that + * can help to find an aligned block. This can result + * in altmap block allocation failures, in which case + * we fallback to RAM for vmemmap allocation. + */ + if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) || + altmap_cross_boundary(altmap, addr, PMD_SIZE))) { + /* + * make sure we don't create altmap mappings + * covering things outside the device. + */ + goto base_mapping; + } + + p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); + if (p) { + vmemmap_set_pmd(pmd, p, node, addr, next); + pr_debug("PMD_SIZE vmemmap mapping\n"); + continue; + } else { + /* + * A vmemmap block allocation can fail due to + * alignment requirements and we trying to align + * things aggressively there by running out of + * space. Try base mapping on failure. + */ + goto base_mapping; + } + } else if (vmemmap_check_pmd(pmd, node, addr, next)) { + /* + * If a huge mapping exist due to early call to + * vmemmap_populate, let's try to use that. + */ + continue; + } +base_mapping: + /* + * Not able allocate higher order memory to back memmap + * or we found a pointer to pte page. Allocate base page + * size vmemmap + */ + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return -ENOMEM; + + pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL); + if (!pte) + return -ENOMEM; + + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + next = addr + PAGE_SIZE; + } + return 0; +} + +static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return NULL; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return NULL; + if (pmd_leaf(*pmd)) + /* + * The second page is mapped as a hugepage due to a nearby request. + * Force our mapping to page size without deduplication + */ + return NULL; + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return NULL; + radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + return pte; +} + +static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr, + unsigned long pfn_offset, int node) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long map_addr; + + /* the second vmemmap page which we use for duplication */ + map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE; + pgd = pgd_offset_k(map_addr); + p4d = p4d_offset(pgd, map_addr); + pud = vmemmap_pud_alloc(p4d, node, map_addr); + if (!pud) + return NULL; + pmd = vmemmap_pmd_alloc(pud, node, map_addr); + if (!pmd) + return NULL; + if (pmd_leaf(*pmd)) + /* + * The second page is mapped as a hugepage due to a nearby request. + * Force our mapping to page size without deduplication + */ + return NULL; + pte = vmemmap_pte_alloc(pmd, node, map_addr); + if (!pte) + return NULL; + /* + * Check if there exist a mapping to the left + */ + if (pte_none(*pte)) { + /* + * Populate the head page vmemmap page. + * It can fall in different pmd, hence + * vmemmap_populate_address() + */ + pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL); + if (!pte) + return NULL; + /* + * Populate the tail pages vmemmap page + */ + pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL); + if (!pte) + return NULL; + vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE); + return pte; + } + return pte; +} + +int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, + unsigned long start, + unsigned long end, int node, + struct dev_pagemap *pgmap) +{ + /* + * we want to map things as base page size mapping so that + * we can save space in vmemmap. We could have huge mapping + * covering out both edges. + */ + unsigned long addr; + unsigned long addr_pfn = start_pfn; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (addr = start; addr < end; addr = next) { + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return -ENOMEM; + + if (pmd_leaf(READ_ONCE(*pmd))) { + /* existing huge mapping. Skip the range */ + addr_pfn += (PMD_SIZE >> PAGE_SHIFT); + next = pmd_addr_end(addr, end); + continue; + } + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return -ENOMEM; + if (!pte_none(*pte)) { + /* + * This could be because we already have a compound + * page whose VMEMMAP_RESERVE_NR pages were mapped and + * this request fall in those pages. + */ + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } else { + unsigned long nr_pages = pgmap_vmemmap_nr(pgmap); + unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages); + pte_t *tail_page_pte; + + /* + * if the address is aligned to huge page size it is the + * head mapping. + */ + if (pfn_offset == 0) { + /* Populate the head page vmemmap page */ + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + /* + * Populate the tail pages vmemmap page + * It can fall in different pmd, hence + * vmemmap_populate_address() + */ + pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL); + if (!pte) + return -ENOMEM; + + addr_pfn += 2; + next = addr + 2 * PAGE_SIZE; + continue; + } + /* + * get the 2nd mapping details + * Also create it if that doesn't exist + */ + tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node); + if (!tail_page_pte) { + + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } + + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte)); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } + } + return 0; +} + + +#ifdef CONFIG_MEMORY_HOTPLUG +void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) +{ + remove_pagetable(start, start + page_size, true, NULL); +} + +void __ref radix__vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ + remove_pagetable(start, end, false, altmap); +} +#endif +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!radix__pmd_trans_huge(*pmdp)); + assert_spin_locked(pmd_lockptr(mm, pmdp)); +#endif + + old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1); + trace_hugepage_update_pmd(addr, old, clr, set); + + return old; +} + +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pud_trans_huge(*pudp)); + assert_spin_locked(pud_lockptr(mm, pudp)); +#endif + + old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1); + trace_hugepage_update_pud(addr, old, clr, set); + + return old; +} + +pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) + +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); + /* + * khugepaged calls this for normal pmd + */ + pmd = *pmdp; + pmd_clear(pmdp); + + radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); + + return pmd; +} + +/* + * For us pgtable_t is pte_t *. Inorder to save the deposisted + * page table, we consider the allocated page table as a list + * head. On withdraw we need to make sure we zero out the used + * list_head memory area. + */ +void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + struct list_head *lh = (struct list_head *) pgtable; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; +} + +pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pte_t *ptep; + pgtable_t pgtable; + struct list_head *lh; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; + list_del(lh); + } + ptep = (pte_t *) pgtable; + *ptep = __pte(0); + ptep++; + *ptep = __pte(0); + return pgtable; +} + +pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + unsigned long old; + + old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + return old_pmd; +} + +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + pud_t old_pud; + unsigned long old; + + old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0); + old_pud = __pud(old); + return old_pud; +} + +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, + pte_t entry, unsigned long address, int psize) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY | + _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); + + unsigned long change = pte_val(entry) ^ pte_val(*ptep); + /* + * On POWER9, the NMMU is not able to relax PTE access permissions + * for a translation with a TLB. The PTE must be invalidated, TLB + * flushed before the new PTE is installed. + * + * This only needs to be done for radix, because hash translation does + * flush when updating the linux pte (and we don't support NMMU + * accelerators on HPT on POWER9 anyway XXX: do we?). + * + * POWER10 (and P9P) NMMU does behave as per ISA. + */ + if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) && + atomic_read(&mm->context.copros) > 0) { + unsigned long old_pte, new_pte; + + old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); + new_pte = old_pte | set; + radix__flush_tlb_page_psize(mm, address, psize); + __radix_pte_update(ptep, _PAGE_INVALID, new_pte); + } else { + __radix_pte_update(ptep, 0, set); + /* + * Book3S does not require a TLB flush when relaxing access + * restrictions when the address space (modulo the POWER9 nest + * MMU issue above) because the MMU will reload the PTE after + * taking an access fault, as defined by the architecture. See + * "Setting a Reference or Change Bit or Upgrading Access + * Authority (PTE Subject to Atomic Hardware Updates)" in + * Power ISA Version 3.1B. + */ + } + /* See ptesync comment in radix__set_pte_at */ +} + +void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + struct mm_struct *mm = vma->vm_mm; + + /* + * POWER9 NMMU must flush the TLB after clearing the PTE before + * installing a PTE with more relaxed access permissions, see + * radix__ptep_set_access_flags. + */ + if (!cpu_has_feature(CPU_FTR_ARCH_31) && + is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && + (atomic_read(&mm->context.copros) > 0)) + radix__flush_tlb_page(vma, addr); + + set_pte_at(mm, addr, ptep, pte); +} + +int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) +{ + pte_t *ptep = (pte_t *)pud; + pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot); + + if (!radix_enabled()) + return 0; + + set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud); + + return 1; +} + +int pud_clear_huge(pud_t *pud) +{ + if (pud_leaf(*pud)) { + pud_clear(pud); + return 1; + } + + return 0; +} + +int pud_free_pmd_page(pud_t *pud, unsigned long addr) +{ + pmd_t *pmd; + int i; + + pmd = pud_pgtable(*pud); + pud_clear(pud); + + flush_tlb_kernel_range(addr, addr + PUD_SIZE); + + for (i = 0; i < PTRS_PER_PMD; i++) { + if (!pmd_none(pmd[i])) { + pte_t *pte; + pte = (pte_t *)pmd_page_vaddr(pmd[i]); + + pte_free_kernel(&init_mm, pte); + } + } + + pmd_free(&init_mm, pmd); + + return 1; +} + +int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) +{ + pte_t *ptep = (pte_t *)pmd; + pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot); + + if (!radix_enabled()) + return 0; + + set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd); + + return 1; +} + +int pmd_clear_huge(pmd_t *pmd) +{ + if (pmd_leaf(*pmd)) { + pmd_clear(pmd); + return 1; + } + + return 0; +} + +int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) +{ + pte_t *pte; + + pte = (pte_t *)pmd_page_vaddr(*pmd); + pmd_clear(pmd); + + flush_tlb_kernel_range(addr, addr + PMD_SIZE); + + pte_free_kernel(&init_mm, pte); + + return 1; +} diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c new file mode 100644 index 000000000000..9e1f6558d026 --- /dev/null +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -0,0 +1,1587 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * TLB flush routines for radix kernels. + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + */ + +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/memblock.h> +#include <linux/mmu_context.h> +#include <linux/sched/mm.h> +#include <linux/debugfs.h> + +#include <asm/ppc-opcode.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/trace.h> +#include <asm/cputhreads.h> +#include <asm/plpar_wrappers.h> + +#include "internal.h" + +/* + * tlbiel instruction for radix, set invalidation + * i.e., r=1 and is=01 or is=10 or is=11 + */ +static __always_inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, + unsigned int pid, + unsigned int ric, unsigned int prs) +{ + unsigned long rb; + unsigned long rs; + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) + : "memory"); +} + +static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and the entire Page Walk Cache + * and partition table entries. Then flush the remaining sets of the + * TLB. + */ + + if (early_cpu_has_feature(CPU_FTR_HVMODE)) { + /* MSR[HV] should flush partition scope translations first. */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); + + if (!early_cpu_has_feature(CPU_FTR_ARCH_31)) { + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, + RIC_FLUSH_TLB, 0); + } + } + + /* Flush process scoped entries. */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); + + if (!early_cpu_has_feature(CPU_FTR_ARCH_31)) { + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); + } + + ppc_after_tlbiel_barrier(); +} + +void radix__tlbiel_all(unsigned int action) +{ + unsigned int is; + + switch (action) { + case TLB_INVAL_SCOPE_GLOBAL: + is = 3; + break; + case TLB_INVAL_SCOPE_LPID: + is = 2; + break; + default: + BUG(); + } + + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) + tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is); + else + WARN(1, "%s called on pre-POWER9 CPU\n", __func__); + + asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +static __always_inline void __tlbiel_pid(unsigned long pid, int set, + unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rb |= set << PPC_BITLSHIFT(51); + rs = ((unsigned long)pid) << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 1, rb, rs, ric, prs, r); +} + +static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(52); /* IS = 2 */ + rs = lpid; + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); +} + +static __always_inline void __tlbie_lpid_guest(unsigned long lpid, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(52); /* IS = 2 */ + rs = lpid; + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); +} + +static __always_inline void __tlbiel_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 1, rb, rs, ric, prs, r); +} + +static __always_inline void __tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = lpid; + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); +} + + +static inline void fixup_tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, 0, ap, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, RIC_FLUSH_TLB); + } +} + +static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_pid(0, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, RIC_FLUSH_TLB); + } +} + +static inline void fixup_tlbie_pid(unsigned long pid) +{ + /* + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. + */ + unsigned long va = ((1UL << 52) - 1); + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_pid(0, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); + } +} + +static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, 0, ap, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, lpid, ap, RIC_FLUSH_TLB); + } +} + +static inline void fixup_tlbie_lpid(unsigned long lpid) +{ + /* + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. + */ + unsigned long va = ((1UL << 52) - 1); + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid(0, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); + } +} + +/* + * We use 128 set in radix mode and 256 set in hpt mode. + */ +static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) +{ + int set; + + asm volatile("ptesync": : :"memory"); + + switch (ric) { + case RIC_FLUSH_PWC: + + /* For PWC, only one flush is needed */ + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); + ppc_after_tlbiel_barrier(); + return; + case RIC_FLUSH_TLB: + __tlbiel_pid(pid, 0, RIC_FLUSH_TLB); + break; + case RIC_FLUSH_ALL: + default: + /* + * Flush the first set of the TLB, and if + * we're doing a RIC_FLUSH_ALL, also flush + * the entire Page Walk Cache. + */ + __tlbiel_pid(pid, 0, RIC_FLUSH_ALL); + } + + if (!cpu_has_feature(CPU_FTR_ARCH_31)) { + /* For the remaining sets, just flush the TLB */ + for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) + __tlbiel_pid(pid, set, RIC_FLUSH_TLB); + } + + ppc_after_tlbiel_barrier(); + asm volatile(PPC_RADIX_INVALIDATE_ERAT_USER "; isync" : : :"memory"); +} + +static inline void _tlbie_pid(unsigned long pid, unsigned long ric) +{ + asm volatile("ptesync": : :"memory"); + + /* + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time constraint to match the "i" constraint + * in the asm statement. + */ + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_pid(pid, RIC_FLUSH_TLB); + fixup_tlbie_pid(pid); + break; + case RIC_FLUSH_PWC: + __tlbie_pid(pid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_pid(pid, RIC_FLUSH_ALL); + fixup_tlbie_pid(pid); + } + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +struct tlbiel_pid { + unsigned long pid; + unsigned long ric; +}; + +static void do_tlbiel_pid(void *info) +{ + struct tlbiel_pid *t = info; + + if (t->ric == RIC_FLUSH_TLB) + _tlbiel_pid(t->pid, RIC_FLUSH_TLB); + else if (t->ric == RIC_FLUSH_PWC) + _tlbiel_pid(t->pid, RIC_FLUSH_PWC); + else + _tlbiel_pid(t->pid, RIC_FLUSH_ALL); +} + +static inline void _tlbiel_pid_multicast(struct mm_struct *mm, + unsigned long pid, unsigned long ric) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_pid t = { .pid = pid, .ric = ric }; + + on_each_cpu_mask(cpus, do_tlbiel_pid, &t, 1); + /* + * Always want the CPU translations to be invalidated with tlbiel in + * these paths, so while coprocessors must use tlbie, we can not + * optimise away the tlbiel component. + */ + if (atomic_read(&mm->context.copros) > 0) + _tlbie_pid(pid, RIC_FLUSH_ALL); +} + +static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) +{ + asm volatile("ptesync": : :"memory"); + + /* + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. + */ + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_lpid(lpid, RIC_FLUSH_TLB); + fixup_tlbie_lpid(lpid); + break; + case RIC_FLUSH_PWC: + __tlbie_lpid(lpid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_lpid(lpid, RIC_FLUSH_ALL); + fixup_tlbie_lpid(lpid); + } + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static __always_inline void _tlbie_lpid_guest(unsigned long lpid, unsigned long ric) +{ + /* + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. + */ + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_lpid_guest(lpid, RIC_FLUSH_TLB); + break; + case RIC_FLUSH_PWC: + __tlbie_lpid_guest(lpid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_lpid_guest(lpid, RIC_FLUSH_ALL); + } + fixup_tlbie_lpid(lpid); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void __tlbiel_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); +} + +static __always_inline void _tlbiel_va(unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) +{ + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + __tlbiel_va(va, pid, ap, ric); + ppc_after_tlbiel_barrier(); +} + +static inline void _tlbiel_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + asm volatile("ptesync": : :"memory"); + if (also_pwc) + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); + __tlbiel_va_range(start, end, pid, page_size, psize); + ppc_after_tlbiel_barrier(); +} + +static inline void __tlbie_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + + fixup_tlbie_va_range(addr - page_size, pid, ap); +} + +static __always_inline void _tlbie_va(unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) +{ + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, ric); + fixup_tlbie_va(va, pid, ap); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +struct tlbiel_va { + unsigned long pid; + unsigned long va; + unsigned long psize; + unsigned long ric; +}; + +static void do_tlbiel_va(void *info) +{ + struct tlbiel_va *t = info; + + if (t->ric == RIC_FLUSH_TLB) + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_TLB); + else if (t->ric == RIC_FLUSH_PWC) + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_PWC); + else + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_ALL); +} + +static inline void _tlbiel_va_multicast(struct mm_struct *mm, + unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_va t = { .va = va, .pid = pid, .psize = psize, .ric = ric }; + on_each_cpu_mask(cpus, do_tlbiel_va, &t, 1); + if (atomic_read(&mm->context.copros) > 0) + _tlbie_va(va, pid, psize, RIC_FLUSH_TLB); +} + +struct tlbiel_va_range { + unsigned long pid; + unsigned long start; + unsigned long end; + unsigned long page_size; + unsigned long psize; + bool also_pwc; +}; + +static void do_tlbiel_va_range(void *info) +{ + struct tlbiel_va_range *t = info; + + _tlbiel_va_range(t->start, t->end, t->pid, t->page_size, + t->psize, t->also_pwc); +} + +static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long psize, unsigned long ric) +{ + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, lpid, ap, ric); + fixup_tlbie_lpid_va(va, lpid, ap); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbie_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + asm volatile("ptesync": : :"memory"); + if (also_pwc) + __tlbie_pid(pid, RIC_FLUSH_PWC); + __tlbie_va_range(start, end, pid, page_size, psize); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbiel_va_range_multicast(struct mm_struct *mm, + unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_va_range t = { .start = start, .end = end, + .pid = pid, .page_size = page_size, + .psize = psize, .also_pwc = also_pwc }; + + on_each_cpu_mask(cpus, do_tlbiel_va_range, &t, 1); + if (atomic_read(&mm->context.copros) > 0) + _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); +} + +/* + * Base TLB flushing operations: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * - local_* variants of page and mm only apply to the current + * processor + */ +void radix__local_flush_tlb_mm(struct mm_struct *mm) +{ + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + _tlbiel_pid(pid, RIC_FLUSH_TLB); + preempt_enable(); +} +EXPORT_SYMBOL(radix__local_flush_tlb_mm); + +#ifndef CONFIG_SMP +void radix__local_flush_all_mm(struct mm_struct *mm) +{ + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + _tlbiel_pid(pid, RIC_FLUSH_ALL); + preempt_enable(); +} +EXPORT_SYMBOL(radix__local_flush_all_mm); + +static void __flush_all_mm(struct mm_struct *mm, bool fullmm) +{ + radix__local_flush_all_mm(mm); +} +#endif /* CONFIG_SMP */ + +void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, + int psize) +{ + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + preempt_enable(); +} + +void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + /* need the return fix for nohash.c */ + if (is_vm_hugetlb_page(vma)) + return radix__local_flush_hugetlb_page(vma, vmaddr); +#endif + radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); +} +EXPORT_SYMBOL(radix__local_flush_tlb_page); + +static bool mm_needs_flush_escalation(struct mm_struct *mm) +{ + /* + * The P9 nest MMU has issues with the page walk cache caching PTEs + * and not flushing them when RIC = 0 for a PID/LPID invalidate. + * + * This may have been fixed in shipping firmware (by disabling PWC + * or preventing it from caching PTEs), but until that is confirmed, + * this workaround is required - escalate all RIC=0 IS=1/2/3 flushes + * to RIC=2. + * + * POWER10 (and P9P) does not have this problem. + */ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + return false; + if (atomic_read(&mm->context.copros) > 0) + return true; + return false; +} + +/* + * If always_flush is true, then flush even if this CPU can't be removed + * from mm_cpumask. + */ +void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush) +{ + unsigned long pid = mm->context.id; + int cpu = smp_processor_id(); + + /* + * A kthread could have done a mmget_not_zero() after the flushing CPU + * checked mm_cpumask, and be in the process of kthread_use_mm when + * interrupted here. In that case, current->mm will be set to mm, + * because kthread_use_mm() setting ->mm and switching to the mm is + * done with interrupts off. + */ + if (current->mm == mm) + goto out; + + if (current->active_mm == mm) { + unsigned long flags; + + WARN_ON_ONCE(current->mm != NULL); + /* + * It is a kernel thread and is using mm as the lazy tlb, so + * switch it to init_mm. This is not always called from IPI + * (e.g., flush_type_needed), so must disable irqs. + */ + local_irq_save(flags); + mmgrab_lazy_tlb(&init_mm); + current->active_mm = &init_mm; + switch_mm_irqs_off(mm, &init_mm, current); + mmdrop_lazy_tlb(mm); + local_irq_restore(flags); + } + + /* + * This IPI may be initiated from any source including those not + * running the mm, so there may be a racing IPI that comes after + * this one which finds the cpumask already clear. Check and avoid + * underflowing the active_cpus count in that case. The race should + * not otherwise be a problem, but the TLB must be flushed because + * that's what the caller expects. + */ + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) { + dec_mm_active_cpus(mm); + cpumask_clear_cpu(cpu, mm_cpumask(mm)); + always_flush = true; + } + +out: + if (always_flush) + _tlbiel_pid(pid, RIC_FLUSH_ALL); +} + +#ifdef CONFIG_SMP +static void do_exit_flush_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + exit_lazy_flush_tlb(mm, true); +} + +static void exit_flush_lazy_tlbs(struct mm_struct *mm) +{ + /* + * Would be nice if this was async so it could be run in + * parallel with our local flush, but generic code does not + * give a good API for it. Could extend the generic code or + * make a special powerpc IPI for flushing TLBs. + * For now it's not too performance critical. + */ + smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb, + (void *)mm, 1); +} + +#else /* CONFIG_SMP */ +static inline void exit_flush_lazy_tlbs(struct mm_struct *mm) { } +#endif /* CONFIG_SMP */ + +static DEFINE_PER_CPU(unsigned int, mm_cpumask_trim_clock); + +/* + * Interval between flushes at which we send out IPIs to check whether the + * mm_cpumask can be trimmed for the case where it's not a single-threaded + * process flushing its own mm. The intent is to reduce the cost of later + * flushes. Don't want this to be so low that it adds noticable cost to TLB + * flushing, or so high that it doesn't help reduce global TLBIEs. + */ +static unsigned long tlb_mm_cpumask_trim_timer = 1073; + +static bool tick_and_test_trim_clock(void) +{ + if (__this_cpu_inc_return(mm_cpumask_trim_clock) == + tlb_mm_cpumask_trim_timer) { + __this_cpu_write(mm_cpumask_trim_clock, 0); + return true; + } + return false; +} + +enum tlb_flush_type { + FLUSH_TYPE_NONE, + FLUSH_TYPE_LOCAL, + FLUSH_TYPE_GLOBAL, +}; + +static enum tlb_flush_type flush_type_needed(struct mm_struct *mm, bool fullmm) +{ + int active_cpus = atomic_read(&mm->context.active_cpus); + int cpu = smp_processor_id(); + + if (active_cpus == 0) + return FLUSH_TYPE_NONE; + if (active_cpus == 1 && cpumask_test_cpu(cpu, mm_cpumask(mm))) { + if (current->mm != mm) { + /* + * Asynchronous flush sources may trim down to nothing + * if the process is not running, so occasionally try + * to trim. + */ + if (tick_and_test_trim_clock()) { + exit_lazy_flush_tlb(mm, true); + return FLUSH_TYPE_NONE; + } + } + return FLUSH_TYPE_LOCAL; + } + + /* Coprocessors require TLBIE to invalidate nMMU. */ + if (atomic_read(&mm->context.copros) > 0) + return FLUSH_TYPE_GLOBAL; + + /* + * In the fullmm case there's no point doing the exit_flush_lazy_tlbs + * because the mm is being taken down anyway, and a TLBIE tends to + * be faster than an IPI+TLBIEL. + */ + if (fullmm) + return FLUSH_TYPE_GLOBAL; + + /* + * If we are running the only thread of a single-threaded process, + * then we should almost always be able to trim off the rest of the + * CPU mask (except in the case of use_mm() races), so always try + * trimming the mask. + */ + if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm) { + exit_flush_lazy_tlbs(mm); + /* + * use_mm() race could prevent IPIs from being able to clear + * the cpumask here, however those users are established + * after our first check (and so after the PTEs are removed), + * and the TLB still gets flushed by the IPI, so this CPU + * will only require a local flush. + */ + return FLUSH_TYPE_LOCAL; + } + + /* + * Occasionally try to trim down the cpumask. It's possible this can + * bring the mask to zero, which results in no flush. + */ + if (tick_and_test_trim_clock()) { + exit_flush_lazy_tlbs(mm); + if (current->mm == mm) + return FLUSH_TYPE_LOCAL; + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) + exit_lazy_flush_tlb(mm, true); + return FLUSH_TYPE_NONE; + } + + return FLUSH_TYPE_GLOBAL; +} + +#ifdef CONFIG_SMP +void radix__flush_tlb_mm(struct mm_struct *mm) +{ + unsigned long pid; + enum tlb_flush_type type; + + pid = mm->context.id; + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + /* + * Order loads of mm_cpumask (in flush_type_needed) vs previous + * stores to clear ptes before the invalidate. See barrier in + * switch_mm_irqs_off + */ + smp_mb(); + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_pid(pid, RIC_FLUSH_TLB); + } else if (type == FLUSH_TYPE_GLOBAL) { + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt = H_RPTI_TARGET_CMMU; + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB, + H_RPTI_PAGE_ALL, 0, -1UL); + } else if (cputlb_use_tlbie()) { + if (mm_needs_flush_escalation(mm)) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } else { + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); + } + } + preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); +} +EXPORT_SYMBOL(radix__flush_tlb_mm); + +static void __flush_all_mm(struct mm_struct *mm, bool fullmm) +{ + unsigned long pid; + enum tlb_flush_type type; + + pid = mm->context.id; + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + type = flush_type_needed(mm, fullmm); + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_pid(pid, RIC_FLUSH_ALL); + } else if (type == FLUSH_TYPE_GLOBAL) { + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt = H_RPTI_TARGET_CMMU; + unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | + H_RPTI_TYPE_PRT; + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, type, + H_RPTI_PAGE_ALL, 0, -1UL); + } else if (cputlb_use_tlbie()) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); + } + preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); +} + +void radix__flush_all_mm(struct mm_struct *mm) +{ + __flush_all_mm(mm, false); +} +EXPORT_SYMBOL(radix__flush_all_mm); + +void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, + int psize) +{ + unsigned long pid; + enum tlb_flush_type type; + + pid = mm->context.id; + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + } else if (type == FLUSH_TYPE_GLOBAL) { + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt, pg_sizes, size; + + tgt = H_RPTI_TARGET_CMMU; + pg_sizes = psize_to_rpti_pgsize(psize); + size = 1UL << mmu_psize_to_shift(psize); + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB, + pg_sizes, vmaddr, + vmaddr + size); + } else if (cputlb_use_tlbie()) + _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + else + _tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB); + } + preempt_enable(); +} + +void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_page(vma, vmaddr); +#endif + radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); +} +EXPORT_SYMBOL(radix__flush_tlb_page); + +#endif /* CONFIG_SMP */ + +static void do_tlbiel_kernel(void *info) +{ + _tlbiel_pid(0, RIC_FLUSH_ALL); +} + +static inline void _tlbiel_kernel_broadcast(void) +{ + on_each_cpu(do_tlbiel_kernel, NULL, 1); + if (tlbie_capable) { + /* + * Coherent accelerators don't refcount kernel memory mappings, + * so have to always issue a tlbie for them. This is quite a + * slow path anyway. + */ + _tlbie_pid(0, RIC_FLUSH_ALL); + } +} + +/* + * If kernel TLBIs ever become local rather than global, then + * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it + * assumes kernel TLBIs are global. + */ +void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt = H_RPTI_TARGET_CMMU | H_RPTI_TARGET_NMMU; + unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | + H_RPTI_TYPE_PRT; + + pseries_rpt_invalidate(0, tgt, type, H_RPTI_PAGE_ALL, + start, end); + } else if (cputlb_use_tlbie()) + _tlbie_pid(0, RIC_FLUSH_ALL); + else + _tlbiel_kernel_broadcast(); +} +EXPORT_SYMBOL(radix__flush_tlb_kernel_range); + +/* + * Doesn't appear to be used anywhere. Remove. + */ +#define TLB_FLUSH_ALL -1UL + +/* + * Number of pages above which we invalidate the entire PID rather than + * flush individual pages, for local and global flushes respectively. + * + * tlbie goes out to the interconnect and individual ops are more costly. + * It also does not iterate over sets like the local tlbiel variant when + * invalidating a full PID, so it has a far lower threshold to change from + * individual page flushes to full-pid flushes. + */ +static u32 tlb_single_page_flush_ceiling __read_mostly = 33; +static u32 tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; + +static inline void __radix__flush_tlb_range(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + unsigned long pid; + unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; + unsigned long page_size = 1UL << page_shift; + unsigned long nr_pages = (end - start) >> page_shift; + bool flush_pid, flush_pwc = false; + enum tlb_flush_type type; + + pid = mm->context.id; + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; + + WARN_ON_ONCE(end == TLB_FLUSH_ALL); + + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_NONE) + goto out; + + if (type == FLUSH_TYPE_GLOBAL) + flush_pid = nr_pages > tlb_single_page_flush_ceiling; + else + flush_pid = nr_pages > tlb_local_single_page_flush_ceiling; + /* + * full pid flush already does the PWC flush. if it is not full pid + * flush check the range is more than PMD and force a pwc flush + * mremap() depends on this behaviour. + */ + if (!flush_pid && (end - start) >= PMD_SIZE) + flush_pwc = true; + + if (!mmu_has_feature(MMU_FTR_GTSE) && type == FLUSH_TYPE_GLOBAL) { + unsigned long type = H_RPTI_TYPE_TLB; + unsigned long tgt = H_RPTI_TARGET_CMMU; + unsigned long pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize); + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + pg_sizes |= psize_to_rpti_pgsize(MMU_PAGE_2M); + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + if (flush_pwc) + type |= H_RPTI_TYPE_PWC; + pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end); + } else if (flush_pid) { + /* + * We are now flushing a range larger than PMD size force a RIC_FLUSH_ALL + */ + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_pid(pid, RIC_FLUSH_ALL); + } else { + if (cputlb_use_tlbie()) { + _tlbie_pid(pid, RIC_FLUSH_ALL); + } else { + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); + } + } + } else { + bool hflush; + unsigned long hstart, hend; + + hstart = (start + PMD_SIZE - 1) & PMD_MASK; + hend = end & PMD_MASK; + hflush = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hstart < hend; + + if (type == FLUSH_TYPE_LOCAL) { + asm volatile("ptesync": : :"memory"); + if (flush_pwc) + /* For PWC, only one flush is needed */ + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); + __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); + if (hflush) + __tlbiel_va_range(hstart, hend, pid, + PMD_SIZE, MMU_PAGE_2M); + ppc_after_tlbiel_barrier(); + } else if (cputlb_use_tlbie()) { + asm volatile("ptesync": : :"memory"); + if (flush_pwc) + __tlbie_pid(pid, RIC_FLUSH_PWC); + __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); + if (hflush) + __tlbie_va_range(hstart, hend, pid, + PMD_SIZE, MMU_PAGE_2M); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); + } else { + _tlbiel_va_range_multicast(mm, + start, end, pid, page_size, mmu_virtual_psize, flush_pwc); + if (hflush) + _tlbiel_va_range_multicast(mm, + hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, flush_pwc); + } + } +out: + preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); +} + +void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_tlb_range(vma, start, end); +#endif + + __radix__flush_tlb_range(vma->vm_mm, start, end); +} +EXPORT_SYMBOL(radix__flush_tlb_range); + +static int radix_get_mmu_psize(int page_size) +{ + int psize; + + if (page_size == (1UL << mmu_psize_defs[mmu_virtual_psize].shift)) + psize = mmu_virtual_psize; + else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_2M].shift)) + psize = MMU_PAGE_2M; + else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_1G].shift)) + psize = MMU_PAGE_1G; + else + return -1; + return psize; +} + +/* + * Flush partition scoped LPID address translation for all CPUs. + */ +void radix__flush_tlb_lpid_page(unsigned int lpid, + unsigned long addr, + unsigned long page_size) +{ + int psize = radix_get_mmu_psize(page_size); + + _tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB); +} +EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page); + +/* + * Flush partition scoped PWC from LPID for all CPUs. + */ +void radix__flush_pwc_lpid(unsigned int lpid) +{ + _tlbie_lpid(lpid, RIC_FLUSH_PWC); +} +EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid); + +/* + * Flush partition scoped translations from LPID (=LPIDR) + */ +void radix__flush_all_lpid(unsigned int lpid) +{ + _tlbie_lpid(lpid, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL_GPL(radix__flush_all_lpid); + +/* + * Flush process scoped translations from LPID (=LPIDR) + */ +void radix__flush_all_lpid_guest(unsigned int lpid) +{ + _tlbie_lpid_guest(lpid, RIC_FLUSH_ALL); +} + +void radix__tlb_flush(struct mmu_gather *tlb) +{ + int psize = 0; + struct mm_struct *mm = tlb->mm; + int page_size = tlb->page_size; + unsigned long start = tlb->start; + unsigned long end = tlb->end; + + /* + * if page size is not something we understand, do a full mm flush + * + * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush + * that flushes the process table entry cache upon process teardown. + * See the comment for radix in arch_exit_mmap(). + */ + if (tlb->fullmm) { + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) { + /* + * Shootdown based lazy tlb mm refcounting means we + * have to IPI everyone in the mm_cpumask anyway soon + * when the mm goes away, so might as well do it as + * part of the final flush now. + * + * If lazy shootdown was improved to reduce IPIs (e.g., + * by batching), then it may end up being better to use + * tlbies here instead. + */ + preempt_disable(); + + smp_mb(); /* see radix__flush_tlb_mm */ + exit_flush_lazy_tlbs(mm); + __flush_all_mm(mm, true); + + preempt_enable(); + } else { + __flush_all_mm(mm, true); + } + + } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { + if (!tlb->freed_tables) + radix__flush_tlb_mm(mm); + else + radix__flush_all_mm(mm); + } else { + if (!tlb->freed_tables) + radix__flush_tlb_range_psize(mm, start, end, psize); + else + radix__flush_tlb_pwc_range_psize(mm, start, end, psize); + } +} + +static void __radix__flush_tlb_range_psize(struct mm_struct *mm, + unsigned long start, unsigned long end, + int psize, bool also_pwc) +{ + unsigned long pid; + unsigned int page_shift = mmu_psize_defs[psize].shift; + unsigned long page_size = 1UL << page_shift; + unsigned long nr_pages = (end - start) >> page_shift; + bool flush_pid; + enum tlb_flush_type type; + + pid = mm->context.id; + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; + + WARN_ON_ONCE(end == TLB_FLUSH_ALL); + + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_NONE) + goto out; + + if (type == FLUSH_TYPE_GLOBAL) + flush_pid = nr_pages > tlb_single_page_flush_ceiling; + else + flush_pid = nr_pages > tlb_local_single_page_flush_ceiling; + + if (!mmu_has_feature(MMU_FTR_GTSE) && type == FLUSH_TYPE_GLOBAL) { + unsigned long tgt = H_RPTI_TARGET_CMMU; + unsigned long type = H_RPTI_TYPE_TLB; + unsigned long pg_sizes = psize_to_rpti_pgsize(psize); + + if (also_pwc) + type |= H_RPTI_TYPE_PWC; + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end); + } else if (flush_pid) { + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } else { + if (cputlb_use_tlbie()) { + if (mm_needs_flush_escalation(mm)) + also_pwc = true; + + _tlbie_pid(pid, + also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } else { + _tlbiel_pid_multicast(mm, pid, + also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } + + } + } else { + if (type == FLUSH_TYPE_LOCAL) + _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); + else if (cputlb_use_tlbie()) + _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); + else + _tlbiel_va_range_multicast(mm, + start, end, pid, page_size, psize, also_pwc); + } +out: + preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); +} + +void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize) +{ + return __radix__flush_tlb_range_psize(mm, start, end, psize, false); +} + +void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize) +{ + __radix__flush_tlb_range_psize(mm, start, end, psize, true); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) +{ + unsigned long pid, end; + enum tlb_flush_type type; + + pid = mm->context.id; + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; + + /* 4k page size, just blow the world */ + if (PAGE_SIZE == 0x1000) { + radix__flush_all_mm(mm); + return; + } + + end = addr + HPAGE_PMD_SIZE; + + /* Otherwise first do the PWC, then iterate the pages. */ + preempt_disable(); + smp_mb(); /* see radix__flush_tlb_mm */ + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + } else if (type == FLUSH_TYPE_GLOBAL) { + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt, type, pg_sizes; + + tgt = H_RPTI_TARGET_CMMU; + type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | + H_RPTI_TYPE_PRT; + pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize); + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, type, pg_sizes, + addr, end); + } else if (cputlb_use_tlbie()) + _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + else + _tlbiel_va_range_multicast(mm, + addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + } + + preempt_enable(); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M); +} +EXPORT_SYMBOL(radix__flush_pmd_tlb_range); + +void radix__flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G); +} +EXPORT_SYMBOL(radix__flush_pud_tlb_range); + +void radix__flush_tlb_all(void) +{ + unsigned long rb,prs,r,rs; + unsigned long ric = RIC_FLUSH_ALL; + + rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */ + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */ + + asm volatile("ptesync": : :"memory"); + /* + * now flush guest entries by passing PRS = 1 and LPID != 0 + */ + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory"); + /* + * now flush host entires by passing PRS = 0 and LPID == 0 + */ + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory"); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +static __always_inline void __tlbie_pid_lpid(unsigned long pid, + unsigned long lpid, + unsigned long ric) +{ + unsigned long rb, rs, prs, r; + + rb = PPC_BIT(53); /* IS = 1 */ + rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31))); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid, + unsigned long lpid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb, rs, prs, r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31))); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid) +{ + /* + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. + */ + unsigned long va = ((1UL << 52) - 1); + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K), + RIC_FLUSH_TLB); + } +} + +static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid, + unsigned long ric) +{ + asm volatile("ptesync" : : : "memory"); + + /* + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. + */ + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB); + fixup_tlbie_pid_lpid(pid, lpid); + break; + case RIC_FLUSH_PWC: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL); + fixup_tlbie_pid_lpid(pid, lpid); + } + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} + +static inline void fixup_tlbie_va_range_lpid(unsigned long va, + unsigned long pid, + unsigned long lpid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB); + } +} + +static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end, + unsigned long pid, unsigned long lpid, + unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB); + + fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap); +} + +static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end, + unsigned long pid, unsigned long lpid, + unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + asm volatile("ptesync" : : : "memory"); + if (also_pwc) + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); + __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} + +/* + * Performs process-scoped invalidations for a given LPID + * as part of H_RPT_INVALIDATE hcall. + */ +void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid, + unsigned long type, unsigned long pg_sizes, + unsigned long start, unsigned long end) +{ + unsigned long psize, nr_pages; + struct mmu_psize_def *def; + bool flush_pid; + + /* + * A H_RPTI_TYPE_ALL request implies RIC=3, hence + * do a single IS=1 based flush. + */ + if ((type & H_RPTI_TYPE_ALL) == H_RPTI_TYPE_ALL) { + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL); + return; + } + + if (type & H_RPTI_TYPE_PWC) + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); + + /* Full PID flush */ + if (start == 0 && end == -1) + return _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB); + + /* Do range invalidation for all the valid page sizes */ + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { + def = &mmu_psize_defs[psize]; + if (!(pg_sizes & def->h_rpt_pgsize)) + continue; + + nr_pages = (end - start) >> def->shift; + flush_pid = nr_pages > tlb_single_page_flush_ceiling; + + /* + * If the number of pages spanning the range is above + * the ceiling, convert the request into a full PID flush. + * And since PID flush takes out all the page sizes, there + * is no need to consider remaining page sizes. + */ + if (flush_pid) { + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB); + return; + } + _tlbie_va_range_lpid(start, end, pid, lpid, + (1UL << def->shift), psize, false); + } +} +EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt); + +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + +static int __init create_tlb_single_page_flush_ceiling(void) +{ + debugfs_create_u32("tlb_single_page_flush_ceiling", 0600, + arch_debugfs_dir, &tlb_single_page_flush_ceiling); + debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600, + arch_debugfs_dir, &tlb_local_single_page_flush_ceiling); + return 0; +} +late_initcall(create_tlb_single_page_flush_ceiling); + diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c new file mode 100644 index 000000000000..15f73abd1506 --- /dev/null +++ b/arch/powerpc/mm/book3s64/slb.c @@ -0,0 +1,795 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * PowerPC64 SLB support. + * + * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM + * Based on earlier code written by: + * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com + * Copyright (c) 2001 Dave Engebretsen + * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM + */ + +#include <asm/interrupt.h> +#include <asm/mmu.h> +#include <asm/mmu_context.h> +#include <asm/paca.h> +#include <asm/lppaca.h> +#include <asm/ppc-opcode.h> +#include <asm/cputable.h> +#include <asm/cacheflush.h> +#include <asm/smp.h> +#include <linux/compiler.h> +#include <linux/context_tracking.h> +#include <linux/mm_types.h> +#include <linux/pgtable.h> + +#include <asm/udbg.h> +#include <asm/text-patching.h> + +#include "internal.h" + + +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea); + +bool stress_slb_enabled __initdata; + +static int __init parse_stress_slb(char *p) +{ + stress_slb_enabled = true; + return 0; +} +early_param("stress_slb", parse_stress_slb); + +__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_slb_key); + +bool no_slb_preload __initdata; +static int __init parse_no_slb_preload(char *p) +{ + no_slb_preload = true; + return 0; +} +early_param("no_slb_preload", parse_no_slb_preload); +__ro_after_init DEFINE_STATIC_KEY_FALSE(no_slb_preload_key); + +static void assert_slb_presence(bool present, unsigned long ea) +{ +#ifdef CONFIG_DEBUG_VM + unsigned long tmp; + + WARN_ON_ONCE(mfmsr() & MSR_EE); + + if (!cpu_has_feature(CPU_FTR_ARCH_206)) + return; + + /* + * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware + * ignores all other bits from 0-27, so just clear them all. + */ + ea &= ~((1UL << SID_SHIFT) - 1); + asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0"); + + WARN_ON(present == (tmp == 0)); +#endif +} + +static inline void slb_shadow_update(unsigned long ea, int ssize, + unsigned long flags, + enum slb_index index) +{ + struct slb_shadow *p = get_slb_shadow(); + + /* + * Clear the ESID first so the entry is not valid while we are + * updating it. No write barriers are needed here, provided + * we only update the current CPU's SLB shadow buffer. + */ + WRITE_ONCE(p->save_area[index].esid, 0); + WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags))); + WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index))); +} + +static inline void slb_shadow_clear(enum slb_index index) +{ + WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index)); +} + +static inline void create_shadowed_slbe(unsigned long ea, int ssize, + unsigned long flags, + enum slb_index index) +{ + /* + * Updating the shadow buffer before writing the SLB ensures + * we don't get a stale entry here if we get preempted by PHYP + * between these two statements. + */ + slb_shadow_update(ea, ssize, flags, index); + + assert_slb_presence(false, ea); + asm volatile("slbmte %0,%1" : + : "r" (mk_vsid_data(ea, ssize, flags)), + "r" (mk_esid_data(ea, ssize, index)) + : "memory" ); +} + +/* + * Insert bolted entries into SLB (which may not be empty, so don't clear + * slb_cache_ptr). + */ +void __slb_restore_bolted_realmode(void) +{ + struct slb_shadow *p = get_slb_shadow(); + enum slb_index index; + + /* No isync needed because realmode. */ + for (index = 0; index < SLB_NUM_BOLTED; index++) { + asm volatile("slbmte %0,%1" : + : "r" (be64_to_cpu(p->save_area[index].vsid)), + "r" (be64_to_cpu(p->save_area[index].esid))); + } + + assert_slb_presence(true, local_paca->kstack); +} + +/* + * Insert the bolted entries into an empty SLB. + */ +void slb_restore_bolted_realmode(void) +{ + __slb_restore_bolted_realmode(); + get_paca()->slb_cache_ptr = 0; + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; +} + +/* + * This flushes all SLB entries including 0, so it must be realmode. + */ +void slb_flush_all_realmode(void) +{ + asm volatile("slbmte %0,%0; slbia" : : "r" (0)); +} + +static __always_inline void __slb_flush_and_restore_bolted(bool preserve_kernel_lookaside) +{ + struct slb_shadow *p = get_slb_shadow(); + unsigned long ksp_esid_data, ksp_vsid_data; + u32 ih; + + /* + * SLBIA IH=1 on ISA v2.05 and newer processors may preserve lookaside + * information created with Class=0 entries, which we use for kernel + * SLB entries (the SLB entries themselves are still invalidated). + * + * Older processors will ignore this optimisation. Over-invalidation + * is fine because we never rely on lookaside information existing. + */ + if (preserve_kernel_lookaside) + ih = 1; + else + ih = 0; + + ksp_esid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].esid); + ksp_vsid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); + + asm volatile(PPC_SLBIA(%0)" \n" + "slbmte %1, %2 \n" + :: "i" (ih), + "r" (ksp_vsid_data), + "r" (ksp_esid_data) + : "memory"); +} + +/* + * This flushes non-bolted entries, it can be run in virtual mode. Must + * be called with interrupts disabled. + */ +void slb_flush_and_restore_bolted(void) +{ + BUILD_BUG_ON(SLB_NUM_BOLTED != 2); + + WARN_ON(!irqs_disabled()); + + /* + * We can't take a PMU exception in the following code, so hard + * disable interrupts. + */ + hard_irq_disable(); + + isync(); + __slb_flush_and_restore_bolted(false); + isync(); + + assert_slb_presence(true, get_paca()->kstack); + + get_paca()->slb_cache_ptr = 0; + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; +} + +void slb_save_contents(struct slb_entry *slb_ptr) +{ + int i; + unsigned long e, v; + + /* Save slb_cache_ptr value. */ + get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr; + + if (!slb_ptr) + return; + + for (i = 0; i < mmu_slb_size; i++) { + asm volatile("slbmfee %0,%1" : "=r" (e) : "r" (i)); + asm volatile("slbmfev %0,%1" : "=r" (v) : "r" (i)); + slb_ptr->esid = e; + slb_ptr->vsid = v; + slb_ptr++; + } +} + +void slb_dump_contents(struct slb_entry *slb_ptr) +{ + int i, n; + unsigned long e, v; + unsigned long llp; + + if (!slb_ptr) + return; + + pr_err("SLB contents of cpu 0x%x\n", smp_processor_id()); + + for (i = 0; i < mmu_slb_size; i++) { + e = slb_ptr->esid; + v = slb_ptr->vsid; + slb_ptr++; + + if (!e && !v) + continue; + + pr_err("%02d %016lx %016lx %s\n", i, e, v, + (e & SLB_ESID_V) ? "VALID" : "NOT VALID"); + + if (!(e & SLB_ESID_V)) + continue; + + llp = v & SLB_VSID_LLP; + if (v & SLB_VSID_B_1T) { + pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n", + GET_ESID_1T(e), + (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp); + } else { + pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n", + GET_ESID(e), + (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp); + } + } + + if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { + /* RR is not so useful as it's often not used for allocation */ + pr_err("SLB RR allocator index %d\n", get_paca()->stab_rr); + + /* Dump slb cache entires as well. */ + pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr); + pr_err("Valid SLB cache entries:\n"); + n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES); + for (i = 0; i < n; i++) + pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); + pr_err("Rest of SLB cache entries:\n"); + for (i = n; i < SLB_CACHE_ENTRIES; i++) + pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); + } +} + +void slb_vmalloc_update(void) +{ + /* + * vmalloc is not bolted, so just have to flush non-bolted. + */ + slb_flush_and_restore_bolted(); +} + +static bool preload_hit(struct thread_info *ti, unsigned long esid) +{ + unsigned char i; + + for (i = 0; i < ti->slb_preload_nr; i++) { + unsigned char idx; + + idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; + if (esid == ti->slb_preload_esid[idx]) + return true; + } + return false; +} + +static void preload_add(struct thread_info *ti, unsigned long ea) +{ + unsigned char idx; + unsigned long esid; + + if (slb_preload_disabled()) + return; + + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { + /* EAs are stored >> 28 so 256MB segments don't need clearing */ + if (ea & ESID_MASK_1T) + ea &= ESID_MASK_1T; + } + + esid = ea >> SID_SHIFT; + + if (preload_hit(ti, esid)) + return; + + idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR; + ti->slb_preload_esid[idx] = esid; + if (ti->slb_preload_nr == SLB_PRELOAD_NR) + ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; + else + ti->slb_preload_nr++; +} + +static void preload_age(struct thread_info *ti) +{ + if (!ti->slb_preload_nr) + return; + ti->slb_preload_nr--; + ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; +} + +static void slb_cache_slbie_kernel(unsigned int index) +{ + unsigned long slbie_data = get_paca()->slb_cache[index]; + unsigned long ksp = get_paca()->kstack; + + slbie_data <<= SID_SHIFT; + slbie_data |= 0xc000000000000000ULL; + if ((ksp & slb_esid_mask(mmu_kernel_ssize)) == slbie_data) + return; + slbie_data |= mmu_kernel_ssize << SLBIE_SSIZE_SHIFT; + + asm volatile("slbie %0" : : "r" (slbie_data)); +} + +static void slb_cache_slbie_user(unsigned int index) +{ + unsigned long slbie_data = get_paca()->slb_cache[index]; + + slbie_data <<= SID_SHIFT; + slbie_data |= user_segment_size(slbie_data) << SLBIE_SSIZE_SHIFT; + slbie_data |= SLBIE_C; /* user slbs have C=1 */ + + asm volatile("slbie %0" : : "r" (slbie_data)); +} + +/* Flush all user entries from the segment table of the current processor. */ +void switch_slb(struct task_struct *tsk, struct mm_struct *mm) +{ + struct thread_info *ti = task_thread_info(tsk); + unsigned char i; + + /* + * We need interrupts hard-disabled here, not just soft-disabled, + * so that a PMU interrupt can't occur, which might try to access + * user memory (to get a stack trace) and possible cause an SLB miss + * which would update the slb_cache/slb_cache_ptr fields in the PACA. + */ + hard_irq_disable(); + isync(); + if (stress_slb()) { + __slb_flush_and_restore_bolted(false); + isync(); + get_paca()->slb_cache_ptr = 0; + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + + } else if (cpu_has_feature(CPU_FTR_ARCH_300)) { + /* + * SLBIA IH=3 invalidates all Class=1 SLBEs and their + * associated lookaside structures, which matches what + * switch_slb wants. So ARCH_300 does not use the slb + * cache. + */ + asm volatile(PPC_SLBIA(3)); + + } else { + unsigned long offset = get_paca()->slb_cache_ptr; + + if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && + offset <= SLB_CACHE_ENTRIES) { + /* + * Could assert_slb_presence(true) here, but + * hypervisor or machine check could have come + * in and removed the entry at this point. + */ + + for (i = 0; i < offset; i++) + slb_cache_slbie_user(i); + + /* Workaround POWER5 < DD2.1 issue */ + if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1) + slb_cache_slbie_user(0); + + } else { + /* Flush but retain kernel lookaside information */ + __slb_flush_and_restore_bolted(true); + isync(); + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + } + + get_paca()->slb_cache_ptr = 0; + } + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; + + copy_mm_to_paca(mm); + + if (slb_preload_disabled()) + return; + + /* + * We gradually age out SLBs after a number of context switches to + * reduce reload overhead of unused entries (like we do with FP/VEC + * reload). Each time we wrap 256 switches, take an entry out of the + * SLB preload cache. + */ + tsk->thread.load_slb++; + if (!tsk->thread.load_slb) { + unsigned long pc = KSTK_EIP(tsk); + + preload_age(ti); + preload_add(ti, pc); + } + + for (i = 0; i < ti->slb_preload_nr; i++) { + unsigned char idx; + unsigned long ea; + + idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; + ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT; + + slb_allocate_user(mm, ea); + } + + /* + * Synchronize slbmte preloads with possible subsequent user memory + * address accesses by the kernel (user mode won't happen until + * rfid, which is safe). + */ + isync(); +} + +void slb_set_size(u16 size) +{ + mmu_slb_size = size; +} + +void slb_initialize(void) +{ + unsigned long linear_llp, vmalloc_llp, io_llp; + unsigned long lflags; + static int slb_encoding_inited; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + unsigned long vmemmap_llp; +#endif + + /* Prepare our SLB miss handler based on our page size */ + linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; + io_llp = mmu_psize_defs[mmu_io_psize].sllp; + vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; + get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp; +#endif + if (!slb_encoding_inited) { + slb_encoding_inited = 1; + pr_devel("SLB: linear LLP = %04lx\n", linear_llp); + pr_devel("SLB: io LLP = %04lx\n", io_llp); +#ifdef CONFIG_SPARSEMEM_VMEMMAP + pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp); +#endif + } + + get_paca()->stab_rr = SLB_NUM_BOLTED - 1; + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; + + lflags = SLB_VSID_KERNEL | linear_llp; + + /* Invalidate the entire SLB (even entry 0) & all the ERATS */ + asm volatile("isync":::"memory"); + asm volatile("slbmte %0,%0"::"r" (0) : "memory"); + asm volatile("isync; slbia; isync":::"memory"); + create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX); + + /* + * For the boot cpu, we're running on the stack in init_thread_union, + * which is in the first segment of the linear mapping, and also + * get_paca()->kstack hasn't been initialized yet. + * For secondary cpus, we need to bolt the kernel stack entry now. + */ + slb_shadow_clear(KSTACK_INDEX); + if (raw_smp_processor_id() != boot_cpuid && + (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET) + create_shadowed_slbe(get_paca()->kstack, + mmu_kernel_ssize, lflags, KSTACK_INDEX); + + asm volatile("isync":::"memory"); +} + +static void slb_cache_update(unsigned long esid_data) +{ + int slb_cache_index; + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return; /* ISAv3.0B and later does not use slb_cache */ + + if (stress_slb()) + return; + + /* + * Now update slb cache entries + */ + slb_cache_index = local_paca->slb_cache_ptr; + if (slb_cache_index < SLB_CACHE_ENTRIES) { + /* + * We have space in slb cache for optimized switch_slb(). + * Top 36 bits from esid_data as per ISA + */ + local_paca->slb_cache[slb_cache_index++] = esid_data >> SID_SHIFT; + local_paca->slb_cache_ptr++; + } else { + /* + * Our cache is full and the current cache content strictly + * doesn't indicate the active SLB contents. Bump the ptr + * so that switch_slb() will ignore the cache. + */ + local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1; + } +} + +static enum slb_index alloc_slb_index(bool kernel) +{ + enum slb_index index; + + /* + * The allocation bitmaps can become out of synch with the SLB + * when the _switch code does slbie when bolting a new stack + * segment and it must not be anywhere else in the SLB. This leaves + * a kernel allocated entry that is unused in the SLB. With very + * large systems or small segment sizes, the bitmaps could slowly + * fill with these entries. They will eventually be cleared out + * by the round robin allocator in that case, so it's probably not + * worth accounting for. + */ + + /* + * SLBs beyond 32 entries are allocated with stab_rr only + * POWER7/8/9 have 32 SLB entries, this could be expanded if a + * future CPU has more. + */ + if (local_paca->slb_used_bitmap != U32_MAX) { + index = ffz(local_paca->slb_used_bitmap); + local_paca->slb_used_bitmap |= 1U << index; + if (kernel) + local_paca->slb_kern_bitmap |= 1U << index; + } else { + /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */ + index = local_paca->stab_rr; + if (index < (mmu_slb_size - 1)) + index++; + else + index = SLB_NUM_BOLTED; + local_paca->stab_rr = index; + if (index < 32) { + if (kernel) + local_paca->slb_kern_bitmap |= 1U << index; + else + local_paca->slb_kern_bitmap &= ~(1U << index); + } + } + BUG_ON(index < SLB_NUM_BOLTED); + + return index; +} + +static long slb_insert_entry(unsigned long ea, unsigned long context, + unsigned long flags, int ssize, bool kernel) +{ + unsigned long vsid; + unsigned long vsid_data, esid_data; + enum slb_index index; + + vsid = get_vsid(context, ea, ssize); + if (!vsid) + return -EFAULT; + + /* + * There must not be a kernel SLB fault in alloc_slb_index or before + * slbmte here or the allocation bitmaps could get out of whack with + * the SLB. + * + * User SLB faults or preloads take this path which might get inlined + * into the caller, so add compiler barriers here to ensure unsafe + * memory accesses do not come between. + */ + barrier(); + + index = alloc_slb_index(kernel); + + vsid_data = __mk_vsid_data(vsid, ssize, flags); + esid_data = mk_esid_data(ea, ssize, index); + + /* + * No need for an isync before or after this slbmte. The exception + * we enter with and the rfid we exit with are context synchronizing. + * User preloads should add isync afterwards in case the kernel + * accesses user memory before it returns to userspace with rfid. + */ + assert_slb_presence(false, ea); + if (stress_slb()) { + int slb_cache_index = local_paca->slb_cache_ptr; + + /* + * stress_slb() does not use slb cache, repurpose as a + * cache of inserted (non-bolted) kernel SLB entries. All + * non-bolted kernel entries are flushed on any user fault, + * or if there are already 3 non-boled kernel entries. + */ + BUILD_BUG_ON(SLB_CACHE_ENTRIES < 3); + if (!kernel || slb_cache_index == 3) { + int i; + + for (i = 0; i < slb_cache_index; i++) + slb_cache_slbie_kernel(i); + slb_cache_index = 0; + } + + if (kernel) + local_paca->slb_cache[slb_cache_index++] = esid_data >> SID_SHIFT; + local_paca->slb_cache_ptr = slb_cache_index; + } + asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)); + + barrier(); + + if (!kernel) + slb_cache_update(esid_data); + + return 0; +} + +static long slb_allocate_kernel(unsigned long ea, unsigned long id) +{ + unsigned long context; + unsigned long flags; + int ssize; + + if (id == LINEAR_MAP_REGION_ID) { + + /* We only support upto H_MAX_PHYSMEM_BITS */ + if ((ea & EA_MASK) > (1UL << H_MAX_PHYSMEM_BITS)) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + } else if (id == VMEMMAP_REGION_ID) { + + if (ea >= H_VMEMMAP_END) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; +#endif + } else if (id == VMALLOC_REGION_ID) { + + if (ea >= H_VMALLOC_END) + return -EFAULT; + + flags = local_paca->vmalloc_sllp; + + } else if (id == IO_REGION_ID) { + + if (ea >= H_KERN_IO_END) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; + + } else { + return -EFAULT; + } + + ssize = MMU_SEGSIZE_1T; + if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) + ssize = MMU_SEGSIZE_256M; + + context = get_kernel_context(ea); + + return slb_insert_entry(ea, context, flags, ssize, true); +} + +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) +{ + unsigned long context; + unsigned long flags; + int bpsize; + int ssize; + + /* + * consider this as bad access if we take a SLB miss + * on an address above addr limit. + */ + if (ea >= mm_ctx_slb_addr_limit(&mm->context)) + return -EFAULT; + + context = get_user_context(&mm->context, ea); + if (!context) + return -EFAULT; + + if (unlikely(ea >= H_PGTABLE_RANGE)) { + WARN_ON(1); + return -EFAULT; + } + + ssize = user_segment_size(ea); + + bpsize = get_slice_psize(mm, ea); + flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; + + return slb_insert_entry(ea, context, flags, ssize, false); +} + +DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault) +{ + unsigned long ea = regs->dar; + unsigned long id = get_region_id(ea); + + /* IRQs are not reconciled here, so can't check irqs_disabled */ + VM_WARN_ON(mfmsr() & MSR_EE); + + if (regs_is_unrecoverable(regs)) + return -EINVAL; + + /* + * SLB kernel faults must be very careful not to touch anything that is + * not bolted. E.g., PACA and global variables are okay, mm->context + * stuff is not. SLB user faults may access all of memory (and induce + * one recursive SLB kernel fault), so the kernel fault must not + * trample on the user fault state at those points. + */ + + /* + * This is a raw interrupt handler, for performance, so that + * fast_interrupt_return can be used. The handler must not touch local + * irq state, or schedule. We could test for usermode and upgrade to a + * normal process context (synchronous) interrupt for those, which + * would make them first-class kernel code and able to be traced and + * instrumented, although performance would suffer a bit, it would + * probably be a good tradeoff. + */ + if (id >= LINEAR_MAP_REGION_ID) { + long err; +#ifdef CONFIG_DEBUG_VM + /* Catch recursive kernel SLB faults. */ + BUG_ON(local_paca->in_kernel_slb_handler); + local_paca->in_kernel_slb_handler = 1; +#endif + err = slb_allocate_kernel(ea, id); +#ifdef CONFIG_DEBUG_VM + local_paca->in_kernel_slb_handler = 0; +#endif + return err; + } else { + struct mm_struct *mm = current->mm; + long err; + + if (unlikely(!mm)) + return -EFAULT; + + err = slb_allocate_user(mm, ea); + if (!err) + preload_add(current_thread_info(), ea); + + return err; + } +} diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c new file mode 100644 index 000000000000..28bec5bc7879 --- /dev/null +++ b/arch/powerpc/mm/book3s64/slice.c @@ -0,0 +1,819 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * address space "slices" (meta-segments) support + * + * Copyright (C) 2007 Benjamin Herrenschmidt, IBM Corporation. + * + * Based on hugetlb implementation + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/export.h> +#include <linux/hugetlb.h> +#include <linux/sched/mm.h> +#include <linux/security.h> +#include <asm/mman.h> +#include <asm/mmu.h> +#include <asm/spu.h> +#include <asm/hugetlb.h> +#include <asm/mmu_context.h> + +static DEFINE_SPINLOCK(slice_convert_lock); + +#ifdef DEBUG +int _slice_debug = 1; + +static void slice_print_mask(const char *label, const struct slice_mask *mask) +{ + if (!_slice_debug) + return; + pr_devel("%s low_slice: %*pbl\n", label, + (int)SLICE_NUM_LOW, &mask->low_slices); + pr_devel("%s high_slice: %*pbl\n", label, + (int)SLICE_NUM_HIGH, mask->high_slices); +} + +#define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0) + +#else + +static void slice_print_mask(const char *label, const struct slice_mask *mask) {} +#define slice_dbg(fmt...) + +#endif + +static inline notrace bool slice_addr_is_low(unsigned long addr) +{ + u64 tmp = (u64)addr; + + return tmp < SLICE_LOW_TOP; +} + +static void slice_range_to_mask(unsigned long start, unsigned long len, + struct slice_mask *ret) +{ + unsigned long end = start + len - 1; + + ret->low_slices = 0; + if (SLICE_NUM_HIGH) + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); + + if (slice_addr_is_low(start)) { + unsigned long mend = min(end, + (unsigned long)(SLICE_LOW_TOP - 1)); + + ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) + - (1u << GET_LOW_SLICE_INDEX(start)); + } + + if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) { + unsigned long start_index = GET_HIGH_SLICE_INDEX(start); + unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); + unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; + + bitmap_set(ret->high_slices, start_index, count); + } +} + +static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + + if ((mm_ctx_slb_addr_limit(&mm->context) - len) < addr) + return 0; + vma = find_vma(mm, addr); + return (!vma || (addr + len) <= vm_start_gap(vma)); +} + +static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice) +{ + return !slice_area_is_free(mm, slice << SLICE_LOW_SHIFT, + 1ul << SLICE_LOW_SHIFT); +} + +static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) +{ + unsigned long start = slice << SLICE_HIGH_SHIFT; + unsigned long end = start + (1ul << SLICE_HIGH_SHIFT); + + /* Hack, so that each addresses is controlled by exactly one + * of the high or low area bitmaps, the first high area starts + * at 4GB, not 0 */ + if (start == 0) + start = (unsigned long)SLICE_LOW_TOP; + + return !slice_area_is_free(mm, start, end - start); +} + +static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret, + unsigned long high_limit) +{ + unsigned long i; + + ret->low_slices = 0; + if (SLICE_NUM_HIGH) + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); + + for (i = 0; i < SLICE_NUM_LOW; i++) + if (!slice_low_has_vma(mm, i)) + ret->low_slices |= 1u << i; + + if (slice_addr_is_low(high_limit - 1)) + return; + + for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++) + if (!slice_high_has_vma(mm, i)) + __set_bit(i, ret->high_slices); +} + +static bool slice_check_range_fits(struct mm_struct *mm, + const struct slice_mask *available, + unsigned long start, unsigned long len) +{ + unsigned long end = start + len - 1; + u64 low_slices = 0; + + if (slice_addr_is_low(start)) { + unsigned long mend = min(end, + (unsigned long)(SLICE_LOW_TOP - 1)); + + low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) + - (1u << GET_LOW_SLICE_INDEX(start)); + } + if ((low_slices & available->low_slices) != low_slices) + return false; + + if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) { + unsigned long start_index = GET_HIGH_SLICE_INDEX(start); + unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); + unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; + unsigned long i; + + for (i = start_index; i < start_index + count; i++) { + if (!test_bit(i, available->high_slices)) + return false; + } + } + + return true; +} + +static void slice_flush_segments(void *parm) +{ +#ifdef CONFIG_PPC64 + struct mm_struct *mm = parm; + unsigned long flags; + + if (mm != current->active_mm) + return; + + copy_mm_to_paca(current->active_mm); + + local_irq_save(flags); + slb_flush_and_restore_bolted(); + local_irq_restore(flags); +#endif +} + +static void slice_convert(struct mm_struct *mm, + const struct slice_mask *mask, int psize) +{ + int index, mask_index; + /* Write the new slice psize bits */ + unsigned char *hpsizes, *lpsizes; + struct slice_mask *psize_mask, *old_mask; + unsigned long i, flags; + int old_psize; + + slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize); + slice_print_mask(" mask", mask); + + psize_mask = slice_mask_for_size(&mm->context, psize); + + /* We need to use a spinlock here to protect against + * concurrent 64k -> 4k demotion ... + */ + spin_lock_irqsave(&slice_convert_lock, flags); + + lpsizes = mm_ctx_low_slices(&mm->context); + for (i = 0; i < SLICE_NUM_LOW; i++) { + if (!(mask->low_slices & (1u << i))) + continue; + + mask_index = i & 0x1; + index = i >> 1; + + /* Update the slice_mask */ + old_psize = (lpsizes[index] >> (mask_index * 4)) & 0xf; + old_mask = slice_mask_for_size(&mm->context, old_psize); + old_mask->low_slices &= ~(1u << i); + psize_mask->low_slices |= 1u << i; + + /* Update the sizes array */ + lpsizes[index] = (lpsizes[index] & ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); + } + + hpsizes = mm_ctx_high_slices(&mm->context); + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm_ctx_slb_addr_limit(&mm->context)); i++) { + if (!test_bit(i, mask->high_slices)) + continue; + + mask_index = i & 0x1; + index = i >> 1; + + /* Update the slice_mask */ + old_psize = (hpsizes[index] >> (mask_index * 4)) & 0xf; + old_mask = slice_mask_for_size(&mm->context, old_psize); + __clear_bit(i, old_mask->high_slices); + __set_bit(i, psize_mask->high_slices); + + /* Update the sizes array */ + hpsizes[index] = (hpsizes[index] & ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); + } + + slice_dbg(" lsps=%lx, hsps=%lx\n", + (unsigned long)mm_ctx_low_slices(&mm->context), + (unsigned long)mm_ctx_high_slices(&mm->context)); + + spin_unlock_irqrestore(&slice_convert_lock, flags); + +#ifdef CONFIG_SPU_BASE + spu_flush_all_slbs(mm); +#endif +} + +/* + * Compute which slice addr is part of; + * set *boundary_addr to the start or end boundary of that slice + * (depending on 'end' parameter); + * return boolean indicating if the slice is marked as available in the + * 'available' slice_mark. + */ +static bool slice_scan_available(unsigned long addr, + const struct slice_mask *available, + int end, unsigned long *boundary_addr) +{ + unsigned long slice; + if (slice_addr_is_low(addr)) { + slice = GET_LOW_SLICE_INDEX(addr); + *boundary_addr = (slice + end) << SLICE_LOW_SHIFT; + return !!(available->low_slices & (1u << slice)); + } else { + slice = GET_HIGH_SLICE_INDEX(addr); + *boundary_addr = (slice + end) ? + ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP; + return !!test_bit(slice, available->high_slices); + } +} + +static unsigned long slice_find_area_bottomup(struct mm_struct *mm, + unsigned long addr, unsigned long len, + const struct slice_mask *available, + int psize, unsigned long high_limit) +{ + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long found, next_end; + struct vm_unmapped_area_info info = { + .length = len, + .align_mask = PAGE_MASK & ((1ul << pshift) - 1), + }; + /* + * Check till the allow max value for this mmap request + */ + while (addr < high_limit) { + info.low_limit = addr; + if (!slice_scan_available(addr, available, 1, &addr)) + continue; + + next_slice: + /* + * At this point [info.low_limit; addr) covers + * available slices only and ends at a slice boundary. + * Check if we need to reduce the range, or if we can + * extend it to cover the next available slice. + */ + if (addr >= high_limit) + addr = high_limit; + else if (slice_scan_available(addr, available, 1, &next_end)) { + addr = next_end; + goto next_slice; + } + info.high_limit = addr; + + found = vm_unmapped_area(&info); + if (!(found & ~PAGE_MASK)) + return found; + } + + return -ENOMEM; +} + +static unsigned long slice_find_area_topdown(struct mm_struct *mm, + unsigned long addr, unsigned long len, + const struct slice_mask *available, + int psize, unsigned long high_limit) +{ + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long found, prev; + struct vm_unmapped_area_info info = { + .flags = VM_UNMAPPED_AREA_TOPDOWN, + .length = len, + .align_mask = PAGE_MASK & ((1ul << pshift) - 1), + }; + unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr); + + /* + * If we are trying to allocate above DEFAULT_MAP_WINDOW + * Add the different to the mmap_base. + * Only for that request for which high_limit is above + * DEFAULT_MAP_WINDOW we should apply this. + */ + if (high_limit > DEFAULT_MAP_WINDOW) + addr += mm_ctx_slb_addr_limit(&mm->context) - DEFAULT_MAP_WINDOW; + + while (addr > min_addr) { + info.high_limit = addr; + if (!slice_scan_available(addr - 1, available, 0, &addr)) + continue; + + prev_slice: + /* + * At this point [addr; info.high_limit) covers + * available slices only and starts at a slice boundary. + * Check if we need to reduce the range, or if we can + * extend it to cover the previous available slice. + */ + if (addr < min_addr) + addr = min_addr; + else if (slice_scan_available(addr - 1, available, 0, &prev)) { + addr = prev; + goto prev_slice; + } + info.low_limit = addr; + + found = vm_unmapped_area(&info); + if (!(found & ~PAGE_MASK)) + return found; + } + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + return slice_find_area_bottomup(mm, TASK_UNMAPPED_BASE, len, available, psize, high_limit); +} + + +static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, + const struct slice_mask *mask, int psize, + int topdown, unsigned long high_limit) +{ + if (topdown) + return slice_find_area_topdown(mm, mm->mmap_base, len, mask, psize, high_limit); + else + return slice_find_area_bottomup(mm, mm->mmap_base, len, mask, psize, high_limit); +} + +static inline void slice_copy_mask(struct slice_mask *dst, + const struct slice_mask *src) +{ + dst->low_slices = src->low_slices; + if (!SLICE_NUM_HIGH) + return; + bitmap_copy(dst->high_slices, src->high_slices, SLICE_NUM_HIGH); +} + +static inline void slice_or_mask(struct slice_mask *dst, + const struct slice_mask *src1, + const struct slice_mask *src2) +{ + dst->low_slices = src1->low_slices | src2->low_slices; + if (!SLICE_NUM_HIGH) + return; + bitmap_or(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH); +} + +static inline void slice_andnot_mask(struct slice_mask *dst, + const struct slice_mask *src1, + const struct slice_mask *src2) +{ + dst->low_slices = src1->low_slices & ~src2->low_slices; + if (!SLICE_NUM_HIGH) + return; + bitmap_andnot(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH); +} + +#ifdef CONFIG_PPC_64K_PAGES +#define MMU_PAGE_BASE MMU_PAGE_64K +#else +#define MMU_PAGE_BASE MMU_PAGE_4K +#endif + +unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, + unsigned long flags, unsigned int psize, + int topdown) +{ + struct slice_mask good_mask; + struct slice_mask potential_mask; + const struct slice_mask *maskp; + const struct slice_mask *compat_maskp = NULL; + int fixed = (flags & MAP_FIXED); + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long page_size = 1UL << pshift; + struct mm_struct *mm = current->mm; + unsigned long newaddr; + unsigned long high_limit; + + high_limit = DEFAULT_MAP_WINDOW; + if (addr >= high_limit || (fixed && (addr + len > high_limit))) + high_limit = TASK_SIZE; + + if (len > high_limit) + return -ENOMEM; + if (len & (page_size - 1)) + return -EINVAL; + if (fixed) { + if (addr & (page_size - 1)) + return -EINVAL; + if (addr > high_limit - len) + return -ENOMEM; + } + + if (high_limit > mm_ctx_slb_addr_limit(&mm->context)) { + /* + * Increasing the slb_addr_limit does not require + * slice mask cache to be recalculated because it should + * be already initialised beyond the old address limit. + */ + mm_ctx_set_slb_addr_limit(&mm->context, high_limit); + + on_each_cpu(slice_flush_segments, mm, 1); + } + + /* Sanity checks */ + BUG_ON(mm->task_size == 0); + BUG_ON(mm_ctx_slb_addr_limit(&mm->context) == 0); + VM_BUG_ON(radix_enabled()); + + slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); + slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", + addr, len, flags, topdown); + + /* If hint, make sure it matches our alignment restrictions */ + if (!fixed && addr) { + addr = ALIGN(addr, page_size); + slice_dbg(" aligned addr=%lx\n", addr); + /* Ignore hint if it's too large or overlaps a VMA */ + if (addr > high_limit - len || addr < mmap_min_addr || + !slice_area_is_free(mm, addr, len)) + addr = 0; + } + + /* First make up a "good" mask of slices that have the right size + * already + */ + maskp = slice_mask_for_size(&mm->context, psize); + + /* + * Here "good" means slices that are already the right page size, + * "compat" means slices that have a compatible page size (i.e. + * 4k in a 64k pagesize kernel), and "free" means slices without + * any VMAs. + * + * If MAP_FIXED: + * check if fits in good | compat => OK + * check if fits in good | compat | free => convert free + * else bad + * If have hint: + * check if hint fits in good => OK + * check if hint fits in good | free => convert free + * Otherwise: + * search in good, found => OK + * search in good | free, found => convert free + * search in good | compat | free, found => convert free. + */ + + /* + * If we support combo pages, we can allow 64k pages in 4k slices + * The mask copies could be avoided in most cases here if we had + * a pointer to good mask for the next code to use. + */ + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { + compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K); + if (fixed) + slice_or_mask(&good_mask, maskp, compat_maskp); + else + slice_copy_mask(&good_mask, maskp); + } else { + slice_copy_mask(&good_mask, maskp); + } + + slice_print_mask(" good_mask", &good_mask); + if (compat_maskp) + slice_print_mask(" compat_mask", compat_maskp); + + /* First check hint if it's valid or if we have MAP_FIXED */ + if (addr != 0 || fixed) { + /* Check if we fit in the good mask. If we do, we just return, + * nothing else to do + */ + if (slice_check_range_fits(mm, &good_mask, addr, len)) { + slice_dbg(" fits good !\n"); + newaddr = addr; + goto return_addr; + } + } else { + /* Now let's see if we can find something in the existing + * slices for that size + */ + newaddr = slice_find_area(mm, len, &good_mask, + psize, topdown, high_limit); + if (newaddr != -ENOMEM) { + /* Found within the good mask, we don't have to setup, + * we thus return directly + */ + slice_dbg(" found area at 0x%lx\n", newaddr); + goto return_addr; + } + } + /* + * We don't fit in the good mask, check what other slices are + * empty and thus can be converted + */ + slice_mask_for_free(mm, &potential_mask, high_limit); + slice_or_mask(&potential_mask, &potential_mask, &good_mask); + slice_print_mask(" potential", &potential_mask); + + if (addr != 0 || fixed) { + if (slice_check_range_fits(mm, &potential_mask, addr, len)) { + slice_dbg(" fits potential !\n"); + newaddr = addr; + goto convert; + } + } + + /* If we have MAP_FIXED and failed the above steps, then error out */ + if (fixed) + return -EBUSY; + + slice_dbg(" search...\n"); + + /* If we had a hint that didn't work out, see if we can fit + * anywhere in the good area. + */ + if (addr) { + newaddr = slice_find_area(mm, len, &good_mask, + psize, topdown, high_limit); + if (newaddr != -ENOMEM) { + slice_dbg(" found area at 0x%lx\n", newaddr); + goto return_addr; + } + } + + /* Now let's see if we can find something in the existing slices + * for that size plus free slices + */ + newaddr = slice_find_area(mm, len, &potential_mask, + psize, topdown, high_limit); + + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && newaddr == -ENOMEM && + psize == MMU_PAGE_64K) { + /* retry the search with 4k-page slices included */ + slice_or_mask(&potential_mask, &potential_mask, compat_maskp); + newaddr = slice_find_area(mm, len, &potential_mask, + psize, topdown, high_limit); + } + + if (newaddr == -ENOMEM) + return -ENOMEM; + + slice_range_to_mask(newaddr, len, &potential_mask); + slice_dbg(" found potential area at 0x%lx\n", newaddr); + slice_print_mask(" mask", &potential_mask); + + convert: + /* + * Try to allocate the context before we do slice convert + * so that we handle the context allocation failure gracefully. + */ + if (need_extra_context(mm, newaddr)) { + if (alloc_extended_context(mm, newaddr) < 0) + return -ENOMEM; + } + + slice_andnot_mask(&potential_mask, &potential_mask, &good_mask); + if (compat_maskp && !fixed) + slice_andnot_mask(&potential_mask, &potential_mask, compat_maskp); + if (potential_mask.low_slices || + (SLICE_NUM_HIGH && + !bitmap_empty(potential_mask.high_slices, SLICE_NUM_HIGH))) { + slice_convert(mm, &potential_mask, psize); + if (psize > MMU_PAGE_BASE) + on_each_cpu(slice_flush_segments, mm, 1); + } + return newaddr; + +return_addr: + if (need_extra_context(mm, newaddr)) { + if (alloc_extended_context(mm, newaddr) < 0) + return -ENOMEM; + } + return newaddr; +} +EXPORT_SYMBOL_GPL(slice_get_unmapped_area); + +#ifdef CONFIG_HUGETLB_PAGE +static int file_to_psize(struct file *file) +{ + struct hstate *hstate = hstate_file(file); + + return shift_to_mmu_psize(huge_page_shift(hstate)); +} +#else +static int file_to_psize(struct file *file) +{ + return 0; +} +#endif + +unsigned long arch_get_unmapped_area(struct file *filp, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags, + vm_flags_t vm_flags) +{ + unsigned int psize; + + if (radix_enabled()) + return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); + + if (filp && is_file_hugepages(filp)) + psize = file_to_psize(filp); + else + psize = mm_ctx_user_psize(¤t->mm->context); + + return slice_get_unmapped_area(addr, len, flags, psize, 0); +} + +unsigned long arch_get_unmapped_area_topdown(struct file *filp, + const unsigned long addr0, + const unsigned long len, + const unsigned long pgoff, + const unsigned long flags, + vm_flags_t vm_flags) +{ + unsigned int psize; + + if (radix_enabled()) + return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags, vm_flags); + + if (filp && is_file_hugepages(filp)) + psize = file_to_psize(filp); + else + psize = mm_ctx_user_psize(¤t->mm->context); + + return slice_get_unmapped_area(addr0, len, flags, psize, 1); +} + +unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr) +{ + unsigned char *psizes; + int index, mask_index; + + VM_BUG_ON(radix_enabled()); + + if (slice_addr_is_low(addr)) { + psizes = mm_ctx_low_slices(&mm->context); + index = GET_LOW_SLICE_INDEX(addr); + } else { + psizes = mm_ctx_high_slices(&mm->context); + index = GET_HIGH_SLICE_INDEX(addr); + } + mask_index = index & 0x1; + return (psizes[index >> 1] >> (mask_index * 4)) & 0xf; +} +EXPORT_SYMBOL_GPL(get_slice_psize); + +void slice_init_new_context_exec(struct mm_struct *mm) +{ + unsigned char *hpsizes, *lpsizes; + struct slice_mask *mask; + unsigned int psize = mmu_virtual_psize; + + slice_dbg("slice_init_new_context_exec(mm=%p)\n", mm); + + /* + * In the case of exec, use the default limit. In the + * case of fork it is just inherited from the mm being + * duplicated. + */ + mm_ctx_set_slb_addr_limit(&mm->context, SLB_ADDR_LIMIT_DEFAULT); + mm_ctx_set_user_psize(&mm->context, psize); + + /* + * Set all slice psizes to the default. + */ + lpsizes = mm_ctx_low_slices(&mm->context); + memset(lpsizes, (psize << 4) | psize, SLICE_NUM_LOW >> 1); + + hpsizes = mm_ctx_high_slices(&mm->context); + memset(hpsizes, (psize << 4) | psize, SLICE_NUM_HIGH >> 1); + + /* + * Slice mask cache starts zeroed, fill the default size cache. + */ + mask = slice_mask_for_size(&mm->context, psize); + mask->low_slices = ~0UL; + if (SLICE_NUM_HIGH) + bitmap_fill(mask->high_slices, SLICE_NUM_HIGH); +} + +void slice_setup_new_exec(void) +{ + struct mm_struct *mm = current->mm; + + slice_dbg("slice_setup_new_exec(mm=%p)\n", mm); + + if (!is_32bit_task()) + return; + + mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW); +} + +void slice_set_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long len, unsigned int psize) +{ + struct slice_mask mask; + + VM_BUG_ON(radix_enabled()); + + slice_range_to_mask(start, len, &mask); + slice_convert(mm, &mask, psize); +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * is_hugepage_only_range() is used by generic code to verify whether + * a normal mmap mapping (non hugetlbfs) is valid on a given area. + * + * until the generic code provides a more generic hook and/or starts + * calling arch get_unmapped_area for MAP_FIXED (which our implementation + * here knows how to deal with), we hijack it to keep standard mappings + * away from us. + * + * because of that generic code limitation, MAP_FIXED mapping cannot + * "convert" back a slice with no VMAs to the standard page size, only + * get_unmapped_area() can. It would be possible to fix it here but I + * prefer working on fixing the generic code instead. + * + * WARNING: This will not work if hugetlbfs isn't enabled since the + * generic code will redefine that function as 0 in that. This is ok + * for now as we only use slices with hugetlbfs enabled. This should + * be fixed as the generic code gets fixed. + */ +int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + const struct slice_mask *maskp; + unsigned int psize = mm_ctx_user_psize(&mm->context); + + VM_BUG_ON(radix_enabled()); + + maskp = slice_mask_for_size(&mm->context, psize); + + /* We need to account for 4k slices too */ + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { + const struct slice_mask *compat_maskp; + struct slice_mask available; + + compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K); + slice_or_mask(&available, maskp, compat_maskp); + return !slice_check_range_fits(mm, &available, addr, len); + } + + return !slice_check_range_fits(mm, maskp, addr, len); +} + +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + /* With radix we don't use slice, so derive it from vma*/ + if (radix_enabled()) + return vma_kernel_pagesize(vma); + + return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start)); +} +#endif diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index aa74acb0fdfc..ec98e526167e 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -1,22 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2007-2008 Paul Mackerras, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/types.h> -#include <linux/mm.h> +#include <linux/pagewalk.h> #include <linux/hugetlb.h> +#include <linux/syscalls.h> -#include <asm/pgtable.h> -#include <asm/uaccess.h> -#include <asm/tlbflush.h> +#include <linux/pgtable.h> +#include <linux/uaccess.h> /* * Free all pages allocated for subpage protection maps and pointers. @@ -25,10 +21,13 @@ */ void subpage_prot_free(struct mm_struct *mm) { - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); unsigned long i, j, addr; u32 **p; + if (!spt) + return; + for (i = 0; i < 4; ++i) { if (spt->low_prot[i]) { free_page((unsigned long)spt->low_prot[i]); @@ -36,7 +35,7 @@ void subpage_prot_free(struct mm_struct *mm) } } addr = 0; - for (i = 0; i < 2; ++i) { + for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) { p = spt->protptrs[i]; if (!p) continue; @@ -48,37 +47,35 @@ void subpage_prot_free(struct mm_struct *mm) free_page((unsigned long)p); } spt->maxaddr = 0; -} - -void subpage_prot_init_new_context(struct mm_struct *mm) -{ - struct subpage_prot_table *spt = &mm->context.spt; - - memset(spt, 0, sizeof(*spt)); + kfree(spt); } static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, int npages) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; spinlock_t *ptl; pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) return; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (pud_none(*pud)) return; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return; arch_enter_lazy_mmu_mode(); for (; npages > 0; --npages) { - pte_update(mm, addr, pte, 0, 0); + pte_update(mm, addr, pte, 0, 0, 0); addr += PAGE_SIZE; ++pte; } @@ -93,19 +90,24 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, static void subpage_prot_clear(unsigned long addr, unsigned long len) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt; u32 **spm, *spp; unsigned long i; size_t nw; unsigned long next, limit; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); + + spt = mm_ctx_subpage_prot(&mm->context); + if (!spt) + goto err_out; + limit = addr + len; if (limit > spt->maxaddr) limit = spt->maxaddr; for (; addr < limit; addr = next) { next = pmd_addr_end(addr, limit); - if (addr < 0x100000000) { + if (addr < 0x100000000UL) { spm = spt->low_prot; } else { spm = spt->protptrs[addr >> SBP_L3_SHIFT]; @@ -127,46 +129,38 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) /* now flush any existing HPTEs for the range */ hpte_flush_range(mm, addr, nw); } - up_write(&mm->mmap_sem); + +err_out: + mmap_write_unlock(mm); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->private; - split_huge_page_pmd(vma, addr, pmd); + struct vm_area_struct *vma = walk->vma; + split_huge_pmd(vma, pmd, addr); return 0; } +static const struct mm_walk_ops subpage_walk_ops = { + .pmd_entry = subpage_walk_pmd_entry, + .walk_lock = PGWALK_WRLOCK_VERIFY, +}; + static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, unsigned long len) { struct vm_area_struct *vma; - struct mm_walk subpage_proto_walk = { - .mm = mm, - .pmd_entry = subpage_walk_pmd_entry, - }; + VMA_ITERATOR(vmi, mm, addr); /* * We don't try too hard, we just mark all the vma in that range * VM_NOHUGEPAGE and split them. */ - vma = find_vma(mm, addr); - /* - * If the range is in unmapped range, just return - */ - if (vma && ((addr + len) <= vma->vm_start)) - return; - - while (vma) { - if (vma->vm_start >= (addr + len)) - break; - vma->vm_flags |= VM_NOHUGEPAGE; - subpage_proto_walk.private = vma; - walk_page_range(vma->vm_start, vma->vm_end, - &subpage_proto_walk); - vma = vma->vm_next; + for_each_vma_range(vmi, vma, addr + len) { + vm_flags_set(vma, VM_NOHUGEPAGE); + walk_page_vma(vma, &subpage_walk_ops, NULL); } } #else @@ -187,19 +181,24 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, * in a 2-bit field won't allow writes to a page that is otherwise * write-protected. */ -long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) +SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, + unsigned long, len, u32 __user *, map) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt; u32 **spm, *spp; unsigned long i; size_t nw; unsigned long next, limit; int err; + if (radix_enabled()) + return -ENOENT; + /* Check parameters */ if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) || - addr >= TASK_SIZE || len >= TASK_SIZE || addr + len > TASK_SIZE) + addr >= mm->task_size || len >= mm->task_size || + addr + len > mm->task_size) return -EINVAL; if (is_hugepage_only_range(mm, addr, len)) @@ -211,15 +210,30 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) return 0; } - if (!access_ok(VERIFY_READ, map, (len >> PAGE_SHIFT) * sizeof(u32))) + if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32))) return -EFAULT; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); + + spt = mm_ctx_subpage_prot(&mm->context); + if (!spt) { + /* + * Allocate subpage prot table if not already done. + * Do this with mmap_lock held + */ + spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL); + if (!spt) { + err = -ENOMEM; + goto out; + } + mm->context.hash_context->spt = spt; + } + subpage_mark_vma_nohuge(mm, addr, len); for (limit = addr + len; addr < limit; addr = next) { next = pmd_addr_end(addr, limit); err = -ENOMEM; - if (addr < 0x100000000) { + if (addr < 0x100000000UL) { spm = spt->low_prot; } else { spm = spt->protptrs[addr >> SBP_L3_SHIFT]; @@ -249,12 +263,11 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) if (addr + (nw << PAGE_SHIFT) > next) nw = (next - addr) >> PAGE_SHIFT; - up_write(&mm->mmap_sem); - err = -EFAULT; + mmap_write_unlock(mm); if (__copy_from_user(spp, map, nw * sizeof(u32))) - goto out2; + return -EFAULT; map += nw; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); /* now flush any existing HPTEs for the range */ hpte_flush_range(mm, addr, nw); @@ -263,7 +276,6 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) spt->maxaddr = limit; err = 0; out: - up_write(&mm->mmap_sem); - out2: + mmap_write_unlock(mm); return err; } diff --git a/arch/powerpc/mm/book3s64/trace.c b/arch/powerpc/mm/book3s64/trace.c new file mode 100644 index 000000000000..ccd64b5e6cac --- /dev/null +++ b/arch/powerpc/mm/book3s64/trace.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file is for defining trace points and trace related helpers. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#include <trace/events/thp.h> +#endif diff --git a/arch/powerpc/mm/cacheflush.c b/arch/powerpc/mm/cacheflush.c new file mode 100644 index 000000000000..7186516eca52 --- /dev/null +++ b/arch/powerpc/mm/cacheflush.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/highmem.h> +#include <linux/kprobes.h> + +/** + * flush_coherent_icache() - if a CPU has a coherent icache, flush it + * Return true if the cache was flushed, false otherwise + */ +static inline bool flush_coherent_icache(void) +{ + /* + * For a snooping icache, we still need a dummy icbi to purge all the + * prefetched instructions from the ifetch buffers. We also need a sync + * before the icbi to order the actual stores to memory that might + * have modified instructions with the icbi. + */ + if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + mb(); /* sync */ + icbi((void *)PAGE_OFFSET); + mb(); /* sync */ + isync(); + return true; + } + + return false; +} + +/** + * invalidate_icache_range() - Flush the icache by issuing icbi across an address range + * @start: the start address + * @stop: the stop address (exclusive) + */ +static void invalidate_icache_range(unsigned long start, unsigned long stop) +{ + unsigned long shift = l1_icache_shift(); + unsigned long bytes = l1_icache_bytes(); + char *addr = (char *)(start & ~(bytes - 1)); + unsigned long size = stop - (unsigned long)addr + (bytes - 1); + unsigned long i; + + for (i = 0; i < size >> shift; i++, addr += bytes) + icbi(addr); + + mb(); /* sync */ + isync(); +} + +/** + * flush_icache_range: Write any modified data cache blocks out to memory + * and invalidate the corresponding blocks in the instruction cache + * + * Generic code will call this after writing memory, before executing from it. + * + * @start: the start address + * @stop: the stop address (exclusive) + */ +void flush_icache_range(unsigned long start, unsigned long stop) +{ + if (flush_coherent_icache()) + return; + + clean_dcache_range(start, stop); + + if (IS_ENABLED(CONFIG_44x)) { + /* + * Flash invalidate on 44x because we are passed kmapped + * addresses and this doesn't work for userspace pages due to + * the virtually tagged icache. + */ + iccci((void *)start); + mb(); /* sync */ + isync(); + } else + invalidate_icache_range(start, stop); +} +EXPORT_SYMBOL(flush_icache_range); + +#ifdef CONFIG_HIGHMEM +/** + * flush_dcache_icache_phys() - Flush a page by its physical address + * @physaddr: the physical address of the page + */ +static void flush_dcache_icache_phys(unsigned long physaddr) +{ + unsigned long bytes = l1_dcache_bytes(); + unsigned long nb = PAGE_SIZE / bytes; + unsigned long addr = physaddr & PAGE_MASK; + unsigned long msr, msr0; + unsigned long loop1 = addr, loop2 = addr; + + msr0 = mfmsr(); + msr = msr0 & ~MSR_DR; + /* + * This must remain as ASM to prevent potential memory accesses + * while the data MMU is disabled + */ + asm volatile( + " mtctr %2;\n" + " mtmsr %3;\n" + " isync;\n" + "0: dcbst 0, %0;\n" + " addi %0, %0, %4;\n" + " bdnz 0b;\n" + " sync;\n" + " mtctr %2;\n" + "1: icbi 0, %1;\n" + " addi %1, %1, %4;\n" + " bdnz 1b;\n" + " sync;\n" + " mtmsr %5;\n" + " isync;\n" + : "+&r" (loop1), "+&r" (loop2) + : "r" (nb), "r" (msr), "i" (bytes), "r" (msr0) + : "ctr", "memory"); +} +NOKPROBE_SYMBOL(flush_dcache_icache_phys) +#else +static void flush_dcache_icache_phys(unsigned long physaddr) +{ +} +#endif + +/** + * __flush_dcache_icache(): Flush a particular page from the data cache to RAM. + * Note: this is necessary because the instruction cache does *not* + * snoop from the data cache. + * + * @p: the address of the page to flush + */ +static void __flush_dcache_icache(void *p) +{ + unsigned long addr = (unsigned long)p & PAGE_MASK; + + clean_dcache_range(addr, addr + PAGE_SIZE); + + /* + * We don't flush the icache on 44x. Those have a virtual icache and we + * don't have access to the virtual address here (it's not the page + * vaddr but where it's mapped in user space). The flushing of the + * icache on these is handled elsewhere, when a change in the address + * space occurs, before returning to user space. + */ + + if (mmu_has_feature(MMU_FTR_TYPE_44x)) + return; + + invalidate_icache_range(addr, addr + PAGE_SIZE); +} + +void flush_dcache_icache_folio(struct folio *folio) +{ + unsigned int i, nr = folio_nr_pages(folio); + + if (flush_coherent_icache()) + return; + + if (!folio_test_highmem(folio)) { + void *addr = folio_address(folio); + for (i = 0; i < nr; i++) + __flush_dcache_icache(addr + i * PAGE_SIZE); + } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) { + for (i = 0; i < nr; i++) { + void *start = kmap_local_folio(folio, i * PAGE_SIZE); + + __flush_dcache_icache(start); + kunmap_local(start); + } + } else { + unsigned long pfn = folio_pfn(folio); + for (i = 0; i < nr; i++) + flush_dcache_icache_phys((pfn + i) * PAGE_SIZE); + } +} +EXPORT_SYMBOL(flush_dcache_icache_folio); + +void clear_user_page(void *page, unsigned long vaddr, struct page *pg) +{ + clear_page(page); + + /* + * We shouldn't have to do this, but some versions of glibc + * require it (ld.so assumes zero filled pages are icache clean) + * - Anton + */ + flush_dcache_page(pg); +} +EXPORT_SYMBOL(clear_user_page); + +void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, + struct page *pg) +{ + copy_page(vto, vfrom); + + /* + * We should be able to use the following optimisation, however + * there are two problems. + * Firstly a bug in some versions of binutils meant PLT sections + * were not marked executable. + * Secondly the first word in the GOT section is blrl, used + * to establish the GOT address. Until recently the GOT was + * not marked executable. + * - Anton + */ +#if 0 + if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0)) + return; +#endif + + flush_dcache_page(pg); +} + +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, + unsigned long addr, int len) +{ + void *maddr; + + maddr = kmap_local_page(page) + (addr & ~PAGE_MASK); + flush_icache_range((unsigned long)maddr, (unsigned long)maddr + len); + kunmap_local(maddr); +} diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c new file mode 100644 index 000000000000..f5f8692e2c69 --- /dev/null +++ b/arch/powerpc/mm/copro_fault.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * CoProcessor (SPU/AFU) mm fault handler + * + * (C) Copyright IBM Deutschland Entwicklung GmbH 2007 + * + * Author: Arnd Bergmann <arndb@de.ibm.com> + * Author: Jeremy Kerr <jk@ozlabs.org> + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/export.h> +#include <asm/reg.h> +#include <asm/copro.h> + +/* + * This ought to be kept in sync with the powerpc specific do_page_fault + * function. Currently, there are a few corner cases that we haven't had + * to handle fortunately. + */ +int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, + unsigned long dsisr, vm_fault_t *flt) +{ + struct vm_area_struct *vma; + unsigned long is_write; + int ret; + + if (mm == NULL) + return -EFAULT; + + if (mm->pgd == NULL) + return -EFAULT; + + vma = lock_mm_and_find_vma(mm, ea, NULL); + if (!vma) + return -EFAULT; + + ret = -EFAULT; + is_write = dsisr & DSISR_ISSTORE; + if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) + goto out_unlock; + } else { + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto out_unlock; + /* + * PROT_NONE is covered by the VMA check above. + * and hash should get a NOHPTE fault instead of + * a PROTFAULT in case fixup is needed for things + * like autonuma. + */ + if (!radix_enabled()) + WARN_ON_ONCE(dsisr & DSISR_PROTFAULT); + } + + ret = 0; + *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL); + + /* The fault is fully completed (including releasing mmap lock) */ + if (*flt & VM_FAULT_COMPLETED) + return 0; + + if (unlikely(*flt & VM_FAULT_ERROR)) { + if (*flt & VM_FAULT_OOM) { + ret = -ENOMEM; + goto out_unlock; + } else if (*flt & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) { + ret = -EFAULT; + goto out_unlock; + } + BUG(); + } + +out_unlock: + mmap_read_unlock(mm); + return ret; +} +EXPORT_SYMBOL_GPL(copro_handle_mm_fault); + +#ifdef CONFIG_PPC_64S_HASH_MMU +int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) +{ + u64 vsid, vsidkey; + int psize, ssize; + + switch (get_region_id(ea)) { + case USER_REGION_ID: + pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea); + if (mm == NULL) + return 1; + psize = get_slice_psize(mm, ea); + ssize = user_segment_size(ea); + vsid = get_user_vsid(&mm->context, ea, ssize); + vsidkey = SLB_VSID_USER; + break; + case VMALLOC_REGION_ID: + pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea); + psize = mmu_vmalloc_psize; + ssize = mmu_kernel_ssize; + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + vsidkey = SLB_VSID_KERNEL; + break; + case IO_REGION_ID: + pr_devel("%s: 0x%llx -- IO_REGION_ID\n", __func__, ea); + psize = mmu_io_psize; + ssize = mmu_kernel_ssize; + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + vsidkey = SLB_VSID_KERNEL; + break; + case LINEAR_MAP_REGION_ID: + pr_devel("%s: 0x%llx -- LINEAR_MAP_REGION_ID\n", __func__, ea); + psize = mmu_linear_psize; + ssize = mmu_kernel_ssize; + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + vsidkey = SLB_VSID_KERNEL; + break; + default: + pr_debug("%s: invalid region access at %016llx\n", __func__, ea); + return 1; + } + /* Bad address */ + if (!vsid) + return 1; + + vsid = (vsid << slb_vsid_shift(ssize)) | vsidkey; + + vsid |= mmu_psize_defs[psize].sllp | + ((ssize == MMU_SEGSIZE_1T) ? SLB_VSID_B_1T : 0); + + slb->esid = (ea & (ssize == MMU_SEGSIZE_1T ? ESID_MASK_1T : ESID_MASK)) | SLB_ESID_V; + slb->vsid = vsid; + + return 0; +} +EXPORT_SYMBOL_GPL(copro_calculate_slb); +#endif diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index 6747eece84af..30260b5d146d 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -1,321 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * PowerPC version derived from arch/arm/mm/consistent.c * Copyright (C) 2001 Dan Malek (dmalek@jlc.net) * * Copyright (C) 2000 Russell King - * - * Consistent memory allocators. Used for DMA devices that want to - * share uncached memory with the processor core. The function return - * is the virtual address and 'dma_handle' is the physical address. - * Mostly stolen from the ARM port, with some changes for PowerPC. - * -- Dan - * - * Reorganized to get rid of the arch-specific consistent_* functions - * and provide non-coherent implementations for the DMA API. -Matt - * - * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent() - * implementation. This is pulled straight from ARM and barely - * modified. -Matt - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ -#include <linux/sched.h> -#include <linux/slab.h> #include <linux/kernel.h> #include <linux/errno.h> -#include <linux/string.h> #include <linux/types.h> #include <linux/highmem.h> -#include <linux/dma-mapping.h> -#include <linux/export.h> +#include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> #include <asm/tlbflush.h> - -#include "mmu_decl.h" - -/* - * This address range defaults to a value that is safe for all - * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It - * can be further configured for specific applications under - * the "Advanced Setup" menu. -Matt - */ -#define CONSISTENT_BASE (IOREMAP_TOP) -#define CONSISTENT_END (CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE) -#define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT) - -/* - * This is the page table (2MB) covering uncached, DMA consistent allocations - */ -static DEFINE_SPINLOCK(consistent_lock); - -/* - * VM region handling support. - * - * This should become something generic, handling VM region allocations for - * vmalloc and similar (ioremap, module space, etc). - * - * I envisage vmalloc()'s supporting vm_struct becoming: - * - * struct vm_struct { - * struct vm_region region; - * unsigned long flags; - * struct page **pages; - * unsigned int nr_pages; - * unsigned long phys_addr; - * }; - * - * get_vm_area() would then call vm_region_alloc with an appropriate - * struct vm_region head (eg): - * - * struct vm_region vmalloc_head = { - * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list), - * .vm_start = VMALLOC_START, - * .vm_end = VMALLOC_END, - * }; - * - * However, vmalloc_head.vm_start is variable (typically, it is dependent on - * the amount of RAM found at boot time.) I would imagine that get_vm_area() - * would have to initialise this each time prior to calling vm_region_alloc(). - */ -struct ppc_vm_region { - struct list_head vm_list; - unsigned long vm_start; - unsigned long vm_end; -}; - -static struct ppc_vm_region consistent_head = { - .vm_list = LIST_HEAD_INIT(consistent_head.vm_list), - .vm_start = CONSISTENT_BASE, - .vm_end = CONSISTENT_END, -}; - -static struct ppc_vm_region * -ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp) -{ - unsigned long addr = head->vm_start, end = head->vm_end - size; - unsigned long flags; - struct ppc_vm_region *c, *new; - - new = kmalloc(sizeof(struct ppc_vm_region), gfp); - if (!new) - goto out; - - spin_lock_irqsave(&consistent_lock, flags); - - list_for_each_entry(c, &head->vm_list, vm_list) { - if ((addr + size) < addr) - goto nospc; - if ((addr + size) <= c->vm_start) - goto found; - addr = c->vm_end; - if (addr > end) - goto nospc; - } - - found: - /* - * Insert this entry _before_ the one we found. - */ - list_add_tail(&new->vm_list, &c->vm_list); - new->vm_start = addr; - new->vm_end = addr + size; - - spin_unlock_irqrestore(&consistent_lock, flags); - return new; - - nospc: - spin_unlock_irqrestore(&consistent_lock, flags); - kfree(new); - out: - return NULL; -} - -static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr) -{ - struct ppc_vm_region *c; - - list_for_each_entry(c, &head->vm_list, vm_list) { - if (c->vm_start == addr) - goto out; - } - c = NULL; - out: - return c; -} - -/* - * Allocate DMA-coherent memory space and return both the kernel remapped - * virtual and bus address for that space. - */ -void * -__dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp) -{ - struct page *page; - struct ppc_vm_region *c; - unsigned long order; - u64 mask = ISA_DMA_THRESHOLD, limit; - - if (dev) { - mask = dev->coherent_dma_mask; - - /* - * Sanity check the DMA mask - it must be non-zero, and - * must be able to be satisfied by a DMA allocation. - */ - if (mask == 0) { - dev_warn(dev, "coherent DMA mask is unset\n"); - goto no_page; - } - - if ((~mask) & ISA_DMA_THRESHOLD) { - dev_warn(dev, "coherent DMA mask %#llx is smaller " - "than system GFP_DMA mask %#llx\n", - mask, (unsigned long long)ISA_DMA_THRESHOLD); - goto no_page; - } - } - - - size = PAGE_ALIGN(size); - limit = (mask + 1) & ~mask; - if ((limit && size >= limit) || - size >= (CONSISTENT_END - CONSISTENT_BASE)) { - printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n", - size, mask); - return NULL; - } - - order = get_order(size); - - /* Might be useful if we ever have a real legacy DMA zone... */ - if (mask != 0xffffffff) - gfp |= GFP_DMA; - - page = alloc_pages(gfp, order); - if (!page) - goto no_page; - - /* - * Invalidate any data that might be lurking in the - * kernel direct-mapped region for device DMA. - */ - { - unsigned long kaddr = (unsigned long)page_address(page); - memset(page_address(page), 0, size); - flush_dcache_range(kaddr, kaddr + size); - } - - /* - * Allocate a virtual address in the consistent mapping region. - */ - c = ppc_vm_region_alloc(&consistent_head, size, - gfp & ~(__GFP_DMA | __GFP_HIGHMEM)); - if (c) { - unsigned long vaddr = c->vm_start; - struct page *end = page + (1 << order); - - split_page(page, order); - - /* - * Set the "dma handle" - */ - *handle = page_to_phys(page); - - do { - SetPageReserved(page); - map_page(vaddr, page_to_phys(page), - pgprot_noncached(PAGE_KERNEL)); - page++; - vaddr += PAGE_SIZE; - } while (size -= PAGE_SIZE); - - /* - * Free the otherwise unused pages. - */ - while (page < end) { - __free_page(page); - page++; - } - - return (void *)c->vm_start; - } - - if (page) - __free_pages(page, order); - no_page: - return NULL; -} -EXPORT_SYMBOL(__dma_alloc_coherent); - -/* - * free a page as defined by the above mapping. - */ -void __dma_free_coherent(size_t size, void *vaddr) -{ - struct ppc_vm_region *c; - unsigned long flags, addr; - - size = PAGE_ALIGN(size); - - spin_lock_irqsave(&consistent_lock, flags); - - c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr); - if (!c) - goto no_area; - - if ((c->vm_end - c->vm_start) != size) { - printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n", - __func__, c->vm_end - c->vm_start, size); - dump_stack(); - size = c->vm_end - c->vm_start; - } - - addr = c->vm_start; - do { - pte_t *ptep; - unsigned long pfn; - - ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr), - addr), - addr), - addr); - if (!pte_none(*ptep) && pte_present(*ptep)) { - pfn = pte_pfn(*ptep); - pte_clear(&init_mm, addr, ptep); - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - - ClearPageReserved(page); - __free_page(page); - } - } - addr += PAGE_SIZE; - } while (size -= PAGE_SIZE); - - flush_tlb_kernel_range(c->vm_start, c->vm_end); - - list_del(&c->vm_list); - - spin_unlock_irqrestore(&consistent_lock, flags); - - kfree(c); - return; - - no_area: - spin_unlock_irqrestore(&consistent_lock, flags); - printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n", - __func__, vaddr); - dump_stack(); -} -EXPORT_SYMBOL(__dma_free_coherent); +#include <asm/dma.h> /* * make an area consistent. */ -void __dma_sync(void *vaddr, size_t size, int direction) +static void __dma_sync(void *vaddr, size_t size, int direction) { unsigned long start = (unsigned long)vaddr; unsigned long end = start + size; @@ -328,7 +32,7 @@ void __dma_sync(void *vaddr, size_t size, int direction) * invalidate only when cache-line aligned otherwise there is * the potential for discarding uncommitted data from the cache */ - if ((start & (L1_CACHE_BYTES - 1)) || (size & (L1_CACHE_BYTES - 1))) + if ((start | end) & (L1_CACHE_BYTES - 1)) flush_dcache_range(start, end); else invalidate_dcache_range(start, end); @@ -341,7 +45,6 @@ void __dma_sync(void *vaddr, size_t size, int direction) break; } } -EXPORT_SYMBOL(__dma_sync); #ifdef CONFIG_HIGHMEM /* @@ -388,34 +91,34 @@ static inline void __dma_sync_page_highmem(struct page *page, * __dma_sync_page makes memory consistent. identical to __dma_sync, but * takes a struct page instead of a virtual address */ -void __dma_sync_page(struct page *page, unsigned long offset, - size_t size, int direction) +static void __dma_sync_page(phys_addr_t paddr, size_t size, int dir) { + struct page *page = pfn_to_page(paddr >> PAGE_SHIFT); + unsigned offset = paddr & ~PAGE_MASK; + #ifdef CONFIG_HIGHMEM - __dma_sync_page_highmem(page, offset, size, direction); + __dma_sync_page_highmem(page, offset, size, dir); #else unsigned long start = (unsigned long)page_address(page) + offset; - __dma_sync((void *)start, size, direction); + __dma_sync((void *)start, size, dir); #endif } -EXPORT_SYMBOL(__dma_sync_page); -/* - * Return the PFN for a given cpu virtual address returned by - * __dma_alloc_coherent. This is used by dma_mmap_coherent() - */ -unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) +{ + __dma_sync_page(paddr, size, dir); +} + +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) +{ + __dma_sync_page(paddr, size, dir); +} + +void arch_dma_prep_coherent(struct page *page, size_t size) { - /* This should always be populated, so we don't test every - * level. If that fails, we'll have a nice crash which - * will be as good as a BUG_ON() - */ - pgd_t *pgd = pgd_offset_k(cpu_addr); - pud_t *pud = pud_offset(pgd, cpu_addr); - pmd_t *pmd = pmd_offset(pud, cpu_addr); - pte_t *ptep = pte_offset_kernel(pmd, cpu_addr); + unsigned long kaddr = (unsigned long)page_address(page); - if (pte_none(*ptep) || !pte_present(*ptep)) - return 0; - return pte_pfn(*ptep); + flush_dcache_range(kaddr, kaddr + size); } diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c new file mode 100644 index 000000000000..8dd7b340d51f --- /dev/null +++ b/arch/powerpc/mm/drmem.c @@ -0,0 +1,514 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Dynamic reconfiguration memory support + * + * Copyright 2017 IBM Corporation + */ + +#define pr_fmt(fmt) "drmem: " fmt + +#include <linux/kernel.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/memblock.h> +#include <linux/slab.h> +#include <asm/drmem.h> + +static int n_root_addr_cells, n_root_size_cells; + +static struct drmem_lmb_info __drmem_info; +struct drmem_lmb_info *drmem_info = &__drmem_info; +static bool in_drmem_update; + +u64 drmem_lmb_memory_max(void) +{ + struct drmem_lmb *last_lmb; + + last_lmb = &drmem_info->lmbs[drmem_info->n_lmbs - 1]; + return last_lmb->base_addr + drmem_lmb_size(); +} + +static u32 drmem_lmb_flags(struct drmem_lmb *lmb) +{ + /* + * Return the value of the lmb flags field minus the reserved + * bit used internally for hotplug processing. + */ + return lmb->flags & ~DRMEM_LMB_RESERVED; +} + +static struct property *clone_property(struct property *prop, u32 prop_sz) +{ + struct property *new_prop; + + new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL); + if (!new_prop) + return NULL; + + new_prop->name = kstrdup(prop->name, GFP_KERNEL); + new_prop->value = kzalloc(prop_sz, GFP_KERNEL); + if (!new_prop->name || !new_prop->value) { + kfree(new_prop->name); + kfree(new_prop->value); + kfree(new_prop); + return NULL; + } + + new_prop->length = prop_sz; +#if defined(CONFIG_OF_DYNAMIC) + of_property_set_flag(new_prop, OF_DYNAMIC); +#endif + return new_prop; +} + +static int drmem_update_dt_v1(struct device_node *memory, + struct property *prop) +{ + struct property *new_prop; + struct of_drconf_cell_v1 *dr_cell; + struct drmem_lmb *lmb; + __be32 *p; + + new_prop = clone_property(prop, prop->length); + if (!new_prop) + return -1; + + p = new_prop->value; + *p++ = cpu_to_be32(drmem_info->n_lmbs); + + dr_cell = (struct of_drconf_cell_v1 *)p; + + for_each_drmem_lmb(lmb) { + dr_cell->base_addr = cpu_to_be64(lmb->base_addr); + dr_cell->drc_index = cpu_to_be32(lmb->drc_index); + dr_cell->aa_index = cpu_to_be32(lmb->aa_index); + dr_cell->flags = cpu_to_be32(drmem_lmb_flags(lmb)); + + dr_cell++; + } + + of_update_property(memory, new_prop); + return 0; +} + +static void init_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell, + struct drmem_lmb *lmb) +{ + dr_cell->base_addr = cpu_to_be64(lmb->base_addr); + dr_cell->drc_index = cpu_to_be32(lmb->drc_index); + dr_cell->aa_index = cpu_to_be32(lmb->aa_index); + dr_cell->flags = cpu_to_be32(drmem_lmb_flags(lmb)); +} + +static int drmem_update_dt_v2(struct device_node *memory, + struct property *prop) +{ + struct property *new_prop; + struct of_drconf_cell_v2 *dr_cell; + struct drmem_lmb *lmb, *prev_lmb; + u32 lmb_sets, prop_sz, seq_lmbs; + u32 *p; + + /* First pass, determine how many LMB sets are needed. */ + lmb_sets = 0; + prev_lmb = NULL; + for_each_drmem_lmb(lmb) { + if (!prev_lmb) { + prev_lmb = lmb; + lmb_sets++; + continue; + } + + if (prev_lmb->aa_index != lmb->aa_index || + drmem_lmb_flags(prev_lmb) != drmem_lmb_flags(lmb)) + lmb_sets++; + + prev_lmb = lmb; + } + + prop_sz = lmb_sets * sizeof(*dr_cell) + sizeof(__be32); + new_prop = clone_property(prop, prop_sz); + if (!new_prop) + return -1; + + p = new_prop->value; + *p++ = cpu_to_be32(lmb_sets); + + dr_cell = (struct of_drconf_cell_v2 *)p; + + /* Second pass, populate the LMB set data */ + prev_lmb = NULL; + seq_lmbs = 0; + for_each_drmem_lmb(lmb) { + if (prev_lmb == NULL) { + /* Start of first LMB set */ + prev_lmb = lmb; + init_drconf_v2_cell(dr_cell, lmb); + seq_lmbs++; + continue; + } + + if (prev_lmb->aa_index != lmb->aa_index || + drmem_lmb_flags(prev_lmb) != drmem_lmb_flags(lmb)) { + /* end of one set, start of another */ + dr_cell->seq_lmbs = cpu_to_be32(seq_lmbs); + dr_cell++; + + init_drconf_v2_cell(dr_cell, lmb); + seq_lmbs = 1; + } else { + seq_lmbs++; + } + + prev_lmb = lmb; + } + + /* close out last LMB set */ + dr_cell->seq_lmbs = cpu_to_be32(seq_lmbs); + of_update_property(memory, new_prop); + return 0; +} + +int drmem_update_dt(void) +{ + struct device_node *memory; + struct property *prop; + int rc = -1; + + memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (!memory) + return -1; + + /* + * Set in_drmem_update to prevent the notifier callback to process the + * DT property back since the change is coming from the LMB tree. + */ + in_drmem_update = true; + prop = of_find_property(memory, "ibm,dynamic-memory", NULL); + if (prop) { + rc = drmem_update_dt_v1(memory, prop); + } else { + prop = of_find_property(memory, "ibm,dynamic-memory-v2", NULL); + if (prop) + rc = drmem_update_dt_v2(memory, prop); + } + in_drmem_update = false; + + of_node_put(memory); + return rc; +} + +static void read_drconf_v1_cell(struct drmem_lmb *lmb, + const __be32 **prop) +{ + const __be32 *p = *prop; + + lmb->base_addr = of_read_number(p, n_root_addr_cells); + p += n_root_addr_cells; + lmb->drc_index = of_read_number(p++, 1); + + p++; /* skip reserved field */ + + lmb->aa_index = of_read_number(p++, 1); + lmb->flags = of_read_number(p++, 1); + + *prop = p; +} + +static int +__walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm, void *data, + int (*func)(struct drmem_lmb *, const __be32 **, void *)) +{ + struct drmem_lmb lmb; + u32 i, n_lmbs; + int ret = 0; + + n_lmbs = of_read_number(prop++, 1); + for (i = 0; i < n_lmbs; i++) { + read_drconf_v1_cell(&lmb, &prop); + ret = func(&lmb, &usm, data); + if (ret) + break; + } + + return ret; +} + +static void read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell, + const __be32 **prop) +{ + const __be32 *p = *prop; + + dr_cell->seq_lmbs = of_read_number(p++, 1); + dr_cell->base_addr = of_read_number(p, n_root_addr_cells); + p += n_root_addr_cells; + dr_cell->drc_index = of_read_number(p++, 1); + dr_cell->aa_index = of_read_number(p++, 1); + dr_cell->flags = of_read_number(p++, 1); + + *prop = p; +} + +static int +__walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm, void *data, + int (*func)(struct drmem_lmb *, const __be32 **, void *)) +{ + struct of_drconf_cell_v2 dr_cell; + struct drmem_lmb lmb; + u32 i, j, lmb_sets; + int ret = 0; + + lmb_sets = of_read_number(prop++, 1); + for (i = 0; i < lmb_sets; i++) { + read_drconf_v2_cell(&dr_cell, &prop); + + for (j = 0; j < dr_cell.seq_lmbs; j++) { + lmb.base_addr = dr_cell.base_addr; + dr_cell.base_addr += drmem_lmb_size(); + + lmb.drc_index = dr_cell.drc_index; + dr_cell.drc_index++; + + lmb.aa_index = dr_cell.aa_index; + lmb.flags = dr_cell.flags; + + ret = func(&lmb, &usm, data); + if (ret) + break; + } + } + + return ret; +} + +#ifdef CONFIG_PPC_PSERIES +int __init walk_drmem_lmbs_early(unsigned long node, void *data, + int (*func)(struct drmem_lmb *, const __be32 **, void *)) +{ + const __be32 *prop, *usm; + int len, ret = -ENODEV; + + prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len); + if (!prop || len < dt_root_size_cells * sizeof(__be32)) + return ret; + + /* Get the address & size cells */ + n_root_addr_cells = dt_root_addr_cells; + n_root_size_cells = dt_root_size_cells; + + drmem_info->lmb_size = dt_mem_next_cell(dt_root_size_cells, &prop); + + usm = of_get_flat_dt_prop(node, "linux,drconf-usable-memory", &len); + + prop = of_get_flat_dt_prop(node, "ibm,dynamic-memory", &len); + if (prop) { + ret = __walk_drmem_v1_lmbs(prop, usm, data, func); + } else { + prop = of_get_flat_dt_prop(node, "ibm,dynamic-memory-v2", + &len); + if (prop) + ret = __walk_drmem_v2_lmbs(prop, usm, data, func); + } + + memblock_dump_all(); + return ret; +} + +/* + * Update the LMB associativity index. + */ +static int update_lmb(struct drmem_lmb *updated_lmb, + __maybe_unused const __be32 **usm, + __maybe_unused void *data) +{ + struct drmem_lmb *lmb; + + for_each_drmem_lmb(lmb) { + if (lmb->drc_index != updated_lmb->drc_index) + continue; + + lmb->aa_index = updated_lmb->aa_index; + break; + } + return 0; +} + +/* + * Update the LMB associativity index. + * + * This needs to be called when the hypervisor is updating the + * dynamic-reconfiguration-memory node property. + */ +void drmem_update_lmbs(struct property *prop) +{ + /* + * Don't update the LMBs if triggered by the update done in + * drmem_update_dt(), the LMB values have been used to the update the DT + * property in that case. + */ + if (in_drmem_update) + return; + if (!strcmp(prop->name, "ibm,dynamic-memory")) + __walk_drmem_v1_lmbs(prop->value, NULL, NULL, update_lmb); + else if (!strcmp(prop->name, "ibm,dynamic-memory-v2")) + __walk_drmem_v2_lmbs(prop->value, NULL, NULL, update_lmb); +} +#endif + +static int init_drmem_lmb_size(struct device_node *dn) +{ + const __be32 *prop; + int len; + + if (drmem_info->lmb_size) + return 0; + + prop = of_get_property(dn, "ibm,lmb-size", &len); + if (!prop || len < n_root_size_cells * sizeof(__be32)) { + pr_info("Could not determine LMB size\n"); + return -1; + } + + drmem_info->lmb_size = of_read_number(prop, n_root_size_cells); + return 0; +} + +/* + * Returns the property linux,drconf-usable-memory if + * it exists (the property exists only in kexec/kdump kernels, + * added by kexec-tools) + */ +static const __be32 *of_get_usable_memory(struct device_node *dn) +{ + const __be32 *prop; + u32 len; + + prop = of_get_property(dn, "linux,drconf-usable-memory", &len); + if (!prop || len < sizeof(unsigned int)) + return NULL; + + return prop; +} + +int walk_drmem_lmbs(struct device_node *dn, void *data, + int (*func)(struct drmem_lmb *, const __be32 **, void *)) +{ + struct device_node *root = of_find_node_by_path("/"); + const __be32 *prop, *usm; + int ret = -ENODEV; + + if (!root) + return ret; + + /* Get the address & size cells */ + n_root_addr_cells = of_n_addr_cells(root); + n_root_size_cells = of_n_size_cells(root); + of_node_put(root); + + if (init_drmem_lmb_size(dn)) + return ret; + + usm = of_get_usable_memory(dn); + + prop = of_get_property(dn, "ibm,dynamic-memory", NULL); + if (prop) { + ret = __walk_drmem_v1_lmbs(prop, usm, data, func); + } else { + prop = of_get_property(dn, "ibm,dynamic-memory-v2", NULL); + if (prop) + ret = __walk_drmem_v2_lmbs(prop, usm, data, func); + } + + return ret; +} + +static void __init init_drmem_v1_lmbs(const __be32 *prop) +{ + struct drmem_lmb *lmb; + + drmem_info->n_lmbs = of_read_number(prop++, 1); + if (drmem_info->n_lmbs == 0) + return; + + drmem_info->lmbs = kcalloc(drmem_info->n_lmbs, sizeof(*lmb), + GFP_KERNEL); + if (!drmem_info->lmbs) + return; + + for_each_drmem_lmb(lmb) + read_drconf_v1_cell(lmb, &prop); +} + +static void __init init_drmem_v2_lmbs(const __be32 *prop) +{ + struct drmem_lmb *lmb; + struct of_drconf_cell_v2 dr_cell; + const __be32 *p; + u32 i, j, lmb_sets; + int lmb_index; + + lmb_sets = of_read_number(prop++, 1); + if (lmb_sets == 0) + return; + + /* first pass, calculate the number of LMBs */ + p = prop; + for (i = 0; i < lmb_sets; i++) { + read_drconf_v2_cell(&dr_cell, &p); + drmem_info->n_lmbs += dr_cell.seq_lmbs; + } + + drmem_info->lmbs = kcalloc(drmem_info->n_lmbs, sizeof(*lmb), + GFP_KERNEL); + if (!drmem_info->lmbs) + return; + + /* second pass, read in the LMB information */ + lmb_index = 0; + p = prop; + + for (i = 0; i < lmb_sets; i++) { + read_drconf_v2_cell(&dr_cell, &p); + + for (j = 0; j < dr_cell.seq_lmbs; j++) { + lmb = &drmem_info->lmbs[lmb_index++]; + + lmb->base_addr = dr_cell.base_addr; + dr_cell.base_addr += drmem_info->lmb_size; + + lmb->drc_index = dr_cell.drc_index; + dr_cell.drc_index++; + + lmb->aa_index = dr_cell.aa_index; + lmb->flags = dr_cell.flags; + } + } +} + +static int __init drmem_init(void) +{ + struct device_node *dn; + const __be32 *prop; + + dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (!dn) + return 0; + + if (init_drmem_lmb_size(dn)) { + of_node_put(dn); + return 0; + } + + prop = of_get_property(dn, "ibm,dynamic-memory", NULL); + if (prop) { + init_drmem_v1_lmbs(prop); + } else { + prop = of_get_property(dn, "ibm,dynamic-memory-v2", NULL); + if (prop) + init_drmem_v2_lmbs(prop); + } + + of_node_put(dn); + return 0; +} +late_initcall(drmem_init); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 8726779e1409..806c74e0d5ab 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -8,526 +9,689 @@ * Modified by Cort Dougan and Paul Mackerras. * * Modified for PPC64 by Dave Engebretsen (engebret@ibm.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/signal.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/types.h> +#include <linux/pagemap.h> #include <linux/ptrace.h> #include <linux/mman.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/highmem.h> -#include <linux/module.h> +#include <linux/extable.h> #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/perf_event.h> -#include <linux/magic.h> #include <linux/ratelimit.h> #include <linux/context_tracking.h> +#include <linux/hugetlb.h> +#include <linux/uaccess.h> +#include <linux/kfence.h> +#include <linux/pkeys.h> #include <asm/firmware.h> +#include <asm/interrupt.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/mmu_context.h> -#include <asm/uaccess.h> -#include <asm/tlbflush.h> #include <asm/siginfo.h> #include <asm/debug.h> -#include <mm/mmu_decl.h> +#include <asm/kup.h> +#include <asm/inst.h> + -#include "icswx.h" +/* + * do_page_fault error handling helpers + */ -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs) +static int +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code) { - int ret = 0; + /* + * If we are in kernel mode, bail out with a SEGV, this will + * be caught by the assembly which will restore the non-volatile + * registers before calling bad_page_fault() + */ + if (!user_mode(regs)) + return SIGSEGV; - /* kprobe_running() needs smp_processor_id() */ - if (!user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 11)) - ret = 1; - preempt_enable(); - } + _exception(SIGSEGV, regs, si_code, address); - return ret; + return 0; } -#else -static inline int notify_page_fault(struct pt_regs *regs) + +static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address) { - return 0; + return __bad_area_nosemaphore(regs, address, SEGV_MAPERR); } -#endif -/* - * Check whether the instruction at regs->nip is a store using - * an update addressing form which will update r1. - */ -static int store_updates_sp(struct pt_regs *regs) +static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code, + struct mm_struct *mm, struct vm_area_struct *vma) { - unsigned int inst; - if (get_user(inst, (unsigned int __user *)regs->nip)) - return 0; - /* check for 1 in the rA field */ - if (((inst >> 16) & 0x1f) != 1) - return 0; - /* check major opcode */ - switch (inst >> 26) { - case 37: /* stwu */ - case 39: /* stbu */ - case 45: /* sthu */ - case 53: /* stfsu */ - case 55: /* stfdu */ - return 1; - case 62: /* std or stdu */ - return (inst & 3) == 1; - case 31: - /* check minor opcode */ - switch ((inst >> 1) & 0x3ff) { - case 181: /* stdux */ - case 183: /* stwux */ - case 247: /* stbux */ - case 439: /* sthux */ - case 695: /* stfsux */ - case 759: /* stfdux */ - return 1; - } - } + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + if (mm) + mmap_read_unlock(mm); + else + vma_end_read(vma); + + return __bad_area_nosemaphore(regs, address, si_code); +} + +static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, + struct mm_struct *mm, + struct vm_area_struct *vma) +{ + int pkey; + + /* + * We don't try to fetch the pkey from page table because reading + * page table without locking doesn't guarantee stable pte value. + * Hence the pkey value that we return to userspace can be different + * from the pkey that actually caused access error. + * + * It does *not* guarantee that the VMA we find here + * was the one that we faulted on. + * + * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); + * 2. T1 : set AMR to deny access to pkey=4, touches, page + * 3. T1 : faults... + * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); + * 5. T1 : enters fault handler, takes mmap_lock, etc... + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really + * faulted on a pte with its pkey=4. + */ + pkey = vma_pkey(vma); + + if (mm) + mmap_read_unlock(mm); + else + vma_end_read(vma); + + /* + * If we are in kernel mode, bail out with a SEGV, this will + * be caught by the assembly which will restore the non-volatile + * registers before calling bad_page_fault() + */ + if (!user_mode(regs)) + return SIGSEGV; + + _exception_pkey(regs, address, pkey); + return 0; } -/* - * do_page_fault error handling helpers - */ -#define MM_FAULT_RETURN 0 -#define MM_FAULT_CONTINUE -1 -#define MM_FAULT_ERR(sig) (sig) +static noinline int bad_access(struct pt_regs *regs, unsigned long address, + struct mm_struct *mm, struct vm_area_struct *vma) +{ + return __bad_area(regs, address, SEGV_ACCERR, mm, vma); +} -static int do_sigbus(struct pt_regs *regs, unsigned long address) +static int do_sigbus(struct pt_regs *regs, unsigned long address, + vm_fault_t fault) { - siginfo_t info; - - up_read(¤t->mm->mmap_sem); - - if (user_mode(regs)) { - current->thread.trap_nr = BUS_ADRERR; - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, current); - return MM_FAULT_RETURN; + if (!user_mode(regs)) + return SIGBUS; + + current->thread.trap_nr = BUS_ADRERR; +#ifdef CONFIG_MEMORY_FAILURE + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { + unsigned int lsb = 0; /* shutup gcc */ + + pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", + current->comm, current->pid, address); + + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + + force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); + return 0; } - return MM_FAULT_ERR(SIGBUS); + +#endif + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); + return 0; } -static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) +static int mm_fault_error(struct pt_regs *regs, unsigned long addr, + vm_fault_t fault) { /* - * Pagefault was interrupted by SIGKILL. We have no reason to - * continue the pagefault. + * Kernel page fault interrupted by SIGKILL. We have no reason to + * continue processing. */ - if (fatal_signal_pending(current)) { - /* - * If we have retry set, the mmap semaphore will have - * alrady been released in __lock_page_or_retry(). Else - * we release it now. - */ - if (!(fault & VM_FAULT_RETRY)) - up_read(¤t->mm->mmap_sem); - /* Coming from kernel, we need to deal with uaccess fixups */ - if (user_mode(regs)) - return MM_FAULT_RETURN; - return MM_FAULT_ERR(SIGKILL); - } - - /* No fault: be happy */ - if (!(fault & VM_FAULT_ERROR)) - return MM_FAULT_CONTINUE; + if (fatal_signal_pending(current) && !user_mode(regs)) + return SIGKILL; /* Out of memory */ if (fault & VM_FAULT_OOM) { - up_read(¤t->mm->mmap_sem); - /* * We ran out of memory, or some other thing happened to us that * made us unable to handle the page fault gracefully. */ if (!user_mode(regs)) - return MM_FAULT_ERR(SIGKILL); + return SIGSEGV; pagefault_out_of_memory(); - return MM_FAULT_RETURN; + } else { + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) + return do_sigbus(regs, addr, fault); + else if (fault & VM_FAULT_SIGSEGV) + return bad_area_nosemaphore(regs, addr); + else + BUG(); + } + return 0; +} + +/* Is this a bad kernel fault ? */ +static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address, bool is_write) +{ + int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE; + + if (is_exec) { + pr_crit_ratelimited("kernel tried to execute %s page (%lx) - exploit attempt? (uid: %d)\n", + address >= TASK_SIZE ? "exec-protected" : "user", + address, + from_kuid(&init_user_ns, current_uid())); + + // Kernel exec fault is always bad + return true; + } + + // Kernel fault on kernel address is bad + if (address >= TASK_SIZE) + return true; + + // Read/write fault blocked by KUAP is bad, it can never succeed. + if (bad_kuap_fault(regs, address, is_write)) { + pr_crit_ratelimited("Kernel attempted to %s user page (%lx) - exploit attempt? (uid: %d)\n", + str_write_read(is_write), address, + from_kuid(&init_user_ns, current_uid())); + + // Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad + if (!search_exception_tables(regs->nip)) + return true; + + // Read/write fault in a valid region (the exception table search passed + // above), but blocked by KUAP is bad, it can never succeed. + return WARN(true, "Bug: %s fault blocked by KUAP!", is_write ? "Write" : "Read"); + } + + // What's left? Kernel fault on user and allowed by KUAP in the faulting context. + return false; +} + +static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey, + struct vm_area_struct *vma) +{ + /* + * Make sure to check the VMA so that we do not perform + * faults just to hit a pkey fault as soon as we fill in a + * page. Only called for current mm, hence foreign == 0 + */ + if (!arch_vma_access_permitted(vma, is_write, is_exec, 0)) + return true; + + return false; +} + +static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma) +{ + /* + * Allow execution from readable areas if the MMU does not + * provide separate controls over reading and executing. + * + * Note: That code used to not be enabled for 4xx/BookE. + * It is now as I/D cache coherency for these is done at + * set_pte_at() time and I see no reason why the test + * below wouldn't be valid on those processors. This -may- + * break programs compiled with a really old ABI though. + */ + if (is_exec) { + return !(vma->vm_flags & VM_EXEC) && + (cpu_has_feature(CPU_FTR_NOEXECUTE) || + !(vma->vm_flags & (VM_READ | VM_WRITE))); } - /* Bus error. x86 handles HWPOISON here, we'll add this if/when - * we support the feature in HW + if (is_write) { + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return true; + return false; + } + + /* + * VM_READ, VM_WRITE and VM_EXEC may imply read permissions, as + * defined in protection_map[]. In that case Read faults can only be + * caused by a PROT_NONE mapping. However a non exec access on a + * VM_EXEC only mapping is invalid anyway, so report it as such. + */ + if (unlikely(!vma_is_accessible(vma))) + return true; + + if ((vma->vm_flags & VM_ACCESS_FLAGS) == VM_EXEC) + return true; + + /* + * We should ideally do the vma pkey access check here. But in the + * fault path, handle_mm_fault() also does the same check. To avoid + * these multiple checks, we skip it here and handle access error due + * to pkeys later. + */ + return false; +} + +#ifdef CONFIG_PPC_SMLPAR +static inline void cmo_account_page_fault(void) +{ + if (firmware_has_feature(FW_FEATURE_CMO)) { + u32 page_ins; + + preempt_disable(); + page_ins = be32_to_cpu(get_lppaca()->page_ins); + page_ins += 1 << PAGE_FACTOR; + get_lppaca()->page_ins = cpu_to_be32(page_ins); + preempt_enable(); + } +} +#else +static inline void cmo_account_page_fault(void) { } +#endif /* CONFIG_PPC_SMLPAR */ + +static void sanity_check_fault(bool is_write, bool is_user, + unsigned long error_code, unsigned long address) +{ + /* + * Userspace trying to access kernel address, we get PROTFAULT for that. + */ + if (is_user && address >= TASK_SIZE) { + if ((long)address == -1) + return; + + pr_crit_ratelimited("%s[%d]: User access of kernel address (%lx) - exploit attempt? (uid: %d)\n", + current->comm, current->pid, address, + from_kuid(&init_user_ns, current_uid())); + return; + } + + if (!IS_ENABLED(CONFIG_PPC_BOOK3S)) + return; + + /* + * For hash translation mode, we should never get a + * PROTFAULT. Any update to pte to reduce access will result in us + * removing the hash page table entry, thus resulting in a DSISR_NOHPTE + * fault instead of DSISR_PROTFAULT. + * + * A pte update to relax the access will not result in a hash page table + * entry invalidate and hence can result in DSISR_PROTFAULT. + * ptep_set_access_flags() doesn't do a hpte flush. This is why we have + * the special !is_write in the below conditional. + * + * For platforms that doesn't supports coherent icache and do support + * per page noexec bit, we do setup things such that we do the + * sync between D/I cache via fault. But that is handled via low level + * hash fault code (hash_page_do_lazy_icache()) and we should not reach + * here in such case. + * + * For wrong access that can result in PROTFAULT, the above vma->vm_flags + * check should handle those and hence we should fall to the bad_area + * handling correctly. + * + * For embedded with per page exec support that doesn't support coherent + * icache we do get PROTFAULT and we handle that D/I cache sync in + * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON + * is conditional for server MMU. + * + * For radix, we can get prot fault for autonuma case, because radix + * page table will have them marked noaccess for user. + */ + if (radix_enabled() || is_write) + return; + + WARN_ON_ONCE(error_code & DSISR_PROTFAULT); +} + +/* + * Define the correct "is_write" bit in error_code based + * on the processor family + */ +#ifdef CONFIG_BOOKE +#define page_fault_is_write(__err) ((__err) & ESR_DST) +#else +#define page_fault_is_write(__err) ((__err) & DSISR_ISSTORE) +#endif + +#ifdef CONFIG_BOOKE +#define page_fault_is_bad(__err) (0) +#elif defined(CONFIG_PPC_8xx) +#define page_fault_is_bad(__err) ((__err) & DSISR_NOEXEC_OR_G) +#elif defined(CONFIG_PPC64) +static int page_fault_is_bad(unsigned long err) +{ + unsigned long flag = DSISR_BAD_FAULT_64S; + + /* + * PAPR+ v2.11 § 14.15.3.4.1 (unreleased) + * If byte 0, bit 3 of pi-attribute-specifier-type in + * ibm,pi-features property is defined, ignore the DSI error + * which is caused by the paste instruction on the + * suspended NX window. */ - if (fault & VM_FAULT_SIGBUS) - return do_sigbus(regs, addr); + if (mmu_has_feature(MMU_FTR_NX_DSI)) + flag &= ~DSISR_BAD_COPYPASTE; - /* We don't understand the fault code, this is fatal */ - BUG(); - return MM_FAULT_CONTINUE; + return err & flag; } +#else +#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S) +#endif /* * For 600- and 800-family processors, the error_code parameter is DSISR - * for a data fault, SRR1 for an instruction fault. For 400-family processors - * the error_code parameter is ESR for a data fault, 0 for an instruction - * fault. - * For 64-bit processors, the error_code parameter is - * - DSISR for a non-SLB data access fault, - * - SRR1 & 0x08000000 for a non-SLB instruction access fault - * - 0 any SLB fault. + * for a data fault, SRR1 for an instruction fault. + * For 400-family processors the error_code parameter is ESR for a data fault, + * 0 for an instruction fault. + * For 64-bit processors, the error_code parameter is DSISR for a data access + * fault, SRR1 & 0x08000000 for an instruction access fault. * * The return value is 0 if the fault was handled, or the signal * number if this is a kernel fault that can't be handled here. */ -int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) +static int ___do_page_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) { - enum ctx_state prev_state = exception_enter(); struct vm_area_struct * vma; struct mm_struct *mm = current->mm; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - int code = SEGV_MAPERR; - int is_write = 0; - int trap = TRAP(regs); - int is_exec = trap == 0x400; - int fault; - int rc = 0; - -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) + unsigned int flags = FAULT_FLAG_DEFAULT; + int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE; + int is_user = user_mode(regs); + int is_write = page_fault_is_write(error_code); + vm_fault_t fault, major = 0; + bool kprobe_fault = kprobe_page_fault(regs, 11); + + if (unlikely(debugger_fault_handler(regs) || kprobe_fault)) + return 0; + + if (unlikely(page_fault_is_bad(error_code))) { + if (is_user) { + _exception(SIGBUS, regs, BUS_OBJERR, address); + return 0; + } + return SIGBUS; + } + + /* Additional sanity check(s) */ + sanity_check_fault(is_write, is_user, error_code, address); + /* - * Fortunately the bit assignments in SRR1 for an instruction - * fault and DSISR for a data fault are mostly the same for the - * bits we are interested in. But there are some bits which - * indicate errors in DSISR but can validly be set in SRR1. + * The kernel should never take an execute fault nor should it + * take a page fault to a kernel address or a page fault to a user + * address outside of dedicated places. + * + * Rather than kfence directly reporting false negatives, search whether + * the NIP belongs to the fixup table for cases where fault could come + * from functions like copy_from_kernel_nofault(). */ - if (trap == 0x400) - error_code &= 0x48200000; - else - is_write = error_code & DSISR_ISSTORE; -#else - is_write = error_code & ESR_DST; -#endif /* CONFIG_4xx || CONFIG_BOOKE */ + if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write))) { + if (is_kfence_address((void *)address) && + !search_exception_tables(instruction_pointer(regs)) && + kfence_handle_page_fault(address, is_write, regs)) + return 0; - if (is_write) - flags |= FAULT_FLAG_WRITE; + return SIGSEGV; + } -#ifdef CONFIG_PPC_ICSWX /* - * we need to do this early because this "data storage - * interrupt" does not update the DAR/DEAR so we don't want to - * look at it + * If we're in an interrupt, have no user context or are running + * in a region with pagefaults disabled then we must not take the fault */ - if (error_code & ICSWX_DSI_UCT) { - rc = acop_handle_fault(regs, address, error_code); - if (rc) - goto bail; + if (unlikely(faulthandler_disabled() || !mm)) { + if (is_user) + printk_ratelimited(KERN_ERR "Page fault in user mode" + " with faulthandler_disabled()=%d" + " mm=%p\n", + faulthandler_disabled(), mm); + return bad_area_nosemaphore(regs, address); } -#endif /* CONFIG_PPC_ICSWX */ - if (notify_page_fault(regs)) - goto bail; + interrupt_cond_local_irq_enable(regs); + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (unlikely(debugger_fault_handler(regs))) - goto bail; + /* + * We want to do this outside mmap_lock, because reading code around nip + * can result in fault, which will cause a deadlock when called with + * mmap_lock held + */ + if (is_user) + flags |= FAULT_FLAG_USER; + if (is_write) + flags |= FAULT_FLAG_WRITE; + if (is_exec) + flags |= FAULT_FLAG_INSTRUCTION; - /* On a kernel SLB miss we can only check for a valid exception entry */ - if (!user_mode(regs) && (address >= TASK_SIZE)) { - rc = SIGSEGV; - goto bail; + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + + vma = lock_vma_under_rcu(mm, address); + if (!vma) + goto lock_mmap; + + if (unlikely(access_pkey_error(is_write, is_exec, + (error_code & DSISR_KEYFAULT), vma))) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + return bad_access_pkey(regs, address, NULL, vma); } -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \ - defined(CONFIG_PPC_BOOK3S_64)) - if (error_code & DSISR_DABRMATCH) { - /* breakpoint match */ - do_break(regs, address, error_code); - goto bail; + if (unlikely(access_error(is_write, is_exec, vma))) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + return bad_access(regs, address, NULL, vma); } -#endif - /* We restore the interrupt state now */ - if (!arch_irq_disabled_regs(regs)) - local_irq_enable(); + fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); - if (in_atomic() || mm == NULL) { - if (!user_mode(regs)) { - rc = SIGSEGV; - goto bail; - } - /* in_atomic() in user mode is really bad, - as is current->mm == NULL. */ - printk(KERN_EMERG "Page fault in user mode with " - "in_atomic() = %d mm = %p\n", in_atomic(), mm); - printk(KERN_EMERG "NIP = %lx MSR = %lx\n", - regs->nip, regs->msr); - die("Weird page fault", regs, SIGSEGV); + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto done; } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + if (fault_signal_pending(fault, regs)) + return user_mode(regs) ? 0 : SIGBUS; + +lock_mmap: /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem + * erroneous fault occurring in a code path which already holds mmap_lock * we will deadlock attempting to validate the fault against the * address space. Luckily the kernel only validly references user * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. + * exceptions table. lock_mm_and_find_vma() handles that logic. */ - if (!down_read_trylock(&mm->mmap_sem)) { - if (!user_mode(regs) && !search_exception_tables(regs->nip)) - goto bad_area_nosemaphore; - retry: - down_read(&mm->mmap_sem); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case we'll have missed the might_sleep() from - * down_read(): - */ - might_sleep(); - } + vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) + return bad_area_nosemaphore(regs, address); - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; + if (unlikely(access_pkey_error(is_write, is_exec, + (error_code & DSISR_KEYFAULT), vma))) + return bad_access_pkey(regs, address, mm, vma); - /* - * N.B. The POWER/Open ABI allows programs to access up to - * 288 bytes below the stack pointer. - * The kernel signal delivery code writes up to about 1.5kB - * below the stack pointer (r1) before decrementing it. - * The exec code can write slightly over 640kB to the stack - * before setting the user r1. Thus we allow the stack to - * expand to 1MB without further checks. - */ - if (address + 0x100000 < vma->vm_end) { - /* get user regs even if this fault is in kernel mode */ - struct pt_regs *uregs = current->thread.regs; - if (uregs == NULL) - goto bad_area; - - /* - * A user-mode access to an address a long way below - * the stack pointer is only valid if the instruction - * is one which would update the stack pointer to the - * address accessed if the instruction completed, - * i.e. either stwu rs,n(r1) or stwux rs,r1,rb - * (or the byte, halfword, float or double forms). - * - * If we don't check this then any write to the area - * between the last mapped region and the stack will - * expand the stack rather than segfaulting. - */ - if (address + 2048 < uregs->gpr[1] - && (!user_mode(regs) || !store_updates_sp(regs))) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; - -good_area: - code = SEGV_ACCERR; -#if defined(CONFIG_6xx) - if (error_code & 0x95700000) - /* an error such as lwarx to I/O controller space, - address matching DABR, eciwx, etc. */ - goto bad_area; -#endif /* CONFIG_6xx */ -#if defined(CONFIG_8xx) - /* 8xx sometimes need to load a invalid/non-present TLBs. - * These must be invalidated separately as linux mm don't. - */ - if (error_code & 0x40000000) /* no translation? */ - _tlbil_va(address, 0, 0, 0); - - /* The MPC8xx seems to always set 0x80000000, which is - * "undefined". Of those that can be set, this is the only - * one which seems bad. - */ - if (error_code & 0x10000000) - /* Guarded storage error. */ - goto bad_area; -#endif /* CONFIG_8xx */ - - if (is_exec) { -#ifdef CONFIG_PPC_STD_MMU - /* Protection fault on exec go straight to failure on - * Hash based MMUs as they either don't support per-page - * execute permission, or if they do, it's handled already - * at the hash level. This test would probably have to - * be removed if we change the way this works to make hash - * processors use the same I/D cache coherency mechanism - * as embedded. - */ - if (error_code & DSISR_PROTFAULT) - goto bad_area; -#endif /* CONFIG_PPC_STD_MMU */ - - /* - * Allow execution from readable areas if the MMU does not - * provide separate controls over reading and executing. - * - * Note: That code used to not be enabled for 4xx/BookE. - * It is now as I/D cache coherency for these is done at - * set_pte_at() time and I see no reason why the test - * below wouldn't be valid on those processors. This -may- - * break programs compiled with a really old ABI though. - */ - if (!(vma->vm_flags & VM_EXEC) && - (cpu_has_feature(CPU_FTR_NOEXECUTE) || - !(vma->vm_flags & (VM_READ | VM_WRITE)))) - goto bad_area; - /* a write */ - } else if (is_write) { - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - /* a read */ - } else { - /* protection fault */ - if (error_code & 0x08000000) - goto bad_area; - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - } + if (unlikely(access_error(is_write, is_exec, vma))) + return bad_access(regs, address, mm, vma); /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); - if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { - rc = mm_fault_error(regs, address, fault); - if (rc >= MM_FAULT_RETURN) - goto bail; - else - rc = 0; - } + fault = handle_mm_fault(vma, address, flags, regs); + + major |= fault & VM_FAULT_MAJOR; + + if (fault_signal_pending(fault, regs)) + return user_mode(regs) ? 0 : SIGBUS; + + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + goto out; /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. + * Handle the retry right now, the mmap_lock has been released in that + * case. */ - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); -#ifdef CONFIG_PPC_SMLPAR - if (firmware_has_feature(FW_FEATURE_CMO)) { - preempt_disable(); - get_lppaca()->page_ins += (1 << PAGE_FACTOR); - preempt_enable(); - } -#endif /* CONFIG_PPC_SMLPAR */ - } else { - current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } - if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; - flags |= FAULT_FLAG_TRIED; - goto retry; - } + if (unlikely(fault & VM_FAULT_RETRY)) { + flags |= FAULT_FLAG_TRIED; + goto retry; } - up_read(&mm->mmap_sem); - goto bail; + mmap_read_unlock(current->mm); -bad_area: - up_read(&mm->mmap_sem); +done: + if (unlikely(fault & VM_FAULT_ERROR)) + return mm_fault_error(regs, address, fault); -bad_area_nosemaphore: - /* User mode accesses cause a SIGSEGV */ - if (user_mode(regs)) { - _exception(SIGSEGV, regs, code, address); - goto bail; - } +out: + /* + * Major/minor page fault accounting. + */ + if (major) + cmo_account_page_fault(); - if (is_exec && (error_code & DSISR_PROTFAULT)) - printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected" - " page (%lx) - exploit attempt? (uid: %d)\n", - address, from_kuid(&init_user_ns, current_uid())); + return 0; +} +NOKPROBE_SYMBOL(___do_page_fault); - rc = SIGSEGV; +static __always_inline void __do_page_fault(struct pt_regs *regs) +{ + long err; -bail: - exception_exit(prev_state); - return rc; + err = ___do_page_fault(regs, regs->dar, regs->dsisr); + if (unlikely(err)) + bad_page_fault(regs, err); +} +DEFINE_INTERRUPT_HANDLER(do_page_fault) +{ + __do_page_fault(regs); } +#ifdef CONFIG_PPC_BOOK3S_64 +/* Same as do_page_fault but interrupt entry has already run in do_hash_fault */ +void hash__do_page_fault(struct pt_regs *regs) +{ + __do_page_fault(regs); +} +NOKPROBE_SYMBOL(hash__do_page_fault); +#endif + /* * bad_page_fault is called when we have a bad access from the kernel. * It is called from the DSI and ISI handlers in head.S and from some * of the procedures in traps.c. */ -void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) +static void __bad_page_fault(struct pt_regs *regs, int sig) { - const struct exception_table_entry *entry; - unsigned long *stackend; - - /* Are we prepared to handle this fault? */ - if ((entry = search_exception_tables(regs->nip)) != NULL) { - regs->nip = entry->fixup; - return; - } + int is_write = page_fault_is_write(regs->dsisr); + const char *msg; /* kernel has accessed a bad area */ - switch (regs->trap) { - case 0x300: - case 0x380: - printk(KERN_ALERT "Unable to handle kernel paging request for " - "data at address 0x%08lx\n", regs->dar); + if (regs->dar < PAGE_SIZE) + msg = "Kernel NULL pointer dereference"; + else + msg = "Unable to handle kernel data access"; + + switch (TRAP(regs)) { + case INTERRUPT_DATA_STORAGE: + case INTERRUPT_H_DATA_STORAGE: + pr_alert("BUG: %s on %s at 0x%08lx\n", msg, + str_write_read(is_write), regs->dar); + break; + case INTERRUPT_DATA_SEGMENT: + pr_alert("BUG: %s at 0x%08lx\n", msg, regs->dar); break; - case 0x400: - case 0x480: - printk(KERN_ALERT "Unable to handle kernel paging request for " - "instruction fetch\n"); + case INTERRUPT_INST_STORAGE: + case INTERRUPT_INST_SEGMENT: + pr_alert("BUG: Unable to handle kernel instruction fetch%s", + regs->nip < PAGE_SIZE ? " (NULL pointer?)\n" : "\n"); + break; + case INTERRUPT_ALIGNMENT: + pr_alert("BUG: Unable to handle kernel unaligned access at 0x%08lx\n", + regs->dar); break; default: - printk(KERN_ALERT "Unable to handle kernel paging request for " - "unknown fault\n"); + pr_alert("BUG: Unable to handle unknown paging fault at 0x%08lx\n", + regs->dar); break; } printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", regs->nip); - stackend = end_of_stack(current); - if (current != &init_task && *stackend != STACK_END_MAGIC) + if (task_stack_end_corrupted(current)) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); die("Kernel access of bad area", regs, sig); } + +void bad_page_fault(struct pt_regs *regs, int sig) +{ + const struct exception_table_entry *entry; + + /* Are we prepared to handle this fault? */ + entry = search_exception_tables(instruction_pointer(regs)); + if (entry) + instruction_pointer_set(regs, extable_fixup(entry)); + else + __bad_page_fault(regs, sig); +} + +#ifdef CONFIG_PPC_BOOK3S_64 +DEFINE_INTERRUPT_HANDLER(do_bad_page_fault_segv) +{ + bad_page_fault(regs, SIGSEGV); +} + +/* + * In radix, segment interrupts indicate the EA is not addressable by the + * page table geometry, so they are always sent here. + * + * In hash, this is called if do_slb_fault returns error. Typically it is + * because the EA was outside the region allowed by software. + */ +DEFINE_INTERRUPT_HANDLER(do_bad_segment_interrupt) +{ + int err = regs->result; + + if (err == -EFAULT) { + if (user_mode(regs)) + _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar); + else + bad_page_fault(regs, SIGSEGV); + } else if (err == -EINVAL) { + unrecoverable_exception(regs); + } else { + BUG(); + } +} +#endif diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c deleted file mode 100644 index 49822d90ea96..000000000000 --- a/arch/powerpc/mm/gup.c +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Lockless get_user_pages_fast for powerpc - * - * Copyright (C) 2008 Nick Piggin - * Copyright (C) 2008 Novell Inc. - */ -#undef DEBUG - -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/hugetlb.h> -#include <linux/vmstat.h> -#include <linux/pagemap.h> -#include <linux/rwsem.h> -#include <asm/pgtable.h> - -#ifdef __HAVE_ARCH_PTE_SPECIAL - -/* - * The performance critical leaf functions are made noinline otherwise gcc - * inlines everything into a single function which results in too much - * register pressure. - */ -static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long mask, result; - pte_t *ptep; - - result = _PAGE_PRESENT|_PAGE_USER; - if (write) - result |= _PAGE_RW; - mask = result | _PAGE_SPECIAL; - - ptep = pte_offset_kernel(&pmd, addr); - do { - pte_t pte = ACCESS_ONCE(*ptep); - struct page *page; - - if ((pte_val(pte) & mask) != result) - return 0; - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); - if (!page_cache_get_speculative(page)) - return 0; - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_page(page); - return 0; - } - pages[*nr] = page; - (*nr)++; - - } while (ptep++, addr += PAGE_SIZE, addr != end); - - return 1; -} - -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp; - - pmdp = pmd_offset(&pud, addr); - do { - pmd_t pmd = ACCESS_ONCE(*pmdp); - - next = pmd_addr_end(addr, end); - /* - * If we find a splitting transparent hugepage we - * return zero. That will result in taking the slow - * path which will call wait_split_huge_page() - * if the pmd is still in splitting state - */ - if (pmd_none(pmd) || pmd_trans_splitting(pmd)) - return 0; - if (pmd_huge(pmd) || pmd_large(pmd)) { - if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next, - write, pages, nr)) - return 0; - } else if (is_hugepd(pmdp)) { - if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT, - addr, next, write, pages, nr)) - return 0; - } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) - return 0; - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp; - - pudp = pud_offset(&pgd, addr); - do { - pud_t pud = ACCESS_ONCE(*pudp); - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (pud_huge(pud)) { - if (!gup_hugepte((pte_t *)pudp, PUD_SIZE, addr, next, - write, pages, nr)) - return 0; - } else if (is_hugepd(pudp)) { - if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT, - addr, next, write, pages, nr)) - return 0; - } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) - return 0; - } while (pudp++, addr = next, addr != end); - - return 1; -} - -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - pgd_t *pgdp; - int nr = 0; - - pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - start, len))) - goto slow_irqon; - - pr_devel(" aligned: %lx .. %lx\n", start, end); - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables from being freed on powerpc. - * - * So long as we atomically load page table pointers versus teardown, - * we can follow the address down to the the page and take a ref on it. - */ - local_irq_disable(); - - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = ACCESS_ONCE(*pgdp); - - pr_devel(" %016lx: normal pgd %p\n", addr, - (void *)pgd_val(pgd)); - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (pgd_huge(pgd)) { - if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next, - write, pages, &nr)) - goto slow; - } else if (is_hugepd(pgdp)) { - if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT, - addr, next, write, pages, &nr)) - goto slow; - } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) - goto slow; - } while (pgdp++, addr = next, addr != end); - - local_irq_enable(); - - VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); - return nr; - - { - int ret; - -slow: - local_irq_enable(); -slow_irqon: - pr_devel(" slow path ! nr = %d\n", nr); - - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - - down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, start, - (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); - up_read(&mm->mmap_sem); - - /* Have to be a bit careful with return values */ - if (nr > 0) { - if (ret < 0) - ret = nr; - else - ret += nr; - } - - return ret; - } -} - -#endif /* __HAVE_ARCH_PTE_SPECIAL */ diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S deleted file mode 100644 index d3cbda62857b..000000000000 --- a/arch/powerpc/mm/hash_low_64.S +++ /dev/null @@ -1,981 +0,0 @@ -/* - * ppc64 MMU hashtable management routines - * - * (c) Copyright IBM Corp. 2003, 2005 - * - * Maintained by: Benjamin Herrenschmidt - * <benh@kernel.crashing.org> - * - * This file is covered by the GNU Public Licence v2 as - * described in the kernel's COPYING file. - */ - -#include <asm/reg.h> -#include <asm/pgtable.h> -#include <asm/mmu.h> -#include <asm/page.h> -#include <asm/types.h> -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/cputable.h> - - .text - -/* - * Stackframe: - * - * +-> Back chain (SP + 256) - * | General register save area (SP + 112) - * | Parameter save area (SP + 48) - * | TOC save area (SP + 40) - * | link editor doubleword (SP + 32) - * | compiler doubleword (SP + 24) - * | LR save area (SP + 16) - * | CR save area (SP + 8) - * SP ---> +-- Back chain (SP + 0) - */ - -#ifndef CONFIG_PPC_64K_PAGES - -/***************************************************************************** - * * - * 4K SW & 4K HW pages implementation * - * * - *****************************************************************************/ - - -/* - * _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, - * pte_t *ptep, unsigned long trap, int local, int ssize) - * - * Adds a 4K page to the hash table in a segment of 4K pages only - */ - -_GLOBAL(__hash_page_4K) - mflr r0 - std r0,16(r1) - stdu r1,-STACKFRAMESIZE(r1) - /* Save all params that we need after a function call */ - std r6,STK_PARAM(R6)(r1) - std r8,STK_PARAM(R8)(r1) - std r9,STK_PARAM(R9)(r1) - - /* Save non-volatile registers. - * r31 will hold "old PTE" - * r30 is "new PTE" - * r29 is vpn - * r28 is a hash value - * r27 is hashtab mask (maybe dynamic patched instead ?) - */ - std r27,STK_REG(R27)(r1) - std r28,STK_REG(R28)(r1) - std r29,STK_REG(R29)(r1) - std r30,STK_REG(R30)(r1) - std r31,STK_REG(R31)(r1) - - /* Step 1: - * - * Check permissions, atomically mark the linux PTE busy - * and hashed. - */ -1: - ldarx r31,0,r6 - /* Check access rights (access & ~(pte_val(*ptep))) */ - andc. r0,r4,r31 - bne- htab_wrong_access - /* Check if PTE is busy */ - andi. r0,r31,_PAGE_BUSY - /* If so, just bail out and refault if needed. Someone else - * is changing this PTE anyway and might hash it. - */ - bne- htab_bail_ok - - /* Prepare new PTE value (turn access RW into DIRTY, then - * add BUSY,HASHPTE and ACCESSED) - */ - rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ - or r30,r30,r31 - ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE - /* Write the linux PTE atomically (setting busy) */ - stdcx. r30,0,r6 - bne- 1b - isync - - /* Step 2: - * - * Insert/Update the HPTE in the hash table. At this point, - * r4 (access) is re-useable, we use it for the new HPTE flags - */ - -BEGIN_FTR_SECTION - cmpdi r9,0 /* check segment size */ - bne 3f -END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) - /* Calc vpn and put it in r29 */ - sldi r29,r5,SID_SHIFT - VPN_SHIFT - rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT) - or r29,r28,r29 - /* - * Calculate hash value for primary slot and store it in r28 - * r3 = va, r5 = vsid - * r0 = (va >> 12) & ((1ul << (28 - 12)) -1) - */ - rldicl r0,r3,64-12,48 - xor r28,r5,r0 /* hash */ - b 4f - -3: /* Calc vpn and put it in r29 */ - sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT - rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT) - or r29,r28,r29 - - /* - * calculate hash value for primary slot and - * store it in r28 for 1T segment - * r3 = va, r5 = vsid - */ - sldi r28,r5,25 /* vsid << 25 */ - /* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */ - rldicl r0,r3,64-12,36 - xor r28,r28,r5 /* vsid ^ ( vsid << 25) */ - xor r28,r28,r0 /* hash */ - - /* Convert linux PTE bits into HW equivalents */ -4: andi. r3,r30,0x1fe /* Get basic set of flags */ - xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */ - rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ - rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ - and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ - andc r0,r30,r0 /* r0 = pte & ~r0 */ - rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ - ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */ - - /* We eventually do the icache sync here (maybe inline that - * code rather than call a C function...) - */ -BEGIN_FTR_SECTION - mr r4,r30 - mr r5,r7 - bl .hash_page_do_lazy_icache -END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) - - /* At this point, r3 contains new PP bits, save them in - * place of "access" in the param area (sic) - */ - std r3,STK_PARAM(R4)(r1) - - /* Get htab_hash_mask */ - ld r4,htab_hash_mask@got(2) - ld r27,0(r4) /* htab_hash_mask -> r27 */ - - /* Check if we may already be in the hashtable, in this case, we - * go to out-of-line code to try to modify the HPTE - */ - andi. r0,r31,_PAGE_HASHPTE - bne htab_modify_pte - -htab_insert_pte: - /* Clear hpte bits in new pte (we also clear BUSY btw) and - * add _PAGE_HASHPTE - */ - lis r0,_PAGE_HPTEFLAGS@h - ori r0,r0,_PAGE_HPTEFLAGS@l - andc r30,r30,r0 - ori r30,r30,_PAGE_HASHPTE - - /* physical address r5 */ - rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT - sldi r5,r5,PAGE_SHIFT - - /* Calculate primary group hash */ - and r0,r28,r27 - rldicr r3,r0,3,63-3 /* r3 = (hash & mask) << 3 */ - - /* Call ppc_md.hpte_insert */ - ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ - mr r4,r29 /* Retrieve vpn */ - li r7,0 /* !bolted, !secondary */ - li r8,MMU_PAGE_4K /* page size */ - li r9,MMU_PAGE_4K /* actual page size */ - ld r10,STK_PARAM(R9)(r1) /* segment size */ -_GLOBAL(htab_call_hpte_insert1) - bl . /* Patched by htab_finish_init() */ - cmpdi 0,r3,0 - bge htab_pte_insert_ok /* Insertion successful */ - cmpdi 0,r3,-2 /* Critical failure */ - beq- htab_pte_insert_failure - - /* Now try secondary slot */ - - /* physical address r5 */ - rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT - sldi r5,r5,PAGE_SHIFT - - /* Calculate secondary group hash */ - andc r0,r27,r28 - rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ - - /* Call ppc_md.hpte_insert */ - ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ - mr r4,r29 /* Retrieve vpn */ - li r7,HPTE_V_SECONDARY /* !bolted, secondary */ - li r8,MMU_PAGE_4K /* page size */ - li r9,MMU_PAGE_4K /* actual page size */ - ld r10,STK_PARAM(R9)(r1) /* segment size */ -_GLOBAL(htab_call_hpte_insert2) - bl . /* Patched by htab_finish_init() */ - cmpdi 0,r3,0 - bge+ htab_pte_insert_ok /* Insertion successful */ - cmpdi 0,r3,-2 /* Critical failure */ - beq- htab_pte_insert_failure - - /* Both are full, we need to evict something */ - mftb r0 - /* Pick a random group based on TB */ - andi. r0,r0,1 - mr r5,r28 - bne 2f - not r5,r5 -2: and r0,r5,r27 - rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ - /* Call ppc_md.hpte_remove */ -_GLOBAL(htab_call_hpte_remove) - bl . /* Patched by htab_finish_init() */ - - /* Try all again */ - b htab_insert_pte - -htab_bail_ok: - li r3,0 - b htab_bail - -htab_pte_insert_ok: - /* Insert slot number & secondary bit in PTE */ - rldimi r30,r3,12,63-15 - - /* Write out the PTE with a normal write - * (maybe add eieio may be good still ?) - */ -htab_write_out_pte: - ld r6,STK_PARAM(R6)(r1) - std r30,0(r6) - li r3, 0 -htab_bail: - ld r27,STK_REG(R27)(r1) - ld r28,STK_REG(R28)(r1) - ld r29,STK_REG(R29)(r1) - ld r30,STK_REG(R30)(r1) - ld r31,STK_REG(R31)(r1) - addi r1,r1,STACKFRAMESIZE - ld r0,16(r1) - mtlr r0 - blr - -htab_modify_pte: - /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ - mr r4,r3 - rlwinm r3,r31,32-12,29,31 - - /* Secondary group ? if yes, get a inverted hash value */ - mr r5,r28 - andi. r0,r31,_PAGE_SECONDARY - beq 1f - not r5,r5 -1: - /* Calculate proper slot value for ppc_md.hpte_updatepp */ - and r0,r5,r27 - rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ - add r3,r0,r3 /* add slot idx */ - - /* Call ppc_md.hpte_updatepp */ - mr r5,r29 /* vpn */ - li r6,MMU_PAGE_4K /* base page size */ - li r7,MMU_PAGE_4K /* actual page size */ - ld r8,STK_PARAM(R9)(r1) /* segment size */ - ld r9,STK_PARAM(R8)(r1) /* get "local" param */ -_GLOBAL(htab_call_hpte_updatepp) - bl . /* Patched by htab_finish_init() */ - - /* if we failed because typically the HPTE wasn't really here - * we try an insertion. - */ - cmpdi 0,r3,-1 - beq- htab_insert_pte - - /* Clear the BUSY bit and Write out the PTE */ - li r0,_PAGE_BUSY - andc r30,r30,r0 - b htab_write_out_pte - -htab_wrong_access: - /* Bail out clearing reservation */ - stdcx. r31,0,r6 - li r3,1 - b htab_bail - -htab_pte_insert_failure: - /* Bail out restoring old PTE */ - ld r6,STK_PARAM(R6)(r1) - std r31,0(r6) - li r3,-1 - b htab_bail - - -#else /* CONFIG_PPC_64K_PAGES */ - - -/***************************************************************************** - * * - * 64K SW & 4K or 64K HW in a 4K segment pages implementation * - * * - *****************************************************************************/ - -/* _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, - * pte_t *ptep, unsigned long trap, int local, int ssize, - * int subpg_prot) - */ - -/* - * For now, we do NOT implement Admixed pages - */ -_GLOBAL(__hash_page_4K) - mflr r0 - std r0,16(r1) - stdu r1,-STACKFRAMESIZE(r1) - /* Save all params that we need after a function call */ - std r6,STK_PARAM(R6)(r1) - std r8,STK_PARAM(R8)(r1) - std r9,STK_PARAM(R9)(r1) - - /* Save non-volatile registers. - * r31 will hold "old PTE" - * r30 is "new PTE" - * r29 is vpn - * r28 is a hash value - * r27 is hashtab mask (maybe dynamic patched instead ?) - * r26 is the hidx mask - * r25 is the index in combo page - */ - std r25,STK_REG(R25)(r1) - std r26,STK_REG(R26)(r1) - std r27,STK_REG(R27)(r1) - std r28,STK_REG(R28)(r1) - std r29,STK_REG(R29)(r1) - std r30,STK_REG(R30)(r1) - std r31,STK_REG(R31)(r1) - - /* Step 1: - * - * Check permissions, atomically mark the linux PTE busy - * and hashed. - */ -1: - ldarx r31,0,r6 - /* Check access rights (access & ~(pte_val(*ptep))) */ - andc. r0,r4,r31 - bne- htab_wrong_access - /* Check if PTE is busy */ - andi. r0,r31,_PAGE_BUSY - /* If so, just bail out and refault if needed. Someone else - * is changing this PTE anyway and might hash it. - */ - bne- htab_bail_ok - /* Prepare new PTE value (turn access RW into DIRTY, then - * add BUSY and ACCESSED) - */ - rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ - or r30,r30,r31 - ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED - oris r30,r30,_PAGE_COMBO@h - /* Write the linux PTE atomically (setting busy) */ - stdcx. r30,0,r6 - bne- 1b - isync - - /* Step 2: - * - * Insert/Update the HPTE in the hash table. At this point, - * r4 (access) is re-useable, we use it for the new HPTE flags - */ - - /* Load the hidx index */ - rldicl r25,r3,64-12,60 - -BEGIN_FTR_SECTION - cmpdi r9,0 /* check segment size */ - bne 3f -END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) - /* Calc vpn and put it in r29 */ - sldi r29,r5,SID_SHIFT - VPN_SHIFT - /* - * clrldi r3,r3,64 - SID_SHIFT --> ea & 0xfffffff - * srdi r28,r3,VPN_SHIFT - */ - rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT) - or r29,r28,r29 - /* - * Calculate hash value for primary slot and store it in r28 - * r3 = va, r5 = vsid - * r0 = (va >> 12) & ((1ul << (28 - 12)) -1) - */ - rldicl r0,r3,64-12,48 - xor r28,r5,r0 /* hash */ - b 4f - -3: /* Calc vpn and put it in r29 */ - sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT - /* - * clrldi r3,r3,64 - SID_SHIFT_1T --> ea & 0xffffffffff - * srdi r28,r3,VPN_SHIFT - */ - rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT) - or r29,r28,r29 - - /* - * Calculate hash value for primary slot and - * store it in r28 for 1T segment - * r3 = va, r5 = vsid - */ - sldi r28,r5,25 /* vsid << 25 */ - /* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */ - rldicl r0,r3,64-12,36 - xor r28,r28,r5 /* vsid ^ ( vsid << 25) */ - xor r28,r28,r0 /* hash */ - - /* Convert linux PTE bits into HW equivalents */ -4: -#ifdef CONFIG_PPC_SUBPAGE_PROT - andc r10,r30,r10 - andi. r3,r10,0x1fe /* Get basic set of flags */ - rlwinm r0,r10,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ -#else - andi. r3,r30,0x1fe /* Get basic set of flags */ - rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ -#endif - xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */ - rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ - and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ - andc r0,r3,r0 /* r0 = pte & ~r0 */ - rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ - ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */ - - /* We eventually do the icache sync here (maybe inline that - * code rather than call a C function...) - */ -BEGIN_FTR_SECTION - mr r4,r30 - mr r5,r7 - bl .hash_page_do_lazy_icache -END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) - - /* At this point, r3 contains new PP bits, save them in - * place of "access" in the param area (sic) - */ - std r3,STK_PARAM(R4)(r1) - - /* Get htab_hash_mask */ - ld r4,htab_hash_mask@got(2) - ld r27,0(r4) /* htab_hash_mask -> r27 */ - - /* Check if we may already be in the hashtable, in this case, we - * go to out-of-line code to try to modify the HPTE. We look for - * the bit at (1 >> (index + 32)) - */ - rldicl. r0,r31,64-12,48 - li r26,0 /* Default hidx */ - beq htab_insert_pte - - /* - * Check if the pte was already inserted into the hash table - * as a 64k HW page, and invalidate the 64k HPTE if so. - */ - andis. r0,r31,_PAGE_COMBO@h - beq htab_inval_old_hpte - - ld r6,STK_PARAM(R6)(r1) - ori r26,r6,PTE_PAGE_HIDX_OFFSET /* Load the hidx mask. */ - ld r26,0(r26) - addi r5,r25,36 /* Check actual HPTE_SUB bit, this */ - rldcr. r0,r31,r5,0 /* must match pgtable.h definition */ - bne htab_modify_pte - -htab_insert_pte: - /* real page number in r5, PTE RPN value + index */ - andis. r0,r31,_PAGE_4K_PFN@h - srdi r5,r31,PTE_RPN_SHIFT - bne- htab_special_pfn - sldi r5,r5,PAGE_SHIFT-HW_PAGE_SHIFT - add r5,r5,r25 -htab_special_pfn: - sldi r5,r5,HW_PAGE_SHIFT - - /* Calculate primary group hash */ - and r0,r28,r27 - rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ - - /* Call ppc_md.hpte_insert */ - ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ - mr r4,r29 /* Retrieve vpn */ - li r7,0 /* !bolted, !secondary */ - li r8,MMU_PAGE_4K /* page size */ - li r9,MMU_PAGE_4K /* actual page size */ - ld r10,STK_PARAM(R9)(r1) /* segment size */ -_GLOBAL(htab_call_hpte_insert1) - bl . /* patched by htab_finish_init() */ - cmpdi 0,r3,0 - bge htab_pte_insert_ok /* Insertion successful */ - cmpdi 0,r3,-2 /* Critical failure */ - beq- htab_pte_insert_failure - - /* Now try secondary slot */ - - /* real page number in r5, PTE RPN value + index */ - andis. r0,r31,_PAGE_4K_PFN@h - srdi r5,r31,PTE_RPN_SHIFT - bne- 3f - sldi r5,r5,PAGE_SHIFT-HW_PAGE_SHIFT - add r5,r5,r25 -3: sldi r5,r5,HW_PAGE_SHIFT - - /* Calculate secondary group hash */ - andc r0,r27,r28 - rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ - - /* Call ppc_md.hpte_insert */ - ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ - mr r4,r29 /* Retrieve vpn */ - li r7,HPTE_V_SECONDARY /* !bolted, secondary */ - li r8,MMU_PAGE_4K /* page size */ - li r9,MMU_PAGE_4K /* actual page size */ - ld r10,STK_PARAM(R9)(r1) /* segment size */ -_GLOBAL(htab_call_hpte_insert2) - bl . /* patched by htab_finish_init() */ - cmpdi 0,r3,0 - bge+ htab_pte_insert_ok /* Insertion successful */ - cmpdi 0,r3,-2 /* Critical failure */ - beq- htab_pte_insert_failure - - /* Both are full, we need to evict something */ - mftb r0 - /* Pick a random group based on TB */ - andi. r0,r0,1 - mr r5,r28 - bne 2f - not r5,r5 -2: and r0,r5,r27 - rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ - /* Call ppc_md.hpte_remove */ -_GLOBAL(htab_call_hpte_remove) - bl . /* patched by htab_finish_init() */ - - /* Try all again */ - b htab_insert_pte - - /* - * Call out to C code to invalidate an 64k HW HPTE that is - * useless now that the segment has been switched to 4k pages. - */ -htab_inval_old_hpte: - mr r3,r29 /* vpn */ - mr r4,r31 /* PTE.pte */ - li r5,0 /* PTE.hidx */ - li r6,MMU_PAGE_64K /* psize */ - ld r7,STK_PARAM(R9)(r1) /* ssize */ - ld r8,STK_PARAM(R8)(r1) /* local */ - bl .flush_hash_page - /* Clear out _PAGE_HPTE_SUB bits in the new linux PTE */ - lis r0,_PAGE_HPTE_SUB@h - ori r0,r0,_PAGE_HPTE_SUB@l - andc r30,r30,r0 - b htab_insert_pte - -htab_bail_ok: - li r3,0 - b htab_bail - -htab_pte_insert_ok: - /* Insert slot number & secondary bit in PTE second half, - * clear _PAGE_BUSY and set approriate HPTE slot bit - */ - ld r6,STK_PARAM(R6)(r1) - li r0,_PAGE_BUSY - andc r30,r30,r0 - /* HPTE SUB bit */ - li r0,1 - subfic r5,r25,27 /* Must match bit position in */ - sld r0,r0,r5 /* pgtable.h */ - or r30,r30,r0 - /* hindx */ - sldi r5,r25,2 - sld r3,r3,r5 - li r4,0xf - sld r4,r4,r5 - andc r26,r26,r4 - or r26,r26,r3 - ori r5,r6,PTE_PAGE_HIDX_OFFSET - std r26,0(r5) - lwsync - std r30,0(r6) - li r3, 0 -htab_bail: - ld r25,STK_REG(R25)(r1) - ld r26,STK_REG(R26)(r1) - ld r27,STK_REG(R27)(r1) - ld r28,STK_REG(R28)(r1) - ld r29,STK_REG(R29)(r1) - ld r30,STK_REG(R30)(r1) - ld r31,STK_REG(R31)(r1) - addi r1,r1,STACKFRAMESIZE - ld r0,16(r1) - mtlr r0 - blr - -htab_modify_pte: - /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ - mr r4,r3 - sldi r5,r25,2 - srd r3,r26,r5 - - /* Secondary group ? if yes, get a inverted hash value */ - mr r5,r28 - andi. r0,r3,0x8 /* page secondary ? */ - beq 1f - not r5,r5 -1: andi. r3,r3,0x7 /* extract idx alone */ - - /* Calculate proper slot value for ppc_md.hpte_updatepp */ - and r0,r5,r27 - rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ - add r3,r0,r3 /* add slot idx */ - - /* Call ppc_md.hpte_updatepp */ - mr r5,r29 /* vpn */ - li r6,MMU_PAGE_4K /* base page size */ - li r7,MMU_PAGE_4K /* actual page size */ - ld r8,STK_PARAM(R9)(r1) /* segment size */ - ld r9,STK_PARAM(R8)(r1) /* get "local" param */ -_GLOBAL(htab_call_hpte_updatepp) - bl . /* patched by htab_finish_init() */ - - /* if we failed because typically the HPTE wasn't really here - * we try an insertion. - */ - cmpdi 0,r3,-1 - beq- htab_insert_pte - - /* Clear the BUSY bit and Write out the PTE */ - li r0,_PAGE_BUSY - andc r30,r30,r0 - ld r6,STK_PARAM(R6)(r1) - std r30,0(r6) - li r3,0 - b htab_bail - -htab_wrong_access: - /* Bail out clearing reservation */ - stdcx. r31,0,r6 - li r3,1 - b htab_bail - -htab_pte_insert_failure: - /* Bail out restoring old PTE */ - ld r6,STK_PARAM(R6)(r1) - std r31,0(r6) - li r3,-1 - b htab_bail - -#endif /* CONFIG_PPC_64K_PAGES */ - -#ifdef CONFIG_PPC_HAS_HASH_64K - -/***************************************************************************** - * * - * 64K SW & 64K HW in a 64K segment pages implementation * - * * - *****************************************************************************/ - -_GLOBAL(__hash_page_64K) - mflr r0 - std r0,16(r1) - stdu r1,-STACKFRAMESIZE(r1) - /* Save all params that we need after a function call */ - std r6,STK_PARAM(R6)(r1) - std r8,STK_PARAM(R8)(r1) - std r9,STK_PARAM(R9)(r1) - - /* Save non-volatile registers. - * r31 will hold "old PTE" - * r30 is "new PTE" - * r29 is vpn - * r28 is a hash value - * r27 is hashtab mask (maybe dynamic patched instead ?) - */ - std r27,STK_REG(R27)(r1) - std r28,STK_REG(R28)(r1) - std r29,STK_REG(R29)(r1) - std r30,STK_REG(R30)(r1) - std r31,STK_REG(R31)(r1) - - /* Step 1: - * - * Check permissions, atomically mark the linux PTE busy - * and hashed. - */ -1: - ldarx r31,0,r6 - /* Check access rights (access & ~(pte_val(*ptep))) */ - andc. r0,r4,r31 - bne- ht64_wrong_access - /* Check if PTE is busy */ - andi. r0,r31,_PAGE_BUSY - /* If so, just bail out and refault if needed. Someone else - * is changing this PTE anyway and might hash it. - */ - bne- ht64_bail_ok -BEGIN_FTR_SECTION - /* Check if PTE has the cache-inhibit bit set */ - andi. r0,r31,_PAGE_NO_CACHE - /* If so, bail out and refault as a 4k page */ - bne- ht64_bail_ok -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_CI_LARGE_PAGE) - /* Prepare new PTE value (turn access RW into DIRTY, then - * add BUSY and ACCESSED) - */ - rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ - or r30,r30,r31 - ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED - /* Write the linux PTE atomically (setting busy) */ - stdcx. r30,0,r6 - bne- 1b - isync - - /* Step 2: - * - * Insert/Update the HPTE in the hash table. At this point, - * r4 (access) is re-useable, we use it for the new HPTE flags - */ - -BEGIN_FTR_SECTION - cmpdi r9,0 /* check segment size */ - bne 3f -END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) - /* Calc vpn and put it in r29 */ - sldi r29,r5,SID_SHIFT - VPN_SHIFT - rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT) - or r29,r28,r29 - - /* Calculate hash value for primary slot and store it in r28 - * r3 = va, r5 = vsid - * r0 = (va >> 16) & ((1ul << (28 - 16)) -1) - */ - rldicl r0,r3,64-16,52 - xor r28,r5,r0 /* hash */ - b 4f - -3: /* Calc vpn and put it in r29 */ - sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT - rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT) - or r29,r28,r29 - /* - * calculate hash value for primary slot and - * store it in r28 for 1T segment - * r3 = va, r5 = vsid - */ - sldi r28,r5,25 /* vsid << 25 */ - /* r0 = (va >> 16) & ((1ul << (40 - 16)) -1) */ - rldicl r0,r3,64-16,40 - xor r28,r28,r5 /* vsid ^ ( vsid << 25) */ - xor r28,r28,r0 /* hash */ - - /* Convert linux PTE bits into HW equivalents */ -4: andi. r3,r30,0x1fe /* Get basic set of flags */ - xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */ - rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ - rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ - and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ - andc r0,r30,r0 /* r0 = pte & ~r0 */ - rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ - ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */ - - /* We eventually do the icache sync here (maybe inline that - * code rather than call a C function...) - */ -BEGIN_FTR_SECTION - mr r4,r30 - mr r5,r7 - bl .hash_page_do_lazy_icache -END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) - - /* At this point, r3 contains new PP bits, save them in - * place of "access" in the param area (sic) - */ - std r3,STK_PARAM(R4)(r1) - - /* Get htab_hash_mask */ - ld r4,htab_hash_mask@got(2) - ld r27,0(r4) /* htab_hash_mask -> r27 */ - - /* Check if we may already be in the hashtable, in this case, we - * go to out-of-line code to try to modify the HPTE - */ - rldicl. r0,r31,64-12,48 - bne ht64_modify_pte - -ht64_insert_pte: - /* Clear hpte bits in new pte (we also clear BUSY btw) and - * add _PAGE_HPTE_SUB0 - */ - lis r0,_PAGE_HPTEFLAGS@h - ori r0,r0,_PAGE_HPTEFLAGS@l - andc r30,r30,r0 -#ifdef CONFIG_PPC_64K_PAGES - oris r30,r30,_PAGE_HPTE_SUB0@h -#else - ori r30,r30,_PAGE_HASHPTE -#endif - /* Phyical address in r5 */ - rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT - sldi r5,r5,PAGE_SHIFT - - /* Calculate primary group hash */ - and r0,r28,r27 - rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ - - /* Call ppc_md.hpte_insert */ - ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ - mr r4,r29 /* Retrieve vpn */ - li r7,0 /* !bolted, !secondary */ - li r8,MMU_PAGE_64K - li r9,MMU_PAGE_64K /* actual page size */ - ld r10,STK_PARAM(R9)(r1) /* segment size */ -_GLOBAL(ht64_call_hpte_insert1) - bl . /* patched by htab_finish_init() */ - cmpdi 0,r3,0 - bge ht64_pte_insert_ok /* Insertion successful */ - cmpdi 0,r3,-2 /* Critical failure */ - beq- ht64_pte_insert_failure - - /* Now try secondary slot */ - - /* Phyical address in r5 */ - rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT - sldi r5,r5,PAGE_SHIFT - - /* Calculate secondary group hash */ - andc r0,r27,r28 - rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ - - /* Call ppc_md.hpte_insert */ - ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ - mr r4,r29 /* Retrieve vpn */ - li r7,HPTE_V_SECONDARY /* !bolted, secondary */ - li r8,MMU_PAGE_64K - li r9,MMU_PAGE_64K /* actual page size */ - ld r10,STK_PARAM(R9)(r1) /* segment size */ -_GLOBAL(ht64_call_hpte_insert2) - bl . /* patched by htab_finish_init() */ - cmpdi 0,r3,0 - bge+ ht64_pte_insert_ok /* Insertion successful */ - cmpdi 0,r3,-2 /* Critical failure */ - beq- ht64_pte_insert_failure - - /* Both are full, we need to evict something */ - mftb r0 - /* Pick a random group based on TB */ - andi. r0,r0,1 - mr r5,r28 - bne 2f - not r5,r5 -2: and r0,r5,r27 - rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ - /* Call ppc_md.hpte_remove */ -_GLOBAL(ht64_call_hpte_remove) - bl . /* patched by htab_finish_init() */ - - /* Try all again */ - b ht64_insert_pte - -ht64_bail_ok: - li r3,0 - b ht64_bail - -ht64_pte_insert_ok: - /* Insert slot number & secondary bit in PTE */ - rldimi r30,r3,12,63-15 - - /* Write out the PTE with a normal write - * (maybe add eieio may be good still ?) - */ -ht64_write_out_pte: - ld r6,STK_PARAM(R6)(r1) - std r30,0(r6) - li r3, 0 -ht64_bail: - ld r27,STK_REG(R27)(r1) - ld r28,STK_REG(R28)(r1) - ld r29,STK_REG(R29)(r1) - ld r30,STK_REG(R30)(r1) - ld r31,STK_REG(R31)(r1) - addi r1,r1,STACKFRAMESIZE - ld r0,16(r1) - mtlr r0 - blr - -ht64_modify_pte: - /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ - mr r4,r3 - rlwinm r3,r31,32-12,29,31 - - /* Secondary group ? if yes, get a inverted hash value */ - mr r5,r28 - andi. r0,r31,_PAGE_F_SECOND - beq 1f - not r5,r5 -1: - /* Calculate proper slot value for ppc_md.hpte_updatepp */ - and r0,r5,r27 - rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ - add r3,r0,r3 /* add slot idx */ - - /* Call ppc_md.hpte_updatepp */ - mr r5,r29 /* vpn */ - li r6,MMU_PAGE_64K /* base page size */ - li r7,MMU_PAGE_64K /* actual page size */ - ld r8,STK_PARAM(R9)(r1) /* segment size */ - ld r9,STK_PARAM(R8)(r1) /* get "local" param */ -_GLOBAL(ht64_call_hpte_updatepp) - bl . /* patched by htab_finish_init() */ - - /* if we failed because typically the HPTE wasn't really here - * we try an insertion. - */ - cmpdi 0,r3,-1 - beq- ht64_insert_pte - - /* Clear the BUSY bit and Write out the PTE */ - li r0,_PAGE_BUSY - andc r30,r30,r0 - b ht64_write_out_pte - -ht64_wrong_access: - /* Bail out clearing reservation */ - stdcx. r31,0,r6 - li r3,1 - b ht64_bail - -ht64_pte_insert_failure: - /* Bail out restoring old PTE */ - ld r6,STK_PARAM(R6)(r1) - std r31,0(r6) - li r3,-1 - b ht64_bail - - -#endif /* CONFIG_PPC_HAS_HASH_64K */ - - -/***************************************************************************** - * * - * Huge pages implementation is in hugetlbpage.c * - * * - *****************************************************************************/ diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c deleted file mode 100644 index 6ecc38bd5b24..000000000000 --- a/arch/powerpc/mm/hash_utils_64.c +++ /dev/null @@ -1,1436 +0,0 @@ -/* - * PowerPC64 port by Mike Corrigan and Dave Engebretsen - * {mikejc|engebret}@us.ibm.com - * - * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com> - * - * SMP scalability work: - * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM - * - * Module name: htab.c - * - * Description: - * PowerPC Hashed Page Table functions - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#undef DEBUG -#undef DEBUG_LOW - -#include <linux/spinlock.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/proc_fs.h> -#include <linux/stat.h> -#include <linux/sysctl.h> -#include <linux/export.h> -#include <linux/ctype.h> -#include <linux/cache.h> -#include <linux/init.h> -#include <linux/signal.h> -#include <linux/memblock.h> -#include <linux/context_tracking.h> - -#include <asm/processor.h> -#include <asm/pgtable.h> -#include <asm/mmu.h> -#include <asm/mmu_context.h> -#include <asm/page.h> -#include <asm/types.h> -#include <asm/uaccess.h> -#include <asm/machdep.h> -#include <asm/prom.h> -#include <asm/tlbflush.h> -#include <asm/io.h> -#include <asm/eeh.h> -#include <asm/tlb.h> -#include <asm/cacheflush.h> -#include <asm/cputable.h> -#include <asm/sections.h> -#include <asm/spu.h> -#include <asm/udbg.h> -#include <asm/code-patching.h> -#include <asm/fadump.h> -#include <asm/firmware.h> -#include <asm/tm.h> - -#ifdef DEBUG -#define DBG(fmt...) udbg_printf(fmt) -#else -#define DBG(fmt...) -#endif - -#ifdef DEBUG_LOW -#define DBG_LOW(fmt...) udbg_printf(fmt) -#else -#define DBG_LOW(fmt...) -#endif - -#define KB (1024) -#define MB (1024*KB) -#define GB (1024L*MB) - -/* - * Note: pte --> Linux PTE - * HPTE --> PowerPC Hashed Page Table Entry - * - * Execution context: - * htab_initialize is called with the MMU off (of course), but - * the kernel has been copied down to zero so it can directly - * reference global data. At this point it is very difficult - * to print debug info. - * - */ - -#ifdef CONFIG_U3_DART -extern unsigned long dart_tablebase; -#endif /* CONFIG_U3_DART */ - -static unsigned long _SDR1; -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; - -struct hash_pte *htab_address; -unsigned long htab_size_bytes; -unsigned long htab_hash_mask; -EXPORT_SYMBOL_GPL(htab_hash_mask); -int mmu_linear_psize = MMU_PAGE_4K; -int mmu_virtual_psize = MMU_PAGE_4K; -int mmu_vmalloc_psize = MMU_PAGE_4K; -#ifdef CONFIG_SPARSEMEM_VMEMMAP -int mmu_vmemmap_psize = MMU_PAGE_4K; -#endif -int mmu_io_psize = MMU_PAGE_4K; -int mmu_kernel_ssize = MMU_SEGSIZE_256M; -int mmu_highuser_ssize = MMU_SEGSIZE_256M; -u16 mmu_slb_size = 64; -EXPORT_SYMBOL_GPL(mmu_slb_size); -#ifdef CONFIG_PPC_64K_PAGES -int mmu_ci_restrictions; -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC -static u8 *linear_map_hash_slots; -static unsigned long linear_map_hash_count; -static DEFINE_SPINLOCK(linear_map_hash_lock); -#endif /* CONFIG_DEBUG_PAGEALLOC */ - -/* There are definitions of page sizes arrays to be used when none - * is provided by the firmware. - */ - -/* Pre-POWER4 CPUs (4k pages only) - */ -static struct mmu_psize_def mmu_psize_defaults_old[] = { - [MMU_PAGE_4K] = { - .shift = 12, - .sllp = 0, - .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, - .avpnm = 0, - .tlbiel = 0, - }, -}; - -/* POWER4, GPUL, POWER5 - * - * Support for 16Mb large pages - */ -static struct mmu_psize_def mmu_psize_defaults_gp[] = { - [MMU_PAGE_4K] = { - .shift = 12, - .sllp = 0, - .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, - .avpnm = 0, - .tlbiel = 1, - }, - [MMU_PAGE_16M] = { - .shift = 24, - .sllp = SLB_VSID_L, - .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0, - [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 }, - .avpnm = 0x1UL, - .tlbiel = 0, - }, -}; - -static unsigned long htab_convert_pte_flags(unsigned long pteflags) -{ - unsigned long rflags = pteflags & 0x1fa; - - /* _PAGE_EXEC -> NOEXEC */ - if ((pteflags & _PAGE_EXEC) == 0) - rflags |= HPTE_R_N; - - /* PP bits. PAGE_USER is already PP bit 0x2, so we only - * need to add in 0x1 if it's a read-only user page - */ - if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) && - (pteflags & _PAGE_DIRTY))) - rflags |= 1; - - /* Always add C */ - return rflags | HPTE_R_C; -} - -int htab_bolt_mapping(unsigned long vstart, unsigned long vend, - unsigned long pstart, unsigned long prot, - int psize, int ssize) -{ - unsigned long vaddr, paddr; - unsigned int step, shift; - int ret = 0; - - shift = mmu_psize_defs[psize].shift; - step = 1 << shift; - - prot = htab_convert_pte_flags(prot); - - DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n", - vstart, vend, pstart, prot, psize, ssize); - - for (vaddr = vstart, paddr = pstart; vaddr < vend; - vaddr += step, paddr += step) { - unsigned long hash, hpteg; - unsigned long vsid = get_kernel_vsid(vaddr, ssize); - unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); - unsigned long tprot = prot; - - /* - * If we hit a bad address return error. - */ - if (!vsid) - return -1; - /* Make kernel text executable */ - if (overlaps_kernel_text(vaddr, vaddr + step)) - tprot &= ~HPTE_R_N; - - hash = hpt_hash(vpn, shift, ssize); - hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); - - BUG_ON(!ppc_md.hpte_insert); - ret = ppc_md.hpte_insert(hpteg, vpn, paddr, tprot, - HPTE_V_BOLTED, psize, psize, ssize); - - if (ret < 0) - break; -#ifdef CONFIG_DEBUG_PAGEALLOC - if ((paddr >> PAGE_SHIFT) < linear_map_hash_count) - linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; -#endif /* CONFIG_DEBUG_PAGEALLOC */ - } - return ret < 0 ? ret : 0; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -static int htab_remove_mapping(unsigned long vstart, unsigned long vend, - int psize, int ssize) -{ - unsigned long vaddr; - unsigned int step, shift; - - shift = mmu_psize_defs[psize].shift; - step = 1 << shift; - - if (!ppc_md.hpte_removebolted) { - printk(KERN_WARNING "Platform doesn't implement " - "hpte_removebolted\n"); - return -EINVAL; - } - - for (vaddr = vstart; vaddr < vend; vaddr += step) - ppc_md.hpte_removebolted(vaddr, psize, ssize); - - return 0; -} -#endif /* CONFIG_MEMORY_HOTPLUG */ - -static int __init htab_dt_scan_seg_sizes(unsigned long node, - const char *uname, int depth, - void *data) -{ - char *type = of_get_flat_dt_prop(node, "device_type", NULL); - u32 *prop; - unsigned long size = 0; - - /* We are scanning "cpu" nodes only */ - if (type == NULL || strcmp(type, "cpu") != 0) - return 0; - - prop = (u32 *)of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", - &size); - if (prop == NULL) - return 0; - for (; size >= 4; size -= 4, ++prop) { - if (prop[0] == 40) { - DBG("1T segment support detected\n"); - cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT; - return 1; - } - } - cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; - return 0; -} - -static void __init htab_init_seg_sizes(void) -{ - of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL); -} - -static int __init get_idx_from_shift(unsigned int shift) -{ - int idx = -1; - - switch (shift) { - case 0xc: - idx = MMU_PAGE_4K; - break; - case 0x10: - idx = MMU_PAGE_64K; - break; - case 0x14: - idx = MMU_PAGE_1M; - break; - case 0x18: - idx = MMU_PAGE_16M; - break; - case 0x22: - idx = MMU_PAGE_16G; - break; - } - return idx; -} - -static int __init htab_dt_scan_page_sizes(unsigned long node, - const char *uname, int depth, - void *data) -{ - char *type = of_get_flat_dt_prop(node, "device_type", NULL); - u32 *prop; - unsigned long size = 0; - - /* We are scanning "cpu" nodes only */ - if (type == NULL || strcmp(type, "cpu") != 0) - return 0; - - prop = (u32 *)of_get_flat_dt_prop(node, - "ibm,segment-page-sizes", &size); - if (prop != NULL) { - pr_info("Page sizes from device-tree:\n"); - size /= 4; - cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE); - while(size > 0) { - unsigned int base_shift = prop[0]; - unsigned int slbenc = prop[1]; - unsigned int lpnum = prop[2]; - struct mmu_psize_def *def; - int idx, base_idx; - - size -= 3; prop += 3; - base_idx = get_idx_from_shift(base_shift); - if (base_idx < 0) { - /* - * skip the pte encoding also - */ - prop += lpnum * 2; size -= lpnum * 2; - continue; - } - def = &mmu_psize_defs[base_idx]; - if (base_idx == MMU_PAGE_16M) - cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE; - - def->shift = base_shift; - if (base_shift <= 23) - def->avpnm = 0; - else - def->avpnm = (1 << (base_shift - 23)) - 1; - def->sllp = slbenc; - /* - * We don't know for sure what's up with tlbiel, so - * for now we only set it for 4K and 64K pages - */ - if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K) - def->tlbiel = 1; - else - def->tlbiel = 0; - - while (size > 0 && lpnum) { - unsigned int shift = prop[0]; - int penc = prop[1]; - - prop += 2; size -= 2; - lpnum--; - - idx = get_idx_from_shift(shift); - if (idx < 0) - continue; - - if (penc == -1) - pr_err("Invalid penc for base_shift=%d " - "shift=%d\n", base_shift, shift); - - def->penc[idx] = penc; - pr_info("base_shift=%d: shift=%d, sllp=0x%04lx," - " avpnm=0x%08lx, tlbiel=%d, penc=%d\n", - base_shift, shift, def->sllp, - def->avpnm, def->tlbiel, def->penc[idx]); - } - } - return 1; - } - return 0; -} - -#ifdef CONFIG_HUGETLB_PAGE -/* Scan for 16G memory blocks that have been set aside for huge pages - * and reserve those blocks for 16G huge pages. - */ -static int __init htab_dt_scan_hugepage_blocks(unsigned long node, - const char *uname, int depth, - void *data) { - char *type = of_get_flat_dt_prop(node, "device_type", NULL); - unsigned long *addr_prop; - u32 *page_count_prop; - unsigned int expected_pages; - long unsigned int phys_addr; - long unsigned int block_size; - - /* We are scanning "memory" nodes only */ - if (type == NULL || strcmp(type, "memory") != 0) - return 0; - - /* This property is the log base 2 of the number of virtual pages that - * will represent this memory block. */ - page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); - if (page_count_prop == NULL) - return 0; - expected_pages = (1 << page_count_prop[0]); - addr_prop = of_get_flat_dt_prop(node, "reg", NULL); - if (addr_prop == NULL) - return 0; - phys_addr = addr_prop[0]; - block_size = addr_prop[1]; - if (block_size != (16 * GB)) - return 0; - printk(KERN_INFO "Huge page(16GB) memory: " - "addr = 0x%lX size = 0x%lX pages = %d\n", - phys_addr, block_size, expected_pages); - if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) { - memblock_reserve(phys_addr, block_size * expected_pages); - add_gpage(phys_addr, block_size, expected_pages); - } - return 0; -} -#endif /* CONFIG_HUGETLB_PAGE */ - -static void mmu_psize_set_default_penc(void) -{ - int bpsize, apsize; - for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) - for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++) - mmu_psize_defs[bpsize].penc[apsize] = -1; -} - -static void __init htab_init_page_sizes(void) -{ - int rc; - - /* se the invalid penc to -1 */ - mmu_psize_set_default_penc(); - - /* Default to 4K pages only */ - memcpy(mmu_psize_defs, mmu_psize_defaults_old, - sizeof(mmu_psize_defaults_old)); - - /* - * Try to find the available page sizes in the device-tree - */ - rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL); - if (rc != 0) /* Found */ - goto found; - - /* - * Not in the device-tree, let's fallback on known size - * list for 16M capable GP & GR - */ - if (mmu_has_feature(MMU_FTR_16M_PAGE)) - memcpy(mmu_psize_defs, mmu_psize_defaults_gp, - sizeof(mmu_psize_defaults_gp)); - found: -#ifndef CONFIG_DEBUG_PAGEALLOC - /* - * Pick a size for the linear mapping. Currently, we only support - * 16M, 1M and 4K which is the default - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift) - mmu_linear_psize = MMU_PAGE_16M; - else if (mmu_psize_defs[MMU_PAGE_1M].shift) - mmu_linear_psize = MMU_PAGE_1M; -#endif /* CONFIG_DEBUG_PAGEALLOC */ - -#ifdef CONFIG_PPC_64K_PAGES - /* - * Pick a size for the ordinary pages. Default is 4K, we support - * 64K for user mappings and vmalloc if supported by the processor. - * We only use 64k for ioremap if the processor - * (and firmware) support cache-inhibited large pages. - * If not, we use 4k and set mmu_ci_restrictions so that - * hash_page knows to switch processes that use cache-inhibited - * mappings to 4k pages. - */ - if (mmu_psize_defs[MMU_PAGE_64K].shift) { - mmu_virtual_psize = MMU_PAGE_64K; - mmu_vmalloc_psize = MMU_PAGE_64K; - if (mmu_linear_psize == MMU_PAGE_4K) - mmu_linear_psize = MMU_PAGE_64K; - if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) { - /* - * Don't use 64k pages for ioremap on pSeries, since - * that would stop us accessing the HEA ethernet. - */ - if (!machine_is(pseries)) - mmu_io_psize = MMU_PAGE_64K; - } else - mmu_ci_restrictions = 1; - } -#endif /* CONFIG_PPC_64K_PAGES */ - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - /* We try to use 16M pages for vmemmap if that is supported - * and we have at least 1G of RAM at boot - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift && - memblock_phys_mem_size() >= 0x40000000) - mmu_vmemmap_psize = MMU_PAGE_16M; - else if (mmu_psize_defs[MMU_PAGE_64K].shift) - mmu_vmemmap_psize = MMU_PAGE_64K; - else - mmu_vmemmap_psize = MMU_PAGE_4K; -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ - - printk(KERN_DEBUG "Page orders: linear mapping = %d, " - "virtual = %d, io = %d" -#ifdef CONFIG_SPARSEMEM_VMEMMAP - ", vmemmap = %d" -#endif - "\n", - mmu_psize_defs[mmu_linear_psize].shift, - mmu_psize_defs[mmu_virtual_psize].shift, - mmu_psize_defs[mmu_io_psize].shift -#ifdef CONFIG_SPARSEMEM_VMEMMAP - ,mmu_psize_defs[mmu_vmemmap_psize].shift -#endif - ); - -#ifdef CONFIG_HUGETLB_PAGE - /* Reserve 16G huge page memory sections for huge pages */ - of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); -#endif /* CONFIG_HUGETLB_PAGE */ -} - -static int __init htab_dt_scan_pftsize(unsigned long node, - const char *uname, int depth, - void *data) -{ - char *type = of_get_flat_dt_prop(node, "device_type", NULL); - u32 *prop; - - /* We are scanning "cpu" nodes only */ - if (type == NULL || strcmp(type, "cpu") != 0) - return 0; - - prop = (u32 *)of_get_flat_dt_prop(node, "ibm,pft-size", NULL); - if (prop != NULL) { - /* pft_size[0] is the NUMA CEC cookie */ - ppc64_pft_size = prop[1]; - return 1; - } - return 0; -} - -static unsigned long __init htab_get_table_size(void) -{ - unsigned long mem_size, rnd_mem_size, pteg_count, psize; - - /* If hash size isn't already provided by the platform, we try to - * retrieve it from the device-tree. If it's not there neither, we - * calculate it now based on the total RAM size - */ - if (ppc64_pft_size == 0) - of_scan_flat_dt(htab_dt_scan_pftsize, NULL); - if (ppc64_pft_size) - return 1UL << ppc64_pft_size; - - /* round mem_size up to next power of 2 */ - mem_size = memblock_phys_mem_size(); - rnd_mem_size = 1UL << __ilog2(mem_size); - if (rnd_mem_size < mem_size) - rnd_mem_size <<= 1; - - /* # pages / 2 */ - psize = mmu_psize_defs[mmu_virtual_psize].shift; - pteg_count = max(rnd_mem_size >> (psize + 1), 1UL << 11); - - return pteg_count << 7; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -int create_section_mapping(unsigned long start, unsigned long end) -{ - return htab_bolt_mapping(start, end, __pa(start), - pgprot_val(PAGE_KERNEL), mmu_linear_psize, - mmu_kernel_ssize); -} - -int remove_section_mapping(unsigned long start, unsigned long end) -{ - return htab_remove_mapping(start, end, mmu_linear_psize, - mmu_kernel_ssize); -} -#endif /* CONFIG_MEMORY_HOTPLUG */ - -#define FUNCTION_TEXT(A) ((*(unsigned long *)(A))) - -static void __init htab_finish_init(void) -{ - extern unsigned int *htab_call_hpte_insert1; - extern unsigned int *htab_call_hpte_insert2; - extern unsigned int *htab_call_hpte_remove; - extern unsigned int *htab_call_hpte_updatepp; - -#ifdef CONFIG_PPC_HAS_HASH_64K - extern unsigned int *ht64_call_hpte_insert1; - extern unsigned int *ht64_call_hpte_insert2; - extern unsigned int *ht64_call_hpte_remove; - extern unsigned int *ht64_call_hpte_updatepp; - - patch_branch(ht64_call_hpte_insert1, - FUNCTION_TEXT(ppc_md.hpte_insert), - BRANCH_SET_LINK); - patch_branch(ht64_call_hpte_insert2, - FUNCTION_TEXT(ppc_md.hpte_insert), - BRANCH_SET_LINK); - patch_branch(ht64_call_hpte_remove, - FUNCTION_TEXT(ppc_md.hpte_remove), - BRANCH_SET_LINK); - patch_branch(ht64_call_hpte_updatepp, - FUNCTION_TEXT(ppc_md.hpte_updatepp), - BRANCH_SET_LINK); - -#endif /* CONFIG_PPC_HAS_HASH_64K */ - - patch_branch(htab_call_hpte_insert1, - FUNCTION_TEXT(ppc_md.hpte_insert), - BRANCH_SET_LINK); - patch_branch(htab_call_hpte_insert2, - FUNCTION_TEXT(ppc_md.hpte_insert), - BRANCH_SET_LINK); - patch_branch(htab_call_hpte_remove, - FUNCTION_TEXT(ppc_md.hpte_remove), - BRANCH_SET_LINK); - patch_branch(htab_call_hpte_updatepp, - FUNCTION_TEXT(ppc_md.hpte_updatepp), - BRANCH_SET_LINK); -} - -static void __init htab_initialize(void) -{ - unsigned long table; - unsigned long pteg_count; - unsigned long prot; - unsigned long base = 0, size = 0, limit; - struct memblock_region *reg; - - DBG(" -> htab_initialize()\n"); - - /* Initialize segment sizes */ - htab_init_seg_sizes(); - - /* Initialize page sizes */ - htab_init_page_sizes(); - - if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { - mmu_kernel_ssize = MMU_SEGSIZE_1T; - mmu_highuser_ssize = MMU_SEGSIZE_1T; - printk(KERN_INFO "Using 1TB segments\n"); - } - - /* - * Calculate the required size of the htab. We want the number of - * PTEGs to equal one half the number of real pages. - */ - htab_size_bytes = htab_get_table_size(); - pteg_count = htab_size_bytes >> 7; - - htab_hash_mask = pteg_count - 1; - - if (firmware_has_feature(FW_FEATURE_LPAR)) { - /* Using a hypervisor which owns the htab */ - htab_address = NULL; - _SDR1 = 0; -#ifdef CONFIG_FA_DUMP - /* - * If firmware assisted dump is active firmware preserves - * the contents of htab along with entire partition memory. - * Clear the htab if firmware assisted dump is active so - * that we dont end up using old mappings. - */ - if (is_fadump_active() && ppc_md.hpte_clear_all) - ppc_md.hpte_clear_all(); -#endif - } else { - /* Find storage for the HPT. Must be contiguous in - * the absolute address space. On cell we want it to be - * in the first 2 Gig so we can use it for IOMMU hacks. - */ - if (machine_is(cell)) - limit = 0x80000000; - else - limit = MEMBLOCK_ALLOC_ANYWHERE; - - table = memblock_alloc_base(htab_size_bytes, htab_size_bytes, limit); - - DBG("Hash table allocated at %lx, size: %lx\n", table, - htab_size_bytes); - - htab_address = __va(table); - - /* htab absolute addr + encoded htabsize */ - _SDR1 = table + __ilog2(pteg_count) - 11; - - /* Initialize the HPT with no entries */ - memset((void *)table, 0, htab_size_bytes); - - /* Set SDR1 */ - mtspr(SPRN_SDR1, _SDR1); - } - - prot = pgprot_val(PAGE_KERNEL); - -#ifdef CONFIG_DEBUG_PAGEALLOC - linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; - linear_map_hash_slots = __va(memblock_alloc_base(linear_map_hash_count, - 1, ppc64_rma_size)); - memset(linear_map_hash_slots, 0, linear_map_hash_count); -#endif /* CONFIG_DEBUG_PAGEALLOC */ - - /* On U3 based machines, we need to reserve the DART area and - * _NOT_ map it to avoid cache paradoxes as it's remapped non - * cacheable later on - */ - - /* create bolted the linear mapping in the hash table */ - for_each_memblock(memory, reg) { - base = (unsigned long)__va(reg->base); - size = reg->size; - - DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", - base, size, prot); - -#ifdef CONFIG_U3_DART - /* Do not map the DART space. Fortunately, it will be aligned - * in such a way that it will not cross two memblock regions and - * will fit within a single 16Mb page. - * The DART space is assumed to be a full 16Mb region even if - * we only use 2Mb of that space. We will use more of it later - * for AGP GART. We have to use a full 16Mb large page. - */ - DBG("DART base: %lx\n", dart_tablebase); - - if (dart_tablebase != 0 && dart_tablebase >= base - && dart_tablebase < (base + size)) { - unsigned long dart_table_end = dart_tablebase + 16 * MB; - if (base != dart_tablebase) - BUG_ON(htab_bolt_mapping(base, dart_tablebase, - __pa(base), prot, - mmu_linear_psize, - mmu_kernel_ssize)); - if ((base + size) > dart_table_end) - BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB, - base + size, - __pa(dart_table_end), - prot, - mmu_linear_psize, - mmu_kernel_ssize)); - continue; - } -#endif /* CONFIG_U3_DART */ - BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), - prot, mmu_linear_psize, mmu_kernel_ssize)); - } - memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); - - /* - * If we have a memory_limit and we've allocated TCEs then we need to - * explicitly map the TCE area at the top of RAM. We also cope with the - * case that the TCEs start below memory_limit. - * tce_alloc_start/end are 16MB aligned so the mapping should work - * for either 4K or 16MB pages. - */ - if (tce_alloc_start) { - tce_alloc_start = (unsigned long)__va(tce_alloc_start); - tce_alloc_end = (unsigned long)__va(tce_alloc_end); - - if (base + size >= tce_alloc_start) - tce_alloc_start = base + size + 1; - - BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, - __pa(tce_alloc_start), prot, - mmu_linear_psize, mmu_kernel_ssize)); - } - - htab_finish_init(); - - DBG(" <- htab_initialize()\n"); -} -#undef KB -#undef MB - -void __init early_init_mmu(void) -{ - /* Setup initial STAB address in the PACA */ - get_paca()->stab_real = __pa((u64)&initial_stab); - get_paca()->stab_addr = (u64)&initial_stab; - - /* Initialize the MMU Hash table and create the linear mapping - * of memory. Has to be done before stab/slb initialization as - * this is currently where the page size encoding is obtained - */ - htab_initialize(); - - /* Initialize stab / SLB management */ - if (mmu_has_feature(MMU_FTR_SLB)) - slb_initialize(); - else - stab_initialize(get_paca()->stab_real); -} - -#ifdef CONFIG_SMP -void early_init_mmu_secondary(void) -{ - /* Initialize hash table for that CPU */ - if (!firmware_has_feature(FW_FEATURE_LPAR)) - mtspr(SPRN_SDR1, _SDR1); - - /* Initialize STAB/SLB. We use a virtual address as it works - * in real mode on pSeries. - */ - if (mmu_has_feature(MMU_FTR_SLB)) - slb_initialize(); - else - stab_initialize(get_paca()->stab_addr); -} -#endif /* CONFIG_SMP */ - -/* - * Called by asm hashtable.S for doing lazy icache flush - */ -unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) -{ - struct page *page; - - if (!pfn_valid(pte_pfn(pte))) - return pp; - - page = pte_page(pte); - - /* page is dirty */ - if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { - if (trap == 0x400) { - flush_dcache_icache_page(page); - set_bit(PG_arch_1, &page->flags); - } else - pp |= HPTE_R_N; - } - return pp; -} - -#ifdef CONFIG_PPC_MM_SLICES -unsigned int get_paca_psize(unsigned long addr) -{ - u64 lpsizes; - unsigned char *hpsizes; - unsigned long index, mask_index; - - if (addr < SLICE_LOW_TOP) { - lpsizes = get_paca()->context.low_slices_psize; - index = GET_LOW_SLICE_INDEX(addr); - return (lpsizes >> (index * 4)) & 0xF; - } - hpsizes = get_paca()->context.high_slices_psize; - index = GET_HIGH_SLICE_INDEX(addr); - mask_index = index & 0x1; - return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xF; -} - -#else -unsigned int get_paca_psize(unsigned long addr) -{ - return get_paca()->context.user_psize; -} -#endif - -/* - * Demote a segment to using 4k pages. - * For now this makes the whole process use 4k pages. - */ -#ifdef CONFIG_PPC_64K_PAGES -void demote_segment_4k(struct mm_struct *mm, unsigned long addr) -{ - if (get_slice_psize(mm, addr) == MMU_PAGE_4K) - return; - slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); -#ifdef CONFIG_SPU_BASE - spu_flush_all_slbs(mm); -#endif - if (get_paca_psize(addr) != MMU_PAGE_4K) { - get_paca()->context = mm->context; - slb_flush_and_rebolt(); - } -} -#endif /* CONFIG_PPC_64K_PAGES */ - -#ifdef CONFIG_PPC_SUBPAGE_PROT -/* - * This looks up a 2-bit protection code for a 4k subpage of a 64k page. - * Userspace sets the subpage permissions using the subpage_prot system call. - * - * Result is 0: full permissions, _PAGE_RW: read-only, - * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. - */ -static int subpage_protection(struct mm_struct *mm, unsigned long ea) -{ - struct subpage_prot_table *spt = &mm->context.spt; - u32 spp = 0; - u32 **sbpm, *sbpp; - - if (ea >= spt->maxaddr) - return 0; - if (ea < 0x100000000) { - /* addresses below 4GB use spt->low_prot */ - sbpm = spt->low_prot; - } else { - sbpm = spt->protptrs[ea >> SBP_L3_SHIFT]; - if (!sbpm) - return 0; - } - sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; - if (!sbpp) - return 0; - spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)]; - - /* extract 2-bit bitfield for this 4k subpage */ - spp >>= 30 - 2 * ((ea >> 12) & 0xf); - - /* turn 0,1,2,3 into combination of _PAGE_USER and _PAGE_RW */ - spp = ((spp & 2) ? _PAGE_USER : 0) | ((spp & 1) ? _PAGE_RW : 0); - return spp; -} - -#else /* CONFIG_PPC_SUBPAGE_PROT */ -static inline int subpage_protection(struct mm_struct *mm, unsigned long ea) -{ - return 0; -} -#endif - -void hash_failure_debug(unsigned long ea, unsigned long access, - unsigned long vsid, unsigned long trap, - int ssize, int psize, int lpsize, unsigned long pte) -{ - if (!printk_ratelimit()) - return; - pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n", - ea, access, current->comm); - pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n", - trap, vsid, ssize, psize, lpsize, pte); -} - -/* Result code is: - * 0 - handled - * 1 - normal page fault - * -1 - critical hash insertion error - * -2 - access not permitted by subpage protection mechanism - */ -int hash_page(unsigned long ea, unsigned long access, unsigned long trap) -{ - enum ctx_state prev_state = exception_enter(); - pgd_t *pgdir; - unsigned long vsid; - struct mm_struct *mm; - pte_t *ptep; - unsigned hugeshift; - const struct cpumask *tmp; - int rc, user_region = 0, local = 0; - int psize, ssize; - - DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", - ea, access, trap); - - /* Get region & vsid */ - switch (REGION_ID(ea)) { - case USER_REGION_ID: - user_region = 1; - mm = current->mm; - if (! mm) { - DBG_LOW(" user region with no mm !\n"); - rc = 1; - goto bail; - } - psize = get_slice_psize(mm, ea); - ssize = user_segment_size(ea); - vsid = get_vsid(mm->context.id, ea, ssize); - break; - case VMALLOC_REGION_ID: - mm = &init_mm; - vsid = get_kernel_vsid(ea, mmu_kernel_ssize); - if (ea < VMALLOC_END) - psize = mmu_vmalloc_psize; - else - psize = mmu_io_psize; - ssize = mmu_kernel_ssize; - break; - default: - /* Not a valid range - * Send the problem up to do_page_fault - */ - rc = 1; - goto bail; - } - DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); - - /* Bad address. */ - if (!vsid) { - DBG_LOW("Bad address!\n"); - rc = 1; - goto bail; - } - /* Get pgdir */ - pgdir = mm->pgd; - if (pgdir == NULL) { - rc = 1; - goto bail; - } - - /* Check CPU locality */ - tmp = cpumask_of(smp_processor_id()); - if (user_region && cpumask_equal(mm_cpumask(mm), tmp)) - local = 1; - -#ifndef CONFIG_PPC_64K_PAGES - /* If we use 4K pages and our psize is not 4K, then we might - * be hitting a special driver mapping, and need to align the - * address before we fetch the PTE. - * - * It could also be a hugepage mapping, in which case this is - * not necessary, but it's not harmful, either. - */ - if (psize != MMU_PAGE_4K) - ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); -#endif /* CONFIG_PPC_64K_PAGES */ - - /* Get PTE and page size from page tables */ - ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); - if (ptep == NULL || !pte_present(*ptep)) { - DBG_LOW(" no PTE !\n"); - rc = 1; - goto bail; - } - - /* Add _PAGE_PRESENT to the required access perm */ - access |= _PAGE_PRESENT; - - /* Pre-check access permissions (will be re-checked atomically - * in __hash_page_XX but this pre-check is a fast path - */ - if (access & ~pte_val(*ptep)) { - DBG_LOW(" no access !\n"); - rc = 1; - goto bail; - } - - if (hugeshift) { - if (pmd_trans_huge(*(pmd_t *)ptep)) - rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, - trap, local, ssize, psize); -#ifdef CONFIG_HUGETLB_PAGE - else - rc = __hash_page_huge(ea, access, vsid, ptep, trap, - local, ssize, hugeshift, psize); -#else - else { - /* - * if we have hugeshift, and is not transhuge with - * hugetlb disabled, something is really wrong. - */ - rc = 1; - WARN_ON(1); - } -#endif - goto bail; - } - -#ifndef CONFIG_PPC_64K_PAGES - DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); -#else - DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), - pte_val(*(ptep + PTRS_PER_PTE))); -#endif - /* Do actual hashing */ -#ifdef CONFIG_PPC_64K_PAGES - /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */ - if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) { - demote_segment_4k(mm, ea); - psize = MMU_PAGE_4K; - } - - /* If this PTE is non-cacheable and we have restrictions on - * using non cacheable large pages, then we switch to 4k - */ - if (mmu_ci_restrictions && psize == MMU_PAGE_64K && - (pte_val(*ptep) & _PAGE_NO_CACHE)) { - if (user_region) { - demote_segment_4k(mm, ea); - psize = MMU_PAGE_4K; - } else if (ea < VMALLOC_END) { - /* - * some driver did a non-cacheable mapping - * in vmalloc space, so switch vmalloc - * to 4k pages - */ - printk(KERN_ALERT "Reducing vmalloc segment " - "to 4kB pages because of " - "non-cacheable mapping\n"); - psize = mmu_vmalloc_psize = MMU_PAGE_4K; -#ifdef CONFIG_SPU_BASE - spu_flush_all_slbs(mm); -#endif - } - } - if (user_region) { - if (psize != get_paca_psize(ea)) { - get_paca()->context = mm->context; - slb_flush_and_rebolt(); - } - } else if (get_paca()->vmalloc_sllp != - mmu_psize_defs[mmu_vmalloc_psize].sllp) { - get_paca()->vmalloc_sllp = - mmu_psize_defs[mmu_vmalloc_psize].sllp; - slb_vmalloc_update(); - } -#endif /* CONFIG_PPC_64K_PAGES */ - -#ifdef CONFIG_PPC_HAS_HASH_64K - if (psize == MMU_PAGE_64K) - rc = __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize); - else -#endif /* CONFIG_PPC_HAS_HASH_64K */ - { - int spp = subpage_protection(mm, ea); - if (access & spp) - rc = -2; - else - rc = __hash_page_4K(ea, access, vsid, ptep, trap, - local, ssize, spp); - } - - /* Dump some info in case of hash insertion failure, they should - * never happen so it is really useful to know if/when they do - */ - if (rc == -1) - hash_failure_debug(ea, access, vsid, trap, ssize, psize, - psize, pte_val(*ptep)); -#ifndef CONFIG_PPC_64K_PAGES - DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); -#else - DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep), - pte_val(*(ptep + PTRS_PER_PTE))); -#endif - DBG_LOW(" -> rc=%d\n", rc); - -bail: - exception_exit(prev_state); - return rc; -} -EXPORT_SYMBOL_GPL(hash_page); - -void hash_preload(struct mm_struct *mm, unsigned long ea, - unsigned long access, unsigned long trap) -{ - int hugepage_shift; - unsigned long vsid; - pgd_t *pgdir; - pte_t *ptep; - unsigned long flags; - int rc, ssize, local = 0; - - BUG_ON(REGION_ID(ea) != USER_REGION_ID); - -#ifdef CONFIG_PPC_MM_SLICES - /* We only prefault standard pages for now */ - if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize)) - return; -#endif - - DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," - " trap=%lx\n", mm, mm->pgd, ea, access, trap); - - /* Get Linux PTE if available */ - pgdir = mm->pgd; - if (pgdir == NULL) - return; - - /* Get VSID */ - ssize = user_segment_size(ea); - vsid = get_vsid(mm->context.id, ea, ssize); - if (!vsid) - return; - /* - * Hash doesn't like irqs. Walking linux page table with irq disabled - * saves us from holding multiple locks. - */ - local_irq_save(flags); - - /* - * THP pages use update_mmu_cache_pmd. We don't do - * hash preload there. Hence can ignore THP here - */ - ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift); - if (!ptep) - goto out_exit; - - WARN_ON(hugepage_shift); -#ifdef CONFIG_PPC_64K_PAGES - /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on - * a 64K kernel), then we don't preload, hash_page() will take - * care of it once we actually try to access the page. - * That way we don't have to duplicate all of the logic for segment - * page size demotion here - */ - if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) - goto out_exit; -#endif /* CONFIG_PPC_64K_PAGES */ - - /* Is that local to this CPU ? */ - if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - local = 1; - - /* Hash it in */ -#ifdef CONFIG_PPC_HAS_HASH_64K - if (mm->context.user_psize == MMU_PAGE_64K) - rc = __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize); - else -#endif /* CONFIG_PPC_HAS_HASH_64K */ - rc = __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize, - subpage_protection(mm, ea)); - - /* Dump some info in case of hash insertion failure, they should - * never happen so it is really useful to know if/when they do - */ - if (rc == -1) - hash_failure_debug(ea, access, vsid, trap, ssize, - mm->context.user_psize, - mm->context.user_psize, - pte_val(*ptep)); -out_exit: - local_irq_restore(flags); -} - -/* WARNING: This is called from hash_low_64.S, if you change this prototype, - * do not forget to update the assembly call site ! - */ -void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, - int local) -{ - unsigned long hash, index, shift, hidx, slot; - - DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn); - pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { - hash = hpt_hash(vpn, shift, ssize); - hidx = __rpte_to_hidx(pte, index); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx); - /* - * We use same base page size and actual psize, because we don't - * use these functions for hugepage - */ - ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local); - } pte_iterate_hashed_end(); - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - /* Transactions are not aborted by tlbiel, only tlbie. - * Without, syncing a page back to a block device w/ PIO could pick up - * transactional data (bad!) so we force an abort here. Before the - * sync the page will be made read-only, which will flush_hash_page. - * BIG ISSUE here: if the kernel uses a page from userspace without - * unmapping it first, it may see the speculated version. - */ - if (local && cpu_has_feature(CPU_FTR_TM) && - current->thread.regs && - MSR_TM_ACTIVE(current->thread.regs->msr)) { - tm_enable(); - tm_abort(TM_CAUSE_TLBI); - } -#endif -} - -void flush_hash_range(unsigned long number, int local) -{ - if (ppc_md.flush_hash_range) - ppc_md.flush_hash_range(number, local); - else { - int i; - struct ppc64_tlb_batch *batch = - &__get_cpu_var(ppc64_tlb_batch); - - for (i = 0; i < number; i++) - flush_hash_page(batch->vpn[i], batch->pte[i], - batch->psize, batch->ssize, local); - } -} - -/* - * low_hash_fault is called when we the low level hash code failed - * to instert a PTE due to an hypervisor error - */ -void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc) -{ - enum ctx_state prev_state = exception_enter(); - - if (user_mode(regs)) { -#ifdef CONFIG_PPC_SUBPAGE_PROT - if (rc == -2) - _exception(SIGSEGV, regs, SEGV_ACCERR, address); - else -#endif - _exception(SIGBUS, regs, BUS_ADRERR, address); - } else - bad_page_fault(regs, address, SIGBUS); - - exception_exit(prev_state); -} - -long hpte_insert_repeating(unsigned long hash, unsigned long vpn, - unsigned long pa, unsigned long rflags, - unsigned long vflags, int psize, int ssize) -{ - unsigned long hpte_group; - long slot; - -repeat: - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; - - /* Insert into the hash table, primary slot */ - slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, vflags, - psize, psize, ssize); - - /* Primary is full, try the secondary */ - if (unlikely(slot == -1)) { - hpte_group = ((~hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; - slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, - vflags | HPTE_V_SECONDARY, - psize, psize, ssize); - if (slot == -1) { - if (mftb() & 0x1) - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP)&~0x7UL; - - ppc_md.hpte_remove(hpte_group); - goto repeat; - } - } - - return slot; -} - -#ifdef CONFIG_DEBUG_PAGEALLOC -static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) -{ - unsigned long hash; - unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); - unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); - unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL); - long ret; - - hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - - /* Don't create HPTE entries for bad address */ - if (!vsid) - return; - - ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, - HPTE_V_BOLTED, - mmu_linear_psize, mmu_kernel_ssize); - - BUG_ON (ret < 0); - spin_lock(&linear_map_hash_lock); - BUG_ON(linear_map_hash_slots[lmi] & 0x80); - linear_map_hash_slots[lmi] = ret | 0x80; - spin_unlock(&linear_map_hash_lock); -} - -static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) -{ - unsigned long hash, hidx, slot; - unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); - unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); - - hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - spin_lock(&linear_map_hash_lock); - BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); - hidx = linear_map_hash_slots[lmi] & 0x7f; - linear_map_hash_slots[lmi] = 0; - spin_unlock(&linear_map_hash_lock); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize, - mmu_kernel_ssize, 0); -} - -void kernel_map_pages(struct page *page, int numpages, int enable) -{ - unsigned long flags, vaddr, lmi; - int i; - - local_irq_save(flags); - for (i = 0; i < numpages; i++, page++) { - vaddr = (unsigned long)page_address(page); - lmi = __pa(vaddr) >> PAGE_SHIFT; - if (lmi >= linear_map_hash_count) - continue; - if (enable) - kernel_map_linear_page(vaddr, lmi); - else - kernel_unmap_linear_page(vaddr, lmi); - } - local_irq_restore(flags); -} -#endif /* CONFIG_DEBUG_PAGEALLOC */ - -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); - - /* On LPAR systems, the first entry is our RMA region, - * non-LPAR 64-bit hash MMU systems don't have a limitation - * on real mode access, but using the first entry works well - * enough. We also clamp it to 1G to avoid some funky things - * such as RTAS bugs etc... - */ - ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); - - /* Finally limit subsequent allocations */ - memblock_set_current_limit(ppc64_rma_size); -} diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c deleted file mode 100644 index e7450bdbe83a..000000000000 --- a/arch/powerpc/mm/highmem.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * highmem.c: virtual kernel memory mappings for high memory - * - * PowerPC version, stolen from the i386 version. - * - * Used in CONFIG_HIGHMEM systems for memory pages which - * are not addressable by direct kernel virtual addresses. - * - * Copyright (C) 1999 Gerhard Wichert, Siemens AG - * Gerhard.Wichert@pdb.siemens.de - * - * - * Redesigned the x86 32-bit VM architecture to deal with - * up to 16 Terrabyte physical memory. With current x86 CPUs - * we now support up to 64 Gigabytes physical RAM. - * - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> - * - * Reworked for PowerPC by various contributors. Moved from - * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp. - */ - -#include <linux/highmem.h> -#include <linux/module.h> - -/* - * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap - * gives a more generic (and caching) interface. But kmap_atomic can - * be used in IRQ contexts, so in some (very limited) cases we need - * it. - */ -void *kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(*(kmap_pte-idx))); -#endif - __set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1); - local_flush_tlb_page(NULL, vaddr); - - return (void*) vaddr; -} -EXPORT_SYMBOL(kmap_atomic_prot); - -void __kunmap_atomic(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int type; - - if (vaddr < __fix_to_virt(FIX_KMAP_END)) { - pagefault_enable(); - return; - } - - type = kmap_atomic_idx(); - -#ifdef CONFIG_DEBUG_HIGHMEM - { - unsigned int idx; - - idx = type + KM_TYPE_NR * smp_processor_id(); - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); - - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); - local_flush_tlb_page(NULL, vaddr); - } -#endif - - kmap_atomic_idx_pop(); - pagefault_enable(); -} -EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c deleted file mode 100644 index 3bc700655fc8..000000000000 --- a/arch/powerpc/mm/hugetlbpage-book3e.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * PPC Huge TLB Page Support for Book3E MMU - * - * Copyright (C) 2009 David Gibson, IBM Corporation. - * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor - * - */ -#include <linux/mm.h> -#include <linux/hugetlb.h> - -static inline int mmu_get_tsize(int psize) -{ - return mmu_psize_defs[psize].enc; -} - -static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) -{ - int found = 0; - - mtspr(SPRN_MAS6, pid << 16); - if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) { - asm volatile( - "li %0,0\n" - "tlbsx. 0,%1\n" - "bne 1f\n" - "li %0,1\n" - "1:\n" - : "=&r"(found) : "r"(ea)); - } else { - asm volatile( - "tlbsx 0,%1\n" - "mfspr %0,0x271\n" - "srwi %0,%0,31\n" - : "=&r"(found) : "r"(ea)); - } - - return found; -} - -void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, - pte_t pte) -{ - unsigned long mas1, mas2; - u64 mas7_3; - unsigned long psize, tsize, shift; - unsigned long flags; - struct mm_struct *mm; - -#ifdef CONFIG_PPC_FSL_BOOK3E - int index, ncams; -#endif - - if (unlikely(is_kernel_addr(ea))) - return; - - mm = vma->vm_mm; - -#ifdef CONFIG_PPC_MM_SLICES - psize = get_slice_psize(mm, ea); - tsize = mmu_get_tsize(psize); - shift = mmu_psize_defs[psize].shift; -#else - psize = vma_mmu_pagesize(vma); - shift = __ilog2(psize); - tsize = shift - 10; -#endif - - /* - * We can't be interrupted while we're setting up the MAS - * regusters or after we've confirmed that no tlb exists. - */ - local_irq_save(flags); - - if (unlikely(book3e_tlb_exists(ea, mm->context.id))) { - local_irq_restore(flags); - return; - } - -#ifdef CONFIG_PPC_FSL_BOOK3E - ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; - - /* We have to use the CAM(TLB1) on FSL parts for hugepages */ - index = __get_cpu_var(next_tlbcam_idx); - mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); - - /* Just round-robin the entries and wrap when we hit the end */ - if (unlikely(index == ncams - 1)) - __get_cpu_var(next_tlbcam_idx) = tlbcam_index; - else - __get_cpu_var(next_tlbcam_idx)++; -#endif - mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); - mas2 = ea & ~((1UL << shift) - 1); - mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; - mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT; - mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK; - if (!pte_dirty(pte)) - mas7_3 &= ~(MAS3_SW|MAS3_UW); - - mtspr(SPRN_MAS1, mas1); - mtspr(SPRN_MAS2, mas2); - - if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) { - mtspr(SPRN_MAS7_MAS3, mas7_3); - } else { - mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); - mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); - } - - asm volatile ("tlbwe"); - - local_irq_restore(flags); -} - -void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ - struct hstate *hstate = hstate_file(vma->vm_file); - unsigned long tsize = huge_page_shift(hstate) - 10; - - __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, tsize, 0); - -} diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c deleted file mode 100644 index 0b7fb6761015..000000000000 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ /dev/null @@ -1,125 +0,0 @@ -/* - * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later) - * - * Copyright (C) 2003 David Gibson, IBM Corporation. - * - * Based on the IA-32 version: - * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> - */ - -#include <linux/mm.h> -#include <linux/hugetlb.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/cacheflush.h> -#include <asm/machdep.h> - -extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn, - unsigned long pa, unsigned long rlags, - unsigned long vflags, int psize, int ssize); - -int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, - pte_t *ptep, unsigned long trap, int local, int ssize, - unsigned int shift, unsigned int mmu_psize) -{ - unsigned long vpn; - unsigned long old_pte, new_pte; - unsigned long rflags, pa, sz; - long slot; - - BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); - - /* Search the Linux page table for a match with va */ - vpn = hpt_vpn(ea, vsid, ssize); - - /* At this point, we have a pte (old_pte) which can be used to build - * or update an HPTE. There are 2 cases: - * - * 1. There is a valid (present) pte with no associated HPTE (this is - * the most common case) - * 2. There is a valid (present) pte with an associated HPTE. The - * current values of the pp bits in the HPTE prevent access - * because we are doing software DIRTY bit management and the - * page is currently not DIRTY. - */ - - - do { - old_pte = pte_val(*ptep); - /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) - return 0; - /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) - return 1; - /* Try to lock the PTE, add ACCESSED and DIRTY if it was - * a write access */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) - new_pte |= _PAGE_DIRTY; - } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); - - rflags = 0x2 | (!(new_pte & _PAGE_RW)); - /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ - rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); - sz = ((1UL) << shift); - if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - /* No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case */ - rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); - - /* Check if pte already has an hpte (case 2) */ - if (unlikely(old_pte & _PAGE_HASHPTE)) { - /* There MIGHT be an HPTE for this pte */ - unsigned long hash, slot; - - hash = hpt_hash(vpn, shift, ssize); - if (old_pte & _PAGE_F_SECOND) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> 12; - - if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize, - mmu_psize, ssize, local) == -1) - old_pte &= ~_PAGE_HPTEFLAGS; - } - - if (likely(!(old_pte & _PAGE_HASHPTE))) { - unsigned long hash = hpt_hash(vpn, shift, ssize); - - pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; - - /* clear HPTE slot informations in new PTE */ -#ifdef CONFIG_PPC_64K_PAGES - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0; -#else - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; -#endif - /* Add in WIMG bits */ - rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | - _PAGE_COHERENT | _PAGE_GUARDED)); - - slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, - mmu_psize, ssize); - - /* - * Hypervisor failure. Restore old pte and return -1 - * similar to __hash_page_* - */ - if (unlikely(slot == -2)) { - *ptep = __pte(old_pte); - hash_failure_debug(ea, access, vsid, trap, ssize, - mmu_psize, mmu_psize, old_pte); - return -1; - } - - new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); - } - - /* - * No need to use ldarx/stdcx here - */ - *ptep = __pte(new_pte & ~_PAGE_BUSY); - return 0; -} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 834ca8eb38f2..d3c1b749dcfc 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -15,393 +15,84 @@ #include <linux/export.h> #include <linux/of_fdt.h> #include <linux/memblock.h> -#include <linux/bootmem.h> #include <linux/moduleparam.h> -#include <asm/pgtable.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/kmemleak.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/setup.h> #include <asm/hugetlb.h> +#include <asm/pte-walk.h> +#include <asm/firmware.h> -#ifdef CONFIG_HUGETLB_PAGE +bool hugetlb_disabled = false; -#define PAGE_SHIFT_64K 16 -#define PAGE_SHIFT_16M 24 -#define PAGE_SHIFT_16G 34 +#define PTE_T_ORDER (__builtin_ffs(sizeof(pte_basic_t)) - \ + __builtin_ffs(sizeof(void *))) -unsigned int HPAGE_SHIFT; - -/* - * Tracks gpages after the device tree is scanned and before the - * huge_boot_pages list is ready. On non-Freescale implementations, this is - * just used to track 16G pages and so is a single array. FSL-based - * implementations may have more than one gpage size, so we need multiple - * arrays - */ -#ifdef CONFIG_PPC_FSL_BOOK3E -#define MAX_NUMBER_GPAGES 128 -struct psize_gpages { - u64 gpage_list[MAX_NUMBER_GPAGES]; - unsigned int nr_gpages; -}; -static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT]; -#else -#define MAX_NUMBER_GPAGES 1024 -static u64 gpage_freearray[MAX_NUMBER_GPAGES]; -static unsigned nr_gpages; -#endif - -#define hugepd_none(hpd) ((hpd).pd == 0) - -#ifdef CONFIG_PPC_BOOK3S_64 -/* - * At this point we do the placement change only for BOOK3S 64. This would - * possibly work on other subarchs. - */ - -/* - * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have - * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD; - */ -int pmd_huge(pmd_t pmd) -{ - /* - * leaf pte for huge page, bottom two bits != 00 - */ - return ((pmd_val(pmd) & 0x3) != 0x0); -} - -int pud_huge(pud_t pud) -{ - /* - * leaf pte for huge page, bottom two bits != 00 - */ - return ((pud_val(pud) & 0x3) != 0x0); -} - -int pgd_huge(pgd_t pgd) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { /* - * leaf pte for huge page, bottom two bits != 00 + * Only called for hugetlbfs pages, hence can ignore THP and the + * irq disabled walk. */ - return ((pgd_val(pgd) & 0x3) != 0x0); -} -#else -int pmd_huge(pmd_t pmd) -{ - return 0; + return __find_linux_pte(mm->pgd, addr, NULL, NULL); } -int pud_huge(pud_t pud) +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long sz) { - return 0; -} - -int pgd_huge(pgd_t pgd) -{ - return 0; -} -#endif - -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) -{ - /* Only called for hugetlbfs pages, hence can ignore THP */ - return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); -} - -static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, - unsigned long address, unsigned pdshift, unsigned pshift) -{ - struct kmem_cache *cachep; - pte_t *new; - -#ifdef CONFIG_PPC_FSL_BOOK3E - int i; - int num_hugepd = 1 << (pshift - pdshift); - cachep = hugepte_cache; -#else - cachep = PGT_CACHE(pdshift - pshift); -#endif - - new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; - BUG_ON(pshift > HUGEPD_SHIFT_MASK); - BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); + addr &= ~(sz - 1); - if (! new) - return -ENOMEM; + p4d = p4d_offset(pgd_offset(mm, addr), addr); + if (!mm_pud_folded(mm) && sz >= P4D_SIZE) + return (pte_t *)p4d; - spin_lock(&mm->page_table_lock); -#ifdef CONFIG_PPC_FSL_BOOK3E - /* - * We have multiple higher-level entries that point to the same - * actual pte location. Fill in each as we go and backtrack on error. - * We need all of these so the DTLB pgtable walk code can find the - * right higher-level entry without knowing if it's a hugepage or not. - */ - for (i = 0; i < num_hugepd; i++, hpdp++) { - if (unlikely(!hugepd_none(*hpdp))) - break; - else - /* We use the old format for PPC_FSL_BOOK3E */ - hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; - } - /* If we bailed from the for loop early, an error occurred, clean up */ - if (i < num_hugepd) { - for (i = i - 1 ; i >= 0; i--, hpdp--) - hpdp->pd = 0; - kmem_cache_free(cachep, new); - } -#else - if (!hugepd_none(*hpdp)) - kmem_cache_free(cachep, new); - else { -#ifdef CONFIG_PPC_BOOK3S_64 - hpdp->pd = (unsigned long)new | - (shift_to_mmu_psize(pshift) << 2); -#else - hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; -#endif - } -#endif - spin_unlock(&mm->page_table_lock); - return 0; -} - -/* - * These macros define how to determine which level of the page table holds - * the hpdp. - */ -#ifdef CONFIG_PPC_FSL_BOOK3E -#define HUGEPD_PGD_SHIFT PGDIR_SHIFT -#define HUGEPD_PUD_SHIFT PUD_SHIFT -#else -#define HUGEPD_PGD_SHIFT PUD_SHIFT -#define HUGEPD_PUD_SHIFT PMD_SHIFT -#endif - -#ifdef CONFIG_PPC_BOOK3S_64 -/* - * At this point we do the placement change only for BOOK3S 64. This would - * possibly work on other subarchs. - */ -pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) -{ - pgd_t *pg; - pud_t *pu; - pmd_t *pm; - hugepd_t *hpdp = NULL; - unsigned pshift = __ffs(sz); - unsigned pdshift = PGDIR_SHIFT; - - addr &= ~(sz-1); - pg = pgd_offset(mm, addr); - - if (pshift == PGDIR_SHIFT) - /* 16GB huge page */ - return (pte_t *) pg; - else if (pshift > PUD_SHIFT) - /* - * We need to use hugepd table - */ - hpdp = (hugepd_t *)pg; - else { - pdshift = PUD_SHIFT; - pu = pud_alloc(mm, pg, addr); - if (pshift == PUD_SHIFT) - return (pte_t *)pu; - else if (pshift > PMD_SHIFT) - hpdp = (hugepd_t *)pu; - else { - pdshift = PMD_SHIFT; - pm = pmd_alloc(mm, pu, addr); - if (pshift == PMD_SHIFT) - /* 16MB hugepage */ - return (pte_t *)pm; - else - hpdp = (hugepd_t *)pm; - } - } - if (!hpdp) + pud = pud_alloc(mm, p4d, addr); + if (!pud) return NULL; + if (!mm_pmd_folded(mm) && sz >= PUD_SIZE) + return (pte_t *)pud; - BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); - - if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) return NULL; - return hugepte_offset(hpdp, addr, pdshift); -} - -#else - -pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) -{ - pgd_t *pg; - pud_t *pu; - pmd_t *pm; - hugepd_t *hpdp = NULL; - unsigned pshift = __ffs(sz); - unsigned pdshift = PGDIR_SHIFT; - - addr &= ~(sz-1); - - pg = pgd_offset(mm, addr); + if (sz >= PMD_SIZE) { + /* On 8xx, all hugepages are handled as contiguous PTEs */ + if (IS_ENABLED(CONFIG_PPC_8xx)) { + int i; - if (pshift >= HUGEPD_PGD_SHIFT) { - hpdp = (hugepd_t *)pg; - } else { - pdshift = PUD_SHIFT; - pu = pud_alloc(mm, pg, addr); - if (pshift >= HUGEPD_PUD_SHIFT) { - hpdp = (hugepd_t *)pu; - } else { - pdshift = PMD_SHIFT; - pm = pmd_alloc(mm, pu, addr); - hpdp = (hugepd_t *)pm; + for (i = 0; i < sz / PMD_SIZE; i++) { + if (!pte_alloc_huge(mm, pmd + i, addr)) + return NULL; + } } + return (pte_t *)pmd; } - if (!hpdp) - return NULL; - - BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); - - if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) - return NULL; - - return hugepte_offset(hpdp, addr, pdshift); -} -#endif - -#ifdef CONFIG_PPC_FSL_BOOK3E -/* Build list of addresses of gigantic pages. This function is used in early - * boot before the buddy or bootmem allocator is setup. - */ -void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) -{ - unsigned int idx = shift_to_mmu_psize(__ffs(page_size)); - int i; - - if (addr == 0) - return; - - gpage_freearray[idx].nr_gpages = number_of_pages; - - for (i = 0; i < number_of_pages; i++) { - gpage_freearray[idx].gpage_list[i] = addr; - addr += page_size; - } + return pte_alloc_huge(mm, pmd, addr); } +#ifdef CONFIG_PPC_BOOK3S_64 /* - * Moves the gigantic page addresses from the temporary list to the - * huge_boot_pages list. - */ -int alloc_bootmem_huge_page(struct hstate *hstate) -{ - struct huge_bootmem_page *m; - int idx = shift_to_mmu_psize(huge_page_shift(hstate)); - int nr_gpages = gpage_freearray[idx].nr_gpages; - - if (nr_gpages == 0) - return 0; - -#ifdef CONFIG_HIGHMEM - /* - * If gpages can be in highmem we can't use the trick of storing the - * data structure in the page; allocate space for this - */ - m = alloc_bootmem(sizeof(struct huge_bootmem_page)); - m->phys = gpage_freearray[idx].gpage_list[--nr_gpages]; -#else - m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]); -#endif - - list_add(&m->list, &huge_boot_pages); - gpage_freearray[idx].nr_gpages = nr_gpages; - gpage_freearray[idx].gpage_list[nr_gpages] = 0; - m->hstate = hstate; - - return 1; -} -/* - * Scan the command line hugepagesz= options for gigantic pages; store those in - * a list that we use to allocate the memory once all options are parsed. + * Tracks gpages after the device tree is scanned and before the + * huge_boot_pages list is ready on pseries. */ - -unsigned long gpage_npages[MMU_PAGE_COUNT]; - -static int __init do_gpage_early_setup(char *param, char *val, - const char *unused) -{ - static phys_addr_t size; - unsigned long npages; - - /* - * The hugepagesz and hugepages cmdline options are interleaved. We - * use the size variable to keep track of whether or not this was done - * properly and skip over instances where it is incorrect. Other - * command-line parsing code will issue warnings, so we don't need to. - * - */ - if ((strcmp(param, "default_hugepagesz") == 0) || - (strcmp(param, "hugepagesz") == 0)) { - size = memparse(val, NULL); - } else if (strcmp(param, "hugepages") == 0) { - if (size != 0) { - if (sscanf(val, "%lu", &npages) <= 0) - npages = 0; - gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages; - size = 0; - } - } - return 0; -} - +#define MAX_NUMBER_GPAGES 1024 +__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES]; +__initdata static unsigned nr_gpages; /* - * This function allocates physical space for pages that are larger than the - * buddy allocator can handle. We want to allocate these in highmem because - * the amount of lowmem is limited. This means that this function MUST be - * called before lowmem_end_addr is set up in MMU_init() in order for the lmb - * allocate to grab highmem. + * Build list of addresses of gigantic pages. This function is used in early + * boot before the buddy allocator is setup. */ -void __init reserve_hugetlb_gpages(void) -{ - static __initdata char cmdline[COMMAND_LINE_SIZE]; - phys_addr_t size, base; - int i; - - strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); - parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0, - &do_gpage_early_setup); - - /* - * Walk gpage list in reverse, allocating larger page sizes first. - * Skip over unsupported sizes, or sizes that have 0 gpages allocated. - * When we reach the point in the list where pages are no longer - * considered gpages, we're done. - */ - for (i = MMU_PAGE_COUNT-1; i >= 0; i--) { - if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0) - continue; - else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT)) - break; - - size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i)); - base = memblock_alloc_base(size * gpage_npages[i], size, - MEMBLOCK_ALLOC_ANYWHERE); - add_gpage(base, size, gpage_npages[i]); - } -} - -#else /* !PPC_FSL_BOOK3E */ - -/* Build list of addresses of gigantic pages. This function is used in early - * boot before the buddy or bootmem allocator is setup. - */ -void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) +void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) { if (!addr) return; @@ -413,458 +104,82 @@ void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) } } -/* Moves the gigantic page addresses from the temporary list to the - * huge_boot_pages list. - */ -int alloc_bootmem_huge_page(struct hstate *hstate) +static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) { struct huge_bootmem_page *m; if (nr_gpages == 0) return 0; m = phys_to_virt(gpage_freearray[--nr_gpages]); gpage_freearray[nr_gpages] = 0; - list_add(&m->list, &huge_boot_pages); + list_add(&m->list, &huge_boot_pages[0]); m->hstate = hstate; + m->flags = 0; return 1; } -#endif - -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - -#ifdef CONFIG_PPC_FSL_BOOK3E -#define HUGEPD_FREELIST_SIZE \ - ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) - -struct hugepd_freelist { - struct rcu_head rcu; - unsigned int index; - void *ptes[0]; -}; - -static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); - -static void hugepd_free_rcu_callback(struct rcu_head *head) -{ - struct hugepd_freelist *batch = - container_of(head, struct hugepd_freelist, rcu); - unsigned int i; - - for (i = 0; i < batch->index; i++) - kmem_cache_free(hugepte_cache, batch->ptes[i]); - - free_page((unsigned long)batch); -} - -static void hugepd_free(struct mmu_gather *tlb, void *hugepte) -{ - struct hugepd_freelist **batchp; - - batchp = &__get_cpu_var(hugepd_freelist_cur); - - if (atomic_read(&tlb->mm->mm_users) < 2 || - cpumask_equal(mm_cpumask(tlb->mm), - cpumask_of(smp_processor_id()))) { - kmem_cache_free(hugepte_cache, hugepte); - return; - } - - if (*batchp == NULL) { - *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); - (*batchp)->index = 0; - } - - (*batchp)->ptes[(*batchp)->index++] = hugepte; - if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { - call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback); - *batchp = NULL; - } -} -#endif - -static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, - unsigned long start, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ - pte_t *hugepte = hugepd_page(*hpdp); - int i; - - unsigned long pdmask = ~((1UL << pdshift) - 1); - unsigned int num_hugepd = 1; - -#ifdef CONFIG_PPC_FSL_BOOK3E - /* Note: On fsl the hpdp may be the first of several */ - num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift)); -#else - unsigned int shift = hugepd_shift(*hpdp); -#endif - - start &= pdmask; - if (start < floor) - return; - if (ceiling) { - ceiling &= pdmask; - if (! ceiling) - return; - } - if (end - 1 > ceiling - 1) - return; - - for (i = 0; i < num_hugepd; i++, hpdp++) - hpdp->pd = 0; - - tlb->need_flush = 1; - -#ifdef CONFIG_PPC_FSL_BOOK3E - hugepd_free(tlb, hugepte); -#else - pgtable_free_tlb(tlb, hugepte, pdshift - shift); -#endif -} -static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) +bool __init hugetlb_node_alloc_supported(void) { - pmd_t *pmd; - unsigned long next; - unsigned long start; - - start = addr; - do { - pmd = pmd_offset(pud, addr); - next = pmd_addr_end(addr, end); - if (!is_hugepd(pmd)) { - /* - * if it is not hugepd pointer, we should already find - * it cleared. - */ - WARN_ON(!pmd_none_or_clear_bad(pmd)); - continue; - } -#ifdef CONFIG_PPC_FSL_BOOK3E - /* - * Increment next by the size of the huge mapping since - * there may be more than one entry at this level for a - * single hugepage, but all of them point to - * the same kmem cache that holds the hugepte. - */ - next = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); -#endif - free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, - addr, next, floor, ceiling); - } while (addr = next, addr != end); - - start &= PUD_MASK; - if (start < floor) - return; - if (ceiling) { - ceiling &= PUD_MASK; - if (!ceiling) - return; - } - if (end - 1 > ceiling - 1) - return; - - pmd = pmd_offset(pud, start); - pud_clear(pud); - pmd_free_tlb(tlb, pmd, start); + return false; } - -static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ - pud_t *pud; - unsigned long next; - unsigned long start; - - start = addr; - do { - pud = pud_offset(pgd, addr); - next = pud_addr_end(addr, end); - if (!is_hugepd(pud)) { - if (pud_none_or_clear_bad(pud)) - continue; - hugetlb_free_pmd_range(tlb, pud, addr, next, floor, - ceiling); - } else { -#ifdef CONFIG_PPC_FSL_BOOK3E - /* - * Increment next by the size of the huge mapping since - * there may be more than one entry at this level for a - * single hugepage, but all of them point to - * the same kmem cache that holds the hugepte. - */ - next = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); #endif - free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, - addr, next, floor, ceiling); - } - } while (addr = next, addr != end); - start &= PGDIR_MASK; - if (start < floor) - return; - if (ceiling) { - ceiling &= PGDIR_MASK; - if (!ceiling) - return; - } - if (end - 1 > ceiling - 1) - return; - pud = pud_offset(pgd, start); - pgd_clear(pgd); - pud_free_tlb(tlb, pud, start); -} - -/* - * This function frees user-level page tables of a process. - * - * Must be called with pagetable lock held. - */ -void hugetlb_free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) +int __init alloc_bootmem_huge_page(struct hstate *h, int nid) { - pgd_t *pgd; - unsigned long next; - - /* - * Because there are a number of different possible pagetable - * layouts for hugepage ranges, we limit knowledge of how - * things should be laid out to the allocation path - * (huge_pte_alloc(), above). Everything else works out the - * structure as it goes from information in the hugepd - * pointers. That means that we can't here use the - * optimization used in the normal page free_pgd_range(), of - * checking whether we're actually covering a large enough - * range to have to do anything at the top level of the walk - * instead of at the bottom. - * - * To make sense of this, you should probably go read the big - * block comment at the top of the normal free_pgd_range(), - * too. - */ - do { - next = pgd_addr_end(addr, end); - pgd = pgd_offset(tlb->mm, addr); - if (!is_hugepd(pgd)) { - if (pgd_none_or_clear_bad(pgd)) - continue; - hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); - } else { -#ifdef CONFIG_PPC_FSL_BOOK3E - /* - * Increment next by the size of the huge mapping since - * there may be more than one entry at the pgd level - * for a single hugepage, but all of them point to the - * same kmem cache that holds the hugepte. - */ - next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); -#endif - free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, - addr, next, floor, ceiling); - } - } while (addr = next, addr != end); -} - -struct page * -follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) -{ - pte_t *ptep; - struct page *page; - unsigned shift; - unsigned long mask; - /* - * Transparent hugepages are handled by generic code. We can skip them - * here. - */ - ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); - - /* Verify it is a huge page else bail. */ - if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) - return ERR_PTR(-EINVAL); - - mask = (1UL << shift) - 1; - page = pte_page(*ptep); - if (page) - page += (address & mask) / PAGE_SIZE; - - return page; -} - -struct page * -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ - BUG(); - return NULL; -} - -static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, - unsigned long sz) -{ - unsigned long __boundary = (addr + sz) & ~(sz-1); - return (__boundary - 1 < end - 1) ? __boundary : end; -} - -int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, - unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - pte_t *ptep; - unsigned long sz = 1UL << hugepd_shift(*hugepd); - unsigned long next; - - ptep = hugepte_offset(hugepd, addr, pdshift); - do { - next = hugepte_addr_end(addr, end, sz); - if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) - return 0; - } while (ptep++, addr = next, addr != end); - - return 1; -} - -#ifdef CONFIG_PPC_MM_SLICES -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct hstate *hstate = hstate_file(file); - int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); - - return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); -} -#endif - -unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) -{ -#ifdef CONFIG_PPC_MM_SLICES - unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); - - return 1UL << mmu_psize_to_shift(psize); -#else - if (!is_vm_hugetlb_page(vma)) - return PAGE_SIZE; - - return huge_page_size(hstate_vma(vma)); +#ifdef CONFIG_PPC_BOOK3S_64 + if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) + return pseries_alloc_bootmem_huge_page(h); #endif + return __alloc_bootmem_huge_page(h, nid); } -static inline bool is_power_of_4(unsigned long x) -{ - if (is_power_of_2(x)) - return (__ilog2(x) % 2) ? false : true; - return false; -} - -static int __init add_huge_page_size(unsigned long long size) +bool __init arch_hugetlb_valid_size(unsigned long size) { int shift = __ffs(size); int mmu_psize; /* Check that it is a page size supported by the hardware and * that it fits within pagetable and slice limits. */ -#ifdef CONFIG_PPC_FSL_BOOK3E - if ((size < PAGE_SIZE) || !is_power_of_4(size)) - return -EINVAL; -#else - if (!is_power_of_2(size) - || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT)) - return -EINVAL; -#endif + if (size <= PAGE_SIZE || !is_power_of_2(size)) + return false; - if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) - return -EINVAL; - -#ifdef CONFIG_SPU_FS_64K_LS - /* Disable support for 64K huge pages when 64K SPU local store - * support is enabled as the current implementation conflicts. - */ - if (shift == PAGE_SHIFT_64K) - return -EINVAL; -#endif /* CONFIG_SPU_FS_64K_LS */ + mmu_psize = check_and_get_huge_psize(shift); + if (mmu_psize < 0) + return false; BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); - /* Return if huge page size has already been setup */ - if (size_to_hstate(size)) - return 0; - - hugetlb_add_hstate(shift - PAGE_SHIFT); - - return 0; + return true; } -static int __init hugepage_setup_sz(char *str) +static int __init add_huge_page_size(unsigned long long size) { - unsigned long long size; - - size = memparse(str, &str); + int shift = __ffs(size); - if (add_huge_page_size(size) != 0) - printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); + if (!arch_hugetlb_valid_size((unsigned long)size)) + return -EINVAL; - return 1; + hugetlb_add_hstate(shift - PAGE_SHIFT); + return 0; } -__setup("hugepagesz=", hugepage_setup_sz); -#ifdef CONFIG_PPC_FSL_BOOK3E -struct kmem_cache *hugepte_cache; static int __init hugetlbpage_init(void) { + bool configured = false; int psize; - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - unsigned shift; - - if (!mmu_psize_defs[psize].shift) - continue; - - shift = mmu_psize_to_shift(psize); - - /* Don't treat normal page sizes as huge... */ - if (shift != PAGE_SHIFT) - if (add_huge_page_size(1ULL << shift) < 0) - continue; + if (hugetlb_disabled) { + pr_info("HugeTLB support is disabled!\n"); + return 0; } - /* - * Create a kmem cache for hugeptes. The bottom bits in the pte have - * size information encoded in them, so align them to allow this - */ - hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t), - HUGEPD_SHIFT_MASK + 1, 0, NULL); - if (hugepte_cache == NULL) - panic("%s: Unable to create kmem cache for hugeptes\n", - __func__); - - /* Default hpage size = 4M */ - if (mmu_psize_defs[MMU_PAGE_4M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; - else - panic("%s: Unable to set default huge page size\n", __func__); - - - return 0; -} -#else -static int __init hugetlbpage_init(void) -{ - int psize; - - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() && + !mmu_has_feature(MMU_FTR_16M_PAGE)) return -ENODEV; for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { unsigned shift; - unsigned pdshift; if (!mmu_psize_defs[psize].shift) continue; @@ -874,216 +189,29 @@ static int __init hugetlbpage_init(void) if (add_huge_page_size(1ULL << shift) < 0) continue; - if (shift < PMD_SHIFT) - pdshift = PMD_SHIFT; - else if (shift < PUD_SHIFT) - pdshift = PUD_SHIFT; - else - pdshift = PGDIR_SHIFT; - /* - * if we have pdshift and shift value same, we don't - * use pgt cache for hugepd. - */ - if (pdshift != shift) { - pgtable_cache_add(pdshift - shift, NULL); - if (!PGT_CACHE(pdshift - shift)) - panic("hugetlbpage_init(): could not create " - "pgtable cache for %d bit pagesize\n", shift); - } + configured = true; } - /* Set default large page size. Currently, we pick 16M or 1M - * depending on what is available - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; - else if (mmu_psize_defs[MMU_PAGE_1M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; + if (!configured) + pr_info("Failed to initialize. Disabling HugeTLB"); return 0; } -#endif -module_init(hugetlbpage_init); - -void flush_dcache_icache_hugepage(struct page *page) -{ - int i; - void *start; - - BUG_ON(!PageCompound(page)); - for (i = 0; i < (1UL << compound_order(page)); i++) { - if (!PageHighMem(page)) { - __flush_dcache_icache(page_address(page+i)); - } else { - start = kmap_atomic(page+i); - __flush_dcache_icache(start); - kunmap_atomic(start); - } - } -} +arch_initcall(hugetlbpage_init); -#endif /* CONFIG_HUGETLB_PAGE */ - -/* - * We have 4 cases for pgds and pmds: - * (1) invalid (all zeroes) - * (2) pointer to next table, as normal; bottom 6 bits == 0 - * (3) leaf pte for huge page, bottom two bits != 00 - * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table - * - * So long as we atomically load page table pointers we are safe against teardown, - * we can follow the address down to the the page and take a ref on it. - */ - -pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) +void __init gigantic_hugetlb_cma_reserve(void) { - pgd_t pgd, *pgdp; - pud_t pud, *pudp; - pmd_t pmd, *pmdp; - pte_t *ret_pte; - hugepd_t *hpdp = NULL; - unsigned pdshift = PGDIR_SHIFT; - - if (shift) - *shift = 0; + unsigned long order = 0; - pgdp = pgdir + pgd_index(ea); - pgd = ACCESS_ONCE(*pgdp); - /* - * Always operate on the local stack value. This make sure the - * value don't get updated by a parallel THP split/collapse, - * page fault or a page unmap. The return pte_t * is still not - * stable. So should be checked there for above conditions. - */ - if (pgd_none(pgd)) - return NULL; - else if (pgd_huge(pgd)) { - ret_pte = (pte_t *) pgdp; - goto out; - } else if (is_hugepd(&pgd)) - hpdp = (hugepd_t *)&pgd; - else { + if (radix_enabled()) + order = PUD_SHIFT - PAGE_SHIFT; + else if (!firmware_has_feature(FW_FEATURE_LPAR) && mmu_psize_defs[MMU_PAGE_16G].shift) /* - * Even if we end up with an unmap, the pgtable will not - * be freed, because we do an rcu free and here we are - * irq disabled + * For pseries we do use ibm,expected#pages for reserving 16G pages. */ - pdshift = PUD_SHIFT; - pudp = pud_offset(&pgd, ea); - pud = ACCESS_ONCE(*pudp); + order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; - if (pud_none(pud)) - return NULL; - else if (pud_huge(pud)) { - ret_pte = (pte_t *) pudp; - goto out; - } else if (is_hugepd(&pud)) - hpdp = (hugepd_t *)&pud; - else { - pdshift = PMD_SHIFT; - pmdp = pmd_offset(&pud, ea); - pmd = ACCESS_ONCE(*pmdp); - /* - * A hugepage collapse is captured by pmd_none, because - * it mark the pmd none and do a hpte invalidate. - * - * A hugepage split is captured by pmd_trans_splitting - * because we mark the pmd trans splitting and do a - * hpte invalidate - * - */ - if (pmd_none(pmd) || pmd_trans_splitting(pmd)) - return NULL; - - if (pmd_huge(pmd) || pmd_large(pmd)) { - ret_pte = (pte_t *) pmdp; - goto out; - } else if (is_hugepd(&pmd)) - hpdp = (hugepd_t *)&pmd; - else - return pte_offset_kernel(&pmd, ea); - } - } - if (!hpdp) - return NULL; - - ret_pte = hugepte_offset(hpdp, ea, pdshift); - pdshift = hugepd_shift(*hpdp); -out: - if (shift) - *shift = pdshift; - return ret_pte; -} -EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); - -int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long mask; - unsigned long pte_end; - struct page *head, *page, *tail; - pte_t pte; - int refs; - - pte_end = (addr + sz) & ~(sz-1); - if (pte_end < end) - end = pte_end; - - pte = ACCESS_ONCE(*ptep); - mask = _PAGE_PRESENT | _PAGE_USER; - if (write) - mask |= _PAGE_RW; - - if ((pte_val(pte) & mask) != mask) - return 0; - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* - * check for splitting here - */ - if (pmd_trans_splitting(pte_pmd(pte))) - return 0; -#endif - - /* hugepages are never "special" */ - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - refs = 0; - head = pte_page(pte); - - page = head + ((addr & (sz-1)) >> PAGE_SHIFT); - tail = page; - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - /* Could be optimized better */ - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - /* - * Any tail page need their mapcount reference taken before we - * return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - - return 1; + if (order) + hugetlb_cma_reserve(order); } diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c deleted file mode 100644 index 915412e4d5ba..000000000000 --- a/arch/powerpc/mm/icswx.c +++ /dev/null @@ -1,292 +0,0 @@ -/* - * ICSWX and ACOP Management - * - * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/module.h> -#include <linux/uaccess.h> - -#include "icswx.h" - -/* - * The processor and its L2 cache cause the icswx instruction to - * generate a COP_REQ transaction on PowerBus. The transaction has no - * address, and the processor does not perform an MMU access to - * authenticate the transaction. The command portion of the PowerBus - * COP_REQ transaction includes the LPAR_ID (LPID) and the coprocessor - * Process ID (PID), which the coprocessor compares to the authorized - * LPID and PID held in the coprocessor, to determine if the process - * is authorized to generate the transaction. The data of the COP_REQ - * transaction is 128-byte or less in size and is placed in cacheable - * memory on a 128-byte cache line boundary. - * - * The task to use a coprocessor should use use_cop() to mark the use - * of the Coprocessor Type (CT) and context switching. On a server - * class processor, the PID register is used only for coprocessor - * management + * and so a coprocessor PID is allocated before - * executing icswx + * instruction. Drop_cop() is used to free the - * coprocessor PID. - * - * Example: - * Host Fabric Interface (HFI) is a PowerPC network coprocessor. - * Each HFI have multiple windows. Each HFI window serves as a - * network device sending to and receiving from HFI network. - * HFI immediate send function uses icswx instruction. The immediate - * send function allows small (single cache-line) packets be sent - * without using the regular HFI send FIFO and doorbell, which are - * much slower than immediate send. - * - * For each task intending to use HFI immediate send, the HFI driver - * calls use_cop() to obtain a coprocessor PID for the task. - * The HFI driver then allocate a free HFI window and save the - * coprocessor PID to the HFI window to allow the task to use the - * HFI window. - * - * The HFI driver repeatedly creates immediate send packets and - * issues icswx instruction to send data through the HFI window. - * The HFI compares the coprocessor PID in the CPU PID register - * to the PID held in the HFI window to determine if the transaction - * is allowed. - * - * When the task to release the HFI window, the HFI driver calls - * drop_cop() to release the coprocessor PID. - */ - -void switch_cop(struct mm_struct *next) -{ -#ifdef CONFIG_PPC_ICSWX_PID - mtspr(SPRN_PID, next->context.cop_pid); -#endif - mtspr(SPRN_ACOP, next->context.acop); -} - -/** - * Start using a coprocessor. - * @acop: mask of coprocessor to be used. - * @mm: The mm the coprocessor to associate with. Most likely current mm. - * - * Return a positive PID if successful. Negative errno otherwise. - * The returned PID will be fed to the coprocessor to determine if an - * icswx transaction is authenticated. - */ -int use_cop(unsigned long acop, struct mm_struct *mm) -{ - int ret; - - if (!cpu_has_feature(CPU_FTR_ICSWX)) - return -ENODEV; - - if (!mm || !acop) - return -EINVAL; - - /* The page_table_lock ensures mm_users won't change under us */ - spin_lock(&mm->page_table_lock); - spin_lock(mm->context.cop_lockp); - - ret = get_cop_pid(mm); - if (ret < 0) - goto out; - - /* update acop */ - mm->context.acop |= acop; - - sync_cop(mm); - - /* - * If this is a threaded process then there might be other threads - * running. We need to send an IPI to force them to pick up any - * change in PID and ACOP. - */ - if (atomic_read(&mm->mm_users) > 1) - smp_call_function(sync_cop, mm, 1); - -out: - spin_unlock(mm->context.cop_lockp); - spin_unlock(&mm->page_table_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(use_cop); - -/** - * Stop using a coprocessor. - * @acop: mask of coprocessor to be stopped. - * @mm: The mm the coprocessor associated with. - */ -void drop_cop(unsigned long acop, struct mm_struct *mm) -{ - int free_pid; - - if (!cpu_has_feature(CPU_FTR_ICSWX)) - return; - - if (WARN_ON_ONCE(!mm)) - return; - - /* The page_table_lock ensures mm_users won't change under us */ - spin_lock(&mm->page_table_lock); - spin_lock(mm->context.cop_lockp); - - mm->context.acop &= ~acop; - - free_pid = disable_cop_pid(mm); - sync_cop(mm); - - /* - * If this is a threaded process then there might be other threads - * running. We need to send an IPI to force them to pick up any - * change in PID and ACOP. - */ - if (atomic_read(&mm->mm_users) > 1) - smp_call_function(sync_cop, mm, 1); - - if (free_pid != COP_PID_NONE) - free_cop_pid(free_pid); - - spin_unlock(mm->context.cop_lockp); - spin_unlock(&mm->page_table_lock); -} -EXPORT_SYMBOL_GPL(drop_cop); - -static int acop_use_cop(int ct) -{ - /* There is no alternate policy, yet */ - return -1; -} - -/* - * Get the instruction word at the NIP - */ -static u32 acop_get_inst(struct pt_regs *regs) -{ - u32 inst; - u32 __user *p; - - p = (u32 __user *)regs->nip; - if (!access_ok(VERIFY_READ, p, sizeof(*p))) - return 0; - - if (__get_user(inst, p)) - return 0; - - return inst; -} - -/** - * @regs: regsiters at time of interrupt - * @address: storage address - * @error_code: Fault code, usually the DSISR or ESR depending on - * processor type - * - * Return 0 if we are able to resolve the data storage fault that - * results from a CT miss in the ACOP register. - */ -int acop_handle_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) -{ - int ct; - u32 inst = 0; - - if (!cpu_has_feature(CPU_FTR_ICSWX)) { - pr_info("No coprocessors available"); - _exception(SIGILL, regs, ILL_ILLOPN, address); - } - - if (!user_mode(regs)) { - /* this could happen if the HV denies the - * kernel access, for now we just die */ - die("ICSWX from kernel failed", regs, SIGSEGV); - } - - /* Some implementations leave us a hint for the CT */ - ct = ICSWX_GET_CT_HINT(error_code); - if (ct < 0) { - /* we have to peek at the instruction word to figure out CT */ - u32 ccw; - u32 rs; - - inst = acop_get_inst(regs); - if (inst == 0) - return -1; - - rs = (inst >> (31 - 10)) & 0x1f; - ccw = regs->gpr[rs]; - ct = (ccw >> 16) & 0x3f; - } - - /* - * We could be here because another thread has enabled acop - * but the ACOP register has yet to be updated. - * - * This should have been taken care of by the IPI to sync all - * the threads (see smp_call_function(sync_cop, mm, 1)), but - * that could take forever if there are a significant amount - * of threads. - * - * Given the number of threads on some of these systems, - * perhaps this is the best way to sync ACOP rather than whack - * every thread with an IPI. - */ - if ((acop_copro_type_bit(ct) & current->active_mm->context.acop) != 0) { - sync_cop(current->active_mm); - return 0; - } - - /* check for alternate policy */ - if (!acop_use_cop(ct)) - return 0; - - /* at this point the CT is unknown to the system */ - pr_warn("%s[%d]: Coprocessor %d is unavailable\n", - current->comm, current->pid, ct); - - /* get inst if we don't already have it */ - if (inst == 0) { - inst = acop_get_inst(regs); - if (inst == 0) - return -1; - } - - /* Check if the instruction is the "record form" */ - if (inst & 1) { - /* - * the instruction is "record" form so we can reject - * using CR0 - */ - regs->ccr &= ~(0xful << 28); - regs->ccr |= ICSWX_RC_NOT_FOUND << 28; - - /* Move on to the next instruction */ - regs->nip += 4; - } else { - /* - * There is no architected mechanism to report a bad - * CT so we could either SIGILL or report nothing. - * Since the non-record version should only bu used - * for "hints" or "don't care" we should probably do - * nothing. However, I could see how some people - * might want an SIGILL so it here if you want it. - */ -#ifdef CONFIG_PPC_ICSWX_USE_SIGILL - _exception(SIGILL, regs, ILL_ILLOPN, address); -#else - regs->nip += 4; -#endif - } - - return 0; -} -EXPORT_SYMBOL_GPL(acop_handle_fault); diff --git a/arch/powerpc/mm/icswx.h b/arch/powerpc/mm/icswx.h deleted file mode 100644 index 6dedc08e62c8..000000000000 --- a/arch/powerpc/mm/icswx.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef _ARCH_POWERPC_MM_ICSWX_H_ -#define _ARCH_POWERPC_MM_ICSWX_H_ - -/* - * ICSWX and ACOP Management - * - * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <asm/mmu_context.h> - -/* also used to denote that PIDs are not used */ -#define COP_PID_NONE 0 - -static inline void sync_cop(void *arg) -{ - struct mm_struct *mm = arg; - - if (mm == current->active_mm) - switch_cop(current->active_mm); -} - -#ifdef CONFIG_PPC_ICSWX_PID -extern int get_cop_pid(struct mm_struct *mm); -extern int disable_cop_pid(struct mm_struct *mm); -extern void free_cop_pid(int free_pid); -#else -#define get_cop_pid(m) (COP_PID_NONE) -#define disable_cop_pid(m) (COP_PID_NONE) -#define free_cop_pid(p) -#endif - -/* - * These are implementation bits for architected registers. If this - * ever becomes architecture the should be moved to reg.h et. al. - */ -/* UCT is the same bit for Server and Embedded */ -#define ICSWX_DSI_UCT 0x00004000 /* Unavailable Coprocessor Type */ - -#ifdef CONFIG_PPC_BOOK3E -/* Embedded implementation gives us no hints as to what the CT is */ -#define ICSWX_GET_CT_HINT(x) (-1) -#else -/* Server implementation contains the CT value in the DSISR */ -#define ICSWX_DSISR_CTMASK 0x00003f00 -#define ICSWX_GET_CT_HINT(x) (((x) & ICSWX_DSISR_CTMASK) >> 8) -#endif - -#define ICSWX_RC_STARTED 0x8 /* The request has been started */ -#define ICSWX_RC_NOT_IDLE 0x4 /* No coprocessor found idle */ -#define ICSWX_RC_NOT_FOUND 0x2 /* No coprocessor found */ -#define ICSWX_RC_UNDEFINED 0x1 /* Reserved */ - -extern int acop_handle_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code); - -static inline u64 acop_copro_type_bit(unsigned int type) -{ - return 1ULL << (63 - type); -} - -#endif /* !_ARCH_POWERPC_MM_ICSWX_H_ */ diff --git a/arch/powerpc/mm/icswx_pid.c b/arch/powerpc/mm/icswx_pid.c deleted file mode 100644 index 91e30eb7d054..000000000000 --- a/arch/powerpc/mm/icswx_pid.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * ICSWX and ACOP/PID Management - * - * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/idr.h> -#include <linux/module.h> -#include "icswx.h" - -#define COP_PID_MIN (COP_PID_NONE + 1) -#define COP_PID_MAX (0xFFFF) - -static DEFINE_SPINLOCK(mmu_context_acop_lock); -static DEFINE_IDA(cop_ida); - -static int new_cop_pid(struct ida *ida, int min_id, int max_id, - spinlock_t *lock) -{ - int index; - int err; - -again: - if (!ida_pre_get(ida, GFP_KERNEL)) - return -ENOMEM; - - spin_lock(lock); - err = ida_get_new_above(ida, min_id, &index); - spin_unlock(lock); - - if (err == -EAGAIN) - goto again; - else if (err) - return err; - - if (index > max_id) { - spin_lock(lock); - ida_remove(ida, index); - spin_unlock(lock); - return -ENOMEM; - } - - return index; -} - -int get_cop_pid(struct mm_struct *mm) -{ - int pid; - - if (mm->context.cop_pid == COP_PID_NONE) { - pid = new_cop_pid(&cop_ida, COP_PID_MIN, COP_PID_MAX, - &mmu_context_acop_lock); - if (pid >= 0) - mm->context.cop_pid = pid; - } - return mm->context.cop_pid; -} - -int disable_cop_pid(struct mm_struct *mm) -{ - int free_pid = COP_PID_NONE; - - if ((!mm->context.acop) && (mm->context.cop_pid != COP_PID_NONE)) { - free_pid = mm->context.cop_pid; - mm->context.cop_pid = COP_PID_NONE; - } - return free_pid; -} - -void free_cop_pid(int free_pid) -{ - spin_lock(&mmu_context_acop_lock); - ida_remove(&cop_ida, free_pid); - spin_unlock(&mmu_context_acop_lock); -} diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c new file mode 100644 index 000000000000..745097554bea --- /dev/null +++ b/arch/powerpc/mm/init-common.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Dave Engebretsen <engebret@us.ibm.com> + * Rework for PPC64 port. + */ + +#undef DEBUG + +#include <linux/string.h> +#include <linux/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/kup.h> +#include <asm/smp.h> + +phys_addr_t memstart_addr __ro_after_init = (phys_addr_t)~0ull; +EXPORT_SYMBOL_GPL(memstart_addr); +phys_addr_t kernstart_addr __ro_after_init; +EXPORT_SYMBOL_GPL(kernstart_addr); +unsigned long kernstart_virt_addr __ro_after_init = KERNELBASE; +EXPORT_SYMBOL_GPL(kernstart_virt_addr); + +bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP); +bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP); +#ifdef CONFIG_KFENCE +bool __ro_after_init kfence_disabled; +bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL; +#endif + +static int __init parse_nosmep(char *p) +{ + if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64)) + return 0; + + disable_kuep = true; + pr_warn("Disabling Kernel Userspace Execution Prevention\n"); + return 0; +} +early_param("nosmep", parse_nosmep); + +static int __init parse_nosmap(char *p) +{ + disable_kuap = true; + pr_warn("Disabling Kernel Userspace Access Protection\n"); + return 0; +} +early_param("nosmap", parse_nosmap); + +void __weak setup_kuep(bool disabled) +{ + if (!IS_ENABLED(CONFIG_PPC_KUEP) || disabled) + return; + + if (smp_processor_id() != boot_cpuid) + return; + + pr_info("Activating Kernel Userspace Execution Prevention\n"); +} + +void setup_kup(void) +{ + setup_kuap(disable_kuap); + setup_kuep(disable_kuep); +} + +#define CTOR(shift) static void ctor_##shift(void *addr) \ +{ \ + memset(addr, 0, sizeof(pgd_t) << (shift)); \ +} + +CTOR(0); CTOR(1); CTOR(2); CTOR(3); CTOR(4); CTOR(5); CTOR(6); CTOR(7); +CTOR(8); CTOR(9); CTOR(10); CTOR(11); CTOR(12); CTOR(13); CTOR(14); CTOR(15); + +static inline void (*ctor(int shift))(void *) +{ + BUILD_BUG_ON(MAX_PGTABLE_INDEX_SIZE != 15); + + switch (shift) { + case 0: return ctor_0; + case 1: return ctor_1; + case 2: return ctor_2; + case 3: return ctor_3; + case 4: return ctor_4; + case 5: return ctor_5; + case 6: return ctor_6; + case 7: return ctor_7; + case 8: return ctor_8; + case 9: return ctor_9; + case 10: return ctor_10; + case 11: return ctor_11; + case 12: return ctor_12; + case 13: return ctor_13; + case 14: return ctor_14; + case 15: return ctor_15; + } + return NULL; +} + +struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE + 1]; +EXPORT_SYMBOL_GPL(pgtable_cache); /* used by kvm_hv module */ + +/* + * Create a kmem_cache() for pagetables. This is not used for PTE + * pages - they're linked to struct page, come from the normal free + * pages pool and have a different entry size (see real_pte_t) to + * everything else. Caches created by this function are used for all + * the higher level pagetables, and for hugepage pagetables. + */ +void pgtable_cache_add(unsigned int shift) +{ + char *name; + unsigned long table_size = sizeof(pgd_t) << shift; + unsigned long align = table_size; + + /* When batching pgtable pointers for RCU freeing, we store + * the index size in the low bits. Table alignment must be + * big enough to fit it. + */ + unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1; + struct kmem_cache *new = NULL; + + /* It would be nice if this was a BUILD_BUG_ON(), but at the + * moment, gcc doesn't seem to recognize is_power_of_2 as a + * constant expression, so so much for that. */ + BUG_ON(!is_power_of_2(minalign)); + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + + if (PGT_CACHE(shift)) + return; /* Already have a cache of this size */ + + align = max_t(unsigned long, align, minalign); + name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift); + if (name) + new = kmem_cache_create(name, table_size, align, 0, ctor(shift)); + if (!new) + panic("Could not allocate pgtable cache for order %d", shift); + + kfree(name); + pgtable_cache[shift] = new; + + pr_debug("Allocated pgtable cache for order %d\n", shift); +} +EXPORT_SYMBOL_GPL(pgtable_cache_add); /* used by kvm_hv module */ + +void pgtable_cache_init(void) +{ + pgtable_cache_add(PGD_INDEX_SIZE); + + if (PMD_CACHE_INDEX) + pgtable_cache_add(PMD_CACHE_INDEX); + /* + * In all current configs, when the PUD index exists it's the + * same size as either the pgd or pmd index except with THP enabled + * on book3s 64 + */ + if (PUD_CACHE_INDEX) + pgtable_cache_add(PUD_CACHE_INDEX); +} diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 01e2db97a210..4e71dfe7d026 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -9,12 +10,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/module.h> @@ -26,7 +21,6 @@ #include <linux/mm.h> #include <linux/stddef.h> #include <linux/init.h> -#include <linux/bootmem.h> #include <linux/highmem.h> #include <linux/initrd.h> #include <linux/pagemap.h> @@ -35,10 +29,7 @@ #include <linux/slab.h> #include <linux/hugetlb.h> -#include <asm/pgalloc.h> -#include <asm/prom.h> #include <asm/io.h> -#include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/smp.h> #include <asm/machdep.h> @@ -46,13 +37,16 @@ #include <asm/tlb.h> #include <asm/sections.h> #include <asm/hugetlb.h> +#include <asm/kup.h> +#include <asm/kasan.h> +#include <asm/fixmap.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> #if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL) /* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */ #if (CONFIG_LOWMEM_SIZE > (0xF0000000 - PAGE_OFFSET)) -#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL" +#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_KERNEL_START" #endif #endif #define MAX_LOW_MEM CONFIG_LOWMEM_SIZE @@ -60,12 +54,7 @@ phys_addr_t total_memory; phys_addr_t total_lowmem; -phys_addr_t memstart_addr = (phys_addr_t)~0ull; -EXPORT_SYMBOL(memstart_addr); -phys_addr_t kernstart_addr; -EXPORT_SYMBOL(kernstart_addr); - -#ifdef CONFIG_RELOCATABLE_PPC32 +#ifdef CONFIG_RELOCATABLE /* Used in __va()/__pa() */ long long virt_phys_offset; EXPORT_SYMBOL(virt_phys_offset); @@ -81,45 +70,10 @@ EXPORT_SYMBOL(agp_special_page); void MMU_init(void); -/* XXX should be in current.h -- paulus */ -extern struct task_struct *current_set[NR_CPUS]; - -/* - * this tells the system to map all of ram with the segregs - * (i.e. page tables) instead of the bats. - * -- Cort - */ -int __map_without_bats; -int __map_without_ltlbs; - -/* - * This tells the system to allow ioremapping memory marked as reserved. - */ -int __allow_ioremap_reserved; - /* max amount of low RAM to map in */ unsigned long __max_low_memory = MAX_LOW_MEM; /* - * Check for command-line options that affect what MMU_init will do. - */ -void MMU_setup(void) -{ - /* Check for nobats option (used in mapin_ram). */ - if (strstr(cmd_line, "nobats")) { - __map_without_bats = 1; - } - - if (strstr(cmd_line, "noltlbs")) { - __map_without_ltlbs = 1; - } -#ifdef CONFIG_DEBUG_PAGEALLOC - __map_without_bats = 1; - __map_without_ltlbs = 1; -#endif -} - -/* * MMU_init sets up the basic memory mappings for the kernel, * including both RAM and possibly some I/O regions, * and sets up the page tables and the MMU hardware ready to go. @@ -129,33 +83,15 @@ void __init MMU_init(void) if (ppc_md.progress) ppc_md.progress("MMU:enter", 0x111); - /* parse args from command line */ - MMU_setup(); - - /* - * Reserve gigantic pages for hugetlb. This MUST occur before - * lowmem_end_addr is initialized below. - */ - reserve_hugetlb_gpages(); - - if (memblock.memory.cnt > 1) { -#ifndef CONFIG_WII - memblock_enforce_memory_limit(memblock.memory.regions[0].size); - printk(KERN_WARNING "Only using first contiguous memory region"); -#else - wii_memory_fixups(); -#endif - } - total_lowmem = total_memory = memblock_end_of_DRAM() - memstart_addr; lowmem_end_addr = memstart_addr + total_lowmem; -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_PPC_85xx /* Freescale Book-E parts expect lowmem to be mapped by fixed TLB * entries, so we need to adjust lowmem to match the amount we can map * in the fixed entries */ adjust_total_lowmem(); -#endif /* CONFIG_FSL_BOOKE */ +#endif /* CONFIG_PPC_85xx */ if (total_lowmem > __max_low_memory) { total_lowmem = __max_low_memory; @@ -179,10 +115,6 @@ void __init MMU_init(void) /* Initialize early top-down ioremap allocator */ ioremap_bot = IOREMAP_TOP; - /* Map in I/O resources */ - if (ppc_md.progress) - ppc_md.progress("MMU:setio", 0x302); - if (ppc_md.progress) ppc_md.progress("MMU:exit", 0x211); @@ -191,29 +123,12 @@ void __init MMU_init(void) btext_unmap(); #endif - /* Shortly after that, the entire linear mapping will be available */ - memblock_set_current_limit(lowmem_end_addr); -} + kasan_mmu_init(); -/* This is only called until mem_init is done. */ -void __init *early_get_page(void) -{ - if (init_bootmem_done) - return alloc_bootmem_pages(PAGE_SIZE); - else - return __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE)); -} + setup_kup(); -#ifdef CONFIG_8xx /* No 8xx specific .c file to put that in ... */ -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); + update_mmu_feature_fixups(MMU_FTR_KUAP); - /* 8xx can only access 8MB at the moment */ - memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000)); + /* Shortly after that, the entire linear mapping will be available */ + memblock_set_current_limit(lowmem_end_addr); } -#endif /* CONFIG_8xx */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index d0cd9e4c6837..b6f3ae03ca9e 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -11,12 +12,6 @@ * * Dave Engebretsen <engebret@us.ibm.com> * Rework for PPC64 port. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #undef DEBUG @@ -34,7 +29,6 @@ #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/delay.h> -#include <linux/bootmem.h> #include <linux/highmem.h> #include <linux/idr.h> #include <linux/nodemask.h> @@ -43,6 +37,11 @@ #include <linux/memblock.h> #include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/of_fdt.h> +#include <linux/libfdt.h> +#include <linux/memremap.h> +#include <linux/memory.h> +#include <linux/bootmem_info.h> #include <asm/pgalloc.h> #include <asm/page.h> @@ -50,9 +49,8 @@ #include <asm/rtas.h> #include <asm/io.h> #include <asm/mmu_context.h> -#include <asm/pgtable.h> #include <asm/mmu.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/smp.h> #include <asm/machdep.h> #include <asm/tlb.h> @@ -63,177 +61,97 @@ #include <asm/sections.h> #include <asm/iommu.h> #include <asm/vdso.h> +#include <asm/hugetlb.h> -#include "mmu_decl.h" - -#ifdef CONFIG_PPC_STD_MMU_64 -#if PGTABLE_RANGE > USER_VSID_RANGE -#warning Limited user VSID range means pagetable space is wasted -#endif - -#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) -#warning TASK_SIZE is smaller than it needs to be. -#endif -#endif /* CONFIG_PPC_STD_MMU_64 */ - -phys_addr_t memstart_addr = ~0; -EXPORT_SYMBOL_GPL(memstart_addr); -phys_addr_t kernstart_addr; -EXPORT_SYMBOL_GPL(kernstart_addr); - -static void pgd_ctor(void *addr) -{ - memset(addr, 0, PGD_TABLE_SIZE); -} - -static void pmd_ctor(void *addr) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - memset(addr, 0, PMD_TABLE_SIZE * 2); -#else - memset(addr, 0, PMD_TABLE_SIZE); -#endif -} - -struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; - -/* - * Create a kmem_cache() for pagetables. This is not used for PTE - * pages - they're linked to struct page, come from the normal free - * pages pool and have a different entry size (see real_pte_t) to - * everything else. Caches created by this function are used for all - * the higher level pagetables, and for hugepage pagetables. - */ -void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) -{ - char *name; - unsigned long table_size = sizeof(void *) << shift; - unsigned long align = table_size; - - /* When batching pgtable pointers for RCU freeing, we store - * the index size in the low bits. Table alignment must be - * big enough to fit it. - * - * Likewise, hugeapge pagetable pointers contain a (different) - * shift value in the low bits. All tables must be aligned so - * as to leave enough 0 bits in the address to contain it. */ - unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1, - HUGEPD_SHIFT_MASK + 1); - struct kmem_cache *new; - - /* It would be nice if this was a BUILD_BUG_ON(), but at the - * moment, gcc doesn't seem to recognize is_power_of_2 as a - * constant expression, so so much for that. */ - BUG_ON(!is_power_of_2(minalign)); - BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE)); - - if (PGT_CACHE(shift)) - return; /* Already have a cache of this size */ - - align = max_t(unsigned long, align, minalign); - name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift); - new = kmem_cache_create(name, table_size, align, 0, ctor); - pgtable_cache[shift - 1] = new; - pr_debug("Allocated pgtable cache for order %d\n", shift); -} - - -void pgtable_cache_init(void) -{ - pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); - pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor); - if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX)) - panic("Couldn't allocate pgtable caches"); - /* In all current configs, when the PUD index exists it's the - * same size as either the pgd or pmd index. Verify that the - * initialization above has also created a PUD cache. This - * will need re-examiniation if we add new possibilities for - * the pagetable layout. */ - BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE)); -} +#include <mm/mmu_decl.h> #ifdef CONFIG_SPARSEMEM_VMEMMAP /* - * Given an address within the vmemmap, determine the pfn of the page that - * represents the start of the section it is within. Note that we have to + * Given an address within the vmemmap, determine the page that + * represents the start of the subsection it is within. Note that we have to * do this by hand as the proffered address may not be correctly aligned. * Subtraction of non-aligned pointers produces undefined results. */ -static unsigned long __meminit vmemmap_section_start(unsigned long page) +static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_addr) { - unsigned long offset = page - ((unsigned long)(vmemmap)); + unsigned long start_pfn; + unsigned long offset = vmemmap_addr - ((unsigned long)(vmemmap)); /* Return the pfn of the start of the section. */ - return (offset / sizeof(struct page)) & PAGE_SECTION_MASK; + start_pfn = (offset / sizeof(struct page)) & PAGE_SUBSECTION_MASK; + return pfn_to_page(start_pfn); } /* - * Check if this vmemmap page is already initialised. If any section - * which overlaps this vmemmap page is initialised then this page is - * initialised already. + * Since memory is added in sub-section chunks, before creating a new vmemmap + * mapping, the kernel should check whether there is an existing memmap mapping + * covering the new subsection added. This is needed because kernel can map + * vmemmap area using 16MB pages which will cover a memory range of 16G. Such + * a range covers multiple subsections (2M) + * + * If any subsection in the 16G range mapped by vmemmap is valid we consider the + * vmemmap populated (There is a page table entry already present). We can't do + * a page table lookup here because with the hash translation we don't keep + * vmemmap details in linux page table. */ -static int __meminit vmemmap_populated(unsigned long start, int page_size) +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size) { - unsigned long end = start + page_size; - - for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page))) - if (pfn_valid(vmemmap_section_start(start))) + struct page *start; + unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size; + start = vmemmap_subsection_start(vmemmap_addr); + + for (; (unsigned long)start < vmemmap_end; start += PAGES_PER_SUBSECTION) + /* + * pfn valid check here is intended to really check + * whether we have any subsection already initialized + * in this range. + */ + if (pfn_valid(page_to_pfn(start))) return 1; return 0; } -/* On hash-based CPUs, the vmemmap is bolted in the hash table. - * - * On Book3E CPUs, the vmemmap is currently mapped in the top half of - * the vmalloc space using normal page tables, though the size of - * pages encoded in the PTEs can be different +/* + * vmemmap virtual address space management does not have a traditional page + * table to track which virtual struct pages are backed by physical mapping. + * The virtual to physical mappings are tracked in a simple linked list + * format. 'vmemmap_list' maintains the entire vmemmap physical mapping at + * all times where as the 'next' list maintains the available + * vmemmap_backing structures which have been deleted from the + * 'vmemmap_global' list during system runtime (memory hotplug remove + * operation). The freed 'vmemmap_backing' structures are reused later when + * new requests come in without allocating fresh memory. This pointer also + * tracks the allocated 'vmemmap_backing' structures as we allocate one + * full page memory at a time when we dont have any. */ - -#ifdef CONFIG_PPC_BOOK3E -static void __meminit vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - /* Create a PTE encoding without page size */ - unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | - _PAGE_KERNEL_RW; - - /* PTEs only contain page size encodings up to 32M */ - BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); - - /* Encode the size in the PTE */ - flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; - - /* For each PTE for that area, map things. Note that we don't - * increment phys because all PTEs are of the large size and - * thus must have the low bits clear - */ - for (i = 0; i < page_size; i += PAGE_SIZE) - BUG_ON(map_kernel_page(start + i, phys, flags)); -} -#else /* CONFIG_PPC_BOOK3E */ -static void __meminit vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - int mapped = htab_bolt_mapping(start, start + page_size, phys, - pgprot_val(PAGE_KERNEL), - mmu_vmemmap_psize, - mmu_kernel_ssize); - BUG_ON(mapped < 0); -} -#endif /* CONFIG_PPC_BOOK3E */ - struct vmemmap_backing *vmemmap_list; +static struct vmemmap_backing *next; + +/* + * The same pointer 'next' tracks individual chunks inside the allocated + * full page during the boot time and again tracks the freed nodes during + * runtime. It is racy but it does not happen as they are separated by the + * boot process. Will create problem if some how we have memory hotplug + * operation during boot !! + */ +static int num_left; +static int num_freed; static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node) { - static struct vmemmap_backing *next; - static int num_left; + struct vmemmap_backing *vmem_back; + /* get from freed entries first */ + if (num_freed) { + num_freed--; + vmem_back = next; + next = next->list; + + return vmem_back; + } /* allocate a page when required and hand out chunks */ - if (!next || !num_left) { + if (!num_left) { next = vmemmap_alloc_block(PAGE_SIZE, node); if (unlikely(!next)) { WARN_ON(1); @@ -247,16 +165,16 @@ static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node) return next++; } -static __meminit void vmemmap_list_populate(unsigned long phys, - unsigned long start, - int node) +static __meminit int vmemmap_list_populate(unsigned long phys, + unsigned long start, + int node) { struct vmemmap_backing *vmem_back; vmem_back = vmemmap_list_alloc(node); if (unlikely(!vmem_back)) { - WARN_ON(1); - return; + pr_debug("vmemap list allocation failed\n"); + return -ENOMEM; } vmem_back->phys = phys; @@ -264,41 +182,499 @@ static __meminit void vmemmap_list_populate(unsigned long phys, vmem_back->list = vmemmap_list; vmemmap_list = vmem_back; + return 0; } -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size) { + unsigned long nr_pfn = page_size / sizeof(struct page); + unsigned long start_pfn = page_to_pfn((struct page *)start); + + if ((start_pfn + nr_pfn - 1) > altmap->end_pfn) + return true; + + if (start_pfn < altmap->base_pfn) + return true; + + return false; +} + +static int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + bool altmap_alloc; unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; /* Align to the page size of the linear mapping. */ - start = _ALIGN_DOWN(start, page_size); + start = ALIGN_DOWN(start, page_size); pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node); for (; start < end; start += page_size) { - void *p; - + void *p = NULL; + int rc; + + /* + * This vmemmap range is backing different subsections. If any + * of that subsection is marked valid, that means we already + * have initialized a page table covering this range and hence + * the vmemmap range is populated. + */ if (vmemmap_populated(start, page_size)) continue; - p = vmemmap_alloc_block(page_size, node); + /* + * Allocate from the altmap first if we have one. This may + * fail due to alignment issues when using 16MB hugepages, so + * fall back to system memory if the altmap allocation fail. + */ + if (altmap && !altmap_cross_boundary(altmap, start, page_size)) { + p = vmemmap_alloc_block_buf(page_size, node, altmap); + if (!p) + pr_debug("altmap block allocation failed, falling back to system memory"); + else + altmap_alloc = true; + } + if (!p) { + p = vmemmap_alloc_block_buf(page_size, node, NULL); + altmap_alloc = false; + } if (!p) return -ENOMEM; - vmemmap_list_populate(__pa(p), start, node); + if (vmemmap_list_populate(__pa(p), start, node)) { + /* + * If we don't populate vmemap list, we don't have + * the ability to free the allocated vmemmap + * pages in section_deactivate. Hence free them + * here. + */ + int nr_pfns = page_size >> PAGE_SHIFT; + unsigned long page_order = get_order(page_size); + + if (altmap_alloc) + vmem_altmap_free(altmap, nr_pfns); + else + free_pages((unsigned long)p, page_order); + return -ENOMEM; + } pr_debug(" * %016lx..%016lx allocated at %p\n", start, start + page_size, p); - vmemmap_create_mapping(start, page_size, __pa(p)); + rc = vmemmap_create_mapping(start, page_size, __pa(p)); + if (rc < 0) { + pr_warn("%s: Unable to create vmemmap mapping: %d\n", + __func__, rc); + return -EFAULT; + } } return 0; } -void vmemmap_free(unsigned long start, unsigned long end) +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_populate(start, end, node, altmap); +#endif + + return __vmemmap_populate(start, end, node, altmap); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static unsigned long vmemmap_list_free(unsigned long start) +{ + struct vmemmap_backing *vmem_back, *vmem_back_prev; + + vmem_back_prev = vmem_back = vmemmap_list; + + /* look for it with prev pointer recorded */ + for (; vmem_back; vmem_back = vmem_back->list) { + if (vmem_back->virt_addr == start) + break; + vmem_back_prev = vmem_back; + } + + if (unlikely(!vmem_back)) + return 0; + + /* remove it from vmemmap_list */ + if (vmem_back == vmemmap_list) /* remove head */ + vmemmap_list = vmem_back->list; + else + vmem_back_prev->list = vmem_back->list; + + /* next point to this freed entry */ + vmem_back->list = next; + next = vmem_back; + num_freed++; + + return vmem_back->phys; +} + +static void __ref __vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ + unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; + unsigned long page_order = get_order(page_size); + unsigned long alt_start = ~0, alt_end = ~0; + unsigned long base_pfn; + + start = ALIGN_DOWN(start, page_size); + if (altmap) { + alt_start = altmap->base_pfn; + alt_end = altmap->base_pfn + altmap->reserve + altmap->free; + } + + pr_debug("vmemmap_free %lx...%lx\n", start, end); + + for (; start < end; start += page_size) { + unsigned long nr_pages, addr; + struct page *page; + + /* + * We have already marked the subsection we are trying to remove + * invalid. So if we want to remove the vmemmap range, we + * need to make sure there is no subsection marked valid + * in this range. + */ + if (vmemmap_populated(start, page_size)) + continue; + + addr = vmemmap_list_free(start); + if (!addr) + continue; + + page = pfn_to_page(addr >> PAGE_SHIFT); + nr_pages = 1 << page_order; + base_pfn = PHYS_PFN(addr); + + if (base_pfn >= alt_start && base_pfn < alt_end) { + vmem_altmap_free(altmap, nr_pages); + } else if (PageReserved(page)) { + /* allocated from bootmem */ + if (page_size < PAGE_SIZE) { + /* + * this shouldn't happen, but if it is + * the case, leave the memory there + */ + WARN_ON_ONCE(1); + } else { + while (nr_pages--) + free_reserved_page(page++); + } + } else { + free_pages((unsigned long)(__va(addr)), page_order); + } + + vmemmap_remove_mapping(start, page_size); + } +} + +void __ref vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) { +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_free(start, end, altmap); +#endif + return __vmemmap_free(start, end, altmap); } +#endif + +#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE +void register_page_bootmem_memmap(unsigned long section_nr, + struct page *start_page, unsigned long size) +{ +} +#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ + #endif /* CONFIG_SPARSEMEM_VMEMMAP */ +#ifdef CONFIG_PPC_BOOK3S_64 +unsigned int mmu_lpid_bits; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +EXPORT_SYMBOL_GPL(mmu_lpid_bits); +#endif +unsigned int mmu_pid_bits; + +static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); + +static int __init parse_disable_radix(char *p) +{ + bool val; + + if (!p) + val = true; + else if (kstrtobool(p, &val)) + return -EINVAL; + + disable_radix = val; + + return 0; +} +early_param("disable_radix", parse_disable_radix); + +/* + * If we're running under a hypervisor, we need to check the contents of + * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do + * radix. If not, we clear the radix feature bit so we fall back to hash. + */ +static void __init early_check_vec5(void) +{ + unsigned long root, chosen; + int size; + const u8 *vec5; + u8 mmu_supported; + + root = of_get_flat_dt_root(); + chosen = of_get_flat_dt_subnode_by_name(root, "chosen"); + if (chosen == -FDT_ERR_NOTFOUND) { + cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + return; + } + vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size); + if (!vec5) { + cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + return; + } + if (size <= OV5_INDX(OV5_MMU_SUPPORT)) { + cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + return; + } + + /* Check for supported configuration */ + mmu_supported = vec5[OV5_INDX(OV5_MMU_SUPPORT)] & + OV5_FEAT(OV5_MMU_SUPPORT); + if (mmu_supported == OV5_FEAT(OV5_MMU_RADIX)) { + /* Hypervisor only supports radix - check enabled && GTSE */ + if (!early_radix_enabled()) { + pr_warn("WARNING: Ignoring cmdline option disable_radix\n"); + } + if (!(vec5[OV5_INDX(OV5_RADIX_GTSE)] & + OV5_FEAT(OV5_RADIX_GTSE))) { + cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE; + } else + cur_cpu_spec->mmu_features |= MMU_FTR_GTSE; + /* Do radix anyway - the hypervisor said we had to */ + cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX; + } else if (mmu_supported == OV5_FEAT(OV5_MMU_HASH)) { + /* Hypervisor only supports hash - disable radix */ + cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE; + } +} + +static int __init dt_scan_mmu_pid_width(unsigned long node, + const char *uname, int depth, + void *data) +{ + int size = 0; + const __be32 *prop; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + /* Find MMU LPID, PID register size */ + prop = of_get_flat_dt_prop(node, "ibm,mmu-lpid-bits", &size); + if (prop && size == 4) + mmu_lpid_bits = be32_to_cpup(prop); + + prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); + if (prop && size == 4) + mmu_pid_bits = be32_to_cpup(prop); + + if (!mmu_pid_bits && !mmu_lpid_bits) + return 0; + + return 1; +} + +/* + * Outside hotplug the kernel uses this value to map the kernel direct map + * with radix. To be compatible with older kernels, let's keep this value + * as 16M which is also SECTION_SIZE with SPARSEMEM. We can ideally map + * things with 1GB size in the case where we don't support hotplug. + */ +#ifndef CONFIG_MEMORY_HOTPLUG +#define DEFAULT_MEMORY_BLOCK_SIZE SZ_16M +#else +#define DEFAULT_MEMORY_BLOCK_SIZE MIN_MEMORY_BLOCK_SIZE +#endif + +static void update_memory_block_size(unsigned long *block_size, unsigned long mem_size) +{ + unsigned long min_memory_block_size = DEFAULT_MEMORY_BLOCK_SIZE; + + for (; *block_size > min_memory_block_size; *block_size >>= 2) { + if ((mem_size & *block_size) == 0) + break; + } +} + +static int __init probe_memory_block_size(unsigned long node, const char *uname, int + depth, void *data) +{ + const char *type; + unsigned long *block_size = (unsigned long *)data; + const __be32 *reg, *endp; + int l; + + if (depth != 1) + return 0; + /* + * If we have dynamic-reconfiguration-memory node, use the + * lmb value. + */ + if (strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) { + + const __be32 *prop; + + prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &l); + + if (!prop || l < dt_root_size_cells * sizeof(__be32)) + /* + * Nothing in the device tree + */ + *block_size = DEFAULT_MEMORY_BLOCK_SIZE; + else + *block_size = of_read_number(prop, dt_root_size_cells); + /* + * We have found the final value. Don't probe further. + */ + return 1; + } + /* + * Find all the device tree nodes of memory type and make sure + * the area can be mapped using the memory block size value + * we end up using. We start with 1G value and keep reducing + * it such that we can map the entire area using memory_block_size. + * This will be used on powernv and older pseries that don't + * have ibm,lmb-size node. + * For ex: with P5 we can end up with + * memory@0 -> 128MB + * memory@128M -> 64M + * This will end up using 64MB memory block size value. + */ + type = of_get_flat_dt_prop(node, "device_type", NULL); + if (type == NULL || strcmp(type, "memory") != 0) + return 0; + + reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l); + if (!reg) + reg = of_get_flat_dt_prop(node, "reg", &l); + if (!reg) + return 0; + + endp = reg + (l / sizeof(__be32)); + while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) { + const char *compatible; + u64 size; + + dt_mem_next_cell(dt_root_addr_cells, ®); + size = dt_mem_next_cell(dt_root_size_cells, ®); + + if (size) { + update_memory_block_size(block_size, size); + continue; + } + /* + * ibm,coherent-device-memory with linux,usable-memory = 0 + * Force 256MiB block size. Work around for GPUs on P9 PowerNV + * linux,usable-memory == 0 implies driver managed memory and + * we can't use large memory block size due to hotplug/unplug + * limitations. + */ + compatible = of_get_flat_dt_prop(node, "compatible", NULL); + if (compatible && !strcmp(compatible, "ibm,coherent-device-memory")) { + if (*block_size > SZ_256M) + *block_size = SZ_256M; + /* + * We keep 256M as the upper limit with GPU present. + */ + return 0; + } + } + /* continue looking for other memory device types */ + return 0; +} + +/* + * start with 1G memory block size. Early init will + * fix this with correct value. + */ +unsigned long memory_block_size __ro_after_init = 1UL << 30; +static void __init early_init_memory_block_size(void) +{ + /* + * We need to do memory_block_size probe early so that + * radix__early_init_mmu() can use this as limit for + * mapping page size. + */ + of_scan_flat_dt(probe_memory_block_size, &memory_block_size); +} + +void __init mmu_early_init_devtree(void) +{ + bool hvmode = !!(mfmsr() & MSR_HV); + + /* Disable radix mode based on kernel command line. */ + if (disable_radix) { + if (IS_ENABLED(CONFIG_PPC_64S_HASH_MMU)) + cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + else + pr_warn("WARNING: Ignoring cmdline option disable_radix\n"); + } + + of_scan_flat_dt(dt_scan_mmu_pid_width, NULL); + if (hvmode && !mmu_lpid_bits) { + if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) + mmu_lpid_bits = 12; /* POWER8-10 */ + else + mmu_lpid_bits = 10; /* POWER7 */ + } + if (!mmu_pid_bits) { + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) + mmu_pid_bits = 20; /* POWER9-10 */ + } + + /* + * Check /chosen/ibm,architecture-vec-5 if running as a guest. + * When running bare-metal, we can use radix if we like + * even though the ibm,architecture-vec-5 property created by + * skiboot doesn't have the necessary bits set. + */ + if (!hvmode) + early_check_vec5(); + + early_init_memory_block_size(); + + if (early_radix_enabled()) { + radix__early_init_devtree(); + + /* + * We have finalized the translation we are going to use by now. + * Radix mode is not limited by RMA / VRMA addressing. + * Hence don't limit memblock allocations. + */ + ppc64_rma_size = ULONG_MAX; + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + } else + hash__early_init_devtree(); + + if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE)) + hugetlbpage_init_defaultsize(); + + if (!(cur_cpu_spec->mmu_features & MMU_FTR_HPTE_TABLE) && + !(cur_cpu_spec->mmu_features & MMU_FTR_TYPE_RADIX)) + panic("kernel does not support any MMU type offered by platform"); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c new file mode 100644 index 000000000000..4b4feba9873b --- /dev/null +++ b/arch/powerpc/mm/ioremap.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/mmzone.h> +#include <linux/vmalloc.h> + +unsigned long ioremap_bot; +EXPORT_SYMBOL(ioremap_bot); + +void __iomem *ioremap(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_noncached(PAGE_KERNEL); + void *caller = __builtin_return_address(0); + + return __ioremap_caller(addr, size, prot, caller); +} +EXPORT_SYMBOL(ioremap); + +void __iomem *ioremap_wc(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); + void *caller = __builtin_return_address(0); + + return __ioremap_caller(addr, size, prot, caller); +} +EXPORT_SYMBOL(ioremap_wc); + +void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_cached(PAGE_KERNEL); + void *caller = __builtin_return_address(0); + + return __ioremap_caller(addr, size, prot, caller); +} + +void __iomem *ioremap_prot(phys_addr_t addr, size_t size, pgprot_t prot) +{ + pte_t pte = __pte(pgprot_val(prot)); + void *caller = __builtin_return_address(0); + + /* writeable implies dirty for kernel addresses */ + if (pte_write(pte)) + pte = pte_mkdirty(pte); + + return __ioremap_caller(addr, size, pte_pgprot(pte), caller); +} +EXPORT_SYMBOL(ioremap_prot); + +int early_ioremap_range(unsigned long ea, phys_addr_t pa, + unsigned long size, pgprot_t prot) +{ + unsigned long i; + + for (i = 0; i < size; i += PAGE_SIZE) { + int err = map_kernel_page(ea + i, pa + i, pgprot_nx(prot)); + + if (WARN_ON_ONCE(err)) /* Should clean up */ + return err; + } + + return 0; +} diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c new file mode 100644 index 000000000000..ca5bc6be3e6f --- /dev/null +++ b/arch/powerpc/mm/ioremap_32.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include <mm/mmu_decl.h> + +void __iomem *ioremap_wt(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL); + + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_wt); + +void __iomem * +__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) +{ + unsigned long v; + phys_addr_t p, offset; + int err; + + /* + * If the address lies within the first 16 MB, assume it's in ISA + * memory space + */ + if (addr < SZ_16M) + addr += _ISA_MEM_BASE; + + /* + * Choose an address to map it to. + * Once the vmalloc system is running, we use it. + * Before then, we use space going down from IOREMAP_TOP + * (ioremap_bot records where we're up to). + */ + p = addr & PAGE_MASK; + offset = addr & ~PAGE_MASK; + size = PAGE_ALIGN(addr + size) - p; + +#ifndef CONFIG_CRASH_DUMP + /* + * Don't allow anybody to remap normal RAM that we're using. + * mem_init() sets high_memory so only do the check after that. + */ + if (slab_is_available() && p <= virt_to_phys(high_memory - 1) && + page_is_ram(__phys_to_pfn(p))) { + pr_warn("%s(): phys addr 0x%llx is RAM lr %ps\n", __func__, + (unsigned long long)p, __builtin_return_address(0)); + return NULL; + } +#endif + + if (size == 0) + return NULL; + + /* + * Is it already mapped? Perhaps overlapped by a previous + * mapping. + */ + v = p_block_mapped(p); + if (v) + return (void __iomem *)v + offset; + + if (slab_is_available()) + return generic_ioremap_prot(addr, size, prot); + + /* + * Should check if it is a candidate for a BAT mapping + */ + pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller); + + err = early_ioremap_range(ioremap_bot - size - PAGE_SIZE, p, size, prot); + if (err) + return NULL; + ioremap_bot -= size + PAGE_SIZE; + + return (void __iomem *)ioremap_bot + offset; +} + +void iounmap(volatile void __iomem *addr) +{ + /* + * If mapped by BATs then there is nothing to do. + * Calling vfree() generates a benign warning. + */ + if (v_block_mapped((unsigned long)addr)) + return; + + generic_iounmap(addr); +} +EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c new file mode 100644 index 000000000000..fb8b55bd2cd5 --- /dev/null +++ b/arch/powerpc/mm/ioremap_64.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, + pgprot_t prot, void *caller) +{ + phys_addr_t paligned, offset; + void __iomem *ret; + int err; + + /* We don't support the 4K PFN hack with ioremap */ + if (pgprot_val(prot) & H_PAGE_4K_PFN) + return NULL; + + /* + * Choose an address to map it to. Once the vmalloc system is running, + * we use it. Before that, we map using addresses going up from + * ioremap_bot. vmalloc will use the addresses from IOREMAP_BASE + * through ioremap_bot. + */ + paligned = addr & PAGE_MASK; + offset = addr & ~PAGE_MASK; + size = PAGE_ALIGN(addr + size) - paligned; + + if (size == 0 || paligned == 0) + return NULL; + + if (slab_is_available()) + return generic_ioremap_prot(addr, size, prot); + + pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller); + + err = early_ioremap_range(ioremap_bot, paligned, size, prot); + if (err) + return NULL; + + ret = (void __iomem *)ioremap_bot + offset; + ioremap_bot += size + PAGE_SIZE; + + return ret; +} + +/* + * Unmap an IO region and remove it from vmalloc'd list. + * Access to IO memory should be serialized by driver. + */ +void iounmap(volatile void __iomem *token) +{ + if (!slab_is_available()) + return; + + generic_iounmap(token); +} +EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/kasan/8xx.c b/arch/powerpc/mm/kasan/8xx.c new file mode 100644 index 000000000000..989d6cdf4141 --- /dev/null +++ b/arch/powerpc/mm/kasan/8xx.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define DISABLE_BRANCH_PROFILING + +#include <linux/kasan.h> +#include <linux/memblock.h> +#include <linux/hugetlb.h> + +#include <asm/pgalloc.h> + +static int __init +kasan_init_shadow_8M(unsigned long k_start, unsigned long k_end, void *block) +{ + pmd_t *pmd = pmd_off_k(k_start); + unsigned long k_cur, k_next; + + for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++, block += SZ_4M) { + pte_t *ptep; + int i; + + k_next = pgd_addr_end(k_cur, k_end); + if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte) + continue; + + ptep = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); + if (!ptep) + return -ENOMEM; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte_t pte = pte_mkhuge(pfn_pte(PHYS_PFN(__pa(block + i * PAGE_SIZE)), PAGE_KERNEL)); + + __set_pte_at(&init_mm, k_cur, ptep + i, pte, 1); + } + pmd_populate_kernel(&init_mm, pmd, ptep); + *pmd = __pmd(pmd_val(*pmd) | _PMD_PAGE_8M); + } + return 0; +} + +int __init kasan_init_region(void *start, size_t size) +{ + unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start); + unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size); + unsigned long k_cur; + int ret; + void *block; + + block = memblock_alloc(k_end - k_start, SZ_8M); + if (!block) + return -ENOMEM; + + if (IS_ALIGNED(k_start, SZ_8M)) { + kasan_init_shadow_8M(k_start, ALIGN_DOWN(k_end, SZ_8M), block); + k_cur = ALIGN_DOWN(k_end, SZ_8M); + if (k_cur == k_end) + goto finish; + } else { + k_cur = k_start; + } + + ret = kasan_init_shadow_page_tables(k_start, k_end); + if (ret) + return ret; + + for (; k_cur < k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_off_k(k_cur); + void *va = block + k_cur - k_start; + pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); + + if (k_cur < ALIGN_DOWN(k_end, SZ_512K)) + pte = pte_mkhuge(pte); + + __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0); + } +finish: + flush_tlb_kernel_range(k_start, k_end); + return 0; +} diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile new file mode 100644 index 000000000000..f9522fd70b2f --- /dev/null +++ b/arch/powerpc/mm/kasan/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 + +KASAN_SANITIZE := n +KCOV_INSTRUMENT := n + +obj-$(CONFIG_PPC32) += init_32.o +obj-$(CONFIG_PPC_8xx) += 8xx.o +obj-$(CONFIG_PPC_BOOK3S_32) += book3s_32.o +obj-$(CONFIG_PPC_BOOK3S_64) += init_book3s_64.o +obj-$(CONFIG_PPC_BOOK3E_64) += init_book3e_64.o diff --git a/arch/powerpc/mm/kasan/book3s_32.c b/arch/powerpc/mm/kasan/book3s_32.c new file mode 100644 index 000000000000..450a67ef0bbe --- /dev/null +++ b/arch/powerpc/mm/kasan/book3s_32.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define DISABLE_BRANCH_PROFILING + +#include <linux/kasan.h> +#include <linux/memblock.h> +#include <mm/mmu_decl.h> + +int __init kasan_init_region(void *start, size_t size) +{ + unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start); + unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size); + unsigned long k_nobat = k_start; + unsigned long k_cur; + phys_addr_t phys; + int ret; + + while (k_nobat < k_end) { + unsigned int k_size = bat_block_size(k_nobat, k_end); + int idx = find_free_bat(); + + if (idx == -1) + break; + if (k_size < SZ_128K) + break; + phys = memblock_phys_alloc_range(k_size, k_size, 0, + MEMBLOCK_ALLOC_ANYWHERE); + if (!phys) + break; + + setbat(idx, k_nobat, phys, k_size, PAGE_KERNEL); + k_nobat += k_size; + } + if (k_nobat != k_start) + update_bats(); + + if (k_nobat < k_end) { + phys = memblock_phys_alloc_range(k_end - k_nobat, PAGE_SIZE, 0, + MEMBLOCK_ALLOC_ANYWHERE); + if (!phys) + return -ENOMEM; + } + + ret = kasan_init_shadow_page_tables(k_start, k_end); + if (ret) + return ret; + + kasan_update_early_region(k_start, k_nobat, __pte(0)); + + for (k_cur = k_nobat; k_cur < k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_off_k(k_cur); + pte_t pte = pfn_pte(PHYS_PFN(phys + k_cur - k_nobat), PAGE_KERNEL); + + __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0); + } + flush_tlb_kernel_range(k_start, k_end); + memset(kasan_mem_to_shadow(start), 0, k_end - k_start); + + return 0; +} diff --git a/arch/powerpc/mm/kasan/init_32.c b/arch/powerpc/mm/kasan/init_32.c new file mode 100644 index 000000000000..1d083597464f --- /dev/null +++ b/arch/powerpc/mm/kasan/init_32.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define DISABLE_BRANCH_PROFILING + +#include <linux/kasan.h> +#include <linux/printk.h> +#include <linux/memblock.h> +#include <linux/sched/task.h> +#include <asm/pgalloc.h> +#include <asm/text-patching.h> +#include <mm/mmu_decl.h> + +static pgprot_t __init kasan_prot_ro(void) +{ + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return PAGE_READONLY; + + return PAGE_KERNEL_RO; +} + +static void __init kasan_populate_pte(pte_t *ptep, pgprot_t prot) +{ + unsigned long va = (unsigned long)kasan_early_shadow_page; + phys_addr_t pa = __pa(kasan_early_shadow_page); + int i; + + for (i = 0; i < PTRS_PER_PTE; i++, ptep++) + __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 1); +} + +int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end) +{ + pmd_t *pmd; + unsigned long k_cur, k_next; + + pmd = pmd_off_k(k_start); + + for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) { + pte_t *new; + + k_next = pgd_addr_end(k_cur, k_end); + if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte) + continue; + + new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); + + if (!new) + return -ENOMEM; + kasan_populate_pte(new, PAGE_KERNEL); + pmd_populate_kernel(&init_mm, pmd, new); + } + return 0; +} + +int __init __weak kasan_init_region(void *start, size_t size) +{ + unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start); + unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size); + unsigned long k_cur; + int ret; + void *block; + + ret = kasan_init_shadow_page_tables(k_start, k_end); + if (ret) + return ret; + + k_start = k_start & PAGE_MASK; + block = memblock_alloc(k_end - k_start, PAGE_SIZE); + if (!block) + return -ENOMEM; + + for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_off_k(k_cur); + void *va = block + k_cur - k_start; + pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); + + __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0); + } + flush_tlb_kernel_range(k_start, k_end); + return 0; +} + +void __init +kasan_update_early_region(unsigned long k_start, unsigned long k_end, pte_t pte) +{ + unsigned long k_cur; + + for (k_cur = k_start; k_cur != k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_off_k(k_cur); + pte_t *ptep = pte_offset_kernel(pmd, k_cur); + + if (pte_page(*ptep) != virt_to_page(lm_alias(kasan_early_shadow_page))) + continue; + + __set_pte_at(&init_mm, k_cur, ptep, pte, 0); + } + + flush_tlb_kernel_range(k_start, k_end); +} + +static void __init kasan_remap_early_shadow_ro(void) +{ + pgprot_t prot = kasan_prot_ro(); + phys_addr_t pa = __pa(kasan_early_shadow_page); + + kasan_populate_pte(kasan_early_shadow_pte, prot); + + kasan_update_early_region(KASAN_SHADOW_START, KASAN_SHADOW_END, + pfn_pte(PHYS_PFN(pa), prot)); +} + +static void __init kasan_unmap_early_shadow_vmalloc(void) +{ + unsigned long k_start = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_START); + unsigned long k_end = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_END); + + kasan_update_early_region(k_start, k_end, __pte(0)); + +#ifdef MODULES_VADDR + k_start = (unsigned long)kasan_mem_to_shadow((void *)MODULES_VADDR); + k_end = (unsigned long)kasan_mem_to_shadow((void *)MODULES_END); + kasan_update_early_region(k_start, k_end, __pte(0)); +#endif +} + +void __init kasan_mmu_init(void) +{ + int ret; + + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) { + ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END); + + if (ret) + panic("kasan: kasan_init_shadow_page_tables() failed"); + } +} + +void __init kasan_init(void) +{ + phys_addr_t base, end; + u64 i; + int ret; + + for_each_mem_range(i, &base, &end) { + phys_addr_t top = min(end, total_lowmem); + + if (base >= top) + continue; + + ret = kasan_init_region(__va(base), top - base); + if (ret) + panic("kasan: kasan_init_region() failed"); + } + + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { + ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END); + + if (ret) + panic("kasan: kasan_init_shadow_page_tables() failed"); + } + + kasan_remap_early_shadow_ro(); + + clear_page(kasan_early_shadow_page); + + /* At this point kasan is fully initialized. Enable error messages */ + init_task.kasan_depth = 0; + kasan_init_generic(); +} + +void __init kasan_late_init(void) +{ + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_unmap_early_shadow_vmalloc(); +} + +void __init kasan_early_init(void) +{ + unsigned long addr = KASAN_SHADOW_START; + unsigned long end = KASAN_SHADOW_END; + unsigned long next; + pmd_t *pmd = pmd_off_k(addr); + + BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK); + + kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL); + + do { + next = pgd_addr_end(addr, end); + pmd_populate_kernel(&init_mm, pmd, kasan_early_shadow_pte); + } while (pmd++, addr = next, addr != end); +} diff --git a/arch/powerpc/mm/kasan/init_book3e_64.c b/arch/powerpc/mm/kasan/init_book3e_64.c new file mode 100644 index 000000000000..0d3a73d6d4b0 --- /dev/null +++ b/arch/powerpc/mm/kasan/init_book3e_64.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KASAN for 64-bit Book3e powerpc + * + * Copyright 2022, Christophe Leroy, CS GROUP France + */ + +#define DISABLE_BRANCH_PROFILING + +#include <linux/kasan.h> +#include <linux/printk.h> +#include <linux/memblock.h> +#include <linux/set_memory.h> + +#include <asm/pgalloc.h> + +static inline bool kasan_pud_table(p4d_t p4d) +{ + return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud)); +} + +static inline bool kasan_pmd_table(pud_t pud) +{ + return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd)); +} + +static inline bool kasan_pte_table(pmd_t pmd) +{ + return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte)); +} + +static int __init kasan_map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_k(ea); + p4dp = p4d_offset(pgdp, ea); + if (kasan_pud_table(*p4dp)) { + pudp = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + memcpy(pudp, kasan_early_shadow_pud, PUD_TABLE_SIZE); + p4d_populate(&init_mm, p4dp, pudp); + } + pudp = pud_offset(p4dp, ea); + if (kasan_pmd_table(*pudp)) { + pmdp = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + memcpy(pmdp, kasan_early_shadow_pmd, PMD_TABLE_SIZE); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (kasan_pte_table(*pmdp)) { + ptep = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE); + memcpy(ptep, kasan_early_shadow_pte, PTE_TABLE_SIZE); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + + __set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot), 0); + + return 0; +} + +static void __init kasan_init_phys_region(void *start, void *end) +{ + unsigned long k_start, k_end, k_cur; + void *va; + + if (start >= end) + return; + + k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); + k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); + + va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE); + for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) + kasan_map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); +} + +void __init kasan_early_init(void) +{ + int i; + unsigned long addr; + pgd_t *pgd = pgd_offset_k(KASAN_SHADOW_START); + pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL); + + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE)); + + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_populate_kernel(&init_mm, &kasan_early_shadow_pmd[i], + kasan_early_shadow_pte); + + for (i = 0; i < PTRS_PER_PUD; i++) + pud_populate(&init_mm, &kasan_early_shadow_pud[i], + kasan_early_shadow_pmd); + + for (addr = KASAN_SHADOW_START; addr != KASAN_SHADOW_END; addr += PGDIR_SIZE) + p4d_populate(&init_mm, p4d_offset(pgd++, addr), kasan_early_shadow_pud); +} + +void __init kasan_init(void) +{ + phys_addr_t start, end; + u64 i; + pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO); + + for_each_mem_range(i, &start, &end) + kasan_init_phys_region(phys_to_virt(start), phys_to_virt(end)); + + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_remove_zero_shadow((void *)VMALLOC_START, VMALLOC_SIZE); + + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); + + memset(kasan_early_shadow_page, 0, PAGE_SIZE); + + /* Enable error messages */ + init_task.kasan_depth = 0; + kasan_init_generic(); +} + +void __init kasan_late_init(void) { } diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c new file mode 100644 index 000000000000..dcafa641804c --- /dev/null +++ b/arch/powerpc/mm/kasan/init_book3s_64.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KASAN for 64-bit Book3S powerpc + * + * Copyright 2019-2022, Daniel Axtens, IBM Corporation. + */ + +/* + * ppc64 turns on virtual memory late in boot, after calling into generic code + * like the device-tree parser, so it uses this in conjunction with a hook in + * outline mode to avoid invalid access early in boot. + */ + +#define DISABLE_BRANCH_PROFILING + +#include <linux/kasan.h> +#include <linux/printk.h> +#include <linux/sched/task.h> +#include <linux/memblock.h> +#include <asm/pgalloc.h> + +static void __init kasan_init_phys_region(void *start, void *end) +{ + unsigned long k_start, k_end, k_cur; + void *va; + + if (start >= end) + return; + + k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); + k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); + + va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE); + for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) + map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); +} + +void __init kasan_init(void) +{ + /* + * We want to do the following things: + * 1) Map real memory into the shadow for all physical memblocks + * This takes us from c000... to c008... + * 2) Leave a hole over the shadow of vmalloc space. KASAN_VMALLOC + * will manage this for us. + * This takes us from c008... to c00a... + * 3) Map the 'early shadow'/zero page over iomap and vmemmap space. + * This takes us up to where we start at c00e... + */ + + void *k_start = kasan_mem_to_shadow((void *)RADIX_VMALLOC_END); + void *k_end = kasan_mem_to_shadow((void *)RADIX_VMEMMAP_END); + phys_addr_t start, end; + u64 i; + pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL); + + if (!early_radix_enabled()) { + pr_warn("KASAN not enabled as it requires radix!"); + return; + } + + for_each_mem_range(i, &start, &end) + kasan_init_phys_region(phys_to_virt(start), phys_to_virt(end)); + + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_populate_kernel(&init_mm, &kasan_early_shadow_pmd[i], + kasan_early_shadow_pte); + + for (i = 0; i < PTRS_PER_PUD; i++) + pud_populate(&init_mm, &kasan_early_shadow_pud[i], + kasan_early_shadow_pmd); + + /* map the early shadow over the iomap and vmemmap space */ + kasan_populate_early_shadow(k_start, k_end); + + /* mark early shadow region as RO and wipe it */ + zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO); + for (i = 0; i < PTRS_PER_PTE; i++) + __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, + &kasan_early_shadow_pte[i], zero_pte, 0); + + /* + * clear_page relies on some cache info that hasn't been set up yet. + * It ends up looping ~forever and blows up other data. + * Use memset instead. + */ + memset(kasan_early_shadow_page, 0, PAGE_SIZE); + + /* Enable error messages */ + init_task.kasan_depth = 0; + kasan_init_generic(); +} + +void __init kasan_early_init(void) { } + +void __init kasan_late_init(void) { } diff --git a/arch/powerpc/mm/maccess.c b/arch/powerpc/mm/maccess.c new file mode 100644 index 000000000000..ea821d0ffe16 --- /dev/null +++ b/arch/powerpc/mm/maccess.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/uaccess.h> +#include <linux/kernel.h> + +#include <asm/disassemble.h> +#include <asm/inst.h> +#include <asm/ppc-opcode.h> + +bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) +{ + return is_kernel_addr((unsigned long)unsafe_src); +} diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 7f4bea162026..3ddbfdbfa941 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -9,251 +10,218 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ -#include <linux/export.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/gfp.h> -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/stddef.h> -#include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/highmem.h> -#include <linux/initrd.h> -#include <linux/pagemap.h> #include <linux/suspend.h> -#include <linux/memblock.h> -#include <linux/hugetlb.h> -#include <linux/slab.h> - -#include <asm/pgalloc.h> -#include <asm/prom.h> -#include <asm/io.h> -#include <asm/mmu_context.h> -#include <asm/pgtable.h> -#include <asm/mmu.h> -#include <asm/smp.h> -#include <asm/machdep.h> -#include <asm/btext.h> -#include <asm/tlb.h> -#include <asm/sections.h> -#include <asm/sparsemem.h> -#include <asm/vdso.h> -#include <asm/fixmap.h> +#include <linux/dma-direct.h> +#include <linux/execmem.h> +#include <linux/vmalloc.h> + #include <asm/swiotlb.h> +#include <asm/machdep.h> #include <asm/rtas.h> +#include <asm/kasan.h> +#include <asm/svm.h> +#include <asm/mmzone.h> +#include <asm/ftrace.h> +#include <asm/text-patching.h> +#include <asm/setup.h> +#include <asm/fixmap.h> -#include "mmu_decl.h" - -#ifndef CPU_FTR_COHERENT_ICACHE -#define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */ -#define CPU_FTR_NOEXECUTE 0 -#endif - -int init_bootmem_done; -int mem_init_done; -unsigned long long memory_limit; - -#ifdef CONFIG_HIGHMEM -pte_t *kmap_pte; -EXPORT_SYMBOL(kmap_pte); -pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); - -static inline pte_t *virt_to_kpte(unsigned long vaddr) -{ - return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), - vaddr), vaddr), vaddr); -} -#endif +#include <mm/mmu_decl.h> -int page_is_ram(unsigned long pfn) -{ -#ifndef CONFIG_PPC64 /* XXX for now */ - return pfn < max_pfn; -#else - unsigned long paddr = (pfn << PAGE_SHIFT); - struct memblock_region *reg; +unsigned long long memory_limit __initdata; - for_each_memblock(memory, reg) - if (paddr >= reg->base && paddr < (reg->base + reg->size)) - return 1; - return 0; -#endif -} +unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; +EXPORT_SYMBOL(empty_zero_page); -pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, - unsigned long size, pgprot_t vma_prot) +pgprot_t __phys_mem_access_prot(unsigned long pfn, unsigned long size, + pgprot_t vma_prot) { if (ppc_md.phys_mem_access_prot) - return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot); + return ppc_md.phys_mem_access_prot(pfn, size, vma_prot); if (!page_is_ram(pfn)) vma_prot = pgprot_noncached(vma_prot); return vma_prot; } -EXPORT_SYMBOL(phys_mem_access_prot); +EXPORT_SYMBOL(__phys_mem_access_prot); #ifdef CONFIG_MEMORY_HOTPLUG +static DEFINE_MUTEX(linear_mapping_mutex); #ifdef CONFIG_NUMA int memory_add_physaddr_to_nid(u64 start) { return hot_add_scn_to_nid(start); } +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -int arch_add_memory(int nid, u64 start, u64 size) +int __weak create_section_mapping(unsigned long start, unsigned long end, + int nid, pgprot_t prot) { - struct pglist_data *pgdata; - struct zone *zone; - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - - pgdata = NODE_DATA(nid); + return -ENODEV; +} - start = (unsigned long)__va(start); - if (create_section_mapping(start, start + size)) - return -EINVAL; +int __weak remove_section_mapping(unsigned long start, unsigned long end) +{ + return -ENODEV; +} - /* this should work for most non-highmem platforms */ - zone = pgdata->node_zones; +int __ref arch_create_linear_mapping(int nid, u64 start, u64 size, + struct mhp_params *params) +{ + int rc; - return __add_pages(nid, zone, start_pfn, nr_pages); + start = (unsigned long)__va(start); + mutex_lock(&linear_mapping_mutex); + rc = create_section_mapping(start, start + size, nid, + params->pgprot); + mutex_unlock(&linear_mapping_mutex); + if (rc) { + pr_warn("Unable to create linear mapping for 0x%llx..0x%llx: %d\n", + start, start + size, rc); + return -EFAULT; + } + return 0; } -#ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(u64 start, u64 size) +void __ref arch_remove_linear_mapping(u64 start, u64 size) { - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; + int ret; - zone = page_zone(pfn_to_page(start_pfn)); - return __remove_pages(zone, start_pfn, nr_pages); + /* Remove htab bolted mappings for this section of memory */ + start = (unsigned long)__va(start); + + mutex_lock(&linear_mapping_mutex); + ret = remove_section_mapping(start, start + size); + mutex_unlock(&linear_mapping_mutex); + if (ret) + pr_warn("Unable to remove linear mapping for 0x%llx..0x%llx: %d\n", + start, start + size, ret); + + /* Ensure all vmalloc mappings are flushed in case they also + * hit that section of memory + */ + vm_unmap_aliases(); } -#endif -#endif /* CONFIG_MEMORY_HOTPLUG */ /* - * walk_memory_resource() needs to make sure there is no holes in a given - * memory range. PPC64 does not maintain the memory layout in /proc/iomem. - * Instead it maintains it in memblock.memory structures. Walk through the - * memory regions, find holes and callback for contiguous regions. + * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need + * updating. */ -int -walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, - void *arg, int (*func)(unsigned long, unsigned long, void *)) +static void update_end_of_memory_vars(u64 start, u64 size) { - struct memblock_region *reg; - unsigned long end_pfn = start_pfn + nr_pages; - unsigned long tstart, tend; - int ret = -1; - - for_each_memblock(memory, reg) { - tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); - tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); - if (tstart >= tend) - continue; - ret = (*func)(tstart, tend - tstart, arg); - if (ret) - break; + unsigned long end_pfn = PFN_UP(start + size); + + if (end_pfn > max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; } +} + +int __ref add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, + struct mhp_params *params) +{ + int ret; + + ret = __add_pages(nid, start_pfn, nr_pages, params); + if (ret) + return ret; + + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT); + return ret; } -EXPORT_SYMBOL_GPL(walk_system_ram_range); -/* - * Initialize the bootmem system and give it all the memory we - * have available. If we are using highmem, we only put the - * lowmem into the bootmem system. - */ -#ifndef CONFIG_NEED_MULTIPLE_NODES -void __init do_init_bootmem(void) +int __ref arch_add_memory(int nid, u64 start, u64 size, + struct mhp_params *params) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + int rc; + + rc = arch_create_linear_mapping(nid, start, size, params); + if (rc) + return rc; + rc = add_pages(nid, start_pfn, nr_pages, params); + if (rc) + arch_remove_linear_mapping(start, size); + return rc; +} + +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { - unsigned long start, bootmap_pages; - unsigned long total_pages; - struct memblock_region *reg; - int boot_mapsize; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + __remove_pages(start_pfn, nr_pages, altmap); + arch_remove_linear_mapping(start, size); +} +#endif +#ifndef CONFIG_NUMA +void __init mem_topology_setup(void) +{ max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; - total_pages = (memblock_end_of_DRAM() - memstart_addr) >> PAGE_SHIFT; + min_low_pfn = MEMORY_START >> PAGE_SHIFT; #ifdef CONFIG_HIGHMEM - total_pages = total_lowmem >> PAGE_SHIFT; max_low_pfn = lowmem_end_addr >> PAGE_SHIFT; #endif - /* - * Find an area to use for the bootmem bitmap. Calculate the size of - * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE. - * Add 1 additional page in case the address isn't page-aligned. - */ - bootmap_pages = bootmem_bootmap_pages(total_pages); - - start = memblock_alloc(bootmap_pages << PAGE_SHIFT, PAGE_SIZE); - - min_low_pfn = MEMORY_START >> PAGE_SHIFT; - boot_mapsize = init_bootmem_node(NODE_DATA(0), start >> PAGE_SHIFT, min_low_pfn, max_low_pfn); - /* Place all memblock_regions in the same node and merge contiguous * memblock_regions */ - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); - - /* Add all physical memory to the bootmem map, mark each area - * present. - */ -#ifdef CONFIG_HIGHMEM - free_bootmem_with_active_regions(0, lowmem_end_addr >> PAGE_SHIFT); - - /* reserve the sections we're already using */ - for_each_memblock(reserved, reg) { - unsigned long top = reg->base + reg->size - 1; - if (top < lowmem_end_addr) - reserve_bootmem(reg->base, reg->size, BOOTMEM_DEFAULT); - else if (reg->base < lowmem_end_addr) { - unsigned long trunc_size = lowmem_end_addr - reg->base; - reserve_bootmem(reg->base, trunc_size, BOOTMEM_DEFAULT); - } - } -#else - free_bootmem_with_active_regions(0, max_pfn); - - /* reserve the sections we're already using */ - for_each_memblock(reserved, reg) - reserve_bootmem(reg->base, reg->size, BOOTMEM_DEFAULT); -#endif - /* XXX need to clip this if using highmem? */ - sparse_memory_present_with_active_regions(0); + memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); +} - init_bootmem_done = 1; +void __init initmem_init(void) +{ + sparse_init(); } /* mark pages that don't exist as nosave */ static int __init mark_nonram_nosave(void) { - struct memblock_region *reg, *prev = NULL; - - for_each_memblock(memory, reg) { - if (prev && - memblock_region_memory_end_pfn(prev) < memblock_region_memory_base_pfn(reg)) - register_nosave_region(memblock_region_memory_end_pfn(prev), - memblock_region_memory_base_pfn(reg)); - prev = reg; + unsigned long spfn, epfn, prev = 0; + int i; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &spfn, &epfn, NULL) { + if (prev && prev < spfn) + register_nosave_region(prev, spfn); + + prev = epfn; } + return 0; } +#else /* CONFIG_NUMA */ +static int __init mark_nonram_nosave(void) +{ + return 0; +} +#endif + +/* + * Zones usage: + * + * We setup ZONE_DMA to be 31-bits on all platforms and ZONE_NORMAL to be + * everything else. GFP_DMA32 page allocations automatically fall back to + * ZONE_DMA. + * + * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the + * generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU + * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by + * ZONE_DMA. + */ +static unsigned long max_zone_pfns[MAX_NR_ZONES]; /* * paging_init() sets up the page tables - in fact we've already done this. @@ -262,66 +230,72 @@ void __init paging_init(void) { unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); - unsigned long max_zone_pfns[MAX_NR_ZONES]; + int zone_dma_bits; -#ifdef CONFIG_PPC32 - unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1); - unsigned long end = __fix_to_virt(FIX_HOLE); +#ifdef CONFIG_HIGHMEM + unsigned long v = __fix_to_virt(FIX_KMAP_END); + unsigned long end = __fix_to_virt(FIX_KMAP_BEGIN); for (; v < end; v += PAGE_SIZE) - map_page(v, 0, 0); /* XXX gross */ -#endif + map_kernel_page(v, 0, __pgprot(0)); /* XXX gross */ -#ifdef CONFIG_HIGHMEM - map_page(PKMAP_BASE, 0, 0); /* XXX gross */ + map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */ pkmap_page_table = virt_to_kpte(PKMAP_BASE); - - kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); - kmap_prot = PAGE_KERNEL; #endif /* CONFIG_HIGHMEM */ printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n", (unsigned long long)top_of_ram, total_ram); printk(KERN_DEBUG "Memory hole size: %ldMB\n", (long int)((top_of_ram - total_ram) >> 20)); - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + + /* + * Allow 30-bit DMA for very limited Broadcom wifi chips on many + * powerbooks. + */ + if (IS_ENABLED(CONFIG_PPC32)) + zone_dma_bits = 30; + else + zone_dma_bits = 31; + + zone_dma_limit = DMA_BIT_MASK(zone_dma_bits); + +#ifdef CONFIG_ZONE_DMA + max_zone_pfns[ZONE_DMA] = min(max_low_pfn, + 1UL << (zone_dma_bits - PAGE_SHIFT)); +#endif + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_DMA] = lowmem_end_addr >> PAGE_SHIFT; - max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT; -#else - max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT; + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif - free_area_init_nodes(max_zone_pfns); + + free_area_init(max_zone_pfns); mark_nonram_nosave(); } -#endif /* ! CONFIG_NEED_MULTIPLE_NODES */ -void __init mem_init(void) +void __init arch_mm_preinit(void) { + /* + * book3s is limited to 16 page sizes due to encoding this in + * a 4-bit field for slices. + */ + BUILD_BUG_ON(MMU_PAGE_COUNT > 16); + #ifdef CONFIG_SWIOTLB - swiotlb_init(0); + /* + * Some platforms (e.g. 85xx) limit DMA-able memory way below + * 4G. We force memblock to bottom-up mode to ensure that the + * memory allocated in swiotlb_init() is DMA-able. + * As it's the last memblock allocation, no need to reset it + * back to to-down. + */ + memblock_set_bottom_up(true); + swiotlb_init(ppc_swiotlb_enable, ppc_swiotlb_flags); #endif - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - set_max_mapnr(max_pfn); - free_all_bootmem(); - -#ifdef CONFIG_HIGHMEM - { - unsigned long pfn, highmem_mapnr; - - highmem_mapnr = lowmem_end_addr >> PAGE_SHIFT; - for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) { - phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT; - struct page *page = pfn_to_page(pfn); - if (!memblock_is_reserved(paddr)) - free_highmem_page(page); - } - } -#endif /* CONFIG_HIGHMEM */ + kasan_late_init(); -#if defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_SMP) +#if defined(CONFIG_PPC_E500) && !defined(CONFIG_SMP) /* * If smp is enabled, next_tlbcam_idx is initialized in the cpu up * functions.... do it here for the non-smp case. @@ -329,193 +303,42 @@ void __init mem_init(void) per_cpu(next_tlbcam_idx, smp_processor_id()) = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1; #endif - - mem_init_print_info(NULL); -#ifdef CONFIG_PPC32 - pr_info("Kernel virtual memory layout:\n"); - pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP); -#ifdef CONFIG_HIGHMEM - pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n", - PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP)); -#endif /* CONFIG_HIGHMEM */ -#ifdef CONFIG_NOT_COHERENT_CACHE - pr_info(" * 0x%08lx..0x%08lx : consistent mem\n", - IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE); -#endif /* CONFIG_NOT_COHERENT_CACHE */ - pr_info(" * 0x%08lx..0x%08lx : early ioremap\n", - ioremap_bot, IOREMAP_TOP); - pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n", - VMALLOC_START, VMALLOC_END); -#endif /* CONFIG_PPC32 */ - - mem_init_done = 1; } void free_initmem(void) { ppc_md.progress = ppc_printk_progress; + mark_initmem_nx(); free_initmem_default(POISON_FREE_INITMEM); -} - -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - -/* - * This is called when a page has been modified by the kernel. - * It just marks the page as not i-cache clean. We do the i-cache - * flush later when the page is given to a user process, if necessary. - */ -void flush_dcache_page(struct page *page) -{ - if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - return; - /* avoid an atomic op if possible */ - if (test_bit(PG_arch_1, &page->flags)) - clear_bit(PG_arch_1, &page->flags); -} -EXPORT_SYMBOL(flush_dcache_page); - -void flush_dcache_icache_page(struct page *page) -{ -#ifdef CONFIG_HUGETLB_PAGE - if (PageCompound(page)) { - flush_dcache_icache_hugepage(page); - return; - } -#endif -#ifdef CONFIG_BOOKE - { - void *start = kmap_atomic(page); - __flush_dcache_icache(start); - kunmap_atomic(start); - } -#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64) - /* On 8xx there is no need to kmap since highmem is not supported */ - __flush_dcache_icache(page_address(page)); -#else - __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); -#endif -} -EXPORT_SYMBOL(flush_dcache_icache_page); - -void clear_user_page(void *page, unsigned long vaddr, struct page *pg) -{ - clear_page(page); - - /* - * We shouldn't have to do this, but some versions of glibc - * require it (ld.so assumes zero filled pages are icache clean) - * - Anton - */ - flush_dcache_page(pg); -} -EXPORT_SYMBOL(clear_user_page); - -void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, - struct page *pg) -{ - copy_page(vto, vfrom); - - /* - * We should be able to use the following optimisation, however - * there are two problems. - * Firstly a bug in some versions of binutils meant PLT sections - * were not marked executable. - * Secondly the first word in the GOT section is blrl, used - * to establish the GOT address. Until recently the GOT was - * not marked executable. - * - Anton - */ -#if 0 - if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0)) - return; -#endif - - flush_dcache_page(pg); -} - -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, - unsigned long addr, int len) -{ - unsigned long maddr; - - maddr = (unsigned long) kmap(page) + (addr & ~PAGE_MASK); - flush_icache_range(maddr, maddr + len); - kunmap(page); -} -EXPORT_SYMBOL(flush_icache_user_range); - -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a PTE in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux PTE. - * - * This must always be called with the pte lock held. - */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *ptep) -{ -#ifdef CONFIG_PPC_STD_MMU - /* - * We don't need to worry about _PAGE_PRESENT here because we are - * called with either mm->page_table_lock held or ptl lock held - */ - unsigned long access = 0, trap; - - /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ - if (!pte_young(*ptep) || address >= TASK_SIZE) - return; - - /* We try to figure out if we are coming from an instruction - * access fault and pass that down to __hash_page so we avoid - * double-faulting on execution of fresh text. We have to test - * for regs NULL since init will get here first thing at boot - * - * We also avoid filling the hash if not coming from a fault - */ - if (current->thread.regs == NULL) - return; - trap = TRAP(current->thread.regs); - if (trap == 0x400) - access |= _PAGE_EXEC; - else if (trap != 0x300) - return; - hash_preload(vma->vm_mm, address, access, trap); -#endif /* CONFIG_PPC_STD_MMU */ -#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ - && defined(CONFIG_HUGETLB_PAGE) - if (is_vm_hugetlb_page(vma)) - book3e_hugetlb_preload(vma, address, *ptep); -#endif + ftrace_free_init_tramp(); } /* * System memory should not be in /proc/iomem but various tools expect it * (eg kdump). */ -static int add_system_ram_resources(void) +static int __init add_system_ram_resources(void) { - struct memblock_region *reg; + phys_addr_t start, end; + u64 i; - for_each_memblock(memory, reg) { + for_each_mem_range(i, &start, &end) { struct resource *res; - unsigned long base = reg->base; - unsigned long size = reg->size; res = kzalloc(sizeof(struct resource), GFP_KERNEL); WARN_ON(!res); if (res) { res->name = "System RAM"; - res->start = base; - res->end = base + size - 1; - res->flags = IORESOURCE_MEM; - WARN_ON(request_resource(&iomem_resource, res) < 0); + res->start = start; + /* + * In memblock, end points to the first byte after + * the range while in resourses, end points to the + * last byte in the range. + */ + res->end = end - 1; + res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + WARN_ON(insert_resource(&iomem_resource, res) < 0); } } @@ -533,12 +356,95 @@ subsys_initcall(add_system_ram_resources); */ int devmem_is_allowed(unsigned long pfn) { - if (iomem_is_exclusive(pfn << PAGE_SHIFT)) + if (page_is_rtas_user_buf(pfn)) + return 1; + if (iomem_is_exclusive(PFN_PHYS(pfn))) return 0; if (!page_is_ram(pfn)) return 1; - if (page_is_rtas_user_buf(pfn)) - return 1; return 0; } #endif /* CONFIG_STRICT_DEVMEM */ + +/* + * This is defined in kernel/resource.c but only powerpc needs to export it, for + * the EHEA driver. Drop this when drivers/net/ethernet/ibm/ehea is removed. + */ +EXPORT_SYMBOL_GPL(walk_system_ram_range); + +#ifdef CONFIG_EXECMEM +static struct execmem_info execmem_info __ro_after_init; + +#if defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_BOOK3S_603) +static void prealloc_execmem_pgtable(void) +{ + unsigned long va; + + for (va = ALIGN_DOWN(MODULES_VADDR, PGDIR_SIZE); va < MODULES_END; va += PGDIR_SIZE) + pte_alloc_kernel(pmd_off_k(va), va); +} +#else +static void prealloc_execmem_pgtable(void) { } +#endif + +struct execmem_info __init *execmem_arch_setup(void) +{ + pgprot_t kprobes_prot = strict_module_rwx_enabled() ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; + pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; + unsigned long fallback_start = 0, fallback_end = 0; + unsigned long start, end; + + /* + * BOOK3S_32 and 8xx define MODULES_VADDR for text allocations and + * allow allocating data in the entire vmalloc space + */ +#ifdef MODULES_VADDR + unsigned long limit = (unsigned long)_etext - SZ_32M; + + BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); + + /* First try within 32M limit from _etext to avoid branch trampolines */ + if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) { + start = limit; + fallback_start = MODULES_VADDR; + fallback_end = MODULES_END; + } else { + start = MODULES_VADDR; + } + + end = MODULES_END; +#else + start = VMALLOC_START; + end = VMALLOC_END; +#endif + + prealloc_execmem_pgtable(); + + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = start, + .end = end, + .pgprot = prot, + .alignment = 1, + .fallback_start = fallback_start, + .fallback_end = fallback_end, + }, + [EXECMEM_KPROBES] = { + .start = VMALLOC_START, + .end = VMALLOC_END, + .pgprot = kprobes_prot, + .alignment = 1, + }, + [EXECMEM_MODULE_DATA] = { + .start = VMALLOC_START, + .end = VMALLOC_END, + .pgprot = PAGE_KERNEL, + .alignment = 1, + }, + }, + }; + + return &execmem_info; +} +#endif /* CONFIG_EXECMEM */ diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c deleted file mode 100644 index cb8bdbe4972f..000000000000 --- a/arch/powerpc/mm/mmap.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * flexible mmap layout support - * - * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * - * Started by Ingo Molnar <mingo@elte.hu> - */ - -#include <linux/personality.h> -#include <linux/mm.h> -#include <linux/random.h> -#include <linux/sched.h> - -/* - * Top of mmap area (just below the process stack). - * - * Leave at least a ~128 MB hole on 32bit applications. - * - * On 64bit applications we randomise the stack by 1GB so we need to - * space our mmap start address by a further 1GB, otherwise there is a - * chance the mmap area will end up closer to the stack than our ulimit - * requires. - */ -#define MIN_GAP32 (128*1024*1024) -#define MIN_GAP64 ((128 + 1024)*1024*1024UL) -#define MIN_GAP ((is_32bit_task()) ? MIN_GAP32 : MIN_GAP64) -#define MAX_GAP (TASK_SIZE/6*5) - -static inline int mmap_is_legacy(void) -{ - if (current->personality & ADDR_COMPAT_LAYOUT) - return 1; - - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) - return 1; - - return sysctl_legacy_va_layout; -} - -static unsigned long mmap_rnd(void) -{ - unsigned long rnd = 0; - - if (current->flags & PF_RANDOMIZE) { - /* 8MB for 32bit, 1GB for 64bit */ - if (is_32bit_task()) - rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT))); - else - rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT))); - } - return rnd << PAGE_SHIFT; -} - -static inline unsigned long mmap_base(void) -{ - unsigned long gap = rlimit(RLIMIT_STACK); - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); -} - -/* - * This function, called very early during the creation of a new - * process VM image, sets up which VM layout function to use: - */ -void arch_pick_mmap_layout(struct mm_struct *mm) -{ - /* - * Fall back to the standard layout if the personality - * bit is set, or if the expected stack growth is unlimited: - */ - if (mmap_is_legacy()) { - mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } -} diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c new file mode 100644 index 000000000000..3e3af29b4523 --- /dev/null +++ b/arch/powerpc/mm/mmu_context.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Common implementation of switch_mm_irqs_off + * + * Copyright IBM Corp. 2017 + */ + +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/sched/mm.h> + +#include <asm/mmu_context.h> +#include <asm/pgalloc.h> + +#if defined(CONFIG_PPC32) +static inline void switch_mm_pgdir(struct task_struct *tsk, + struct mm_struct *mm) +{ + /* 32-bit keeps track of the current PGDIR in the thread struct */ + tsk->thread.pgdir = mm->pgd; +#ifdef CONFIG_PPC_BOOK3S_32 + tsk->thread.sr0 = mm->context.sr0; +#endif +#if defined(CONFIG_BOOKE) && defined(CONFIG_PPC_KUAP) + tsk->thread.pid = mm->context.id; +#endif +} +#elif defined(CONFIG_PPC_BOOK3E_64) +static inline void switch_mm_pgdir(struct task_struct *tsk, + struct mm_struct *mm) +{ + /* 64-bit Book3E keeps track of current PGD in the PACA */ + get_paca()->pgd = mm->pgd; +#ifdef CONFIG_PPC_KUAP + tsk->thread.pid = mm->context.id; +#endif +} +#else +static inline void switch_mm_pgdir(struct task_struct *tsk, + struct mm_struct *mm) { } +#endif + +void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + int cpu = smp_processor_id(); + bool new_on_cpu = false; + + /* Mark this context has been used on the new CPU */ + if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + VM_WARN_ON_ONCE(next == &init_mm); + cpumask_set_cpu(cpu, mm_cpumask(next)); + inc_mm_active_cpus(next); + + /* + * This full barrier orders the store to the cpumask above vs + * a subsequent load which allows this CPU/MMU to begin loading + * translations for 'next' from page table PTEs into the TLB. + * + * When using the radix MMU, that operation is the load of the + * MMU context id, which is then moved to SPRN_PID. + * + * For the hash MMU it is either the first load from slb_cache + * in switch_slb() to preload the SLBs, or the load of + * get_user_context which loads the context for the VSID hash + * to insert a new SLB, in the SLB fault handler. + * + * On the other side, the barrier is in mm/tlb-radix.c for + * radix which orders earlier stores to clear the PTEs before + * the load of mm_cpumask to check which CPU TLBs should be + * flushed. For hash, pte_xchg to clear the PTE includes the + * barrier. + * + * This full barrier is also needed by membarrier when + * switching between processes after store to rq->curr, before + * user-space memory accesses. + */ + smp_mb(); + + new_on_cpu = true; + } + + /* Some subarchs need to track the PGD elsewhere */ + switch_mm_pgdir(tsk, next); + + /* Nothing else to do if we aren't actually switching */ + if (prev == next) + return; + + /* + * We must stop all altivec streams before changing the HW + * context + */ + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + asm volatile (PPC_DSSALL); + + if (!new_on_cpu) + membarrier_arch_switch_mm(prev, next, tsk); + + /* + * The actual HW switching method differs between the various + * sub architectures. Out of line for now + */ + switch_mmu_context(prev, next, tsk); + + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(prev))); +} + +#ifndef CONFIG_PPC_BOOK3S_64 +void arch_exit_mmap(struct mm_struct *mm) +{ + void *frag = pte_frag_get(&mm->context); + + if (frag) + pte_frag_destroy(frag); +} +#endif diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c deleted file mode 100644 index 178876aef40f..000000000000 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * MMU context allocation for 64-bit kernels. - * - * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/idr.h> -#include <linux/export.h> -#include <linux/gfp.h> -#include <linux/slab.h> - -#include <asm/mmu_context.h> -#include <asm/pgalloc.h> - -#include "icswx.h" - -static DEFINE_SPINLOCK(mmu_context_lock); -static DEFINE_IDA(mmu_context_ida); - -int __init_new_context(void) -{ - int index; - int err; - -again: - if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL)) - return -ENOMEM; - - spin_lock(&mmu_context_lock); - err = ida_get_new_above(&mmu_context_ida, 1, &index); - spin_unlock(&mmu_context_lock); - - if (err == -EAGAIN) - goto again; - else if (err) - return err; - - if (index > MAX_USER_CONTEXT) { - spin_lock(&mmu_context_lock); - ida_remove(&mmu_context_ida, index); - spin_unlock(&mmu_context_lock); - return -ENOMEM; - } - - return index; -} -EXPORT_SYMBOL_GPL(__init_new_context); - -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - int index; - - index = __init_new_context(); - if (index < 0) - return index; - - /* The old code would re-promote on fork, we don't do that - * when using slices as it could cause problem promoting slices - * that have been forced down to 4K - */ - if (slice_mm_new_context(mm)) - slice_set_user_psize(mm, mmu_virtual_psize); - subpage_prot_init_new_context(mm); - mm->context.id = index; -#ifdef CONFIG_PPC_ICSWX - mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); - if (!mm->context.cop_lockp) { - __destroy_context(index); - subpage_prot_free(mm); - mm->context.id = MMU_NO_CONTEXT; - return -ENOMEM; - } - spin_lock_init(mm->context.cop_lockp); -#endif /* CONFIG_PPC_ICSWX */ - -#ifdef CONFIG_PPC_64K_PAGES - mm->context.pte_frag = NULL; -#endif - return 0; -} - -void __destroy_context(int context_id) -{ - spin_lock(&mmu_context_lock); - ida_remove(&mmu_context_ida, context_id); - spin_unlock(&mmu_context_lock); -} -EXPORT_SYMBOL_GPL(__destroy_context); - -#ifdef CONFIG_PPC_64K_PAGES -static void destroy_pagetable_page(struct mm_struct *mm) -{ - int count; - void *pte_frag; - struct page *page; - - pte_frag = mm->context.pte_frag; - if (!pte_frag) - return; - - page = virt_to_page(pte_frag); - /* drop all the pending references */ - count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; - /* We allow PTE_FRAG_NR fragments from a PTE page */ - count = atomic_sub_return(PTE_FRAG_NR - count, &page->_count); - if (!count) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } -} - -#else -static inline void destroy_pagetable_page(struct mm_struct *mm) -{ - return; -} -#endif - - -void destroy_context(struct mm_struct *mm) -{ - -#ifdef CONFIG_PPC_ICSWX - drop_cop(mm->context.acop, mm); - kfree(mm->context.cop_lockp); - mm->context.cop_lockp = NULL; -#endif /* CONFIG_PPC_ICSWX */ - - destroy_pagetable_page(mm); - __destroy_context(mm->context.id); - subpage_prot_free(mm); - mm->context.id = MMU_NO_CONTEXT; -} diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 83eb5d5f53d5..b2d1eea09761 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Declarations of procedures and variables shared between files * in arch/ppc/mm/. @@ -11,53 +12,50 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/mm.h> -#include <asm/tlbflush.h> #include <asm/mmu.h> #ifdef CONFIG_PPC_MMU_NOHASH +#include <asm/trace.h> /* - * On 40x and 8xx, we directly inline tlbia and tlbivax + * On 8xx, we directly inline tlbia */ -#if defined(CONFIG_40x) || defined(CONFIG_8xx) +#ifdef CONFIG_PPC_8xx static inline void _tlbil_all(void) { asm volatile ("sync; tlbia; isync" : : : "memory"); + trace_tlbia(MMU_NO_CONTEXT); } static inline void _tlbil_pid(unsigned int pid) { asm volatile ("sync; tlbia; isync" : : : "memory"); + trace_tlbia(pid); } #define _tlbil_pid_noind(pid) _tlbil_pid(pid) -#else /* CONFIG_40x || CONFIG_8xx */ +#else /* CONFIG_PPC_8xx */ extern void _tlbil_all(void); extern void _tlbil_pid(unsigned int pid); -#ifdef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_BOOK3E_64 extern void _tlbil_pid_noind(unsigned int pid); #else #define _tlbil_pid_noind(pid) _tlbil_pid(pid) #endif -#endif /* !(CONFIG_40x || CONFIG_8xx) */ +#endif /* !CONFIG_PPC_8xx */ /* * On 8xx, we directly inline tlbie, on others, it's extern */ -#ifdef CONFIG_8xx +#ifdef CONFIG_PPC_8xx static inline void _tlbil_va(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind) { asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); + trace_tlbie(0, 0, address, pid, 0, 0, 0); } -#elif defined(CONFIG_PPC_BOOK3E) +#elif defined(CONFIG_PPC_BOOK3E_64) extern void _tlbil_va(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind); #else @@ -67,9 +65,9 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, { __tlbil_va(address, pid); } -#endif /* CONIFG_8xx */ +#endif /* CONFIG_PPC_8xx */ -#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x) +#if defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_47x) extern void _tlbivax_bcast(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind); #else @@ -80,76 +78,62 @@ static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, } #endif -#else /* CONFIG_PPC_MMU_NOHASH */ - -extern void hash_preload(struct mm_struct *mm, unsigned long ea, - unsigned long access, unsigned long trap); +static inline void print_system_hash_info(void) {} +#else /* CONFIG_PPC_MMU_NOHASH */ -extern void _tlbie(unsigned long address); -extern void _tlbia(void); +void print_system_hash_info(void); #endif /* CONFIG_PPC_MMU_NOHASH */ #ifdef CONFIG_PPC32 extern void mapin_ram(void); -extern int map_page(unsigned long va, phys_addr_t pa, int flags); extern void setbat(int index, unsigned long virt, phys_addr_t phys, - unsigned int size, int flags); + unsigned int size, pgprot_t prot); -extern int __map_without_bats; -extern int __allow_ioremap_reserved; -extern unsigned long ioremap_base; -extern unsigned int rtas_data, rtas_size; - -struct hash_pte; -extern struct hash_pte *Hash, *Hash_end; -extern unsigned long Hash_size, Hash_mask; +extern u8 early_hash[]; #endif /* CONFIG_PPC32 */ -#ifdef CONFIG_PPC64 -extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags); -#endif /* CONFIG_PPC64 */ - -extern unsigned long ioremap_bot; extern unsigned long __max_low_memory; -extern phys_addr_t __initial_memory_limit_addr; extern phys_addr_t total_memory; extern phys_addr_t total_lowmem; extern phys_addr_t memstart_addr; extern phys_addr_t lowmem_end_addr; -#ifdef CONFIG_WII -extern unsigned long wii_hole_start; -extern unsigned long wii_hole_size; - -extern unsigned long wii_mmu_mapin_mem2(unsigned long top); -extern void wii_memory_fixups(void); -#endif - /* ...and now those things that may be slightly different between processor * architectures. -- Dan */ -#if defined(CONFIG_8xx) -#define MMU_init_hw() do { } while(0) -#define mmu_mapin_ram(top) (0UL) - -#elif defined(CONFIG_4xx) +#ifdef CONFIG_PPC32 extern void MMU_init_hw(void); -extern unsigned long mmu_mapin_ram(unsigned long top); +void MMU_init_hw_patch(void); +unsigned long mmu_mapin_ram(unsigned long base, unsigned long top); +#endif +void mmu_init_secondary(int cpu); -#elif defined(CONFIG_PPC_FSL_BOOK3E) -extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx); -extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, - phys_addr_t phys); +#ifdef CONFIG_PPC_E500 +extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, + bool dryrun, bool init); #ifdef CONFIG_PPC32 -extern void MMU_init_hw(void); -extern unsigned long mmu_mapin_ram(unsigned long top); extern void adjust_total_lowmem(void); +extern int switch_to_as1(void); +extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu); +void create_kaslr_tlb_entry(int entry, unsigned long virt, phys_addr_t phys); +void reloc_kernel_entry(void *fdt, int addr); +void relocate_init(u64 dt_ptr, phys_addr_t start); +extern int is_second_reloc; #endif extern void loadcam_entry(unsigned int index); +extern void loadcam_multi(int first_idx, int num, int tmp_idx); + +#ifdef CONFIG_RANDOMIZE_BASE +void kaslr_early_init(void *dt_ptr, phys_addr_t size); +void kaslr_late_init(void); +#else +static inline void kaslr_early_init(void *dt_ptr, phys_addr_t size) {} +static inline void kaslr_late_init(void) {} +#endif struct tlbcam { u32 MAS0; @@ -158,8 +142,43 @@ struct tlbcam { u32 MAS3; u32 MAS7; }; -#elif defined(CONFIG_PPC32) -/* anything 32-bit except 4xx or 8xx */ -extern void MMU_init_hw(void); -extern unsigned long mmu_mapin_ram(unsigned long top); + +#define NUM_TLBCAMS 64 + +extern struct tlbcam TLBCAM[NUM_TLBCAMS]; +#endif + +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_85xx) || defined(CONFIG_PPC_8xx) +/* 6xx have BATS */ +/* PPC_85xx have TLBCAM */ +/* 8xx have LTLB */ +phys_addr_t v_block_mapped(unsigned long va); +unsigned long p_block_mapped(phys_addr_t pa); +#else +static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; } +static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; } +#endif + +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_E500) +int mmu_mark_initmem_nx(void); +int mmu_mark_rodata_ro(void); +#else +static inline int mmu_mark_initmem_nx(void) { return 0; } +static inline int mmu_mark_rodata_ro(void) { return 0; } #endif + +#ifdef CONFIG_PPC_8xx +void __init mmu_mapin_immr(void); +#endif + +static inline bool debug_pagealloc_enabled_or_kfence(void) +{ + return IS_ENABLED(CONFIG_KFENCE) || debug_pagealloc_enabled(); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +int create_section_mapping(unsigned long start, unsigned long end, + int nid, pgprot_t prot); +#endif + +int hash__kernel_map_pages(struct page *page, int numpages, int enable); diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/nohash/44x.c index 82b1ff759e26..6d10c6d8be71 100644 --- a/arch/powerpc/mm/44x_mmu.c +++ b/arch/powerpc/mm/nohash/44x.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Modifications by Matt Porter (mporter@mvista.com) to support * PPC44x Book E processors. @@ -15,12 +16,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/init.h> @@ -29,8 +24,10 @@ #include <asm/mmu.h> #include <asm/page.h> #include <asm/cacheflush.h> +#include <asm/text-patching.h> +#include <asm/smp.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> /* Used by the 44x TLB replacement exception handler. * Just needed it declared someplace. @@ -41,24 +38,15 @@ int icache_44x_need_flush; unsigned long tlb_47x_boltmap[1024/8]; -static void ppc44x_update_tlb_hwater(void) +static void __init ppc44x_update_tlb_hwater(void) { - extern unsigned int tlb_44x_patch_hwater_D[]; - extern unsigned int tlb_44x_patch_hwater_I[]; - /* The TLB miss handlers hard codes the watermark in a cmpli * instruction to improve performances rather than loading it * from the global variable. Thus, we patch the instructions * in the 2 TLB miss handlers when updating the value */ - tlb_44x_patch_hwater_D[0] = (tlb_44x_patch_hwater_D[0] & 0xffff0000) | - tlb_44x_hwater; - flush_icache_range((unsigned long)&tlb_44x_patch_hwater_D[0], - (unsigned long)&tlb_44x_patch_hwater_D[1]); - tlb_44x_patch_hwater_I[0] = (tlb_44x_patch_hwater_I[0] & 0xffff0000) | - tlb_44x_hwater; - flush_icache_range((unsigned long)&tlb_44x_patch_hwater_I[0], - (unsigned long)&tlb_44x_patch_hwater_I[1]); + modify_instruction_site(&patch__tlb_44x_hwater_D, 0xffff, tlb_44x_hwater); + modify_instruction_site(&patch__tlb_44x_hwater_I, 0xffff, tlb_44x_hwater); } /* @@ -134,7 +122,7 @@ static void __init ppc47x_update_boltmap(void) /* * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU */ -static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys) +static void __init ppc47x_pin_tlb(unsigned int virt, unsigned int phys) { unsigned int rA; int bolted; @@ -178,7 +166,7 @@ void __init MMU_init_hw(void) flush_instruction_cache(); } -unsigned long __init mmu_mapin_ram(unsigned long top) +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long addr; unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); @@ -229,7 +217,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, } #ifdef CONFIG_SMP -void mmu_init_secondary(int cpu) +void __init mmu_init_secondary(int cpu) { unsigned long addr; unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c new file mode 100644 index 000000000000..ab1505cf42bf --- /dev/null +++ b/arch/powerpc/mm/nohash/8xx.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file contains the routines for initializing the MMU + * on the 8xx series of chips. + * -- christophe + * + * Derived from arch/powerpc/mm/40x_mmu.c: + */ + +#include <linux/memblock.h> +#include <linux/hugetlb.h> + +#include <asm/fixmap.h> +#include <asm/pgalloc.h> + +#include <mm/mmu_decl.h> + +#define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT) + +static unsigned long block_mapped_ram; + +/* + * Return PA for this VA if it is in an area mapped with LTLBs or fixmap. + * Otherwise, returns 0 + */ +phys_addr_t v_block_mapped(unsigned long va) +{ + unsigned long p = PHYS_IMMR_BASE; + + if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE) + return p + va - VIRT_IMMR_BASE; + if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram) + return __pa(va); + return 0; +} + +/* + * Return VA for a given PA mapped with LTLBs or fixmap + * Return 0 if not mapped + */ +unsigned long p_block_mapped(phys_addr_t pa) +{ + unsigned long p = PHYS_IMMR_BASE; + + if (pa >= p && pa < p + IMMR_SIZE) + return VIRT_IMMR_BASE + pa - p; + if (pa < block_mapped_ram) + return (unsigned long)__va(pa); + return 0; +} + +static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa, + pgprot_t prot, int psize, bool new) +{ + pmd_t *pmdp = pmd_off_k(va); + pte_t *ptep; + unsigned int shift = mmu_psize_to_shift(psize); + + if (new) { + if (WARN_ON(slab_is_available())) + return -EINVAL; + + if (psize == MMU_PAGE_8M) { + if (WARN_ON(!pmd_none(*pmdp) || !pmd_none(*(pmdp + 1)))) + return -EINVAL; + + ptep = early_alloc_pgtable(PTE_FRAG_SIZE); + pmd_populate_kernel(&init_mm, pmdp, ptep); + + ptep = early_alloc_pgtable(PTE_FRAG_SIZE); + pmd_populate_kernel(&init_mm, pmdp + 1, ptep); + + ptep = (pte_t *)pmdp; + } else { + ptep = early_pte_alloc_kernel(pmdp, va); + /* The PTE should never be already present */ + if (WARN_ON(pte_present(*ptep) && pgprot_val(prot))) + return -EINVAL; + } + } else { + if (psize == MMU_PAGE_8M) + ptep = (pte_t *)pmdp; + else + ptep = pte_offset_kernel(pmdp, va); + } + + if (WARN_ON(!ptep)) + return -ENOMEM; + + set_huge_pte_at(&init_mm, va, ptep, + arch_make_huge_pte(pfn_pte(pa >> PAGE_SHIFT, prot), shift, 0), + 1UL << shift); + + return 0; +} + +/* + * MMU_init_hw does the chip-specific initialization of the MMU hardware. + */ +void __init MMU_init_hw(void) +{ +} + +static bool immr_is_mapped __initdata; + +void __init mmu_mapin_immr(void) +{ + if (immr_is_mapped) + return; + + immr_is_mapped = true; + + __early_map_kernel_hugepage(VIRT_IMMR_BASE, PHYS_IMMR_BASE, + PAGE_KERNEL_NCG, MMU_PAGE_512K, true); +} + +static int mmu_mapin_ram_chunk(unsigned long offset, unsigned long top, + pgprot_t prot, bool new) +{ + unsigned long v = PAGE_OFFSET + offset; + unsigned long p = offset; + int err = 0; + + WARN_ON(!IS_ALIGNED(offset, SZ_16K) || !IS_ALIGNED(top, SZ_16K)); + + for (; p < ALIGN(p, SZ_512K) && p < top && !err; p += SZ_16K, v += SZ_16K) + err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_16K, new); + for (; p < ALIGN(p, SZ_8M) && p < top && !err; p += SZ_512K, v += SZ_512K) + err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new); + for (; p < ALIGN_DOWN(top, SZ_8M) && p < top && !err; p += SZ_8M, v += SZ_8M) + err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_8M, new); + for (; p < ALIGN_DOWN(top, SZ_512K) && p < top && !err; p += SZ_512K, v += SZ_512K) + err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new); + for (; p < ALIGN_DOWN(top, SZ_16K) && p < top && !err; p += SZ_16K, v += SZ_16K) + err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_16K, new); + + if (!new) + flush_tlb_kernel_range(PAGE_OFFSET + v, PAGE_OFFSET + top); + + return err; +} + +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) +{ + unsigned long etext8 = ALIGN(__pa(_etext), SZ_8M); + unsigned long sinittext = __pa(_sinittext); + bool strict_boundary = strict_kernel_rwx_enabled() || debug_pagealloc_enabled_or_kfence(); + unsigned long boundary = strict_boundary ? sinittext : etext8; + unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M); + + WARN_ON(top < einittext8); + + mmu_mapin_immr(); + + mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_X, true); + if (debug_pagealloc_enabled_or_kfence()) { + top = boundary; + } else { + mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL_X, true); + mmu_mapin_ram_chunk(einittext8, top, PAGE_KERNEL, true); + } + + if (top > SZ_32M) + memblock_set_current_limit(top); + + block_mapped_ram = top; + + return top; +} + +int mmu_mark_initmem_nx(void) +{ + unsigned long etext8 = ALIGN(__pa(_etext), SZ_8M); + unsigned long sinittext = __pa(_sinittext); + unsigned long boundary = strict_kernel_rwx_enabled() ? sinittext : etext8; + unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M); + int err = 0; + + if (!debug_pagealloc_enabled_or_kfence()) + err = mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL, false); + + if (IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + mmu_pin_tlb(block_mapped_ram, false); + + return err; +} + +#ifdef CONFIG_STRICT_KERNEL_RWX +int mmu_mark_rodata_ro(void) +{ + unsigned long sinittext = __pa(_sinittext); + int err; + + err = mmu_mapin_ram_chunk(0, sinittext, PAGE_KERNEL_ROX, false); + if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) + mmu_pin_tlb(block_mapped_ram, true); + + return err; +} +#endif + +void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* 8xx can only access 32MB at the moment */ + memblock_set_current_limit(min_t(u64, first_memblock_size, SZ_32M)); + + BUILD_BUG_ON(ALIGN_DOWN(MODULES_VADDR, PGDIR_SIZE) < TASK_SIZE); +} + +int pud_clear_huge(pud_t *pud) +{ + return 0; +} + +int pmd_clear_huge(pmd_t *pmd) +{ + return 0; +} diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile new file mode 100644 index 000000000000..cf60c776c883 --- /dev/null +++ b/arch/powerpc/mm/nohash/Makefile @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += mmu_context.o tlb.o tlb_low.o kup.o +obj-$(CONFIG_PPC_BOOK3E_64) += tlb_64e.o tlb_low_64e.o book3e_pgtable.o +obj-$(CONFIG_44x) += 44x.o +obj-$(CONFIG_PPC_8xx) += 8xx.o +obj-$(CONFIG_PPC_E500) += e500.o +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_booke.o +ifdef CONFIG_HUGETLB_PAGE +obj-$(CONFIG_PPC_E500) += e500_hugetlbpage.o +endif + +# Disable kcov instrumentation on sensitive code +# This is necessary for booting with kcov enabled on book3e machines +KCOV_INSTRUMENT_tlb.o := n +KCOV_INSTRUMENT_e500.o := n diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c new file mode 100644 index 000000000000..062e8785c1bb --- /dev/null +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2005, Paul Mackerras, IBM Corporation. + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + */ + +#include <linux/sched.h> +#include <linux/memblock.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> +#include <asm/dma.h> +#include <asm/text-patching.h> + +#include <mm/mmu_decl.h> + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * On Book3E CPUs, the vmemmap is currently mapped in the top half of + * the vmalloc space using normal page tables, though the size of + * pages encoded in the PTEs can be different + */ +int __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding without page size */ + unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | + _PAGE_KERNEL_RW; + + /* PTEs only contain page size encodings up to 32M */ + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].shift - 10 > 0xf); + + /* Encode the size in the PTE */ + flags |= (mmu_psize_defs[mmu_vmemmap_psize].shift - 10) << 8; + + /* For each PTE for that area, map things. Note that we don't + * increment phys because all PTEs are of the large size and + * thus must have the low bits clear + */ + for (i = 0; i < page_size; i += PAGE_SIZE) + BUG_ON(map_kernel_page(start + i, phys, __pgprot(flags))); + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ +} +#endif +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +static void __init *early_alloc_pgtable(unsigned long size) +{ + void *ptr; + + ptr = memblock_alloc_try_nid(size, size, MEMBLOCK_LOW_LIMIT, + __pa(MAX_DMA_ADDRESS), NUMA_NO_NODE); + + if (!ptr) + panic("%s: Failed to allocate %lu bytes align=0x%lx max_addr=%lx\n", + __func__, size, size, __pa(MAX_DMA_ADDRESS)); + + return ptr; +} + +/* + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +int __ref map_kernel_page(unsigned long ea, phys_addr_t pa, pgprot_t prot) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + } else { + pgdp = pgd_offset_k(ea); + p4dp = p4d_offset(pgdp, ea); + if (p4d_none(*p4dp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + p4d_populate(&init_mm, p4dp, pudp); + } + pudp = pud_offset(p4dp, ea); + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PTE_TABLE_SIZE); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + } + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); + + smp_wmb(); + return 0; +} + +void __patch_exception(int exc, unsigned long addr) +{ + unsigned int *ibase = &interrupt_base_book3e; + + /* + * Our exceptions vectors start with a NOP and -then- a branch + * to deal with single stepping from userspace which stops on + * the second instruction. Thus we need to patch the second + * instruction of the exception, not the first one. + */ + + patch_branch(ibase + (exc / 4) + 1, addr, 0); +} diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/nohash/e500.c index 07ba45b0f07c..266fb22131fc 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/nohash/e500.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Modifications by Kumar Gala (galak@kernel.crashing.org) to support * E500 Book E processors. @@ -17,12 +18,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/signal.h> @@ -41,42 +36,34 @@ #include <linux/delay.h> #include <linux/highmem.h> #include <linux/memblock.h> +#include <linux/of_fdt.h> -#include <asm/pgalloc.h> -#include <asm/prom.h> #include <asm/io.h> #include <asm/mmu_context.h> -#include <asm/pgtable.h> #include <asm/mmu.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/smp.h> #include <asm/machdep.h> #include <asm/setup.h> +#include <asm/paca.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> unsigned int tlbcam_index; -#define NUM_TLBCAMS (64) struct tlbcam TLBCAM[NUM_TLBCAMS]; -struct tlbcamrange { +static struct { unsigned long start; unsigned long limit; phys_addr_t phys; } tlbcam_addrs[NUM_TLBCAMS]; -extern unsigned int tlbcam_index; - -unsigned long tlbcam_sz(int idx) -{ - return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; -} - +#ifdef CONFIG_PPC_85xx /* * Return PA for this VA if it is mapped by a CAM, or 0 */ -phys_addr_t v_mapped_by_tlbcam(unsigned long va) +phys_addr_t v_block_mapped(unsigned long va) { int b; for (b = 0; b < tlbcam_index; ++b) @@ -88,7 +75,7 @@ phys_addr_t v_mapped_by_tlbcam(unsigned long va) /* * Return VA for a given PA or 0 if not mapped */ -unsigned long p_mapped_by_tlbcam(phys_addr_t pa) +unsigned long p_block_mapped(phys_addr_t pa) { int b; for (b = 0; b < tlbcam_index; ++b) @@ -98,6 +85,7 @@ unsigned long p_mapped_by_tlbcam(phys_addr_t pa) return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys); return 0; } +#endif /* * Set up a variable-size TLB entry (tlbcam). The parameters are not checked; @@ -113,7 +101,7 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, tsize = __ilog2(size) - 10; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) if ((flags & _PAGE_NO_CACHE) == 0) flags |= _PAGE_COHERENT; #endif @@ -128,26 +116,27 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; - TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR; - TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0); + TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SR; + TLBCAM[index].MAS3 |= (flags & _PAGE_WRITE) ? MAS3_SW : 0; if (mmu_has_feature(MMU_FTR_BIG_PHYS)) TLBCAM[index].MAS7 = (u64)phys >> 32; /* Below is unlikely -- only for large user pages or similar */ - if (pte_user(flags)) { - TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; - TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); + if (!is_kernel_addr(virt)) { + TLBCAM[index].MAS3 |= MAS3_UR; + TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_UX : 0; + TLBCAM[index].MAS3 |= (flags & _PAGE_WRITE) ? MAS3_UW : 0; + } else { + TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_SX : 0; } tlbcam_addrs[index].start = virt; tlbcam_addrs[index].limit = virt + size - 1; tlbcam_addrs[index].phys = phys; - - loadcam_entry(index); } -unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, - phys_addr_t phys) +static unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, + phys_addr_t phys) { unsigned int camsize = __ilog2(ram); unsigned int align = __ffs(virt | phys); @@ -171,41 +160,96 @@ unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, return 1UL << camsize; } -unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx) +static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, + unsigned long ram, int max_cam_idx, + bool dryrun, bool init) { int i; - unsigned long virt = PAGE_OFFSET; - phys_addr_t phys = memstart_addr; unsigned long amount_mapped = 0; + unsigned long boundary; + + if (strict_kernel_rwx_enabled()) + boundary = (unsigned long)(_sinittext - _stext); + else + boundary = ram; /* Calculate CAM values */ - for (i = 0; ram && i < max_cam_idx; i++) { + for (i = 0; boundary && i < max_cam_idx; i++) { + unsigned long cam_sz; + pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL_ROX; + + cam_sz = calc_cam_sz(boundary, virt, phys); + if (!dryrun) + settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0); + + boundary -= cam_sz; + amount_mapped += cam_sz; + virt += cam_sz; + phys += cam_sz; + } + for (ram -= amount_mapped; ram && i < max_cam_idx; i++) { unsigned long cam_sz; + pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL; cam_sz = calc_cam_sz(ram, virt, phys); - settlbcam(i, virt, phys, cam_sz, PAGE_KERNEL_X, 0); + if (!dryrun) + settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0); ram -= cam_sz; amount_mapped += cam_sz; virt += cam_sz; phys += cam_sz; } - tlbcam_index = i; + + if (dryrun) + return amount_mapped; + + if (init) { + loadcam_multi(0, i, max_cam_idx); + tlbcam_index = i; + } else { + loadcam_multi(0, i, 0); + WARN_ON(i > tlbcam_index); + } + +#ifdef CONFIG_PPC64 + get_paca()->tcd.esel_next = i; + get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + get_paca()->tcd.esel_first = i; +#endif return amount_mapped; } +unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun, bool init) +{ + unsigned long virt = PAGE_OFFSET; + phys_addr_t phys = memstart_addr; + + return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun, init); +} + #ifdef CONFIG_PPC32 #if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS) #error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" #endif -unsigned long __init mmu_mapin_ram(unsigned long top) +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; } +void flush_instruction_cache(void) +{ + unsigned long tmp; + + tmp = mfspr(SPRN_L1CSR1); + tmp |= L1CSR1_ICFI | L1CSR1_ICLFR; + mtspr(SPRN_L1CSR1, tmp); + isync(); +} + /* * MMU_init_hw does the chip-specific initialization of the MMU hardware. */ @@ -214,6 +258,11 @@ void __init MMU_init_hw(void) flush_instruction_cache(); } +static unsigned long __init tlbcam_sz(int idx) +{ + return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; +} + void __init adjust_total_lowmem(void) { unsigned long ram; @@ -222,7 +271,9 @@ void __init adjust_total_lowmem(void) /* adjust lowmem size to __max_low_memory */ ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); - __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM); + i = switch_to_as1(); + __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false, true); + restore_to_as0(i, 0, NULL, 1); pr_info("Memory CAM mapping: "); for (i = 0; i < tlbcam_index - 1; i++) @@ -233,6 +284,26 @@ void __init adjust_total_lowmem(void) memblock_set_current_limit(memstart_addr + __max_low_memory); } +#ifdef CONFIG_STRICT_KERNEL_RWX +int mmu_mark_rodata_ro(void) +{ + unsigned long remapped; + + remapped = map_mem_in_cams(__max_low_memory, CONFIG_LOWMEM_CAM_NUM, false, false); + + if (WARN_ON(__max_low_memory != remapped)) + return -EINVAL; + + return 0; +} +#endif + +int mmu_mark_initmem_nx(void) +{ + /* Everything is done in mmu_mark_rodata_ro() */ + return 0; +} + void setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { @@ -241,4 +312,68 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, /* 64M mapped initially according to head_fsl_booke.S */ memblock_set_current_limit(min_t(u64, limit, 0x04000000)); } + +#ifdef CONFIG_RELOCATABLE +int __initdata is_second_reloc; +notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) +{ + unsigned long base = kernstart_virt_addr; + phys_addr_t size; + + kernstart_addr = start; + if (is_second_reloc) { + virt_phys_offset = PAGE_OFFSET - memstart_addr; + kaslr_late_init(); + return; + } + + /* + * Relocatable kernel support based on processing of dynamic + * relocation entries. Before we get the real memstart_addr, + * We will compute the virt_phys_offset like this: + * virt_phys_offset = stext.run - kernstart_addr + * + * stext.run = (KERNELBASE & ~0x3ffffff) + + * (kernstart_addr & 0x3ffffff) + * When we relocate, we have : + * + * (kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff) + * + * hence: + * virt_phys_offset = (KERNELBASE & ~0x3ffffff) - + * (kernstart_addr & ~0x3ffffff) + * + */ + start &= ~0x3ffffff; + base &= ~0x3ffffff; + virt_phys_offset = base - start; + early_get_first_memblock_info(__va(dt_ptr), &size); + /* + * We now get the memstart_addr, then we should check if this + * address is the same as what the PAGE_OFFSET map to now. If + * not we have to change the map of PAGE_OFFSET to memstart_addr + * and do a second relocation. + */ + if (start != memstart_addr) { + int n; + long offset = start - memstart_addr; + + is_second_reloc = 1; + n = switch_to_as1(); + /* map a 64M area for the second relocation */ + if (memstart_addr > start) + map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM, + false, true); + else + map_mem_in_cams_addr(start, PAGE_OFFSET + offset, + 0x4000000, CONFIG_LOWMEM_CAM_NUM, + false, true); + restore_to_as0(n, offset, __va(dt_ptr), 1); + /* We should never reach here */ + panic("Relocation error"); + } + + kaslr_early_init(__va(dt_ptr), size); +} +#endif #endif diff --git a/arch/powerpc/mm/nohash/e500_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c new file mode 100644 index 000000000000..a134d28a0e4d --- /dev/null +++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PPC Huge TLB Page Support for Book3E MMU + * + * Copyright (C) 2009 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor + * + */ +#include <linux/mm.h> +#include <linux/hugetlb.h> + +#include <asm/mmu.h> + +#ifdef CONFIG_PPC64 +#include <asm/paca.h> + +static inline int tlb1_next(void) +{ + struct paca_struct *paca = get_paca(); + struct tlb_core_data *tcd; + int this, next; + + tcd = paca->tcd_ptr; + this = tcd->esel_next; + + next = this + 1; + if (next >= tcd->esel_max) + next = tcd->esel_first; + + tcd->esel_next = next; + return this; +} + +static inline void book3e_tlb_lock(void) +{ + struct paca_struct *paca = get_paca(); + unsigned long tmp; + int token = smp_processor_id() + 1; + + /* + * Besides being unnecessary in the absence of SMT, this + * check prevents trying to do lbarx/stbcx. on e5500 which + * doesn't implement either feature. + */ + if (!cpu_has_feature(CPU_FTR_SMT)) + return; + + asm volatile(".machine push;" + ".machine e6500;" + "1: lbarx %0, 0, %1;" + "cmpwi %0, 0;" + "bne 2f;" + "stbcx. %2, 0, %1;" + "bne 1b;" + "b 3f;" + "2: lbzx %0, 0, %1;" + "cmpwi %0, 0;" + "bne 2b;" + "b 1b;" + "3:" + ".machine pop;" + : "=&r" (tmp) + : "r" (&paca->tcd_ptr->lock), "r" (token) + : "memory"); +} + +static inline void book3e_tlb_unlock(void) +{ + struct paca_struct *paca = get_paca(); + + if (!cpu_has_feature(CPU_FTR_SMT)) + return; + + isync(); + paca->tcd_ptr->lock = 0; +} +#else +static inline int tlb1_next(void) +{ + int index, ncams; + + ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + index = this_cpu_read(next_tlbcam_idx); + + /* Just round-robin the entries and wrap when we hit the end */ + if (unlikely(index == ncams - 1)) + __this_cpu_write(next_tlbcam_idx, tlbcam_index); + else + __this_cpu_inc(next_tlbcam_idx); + + return index; +} + +static inline void book3e_tlb_lock(void) +{ +} + +static inline void book3e_tlb_unlock(void) +{ +} +#endif + +static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) +{ + int found = 0; + + mtspr(SPRN_MAS6, pid << 16); + asm volatile( + "tlbsx 0,%1\n" + "mfspr %0,0x271\n" + "srwi %0,%0,31\n" + : "=&r"(found) : "r"(ea)); + + return found; +} + +static void +book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) +{ + unsigned long mas1, mas2; + u64 mas7_3; + unsigned long psize, tsize, shift; + unsigned long flags; + struct mm_struct *mm; + int index; + + if (unlikely(is_kernel_addr(ea))) + return; + + mm = vma->vm_mm; + + psize = vma_mmu_pagesize(vma); + shift = __ilog2(psize); + tsize = shift - 10; + /* + * We can't be interrupted while we're setting up the MAS + * registers or after we've confirmed that no tlb exists. + */ + local_irq_save(flags); + + book3e_tlb_lock(); + + if (unlikely(book3e_tlb_exists(ea, mm->context.id))) { + book3e_tlb_unlock(); + local_irq_restore(flags); + return; + } + + /* We have to use the CAM(TLB1) on FSL parts for hugepages */ + index = tlb1_next(); + mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); + + mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); + mas2 = ea & ~((1UL << shift) - 1); + mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; + mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT; + mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK; + if (!pte_dirty(pte)) + mas7_3 &= ~(MAS3_SW|MAS3_UW); + + mtspr(SPRN_MAS1, mas1); + mtspr(SPRN_MAS2, mas2); + + if (mmu_has_feature(MMU_FTR_BIG_PHYS)) + mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); + mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); + + asm volatile ("tlbwe"); + + book3e_tlb_unlock(); + local_irq_restore(flags); +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * + * This must always be called with the pte lock held. + */ +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +{ + if (is_vm_hugetlb_page(vma)) + book3e_hugetlb_preload(vma, address, *ptep); +} + +void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + struct hstate *hstate = hstate_file(vma->vm_file); + unsigned long tsize = huge_page_shift(hstate) - 10; + + __flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0); +} diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c new file mode 100644 index 000000000000..5e4897daaaea --- /dev/null +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-2.0-only +// +// Copyright (C) 2019 Jason Yan <yanaijie@huawei.com> + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/stddef.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/memblock.h> +#include <linux/libfdt.h> +#include <linux/crash_reserve.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <asm/cacheflush.h> +#include <asm/kdump.h> +#include <mm/mmu_decl.h> + +struct regions { + unsigned long pa_start; + unsigned long pa_end; + unsigned long kernel_size; + unsigned long dtb_start; + unsigned long dtb_end; + unsigned long initrd_start; + unsigned long initrd_end; + unsigned long crash_start; + unsigned long crash_end; + int reserved_mem; + int reserved_mem_addr_cells; + int reserved_mem_size_cells; +}; + +struct regions __initdata regions; + +static __init void kaslr_get_cmdline(void *fdt) +{ + early_init_dt_scan_chosen(boot_command_line); +} + +static unsigned long __init rotate_xor(unsigned long hash, const void *area, + size_t size) +{ + size_t i; + const unsigned long *ptr = area; + + for (i = 0; i < size / sizeof(hash); i++) { + /* Rotate by odd number of bits and XOR. */ + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); + hash ^= ptr[i]; + } + + return hash; +} + +/* Attempt to create a simple starting entropy. This can make it defferent for + * every build but it is still not enough. Stronger entropy should + * be added to make it change for every boot. + */ +static unsigned long __init get_boot_seed(void *fdt) +{ + unsigned long hash = 0; + + /* build-specific string for starting entropy. */ + hash = rotate_xor(hash, linux_banner, strlen(linux_banner)); + hash = rotate_xor(hash, fdt, fdt_totalsize(fdt)); + + return hash; +} + +static __init u64 get_kaslr_seed(void *fdt) +{ + int node, len; + fdt64_t *prop; + u64 ret; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return 0; + + prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); + if (!prop || len != sizeof(u64)) + return 0; + + ret = fdt64_to_cpu(*prop); + *prop = 0; + return ret; +} + +static __init bool regions_overlap(u32 s1, u32 e1, u32 s2, u32 e2) +{ + return e1 >= s2 && e2 >= s1; +} + +static __init bool overlaps_reserved_region(const void *fdt, u32 start, + u32 end) +{ + int subnode, len, i; + u64 base, size; + + /* check for overlap with /memreserve/ entries */ + for (i = 0; i < fdt_num_mem_rsv(fdt); i++) { + if (fdt_get_mem_rsv(fdt, i, &base, &size) < 0) + continue; + if (regions_overlap(start, end, base, base + size)) + return true; + } + + if (regions.reserved_mem < 0) + return false; + + /* check for overlap with static reservations in /reserved-memory */ + for (subnode = fdt_first_subnode(fdt, regions.reserved_mem); + subnode >= 0; + subnode = fdt_next_subnode(fdt, subnode)) { + const fdt32_t *reg; + u64 rsv_end; + + len = 0; + reg = fdt_getprop(fdt, subnode, "reg", &len); + while (len >= (regions.reserved_mem_addr_cells + + regions.reserved_mem_size_cells)) { + base = fdt32_to_cpu(reg[0]); + if (regions.reserved_mem_addr_cells == 2) + base = (base << 32) | fdt32_to_cpu(reg[1]); + + reg += regions.reserved_mem_addr_cells; + len -= 4 * regions.reserved_mem_addr_cells; + + size = fdt32_to_cpu(reg[0]); + if (regions.reserved_mem_size_cells == 2) + size = (size << 32) | fdt32_to_cpu(reg[1]); + + reg += regions.reserved_mem_size_cells; + len -= 4 * regions.reserved_mem_size_cells; + + if (base >= regions.pa_end) + continue; + + rsv_end = min(base + size, (u64)U32_MAX); + + if (regions_overlap(start, end, base, rsv_end)) + return true; + } + } + return false; +} + +static __init bool overlaps_region(const void *fdt, u32 start, + u32 end) +{ + if (regions_overlap(start, end, __pa(_stext), __pa(_end))) + return true; + + if (regions_overlap(start, end, regions.dtb_start, + regions.dtb_end)) + return true; + + if (regions_overlap(start, end, regions.initrd_start, + regions.initrd_end)) + return true; + + if (regions_overlap(start, end, regions.crash_start, + regions.crash_end)) + return true; + + return overlaps_reserved_region(fdt, start, end); +} + +static void __init get_crash_kernel(void *fdt, unsigned long size) +{ +#ifdef CONFIG_CRASH_RESERVE + unsigned long long crash_size, crash_base; + int ret; + + ret = parse_crashkernel(boot_command_line, size, &crash_size, + &crash_base, NULL, NULL, NULL); + if (ret != 0 || crash_size == 0) + return; + if (crash_base == 0) + crash_base = KDUMP_KERNELBASE; + + regions.crash_start = (unsigned long)crash_base; + regions.crash_end = (unsigned long)(crash_base + crash_size); + + pr_debug("crash_base=0x%llx crash_size=0x%llx\n", crash_base, crash_size); +#endif +} + +static void __init get_initrd_range(void *fdt) +{ + u64 start, end; + int node, len; + const __be32 *prop; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return; + + prop = fdt_getprop(fdt, node, "linux,initrd-start", &len); + if (!prop) + return; + start = of_read_number(prop, len / 4); + + prop = fdt_getprop(fdt, node, "linux,initrd-end", &len); + if (!prop) + return; + end = of_read_number(prop, len / 4); + + regions.initrd_start = (unsigned long)start; + regions.initrd_end = (unsigned long)end; + + pr_debug("initrd_start=0x%llx initrd_end=0x%llx\n", start, end); +} + +static __init unsigned long get_usable_address(const void *fdt, + unsigned long start, + unsigned long offset) +{ + unsigned long pa; + unsigned long pa_end; + + for (pa = offset; (long)pa > (long)start; pa -= SZ_16K) { + pa_end = pa + regions.kernel_size; + if (overlaps_region(fdt, pa, pa_end)) + continue; + + return pa; + } + return 0; +} + +static __init void get_cell_sizes(const void *fdt, int node, int *addr_cells, + int *size_cells) +{ + const int *prop; + int len; + + /* + * Retrieve the #address-cells and #size-cells properties + * from the 'node', or use the default if not provided. + */ + *addr_cells = *size_cells = 1; + + prop = fdt_getprop(fdt, node, "#address-cells", &len); + if (len == 4) + *addr_cells = fdt32_to_cpu(*prop); + prop = fdt_getprop(fdt, node, "#size-cells", &len); + if (len == 4) + *size_cells = fdt32_to_cpu(*prop); +} + +static unsigned long __init kaslr_legal_offset(void *dt_ptr, unsigned long index, + unsigned long offset) +{ + unsigned long koffset = 0; + unsigned long start; + + while ((long)index >= 0) { + offset = memstart_addr + index * SZ_64M + offset; + start = memstart_addr + index * SZ_64M; + koffset = get_usable_address(dt_ptr, start, offset); + if (koffset) + break; + index--; + } + + if (koffset != 0) + koffset -= memstart_addr; + + return koffset; +} + +static inline __init bool kaslr_disabled(void) +{ + return strstr(boot_command_line, "nokaslr") != NULL; +} + +static unsigned long __init kaslr_choose_location(void *dt_ptr, phys_addr_t size, + unsigned long kernel_sz) +{ + unsigned long offset, random; + unsigned long ram, linear_sz; + u64 seed; + unsigned long index; + + kaslr_get_cmdline(dt_ptr); + if (kaslr_disabled()) + return 0; + + random = get_boot_seed(dt_ptr); + + seed = get_tb() << 32; + seed ^= get_tb(); + random = rotate_xor(random, &seed, sizeof(seed)); + + /* + * Retrieve (and wipe) the seed from the FDT + */ + seed = get_kaslr_seed(dt_ptr); + if (seed) + random = rotate_xor(random, &seed, sizeof(seed)); + else + pr_warn("KASLR: No safe seed for randomizing the kernel base.\n"); + + ram = min_t(phys_addr_t, __max_low_memory, size); + ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true, true); + linear_sz = min_t(unsigned long, ram, SZ_512M); + + /* If the linear size is smaller than 64M, do not randomize */ + if (linear_sz < SZ_64M) + return 0; + + /* check for a reserved-memory node and record its cell sizes */ + regions.reserved_mem = fdt_path_offset(dt_ptr, "/reserved-memory"); + if (regions.reserved_mem >= 0) + get_cell_sizes(dt_ptr, regions.reserved_mem, + ®ions.reserved_mem_addr_cells, + ®ions.reserved_mem_size_cells); + + regions.pa_start = memstart_addr; + regions.pa_end = memstart_addr + linear_sz; + regions.dtb_start = __pa(dt_ptr); + regions.dtb_end = __pa(dt_ptr) + fdt_totalsize(dt_ptr); + regions.kernel_size = kernel_sz; + + get_initrd_range(dt_ptr); + get_crash_kernel(dt_ptr, ram); + + /* + * Decide which 64M we want to start + * Only use the low 8 bits of the random seed + */ + index = random & 0xFF; + index %= linear_sz / SZ_64M; + + /* Decide offset inside 64M */ + offset = random % (SZ_64M - kernel_sz); + offset = round_down(offset, SZ_16K); + + return kaslr_legal_offset(dt_ptr, index, offset); +} + +/* + * To see if we need to relocate the kernel to a random offset + * void *dt_ptr - address of the device tree + * phys_addr_t size - size of the first memory block + */ +notrace void __init kaslr_early_init(void *dt_ptr, phys_addr_t size) +{ + unsigned long tlb_virt; + phys_addr_t tlb_phys; + unsigned long offset; + unsigned long kernel_sz; + + kernel_sz = (unsigned long)_end - (unsigned long)_stext; + + offset = kaslr_choose_location(dt_ptr, size, kernel_sz); + if (offset == 0) + return; + + kernstart_virt_addr += offset; + kernstart_addr += offset; + + is_second_reloc = 1; + + if (offset >= SZ_64M) { + tlb_virt = round_down(kernstart_virt_addr, SZ_64M); + tlb_phys = round_down(kernstart_addr, SZ_64M); + + /* Create kernel map to relocate in */ + create_kaslr_tlb_entry(1, tlb_virt, tlb_phys); + } + + /* Copy the kernel to its new location and run */ + memcpy((void *)kernstart_virt_addr, (void *)_stext, kernel_sz); + flush_icache_range(kernstart_virt_addr, kernstart_virt_addr + kernel_sz); + + reloc_kernel_entry(dt_ptr, kernstart_virt_addr); +} + +void __init kaslr_late_init(void) +{ + /* If randomized, clear the original kernel */ + if (kernstart_virt_addr != KERNELBASE) { + unsigned long kernel_sz; + + kernel_sz = (unsigned long)_end - kernstart_virt_addr; + memzero_explicit((void *)KERNELBASE, kernel_sz); + } +} diff --git a/arch/powerpc/mm/nohash/kup.c b/arch/powerpc/mm/nohash/kup.c new file mode 100644 index 000000000000..c20c4f357fbf --- /dev/null +++ b/arch/powerpc/mm/nohash/kup.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file contains the routines for initializing kernel userspace protection + */ + +#include <linux/export.h> +#include <linux/init.h> +#include <linux/printk.h> +#include <linux/smp.h> + +#include <asm/kup.h> +#include <asm/smp.h> + +#ifdef CONFIG_PPC_KUAP +void setup_kuap(bool disabled) +{ + if (disabled) { + if (smp_processor_id() == boot_cpuid) + cur_cpu_spec->mmu_features &= ~MMU_FTR_KUAP; + return; + } + + pr_info("Activating Kernel Userspace Access Protection\n"); + + prevent_user_access(KUAP_READ_WRITE); +} +#endif diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/nohash/mmu_context.c index af3d78e19302..28a96a10c907 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/nohash/mmu_context.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains the routines for handling the MMU on those * PowerPC implementations where the MMU is not using the hash @@ -9,11 +10,6 @@ * Derived from previous arch/powerpc/mm/mmu_context.c * and arch/powerpc/include/asm/mmu_context.h * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * TODO: * * - The global context lock will not scale very well @@ -25,34 +21,54 @@ * also clear mm->cpu_vm_mask bits when processes are migrated */ -//#define DEBUG_MAP_CONSISTENCY -//#define DEBUG_CLAMP_LAST_CONTEXT 31 -//#define DEBUG_HARDER - -/* We don't use DEBUG because it tends to be compiled in always nowadays - * and this would generate way too much output - */ -#ifdef DEBUG_HARDER -#define pr_hard(args...) printk(KERN_DEBUG args) -#define pr_hardcont(args...) printk(KERN_CONT args) -#else -#define pr_hard(args...) do { } while(0) -#define pr_hardcont(args...) do { } while(0) -#endif - #include <linux/kernel.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/spinlock.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/slab.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> +#include <asm/smp.h> +#include <asm/kup.h> + +#include <mm/mmu_decl.h> + +/* + * Room for two PTE table pointers, usually the kernel and current user + * pointer to their respective root page table (pgdir). + */ +void *abatron_pteptrs[2]; + +/* + * The MPC8xx has only 16 contexts. We rotate through them on each task switch. + * A better way would be to keep track of tasks that own contexts, and implement + * an LRU usage. That way very active tasks don't always have to pay the TLB + * reload overhead. The kernel pages are mapped shared, so the kernel can run on + * behalf of any task that makes a kernel entry. Shared does not mean they are + * not protected, just that the ASID comparison is not performed. -- Dan + * + * The IBM4xx has 256 contexts, so we can just rotate through these as a way of + * "switching" contexts. If the TID of the TLB is zero, the PID/TID comparison + * is disabled, so we can use a TID of zero to represent all kernel pages as + * shared among all contexts. -- Dan + * + * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We should + * normally never have to steal though the facility is present if needed. + * -- BenH + */ +#define FIRST_CONTEXT 1 +#if defined(CONFIG_PPC_8xx) +#define LAST_CONTEXT 16 +#elif defined(CONFIG_PPC_47x) +#define LAST_CONTEXT 65535 +#else +#define LAST_CONTEXT 255 +#endif -static unsigned int first_context, last_context; static unsigned int next_context, nr_free_contexts; static unsigned long *context_map; static unsigned long *stale_map[NR_CPUS]; @@ -60,7 +76,7 @@ static struct mm_struct **context_mm; static DEFINE_RAW_SPINLOCK(context_lock); #define CTX_MAP_SIZE \ - (sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1)) + (sizeof(unsigned long) * (LAST_CONTEXT / BITS_PER_LONG + 1)) /* Steal a context from a task that has one at the moment. @@ -78,13 +94,12 @@ static DEFINE_RAW_SPINLOCK(context_lock); * the stale map as we can just flush the local CPU * -- benh */ -#ifdef CONFIG_SMP static unsigned int steal_context_smp(unsigned int id) { struct mm_struct *mm; unsigned int cpu, max, i; - max = last_context - first_context; + max = LAST_CONTEXT - FIRST_CONTEXT; /* Attempt to free next_context first and then loop until we manage */ while (max--) { @@ -96,11 +111,10 @@ static unsigned int steal_context_smp(unsigned int id) */ if (mm->context.active) { id++; - if (id > last_context) - id = first_context; + if (id > LAST_CONTEXT) + id = FIRST_CONTEXT; continue; } - pr_hardcont(" | steal %d from 0x%p", id, mm); /* Mark this mm has having no context anymore */ mm->context.id = MMU_NO_CONTEXT; @@ -131,7 +145,34 @@ static unsigned int steal_context_smp(unsigned int id) /* This will cause the caller to try again */ return MMU_NO_CONTEXT; } -#endif /* CONFIG_SMP */ + +static unsigned int steal_all_contexts(void) +{ + struct mm_struct *mm; + int cpu = smp_processor_id(); + unsigned int id; + + for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) { + /* Pick up the victim mm */ + mm = context_mm[id]; + + /* Mark this mm as having no context anymore */ + mm->context.id = MMU_NO_CONTEXT; + if (id != FIRST_CONTEXT) { + context_mm[id] = NULL; + __clear_bit(id, context_map); + } + if (IS_ENABLED(CONFIG_SMP)) + __clear_bit(id, stale_map[cpu]); + } + + /* Flush the TLB for all contexts (not to be used on SMP) */ + _tlbil_all(); + + nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT; + + return FIRST_CONTEXT; +} /* Note that this will also be called on SMP if all other CPUs are * offlined, which means that it may be called for cpu != 0. For @@ -146,8 +187,6 @@ static unsigned int steal_context_up(unsigned int id) /* Pick up the victim mm */ mm = context_mm[id]; - pr_hardcont(" | steal %d from 0x%p", id, mm); - /* Flush the TLB for that context */ local_flush_tlb_mm(mm); @@ -155,120 +194,93 @@ static unsigned int steal_context_up(unsigned int id) mm->context.id = MMU_NO_CONTEXT; /* XXX This clear should ultimately be part of local_flush_tlb_mm */ - __clear_bit(id, stale_map[cpu]); + if (IS_ENABLED(CONFIG_SMP)) + __clear_bit(id, stale_map[cpu]); return id; } -#ifdef DEBUG_MAP_CONSISTENCY -static void context_check_map(void) +static void set_context(unsigned long id, pgd_t *pgd) { - unsigned int id, nrf, nact; - - nrf = nact = 0; - for (id = first_context; id <= last_context; id++) { - int used = test_bit(id, context_map); - if (!used) - nrf++; - if (used != (context_mm[id] != NULL)) - pr_err("MMU: Context %d is %s and MM is %p !\n", - id, used ? "used" : "free", context_mm[id]); - if (context_mm[id] != NULL) - nact += context_mm[id]->context.active; - } - if (nrf != nr_free_contexts) { - pr_err("MMU: Free context count out of sync ! (%d vs %d)\n", - nr_free_contexts, nrf); - nr_free_contexts = nrf; + if (IS_ENABLED(CONFIG_PPC_8xx)) { + mtspr(SPRN_M_TWB, __pa(pgd)); + + /* Update context */ + mtspr(SPRN_M_CASID, id - 1); + + /* sync */ + mb(); + } else if (kuap_is_disabled()) { + mtspr(SPRN_PID, id); + isync(); } - if (nact > num_online_cpus()) - pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n", - nact, num_online_cpus()); - if (first_context > 0 && !test_bit(0, context_map)) - pr_err("MMU: Context 0 has been freed !!!\n"); } -#else -static void context_check_map(void) { } -#endif -void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) { - unsigned int i, id, cpu = smp_processor_id(); + unsigned int id; + unsigned int i, cpu = smp_processor_id(); unsigned long *map; /* No lockless fast path .. yet */ raw_spin_lock(&context_lock); - pr_hard("[%d] activating context for mm @%p, active=%d, id=%d", - cpu, next, next->context.active, next->context.id); - -#ifdef CONFIG_SMP - /* Mark us active and the previous one not anymore */ - next->context.active++; - if (prev) { - pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active); - WARN_ON(prev->context.active < 1); - prev->context.active--; + if (IS_ENABLED(CONFIG_SMP)) { + /* Mark us active and the previous one not anymore */ + next->context.active++; + if (prev) { + WARN_ON(prev->context.active < 1); + prev->context.active--; + } } again: -#endif /* CONFIG_SMP */ /* If we already have a valid assigned context, skip all that */ id = next->context.id; - if (likely(id != MMU_NO_CONTEXT)) { -#ifdef DEBUG_MAP_CONSISTENCY - if (context_mm[id] != next) - pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n", - next, id, id, context_mm[id]); -#endif + if (likely(id != MMU_NO_CONTEXT)) goto ctxt_ok; - } /* We really don't have a context, let's try to acquire one */ id = next_context; - if (id > last_context) - id = first_context; + if (id > LAST_CONTEXT) + id = FIRST_CONTEXT; map = context_map; /* No more free contexts, let's try to steal one */ if (nr_free_contexts == 0) { -#ifdef CONFIG_SMP if (num_online_cpus() > 1) { id = steal_context_smp(id); if (id == MMU_NO_CONTEXT) goto again; goto stolen; } -#endif /* CONFIG_SMP */ - id = steal_context_up(id); + if (IS_ENABLED(CONFIG_PPC_8xx)) + id = steal_all_contexts(); + else + id = steal_context_up(id); goto stolen; } nr_free_contexts--; /* We know there's at least one free context, try to find it */ while (__test_and_set_bit(id, map)) { - id = find_next_zero_bit(map, last_context+1, id); - if (id > last_context) - id = first_context; + id = find_next_zero_bit(map, LAST_CONTEXT+1, id); + if (id > LAST_CONTEXT) + id = FIRST_CONTEXT; } stolen: next_context = id + 1; context_mm[id] = next; next->context.id = id; - pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts); - context_check_map(); ctxt_ok: /* If that context got marked stale on this CPU, then flush the * local TLB for it and unmark it before we use it */ - if (test_bit(id, stale_map[cpu])) { - pr_hardcont(" | stale flush %d [%d..%d]", - id, cpu_first_thread_sibling(cpu), - cpu_last_thread_sibling(cpu)); - + if (IS_ENABLED(CONFIG_SMP) && test_bit(id, stale_map[cpu])) { local_flush_tlb_mm(next); /* XXX This clear should ultimately be part of local_flush_tlb_mm */ @@ -280,8 +292,12 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) } /* Flick the MMU and release lock */ - pr_hardcont(" -> %d\n", id); + if (IS_ENABLED(CONFIG_BDI_SWITCH)) + abatron_pteptrs[1] = next->pgd; set_context(id, next->pgd); +#if defined(CONFIG_BOOKE) && defined(CONFIG_PPC_KUAP) + tsk->thread.pid = id; +#endif raw_spin_unlock(&context_lock); } @@ -290,16 +306,9 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) */ int init_new_context(struct task_struct *t, struct mm_struct *mm) { - pr_hard("initing context for mm @%p\n", mm); - mm->context.id = MMU_NO_CONTEXT; mm->context.active = 0; - -#ifdef CONFIG_PPC_MM_SLICES - if (slice_mm_new_context(mm)) - slice_set_user_psize(mm, mmu_virtual_psize); -#endif - + pte_frag_set(&mm->context, NULL); return 0; } @@ -321,56 +330,38 @@ void destroy_context(struct mm_struct *mm) if (id != MMU_NO_CONTEXT) { __clear_bit(id, context_map); mm->context.id = MMU_NO_CONTEXT; -#ifdef DEBUG_MAP_CONSISTENCY - mm->context.active = 0; -#endif context_mm[id] = NULL; nr_free_contexts++; } raw_spin_unlock_irqrestore(&context_lock, flags); } -#ifdef CONFIG_SMP - -static int mmu_context_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int mmu_ctx_cpu_prepare(unsigned int cpu) { - unsigned int cpu = (unsigned int)(long)hcpu; - /* We don't touch CPU 0 map, it's allocated at aboot and kept * around forever */ if (cpu == boot_cpuid) - return NOTIFY_OK; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu); - stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); - kfree(stale_map[cpu]); - stale_map[cpu] = NULL; - - /* We also clear the cpu_vm_mask bits of CPUs going away */ - clear_tasks_mm_cpumask(cpu); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; + return 0; + + stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL); + return 0; } -static struct notifier_block mmu_context_cpu_nb = { - .notifier_call = mmu_context_cpu_notify, -}; +static int mmu_ctx_cpu_dead(unsigned int cpu) +{ +#ifdef CONFIG_HOTPLUG_CPU + if (cpu == boot_cpuid) + return 0; -#endif /* CONFIG_SMP */ + kfree(stale_map[cpu]); + stale_map[cpu] = NULL; + + /* We also clear the cpu_vm_mask bits of CPUs going away */ + clear_tasks_mm_cpumask(cpu); +#endif + return 0; +} /* * Initialize the context management stuff. @@ -384,76 +375,30 @@ void __init mmu_context_init(void) init_mm.context.active = NR_CPUS; /* - * The MPC8xx has only 16 contexts. We rotate through them on each - * task switch. A better way would be to keep track of tasks that - * own contexts, and implement an LRU usage. That way very active - * tasks don't always have to pay the TLB reload overhead. The - * kernel pages are mapped shared, so the kernel can run on behalf - * of any task that makes a kernel entry. Shared does not mean they - * are not protected, just that the ASID comparison is not performed. - * -- Dan - * - * The IBM4xx has 256 contexts, so we can just rotate through these - * as a way of "switching" contexts. If the TID of the TLB is zero, - * the PID/TID comparison is disabled, so we can use a TID of zero - * to represent all kernel pages as shared among all contexts. - * -- Dan - * - * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We - * should normally never have to steal though the facility is - * present if needed. - * -- BenH - */ - if (mmu_has_feature(MMU_FTR_TYPE_8xx)) { - first_context = 0; - last_context = 15; - } else if (mmu_has_feature(MMU_FTR_TYPE_47x)) { - first_context = 1; - last_context = 65535; - } else -#ifdef CONFIG_PPC_BOOK3E_MMU - if (mmu_has_feature(MMU_FTR_TYPE_3E)) { - u32 mmucfg = mfspr(SPRN_MMUCFG); - u32 pid_bits = (mmucfg & MMUCFG_PIDSIZE_MASK) - >> MMUCFG_PIDSIZE_SHIFT; - first_context = 1; - last_context = (1UL << (pid_bits + 1)) - 1; - } else -#endif - { - first_context = 1; - last_context = 255; - } - -#ifdef DEBUG_CLAMP_LAST_CONTEXT - last_context = DEBUG_CLAMP_LAST_CONTEXT; -#endif - /* * Allocate the maps used by context management */ - context_map = alloc_bootmem(CTX_MAP_SIZE); - context_mm = alloc_bootmem(sizeof(void *) * (last_context + 1)); -#ifndef CONFIG_SMP - stale_map[0] = alloc_bootmem(CTX_MAP_SIZE); -#else - stale_map[boot_cpuid] = alloc_bootmem(CTX_MAP_SIZE); - - register_cpu_notifier(&mmu_context_cpu_nb); -#endif + context_map = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES); + context_mm = memblock_alloc_or_panic(sizeof(void *) * (LAST_CONTEXT + 1), + SMP_CACHE_BYTES); + if (IS_ENABLED(CONFIG_SMP)) { + stale_map[boot_cpuid] = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES); + cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE, + "powerpc/mmu/ctx:prepare", + mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead); + } printk(KERN_INFO "MMU: Allocated %zu bytes of context maps for %d contexts\n", - 2 * CTX_MAP_SIZE + (sizeof(void *) * (last_context + 1)), - last_context - first_context + 1); + 2 * CTX_MAP_SIZE + (sizeof(void *) * (LAST_CONTEXT + 1)), + LAST_CONTEXT - FIRST_CONTEXT + 1); /* * Some processors have too few contexts to reserve one for * init_mm, and require using context 0 for a normal task. * Other processors reserve the use of context zero for the kernel. - * This code assumes first_context < 32. + * This code assumes FIRST_CONTEXT < 32. */ - context_map[0] = (1 << first_context) - 1; - next_context = first_context; - nr_free_contexts = last_context - first_context + 1; + context_map[0] = (1 << FIRST_CONTEXT) - 1; + next_context = FIRST_CONTEXT; + nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1; } - diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c new file mode 100644 index 000000000000..0a650742f3a0 --- /dev/null +++ b/arch/powerpc/mm/nohash/tlb.c @@ -0,0 +1,341 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file contains the routines for TLB flushing. + * On machines where the MMU does not use a hash table to store virtual to + * physical translations (ie, SW loaded TLBs or Book3E compilant processors, + * this does -not- include 603 however which shares the implementation with + * hash based processors) + * + * -- BenH + * + * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org> + * IBM Corp. + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/preempt.h> +#include <linux/spinlock.h> +#include <linux/memblock.h> +#include <linux/of_fdt.h> +#include <linux/hugetlb.h> + +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/text-patching.h> +#include <asm/cputhreads.h> +#include <asm/hugetlb.h> +#include <asm/paca.h> + +#include <mm/mmu_decl.h> + +/* + * This struct lists the sw-supported page sizes. The hardawre MMU may support + * other sizes not listed here. The .ind field is only used on MMUs that have + * indirect page table entries. + */ +#ifdef CONFIG_PPC_E500 +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + [MMU_PAGE_4K] = { + .shift = 12, + }, + [MMU_PAGE_2M] = { + .shift = 21, + }, + [MMU_PAGE_4M] = { + .shift = 22, + }, + [MMU_PAGE_16M] = { + .shift = 24, + }, + [MMU_PAGE_64M] = { + .shift = 26, + }, + [MMU_PAGE_256M] = { + .shift = 28, + }, + [MMU_PAGE_1G] = { + .shift = 30, + }, +}; + +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].shift - 10; +} +#else +static inline int mmu_get_tsize(int psize) +{ + /* This isn't used on !Book3E for now */ + return 0; +} +#endif + +#ifdef CONFIG_PPC_8xx +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + [MMU_PAGE_4K] = { + .shift = 12, + }, + [MMU_PAGE_16K] = { + .shift = 14, + }, + [MMU_PAGE_512K] = { + .shift = 19, + }, + [MMU_PAGE_8M] = { + .shift = 23, + }, +}; +#endif + +#ifdef CONFIG_PPC_E500 +/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */ +DEFINE_PER_CPU(int, next_tlbcam_idx); +EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx); +#endif + +/* + * Base TLB flushing operations: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * - local_* variants of page and mm only apply to the current + * processor + */ + +#ifndef CONFIG_PPC_8xx +/* + * These are the base non-SMP variants of page and mm flushing + */ +void local_flush_tlb_mm(struct mm_struct *mm) +{ + unsigned int pid; + + preempt_disable(); + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbil_pid(pid); + preempt_enable(); +} +EXPORT_SYMBOL(local_flush_tlb_mm); + +void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind) +{ + unsigned int pid; + + preempt_disable(); + pid = mm ? mm->context.id : 0; + if (pid != MMU_NO_CONTEXT) + _tlbil_va(vmaddr, pid, tsize, ind); + preempt_enable(); +} + +void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_tsize(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(local_flush_tlb_page); + +void local_flush_tlb_page_psize(struct mm_struct *mm, + unsigned long vmaddr, int psize) +{ + __local_flush_tlb_page(mm, vmaddr, mmu_get_tsize(psize), 0); +} +EXPORT_SYMBOL(local_flush_tlb_page_psize); + +#endif + +/* + * And here are the SMP non-local implementations + */ +#ifdef CONFIG_SMP + +static DEFINE_RAW_SPINLOCK(tlbivax_lock); + +struct tlb_flush_param { + unsigned long addr; + unsigned int pid; + unsigned int tsize; + unsigned int ind; +}; + +static void do_flush_tlb_mm_ipi(void *param) +{ + struct tlb_flush_param *p = param; + + _tlbil_pid(p ? p->pid : 0); +} + +static void do_flush_tlb_page_ipi(void *param) +{ + struct tlb_flush_param *p = param; + + _tlbil_va(p->addr, p->pid, p->tsize, p->ind); +} + + +/* Note on invalidations and PID: + * + * We snapshot the PID with preempt disabled. At this point, it can still + * change either because: + * - our context is being stolen (PID -> NO_CONTEXT) on another CPU + * - we are invaliating some target that isn't currently running here + * and is concurrently acquiring a new PID on another CPU + * - some other CPU is re-acquiring a lost PID for this mm + * etc... + * + * However, this shouldn't be a problem as we only guarantee + * invalidation of TLB entries present prior to this call, so we + * don't care about the PID changing, and invalidating a stale PID + * is generally harmless. + */ + +void flush_tlb_mm(struct mm_struct *mm) +{ + unsigned int pid; + + preempt_disable(); + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto no_context; + if (!mm_is_core_local(mm)) { + struct tlb_flush_param p = { .pid = pid }; + /* Ignores smp_processor_id() even if set. */ + smp_call_function_many(mm_cpumask(mm), + do_flush_tlb_mm_ipi, &p, 1); + } + _tlbil_pid(pid); + no_context: + preempt_enable(); +} +EXPORT_SYMBOL(flush_tlb_mm); + +void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind) +{ + struct cpumask *cpu_mask; + unsigned int pid; + + /* + * This function as well as __local_flush_tlb_page() must only be called + * for user contexts. + */ + if (WARN_ON(!mm)) + return; + + preempt_disable(); + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto bail; + cpu_mask = mm_cpumask(mm); + if (!mm_is_core_local(mm)) { + /* If broadcast tlbivax is supported, use it */ + if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { + int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); + if (lock) + raw_spin_lock(&tlbivax_lock); + _tlbivax_bcast(vmaddr, pid, tsize, ind); + if (lock) + raw_spin_unlock(&tlbivax_lock); + goto bail; + } else { + struct tlb_flush_param p = { + .pid = pid, + .addr = vmaddr, + .tsize = tsize, + .ind = ind, + }; + /* Ignores smp_processor_id() even if set in cpu_mask */ + smp_call_function_many(cpu_mask, + do_flush_tlb_page_ipi, &p, 1); + } + } + _tlbil_va(vmaddr, pid, tsize, ind); + bail: + preempt_enable(); +} + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + if (vma && is_vm_hugetlb_page(vma)) + flush_hugetlb_page(vma, vmaddr); +#endif + + __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_tsize(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(flush_tlb_page); + +#endif /* CONFIG_SMP */ + +/* + * Flush kernel TLB entries in the given range + */ +#ifndef CONFIG_PPC_8xx +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ +#ifdef CONFIG_SMP + preempt_disable(); + smp_call_function(do_flush_tlb_mm_ipi, NULL, 1); + _tlbil_pid(0); + preempt_enable(); +#else + _tlbil_pid(0); +#endif +} +EXPORT_SYMBOL(flush_tlb_kernel_range); +#endif + +/* + * Currently, for range flushing, we just do a full mm flush. This should + * be optimized based on a threshold on the size of the range, since + * some implementation can stack multiple tlbivax before a tlbsync but + * for now, we keep it that way + */ +void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ + if (end - start == PAGE_SIZE && !(start & ~PAGE_MASK)) + flush_tlb_page(vma, start); + else + flush_tlb_mm(vma->vm_mm); +} +EXPORT_SYMBOL(flush_tlb_range); + +void tlb_flush(struct mmu_gather *tlb) +{ + flush_tlb_mm(tlb->mm); +} + +#ifndef CONFIG_PPC64 +void __init early_init_mmu(void) +{ + unsigned long root = of_get_flat_dt_root(); + + if (IS_ENABLED(CONFIG_PPC_47x) && IS_ENABLED(CONFIG_SMP) && + of_get_flat_dt_prop(root, "cooperative-partition", NULL)) + mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST); +} +#endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c new file mode 100644 index 000000000000..4f925adf2695 --- /dev/null +++ b/arch/powerpc/mm/nohash/tlb_64e.c @@ -0,0 +1,314 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org> + * IBM Corp. + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/pagemap.h> +#include <linux/memblock.h> + +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/text-patching.h> +#include <asm/cputhreads.h> + +#include <mm/mmu_decl.h> + +/* The variables below are currently only used on 64-bit Book3E + * though this will probably be made common with other nohash + * implementations at some point + */ +static int mmu_pte_psize; /* Page size used for PTE pages */ +int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ +int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */ +unsigned long linear_map_top; /* Top of linear mapping */ + + +/* + * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug + * exceptions. This is used for bolted and e6500 TLB miss handlers which + * do not modify this SPRG in the TLB miss code; for other TLB miss handlers, + * this is set to zero. + */ +int extlb_level_exc; + +/* + * Handling of virtual linear page tables or indirect TLB entries + * flushing when PTE pages are freed + */ +void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) +{ + int tsize = mmu_psize_defs[mmu_pte_psize].shift - 10; + + if (book3e_htw_mode != PPC_HTW_NONE) { + unsigned long start = address & PMD_MASK; + unsigned long end = address + PMD_SIZE; + unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift; + + /* This isn't the most optimal, ideally we would factor out the + * while preempt & CPU mask mucking around, or even the IPI but + * it will do for now + */ + while (start < end) { + __flush_tlb_page(tlb->mm, start, tsize, 1); + start += size; + } + } else { + unsigned long rmask = 0xf000000000000000ul; + unsigned long rid = (address & rmask) | 0x1000000000000000ul; + unsigned long vpte = address & ~rmask; + + vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; + vpte |= rid; + __flush_tlb_page(tlb->mm, vpte, tsize, 0); + } +} + +static void __init setup_page_sizes(void) +{ + unsigned int tlb0cfg; + unsigned int eptcfg; + int psize; + + unsigned int mmucfg = mfspr(SPRN_MMUCFG); + + if ((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { + unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG); + unsigned int min_pg, max_pg; + + min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT; + max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def; + unsigned int shift; + + def = &mmu_psize_defs[psize]; + shift = def->shift; + + if (shift == 0 || shift & 1) + continue; + + /* adjust to be in terms of 4^shift Kb */ + shift = (shift - 10) >> 1; + + if ((shift >= min_pg) && (shift <= max_pg)) + def->flags |= MMU_PAGE_SIZE_DIRECT; + } + + goto out; + } + + if ((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) { + u32 tlb1cfg, tlb1ps; + + tlb0cfg = mfspr(SPRN_TLB0CFG); + tlb1cfg = mfspr(SPRN_TLB1CFG); + tlb1ps = mfspr(SPRN_TLB1PS); + eptcfg = mfspr(SPRN_EPTCFG); + + if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT)) + book3e_htw_mode = PPC_HTW_E6500; + + /* + * We expect 4K subpage size and unrestricted indirect size. + * The lack of a restriction on indirect size is a Freescale + * extension, indicated by PSn = 0 but SPSn != 0. + */ + if (eptcfg != 2) + book3e_htw_mode = PPC_HTW_NONE; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + + if (!def->shift) + continue; + + if (tlb1ps & (1U << (def->shift - 10))) { + def->flags |= MMU_PAGE_SIZE_DIRECT; + + if (book3e_htw_mode && psize == MMU_PAGE_2M) + def->flags |= MMU_PAGE_SIZE_INDIRECT; + } + } + + goto out; + } +out: + /* Cleanup array and print summary */ + pr_info("MMU: Supported page sizes\n"); + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + const char *__page_type_names[] = { + "unsupported", + "direct", + "indirect", + "direct & indirect" + }; + if (def->flags == 0) { + def->shift = 0; + continue; + } + pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10), + __page_type_names[def->flags & 0x3]); + } +} + +/* + * Early initialization of the MMU TLB code + */ +static void early_init_this_mmu(void) +{ + unsigned int mas4; + + /* Set MAS4 based on page table setting */ + + mas4 = 0x4 << MAS4_WIMGED_SHIFT; + switch (book3e_htw_mode) { + case PPC_HTW_E6500: + mas4 |= MAS4_INDD; + mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT; + mas4 |= MAS4_TLBSELD(1); + mmu_pte_psize = MMU_PAGE_2M; + break; + + case PPC_HTW_NONE: + mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; + mmu_pte_psize = mmu_virtual_psize; + break; + } + mtspr(SPRN_MAS4, mas4); + + unsigned int num_cams; + bool map = true; + + /* use a quarter of the TLBCAM for bolted linear map */ + num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; + + /* + * Only do the mapping once per core, or else the + * transient mapping would cause problems. + */ +#ifdef CONFIG_SMP + if (hweight32(get_tensr()) > 1) + map = false; +#endif + + if (map) + linear_map_top = map_mem_in_cams(linear_map_top, + num_cams, false, true); + + /* A sync won't hurt us after mucking around with + * the MMU configuration + */ + mb(); +} + +static void __init early_init_mmu_global(void) +{ + /* + * Freescale booke only supports 4K pages in TLB0, so use that. + */ + mmu_vmemmap_psize = MMU_PAGE_4K; + + /* XXX This code only checks for TLB 0 capabilities and doesn't + * check what page size combos are supported by the HW. It + * also doesn't handle the case where a separate array holds + * the IND entries from the array loaded by the PT. + */ + /* Look for supported page sizes */ + setup_page_sizes(); + + /* + * If we want to use HW tablewalk, enable it by patching the TLB miss + * handlers to branch to the one dedicated to it. + */ + extlb_level_exc = EX_TLB_SIZE; + switch (book3e_htw_mode) { + case PPC_HTW_E6500: + patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e); + patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e); + break; + } + + pr_info("MMU: Book3E HW tablewalk %s\n", + book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported"); + + /* Set the global containing the top of the linear mapping + * for use by the TLB miss code + */ + linear_map_top = memblock_end_of_DRAM(); + + ioremap_bot = IOREMAP_BASE; +} + +static void __init early_mmu_set_memory_limit(void) +{ + /* + * Limit memory so we dont have linear faults. + * Unlike memblock_set_current_limit, which limits + * memory available during early boot, this permanently + * reduces the memory available to Linux. We need to + * do this because highmem is not supported on 64-bit. + */ + memblock_enforce_memory_limit(linear_map_top); + + memblock_set_current_limit(linear_map_top); +} + +/* boot cpu only */ +void __init early_init_mmu(void) +{ + early_init_mmu_global(); + early_init_this_mmu(); + early_mmu_set_memory_limit(); +} + +void early_init_mmu_secondary(void) +{ + early_init_this_mmu(); +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* + * On FSL Embedded 64-bit, usually all RAM is bolted, but with + * unusual memory sizes it's possible for some RAM to not be mapped + * (such RAM is not used at all by Linux, since we don't support + * highmem on 64-bit). We limit ppc64_rma_size to what would be + * mappable if this memblock is the only one. Additional memblocks + * can only increase, not decrease, the amount that ends up getting + * mapped. We still limit max to 1G even if we'll eventually map + * more. This is due to what the early init code is set up to do. + * + * We crop it to the size of the first MEMBLOCK to + * avoid going over total available memory just in case... + */ + unsigned long linear_sz; + unsigned int num_cams; + + /* use a quarter of the TLBCAM for bolted linear map */ + num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; + + linear_sz = map_mem_in_cams(first_memblock_size, num_cams, true, true); + ppc64_rma_size = min_t(u64, linear_sz, 0x40000000); + + /* Finally limit subsequent allocations */ + memblock_set_current_limit(first_memblock_base + ppc64_rma_size); +} diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/nohash/tlb_low.S index 626ad081639f..c4d296e73731 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/nohash/tlb_low.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * This file contains low-level functions for performing various * types of TLB invalidations on various processors with no hash @@ -18,12 +19,6 @@ * * Partially rewritten by Cort Dougan (cort@cs.nmt.edu) * Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <asm/reg.h> @@ -34,33 +29,10 @@ #include <asm/asm-offsets.h> #include <asm/processor.h> #include <asm/bug.h> +#include <asm/asm-compat.h> +#include <asm/feature-fixups.h> -#if defined(CONFIG_40x) - -/* - * 40x implementation needs only tlbil_va - */ -_GLOBAL(__tlbil_va) - /* We run the search with interrupts disabled because we have to change - * the PID and I don't want to preempt when that happens. - */ - mfmsr r5 - mfspr r6,SPRN_PID - wrteei 0 - mtspr SPRN_PID,r4 - tlbsx. r3, 0, r3 - mtspr SPRN_PID,r6 - wrtee r5 - bne 1f - sync - /* There are only 64 TLB entries, so r3 < 64, which means bit 25 is - * clear. Since 25 is the V bit in the TLB_TAG, loading this value - * will invalidate the TLB entry. */ - tlbwe r3, r3, TLB_TAG - isync -1: blr - -#elif defined(CONFIG_8xx) +#if defined(CONFIG_PPC_8xx) /* * Nothing to do for 8xx, everything is inline @@ -95,36 +67,25 @@ _GLOBAL(__tlbil_va) tlbsx. r6,0,r3 bne 10f sync -BEGIN_MMU_FTR_SECTION - b 2f -END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) +#ifndef CONFIG_PPC_47x /* On 440 There are only 64 TLB entries, so r3 < 64, which means bit * 22, is clear. Since 22 is the V bit in the TLB_PAGEID, loading this * value will invalidate the TLB entry. */ tlbwe r6,r6,PPC44x_TLB_PAGEID - isync -10: wrtee r10 - blr -2: -#ifdef CONFIG_PPC_47x - oris r7,r6,0x8000 /* specify way explicitely */ +#else + oris r7,r6,0x8000 /* specify way explicitly */ clrrwi r4,r3,12 /* get an EPN for the hashing with V = 0 */ ori r4,r4,PPC47x_TLBE_SIZE tlbwe r4,r7,0 /* write it */ +#endif /* !CONFIG_PPC_47x */ isync - wrtee r10 +10: wrtee r10 blr -#else /* CONFIG_PPC_47x */ -1: trap - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; -#endif /* !CONFIG_PPC_47x */ _GLOBAL(_tlbil_all) _GLOBAL(_tlbil_pid) -BEGIN_MMU_FTR_SECTION - b 2f -END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) +#ifndef CONFIG_PPC_47x li r3,0 sync @@ -139,8 +100,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) isync blr -2: -#ifdef CONFIG_PPC_47x +#else /* 476 variant. There's not simple way to do this, hopefully we'll * try to limit the amount of such full invalidates */ @@ -149,7 +109,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) li r3,-1 /* Current set */ lis r10,tlb_47x_boltmap@h ori r10,r10,tlb_47x_boltmap@l - lis r7,0x8000 /* Specify way explicitely */ + lis r7,0x8000 /* Specify way explicitly */ b 9f /* For each set */ @@ -182,11 +142,8 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) b 1b /* Then loop */ 1: isync /* Sync shadows */ wrtee r11 -#else /* CONFIG_PPC_47x */ -1: trap - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; -#endif /* !CONFIG_PPC_47x */ blr +#endif /* !CONFIG_PPC_47x */ #ifdef CONFIG_PPC_47x @@ -204,7 +161,7 @@ _GLOBAL(_tlbivax_bcast) isync PPC_TLBIVAX(0, R3) isync - eieio + mbar tlbsync BEGIN_FTR_SECTION b 1f @@ -217,7 +174,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) * Touch enough instruction cache lines to ensure cache hits */ 1: mflr r9 - bl 2f + bcl 20,31,$+4 2: mflr r6 li r7,32 PPC_ICBT(0,R6,R7) /* touch next cache line */ @@ -239,7 +196,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) blr #endif /* CONFIG_PPC_47x */ -#elif defined(CONFIG_FSL_BOOKE) +#elif defined(CONFIG_PPC_85xx) /* * FSL BookE implementations. * @@ -312,7 +269,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) isync 1: wrtee r10 blr -#elif defined(CONFIG_PPC_BOOK3E) +#elif defined(CONFIG_PPC_BOOK3E_64) /* * New Book3E (>= 2.06) implementation * @@ -373,36 +330,26 @@ _GLOBAL(_tlbivax_bcast) rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND 1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ PPC_TLBIVAX(0,R3) - eieio + mbar tlbsync sync wrtee r10 blr - -_GLOBAL(set_context) -#ifdef CONFIG_BDI_SWITCH - /* Context switch the PTE pointer for the Abatron BDI2000. - * The PGDIR is the second parameter. - */ - lis r5, abatron_pteptrs@h - ori r5, r5, abatron_pteptrs@l - stw r4, 0x4(r5) -#endif - mtspr SPRN_PID,r3 - isync /* Force context change */ - blr #else #error Unsupported processor type ! #endif -#if defined(CONFIG_PPC_FSL_BOOK3E) +#if defined(CONFIG_PPC_E500) /* * extern void loadcam_entry(unsigned int index) * * Load TLBCAM[index] entry in to the L2 CAM MMU + * Must preserve r7, r8, r9, r10, r11, r12 */ _GLOBAL(loadcam_entry) - LOAD_REG_ADDR(r4, TLBCAM) + mflr r5 + LOAD_REG_ADDR_PIC(r4, TLBCAM) + mtlr r5 mulli r5,r3,TLBCAM_SIZE add r3,r5,r4 lwz r4,TLBCAM_MAS0(r3) @@ -421,4 +368,80 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) tlbwe isync blr + +/* + * Load multiple TLB entries at once, using an alternate-space + * trampoline so that we don't have to care about whether the same + * TLB entry maps us before and after. + * + * r3 = first entry to write + * r4 = number of entries to write + * r5 = temporary tlb entry (0 means no switch to AS1) + */ +_GLOBAL(loadcam_multi) + mflr r8 + /* Don't switch to AS=1 if already there */ + mfmsr r11 + andi. r11,r11,MSR_IS + bne 10f + mr. r12, r5 + beq 10f + + /* + * Set up temporary TLB entry that is the same as what we're + * running from, but in AS=1. + */ + bcl 20,31,$+4 +1: mflr r6 + tlbsx 0,r8 + mfspr r6,SPRN_MAS1 + ori r6,r6,MAS1_TS + mtspr SPRN_MAS1,r6 + mfspr r6,SPRN_MAS0 + rlwimi r6,r5,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK + mr r7,r5 + mtspr SPRN_MAS0,r6 + isync + tlbwe + isync + + /* Switch to AS=1 */ + mfmsr r6 + ori r6,r6,MSR_IS|MSR_DS + mtmsr r6 + isync + +10: + mr r9,r3 + add r10,r3,r4 +2: bl loadcam_entry + addi r9,r9,1 + cmpw r9,r10 + mr r3,r9 + blt 2b + + /* Don't return to AS=0 if we were in AS=1 at function start */ + andi. r11,r11,MSR_IS + bne 3f + cmpwi r12, 0 + beq 3f + + /* Return to AS=0 and clear the temporary entry */ + mfmsr r6 + rlwinm. r6,r6,0,~(MSR_IS|MSR_DS) + mtmsr r6 + isync + + li r6,0 + mtspr SPRN_MAS1,r6 + rlwinm r6,r7,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK + oris r6,r6,MAS0_TLBSEL(1)@h + mtspr SPRN_MAS0,r6 + isync + tlbwe + isync + +3: + mtlr r8 + blr #endif diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S new file mode 100644 index 000000000000..de568297d5c5 --- /dev/null +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -0,0 +1,743 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Low level TLB miss handlers for Book3E + * + * Copyright (C) 2008-2009 + * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. + */ + +#include <linux/pgtable.h> +#include <asm/processor.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/mmu.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/cputable.h> +#include <asm/exception-64e.h> +#include <asm/ppc-opcode.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_booke_hv_asm.h> +#include <asm/feature-fixups.h> + +#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE) +#define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE) +#define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE) +#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE) + +/********************************************************************** + * * + * TLB miss handling for Book3E with a bolted linear mapping * + * No virtual page table, no nested TLB misses * + * * + **********************************************************************/ + +/* + * Note that, unlike non-bolted handlers, TLB_EXFRAME is not + * modified by the TLB miss handlers themselves, since the TLB miss + * handler code will not itself cause a recursive TLB miss. + * + * TLB_EXFRAME will be modified when crit/mc/debug exceptions are + * entered/exited. + */ +.macro tlb_prolog_bolted intnum addr + mtspr SPRN_SPRG_GEN_SCRATCH,r12 + mfspr r12,SPRN_SPRG_TLB_EXFRAME + std r13,EX_TLB_R13(r12) + std r10,EX_TLB_R10(r12) + mfspr r13,SPRN_SPRG_PACA + + mfcr r10 + std r11,EX_TLB_R11(r12) +#ifdef CONFIG_KVM_BOOKE_HV +BEGIN_FTR_SECTION + mfspr r11, SPRN_SRR1 +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) +#endif + DO_KVM \intnum, SPRN_SRR1 + std r16,EX_TLB_R16(r12) + mfspr r16,\addr /* get faulting address */ + std r14,EX_TLB_R14(r12) + ld r14,PACAPGD(r13) + std r15,EX_TLB_R15(r12) + std r10,EX_TLB_CR(r12) +START_BTB_FLUSH_SECTION + mfspr r11, SPRN_SRR1 + andi. r10,r11,MSR_PR + beq 1f + BTB_FLUSH(r10) +1: +END_BTB_FLUSH_SECTION + std r7,EX_TLB_R7(r12) +.endm + +.macro tlb_epilog_bolted + ld r14,EX_TLB_CR(r12) + ld r7,EX_TLB_R7(r12) + ld r10,EX_TLB_R10(r12) + ld r11,EX_TLB_R11(r12) + ld r13,EX_TLB_R13(r12) + mtcr r14 + ld r14,EX_TLB_R14(r12) + ld r15,EX_TLB_R15(r12) + ld r16,EX_TLB_R16(r12) + mfspr r12,SPRN_SPRG_GEN_SCRATCH +.endm + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss_bolted) + tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR + + /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + /* We pre-test some combination of permissions to avoid double + * faults: + * + * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE + * ESR_ST is 0x00800000 + * _PAGE_BAP_SW is 0x00000010 + * So the shift is >> 19. This tests for supervisor writeability. + * If the page happens to be supervisor writeable and not user + * writeable, we will take a new fault later, but that should be + * a rare enough case. + * + * We also move ESR_ST in _PAGE_DIRTY position + * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 + * + * MAS1 is preset for all we need except for TID that needs to + * be cleared for kernel translations + */ + + mfspr r11,SPRN_ESR + + srdi r15,r16,60 /* get region */ + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + bne- dtlb_miss_fault_bolted /* Bail if fault addr is invalid */ + + rlwinm r10,r11,32-19,27,27 + rlwimi r10,r11,32-16,19,19 + cmpwi r15,0 /* user vs kernel check */ + ori r10,r10,_PAGE_PRESENT + oris r11,r10,_PAGE_ACCESSED@h + + bne tlb_miss_kernel_bolted + +tlb_miss_user_bolted: +#ifdef CONFIG_PPC_KUAP + mfspr r10,SPRN_MAS1 + rlwinm. r10,r10,0,0x3fff0000 + beq- tlb_miss_fault_bolted /* KUAP fault */ +#endif + +tlb_miss_common_bolted: +/* + * This is the guts of the TLB miss handler for bolted-linear. + * We are entered with: + * + * r16 = faulting address + * r15 = crap (free to use) + * r14 = page table base + * r13 = PACA + * r11 = PTE permission mask + * r10 = crap (free to use) + */ + rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3 + cmpldi cr0,r14,0 + clrrdi r15,r15,3 + beq tlb_miss_fault_bolted /* No PGDIR, bail */ + + ldx r14,r14,r15 /* grab pgd entry */ + + rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_bolted /* Bad pgd entry or hugepage; bail */ + ldx r14,r14,r15 /* grab pud entry */ + + rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_bolted + ldx r14,r14,r15 /* Grab pmd entry */ + + rldicl r15,r16,64-PAGE_SHIFT+3,64-PTE_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_bolted + ldx r14,r14,r15 /* Grab PTE, normal (!huge) page */ + + /* Check if required permissions are met */ + andc. r15,r11,r14 + rldicr r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT + bne- tlb_miss_fault_bolted + + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE need change if !base page size, not + * yet implemented for now + * MAS 2 : Defaults not useful, need to be redone + * MAS 3+7 : Needs to be done + */ + clrrdi r11,r16,12 /* Clear low crap in EA */ + clrldi r15,r15,12 /* Clear crap at the top */ + rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ + rlwimi r15,r14,32-8,22,25 /* Move in U bits */ + mtspr SPRN_MAS2,r11 + andi. r11,r14,_PAGE_DIRTY + rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ + + /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ + bne 1f + li r11,MAS3_SW|MAS3_UW + andc r15,r15,r11 +1: + mtspr SPRN_MAS7_MAS3,r15 + tlbwe + +tlb_miss_done_bolted: + tlb_epilog_bolted + rfi + +itlb_miss_kernel_bolted: + li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ + oris r11,r11,_PAGE_ACCESSED@h +tlb_miss_kernel_bolted: + mfspr r10,SPRN_MAS1 + ld r14,PACA_KERNELPGD(r13) + srdi r15,r16,44 /* get kernel region */ + andi. r15,r15,1 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + bne+ tlb_miss_common_bolted + +tlb_miss_fault_bolted: + /* We need to check if it was an instruction miss */ + andi. r10,r11,_PAGE_BAP_UX|_PAGE_BAP_SX + bne itlb_miss_fault_bolted +dtlb_miss_fault_bolted: + tlb_epilog_bolted + b exc_data_storage_book3e +itlb_miss_fault_bolted: + tlb_epilog_bolted + b exc_instruction_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss_bolted) + tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0 + + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + srdi r15,r16,60 /* get region */ + bne- itlb_miss_fault_bolted + + li r11,_PAGE_PRESENT|_PAGE_BAP_UX /* Base perm */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + + cmpldi cr0,r15,0 /* Check for user region */ + oris r11,r11,_PAGE_ACCESSED@h + beq tlb_miss_user_bolted + b itlb_miss_kernel_bolted + +/* + * TLB miss handling for e6500 and derivatives, using hardware tablewalk. + * + * Linear mapping is bolted: no virtual page table or nested TLB misses + * Indirect entries in TLB1, hardware loads resulting direct entries + * into TLB0 + * No HES or NV hint on TLB1, so we need to do software round-robin + * No tlbsrx. so we need a spinlock, and we have to deal + * with MAS-damage caused by tlbsx + * 4K pages only + */ + + START_EXCEPTION(instruction_tlb_miss_e6500) + tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0 + + ld r11,PACA_TCD_PTR(r13) + srdi. r15,r16,60 /* get region */ + ori r16,r16,1 + + bne tlb_miss_kernel_e6500 /* user/kernel test */ + + b tlb_miss_common_e6500 + + START_EXCEPTION(data_tlb_miss_e6500) + tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR + + ld r11,PACA_TCD_PTR(r13) + srdi. r15,r16,60 /* get region */ + rldicr r16,r16,0,62 + + bne tlb_miss_kernel_e6500 /* user vs kernel check */ + +/* + * This is the guts of the TLB miss handler for e6500 and derivatives. + * We are entered with: + * + * r16 = page of faulting address (low bit 0 if data, 1 if instruction) + * r15 = crap (free to use) + * r14 = page table base + * r13 = PACA + * r11 = tlb_per_core ptr + * r10 = crap (free to use) + * r7 = esel_next + */ +tlb_miss_common_e6500: + crmove cr2*4+2,cr0*4+2 /* cr2.eq != 0 if kernel address */ + +BEGIN_FTR_SECTION /* CPU_FTR_SMT */ + /* + * Search if we already have an indirect entry for that virtual + * address, and if we do, bail out. + * + * MAS6:IND should be already set based on MAS4 + */ + lhz r10,PACAPACAINDEX(r13) + addi r10,r10,1 + crclr cr1*4+eq /* set cr1.eq = 0 for non-recursive */ +1: lbarx r15,0,r11 + cmpdi r15,0 + bne 2f + stbcx. r10,0,r11 + bne 1b +3: + .subsection 1 +2: cmpd cr1,r15,r10 /* recursive lock due to mcheck/crit/etc? */ + beq cr1,3b /* unlock will happen if cr1.eq = 0 */ +10: lbz r15,0(r11) + cmpdi r15,0 + bne 10b + b 1b + .previous +END_FTR_SECTION_IFSET(CPU_FTR_SMT) + + lbz r7,TCD_ESEL_NEXT(r11) + +BEGIN_FTR_SECTION /* CPU_FTR_SMT */ + /* + * Erratum A-008139 says that we can't use tlbwe to change + * an indirect entry in any way (including replacing or + * invalidating) if the other thread could be in the process + * of a lookup. The workaround is to invalidate the entry + * with tlbilx before overwriting. + */ + + rlwinm r10,r7,16,0xff0000 + oris r10,r10,MAS0_TLBSEL(1)@h + mtspr SPRN_MAS0,r10 + isync + tlbre + mfspr r15,SPRN_MAS1 + andis. r15,r15,MAS1_VALID@h + beq 5f + +BEGIN_FTR_SECTION_NESTED(532) + mfspr r10,SPRN_MAS8 + rlwinm r10,r10,0,0x80000fff /* tgs,tlpid -> sgs,slpid */ + mtspr SPRN_MAS5,r10 +END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532) + + mfspr r10,SPRN_MAS1 + rlwinm r15,r10,0,0x3fff0000 /* tid -> spid */ + rlwimi r15,r10,20,0x00000003 /* ind,ts -> sind,sas */ + mfspr r10,SPRN_MAS6 + mtspr SPRN_MAS6,r15 + + mfspr r15,SPRN_MAS2 + isync + PPC_TLBILX_VA(0,R15) + isync + + mtspr SPRN_MAS6,r10 + +5: +BEGIN_FTR_SECTION_NESTED(532) + li r10,0 + mtspr SPRN_MAS8,r10 + mtspr SPRN_MAS5,r10 +END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532) + + tlbsx 0,r16 + mfspr r10,SPRN_MAS1 + andis. r15,r10,MAS1_VALID@h + bne tlb_miss_done_e6500 +FTR_SECTION_ELSE + mfspr r10,SPRN_MAS1 +ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT) + + oris r10,r10,MAS1_VALID@h + beq cr2,4f + rlwinm r10,r10,0,16,1 /* Clear TID */ +4: mtspr SPRN_MAS1,r10 + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + bne- tlb_miss_fault_e6500 + + rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3 + cmpldi cr0,r14,0 + clrrdi r15,r15,3 + beq- tlb_miss_fault_e6500 /* No PGDIR, bail */ + ldx r14,r14,r15 /* grab pgd entry */ + + rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_huge_e6500 /* Bad pgd entry or hugepage; bail */ + ldx r14,r14,r15 /* grab pud entry */ + + rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_huge_e6500 + ldx r14,r14,r15 /* Grab pmd entry */ + + mfspr r10,SPRN_MAS0 + cmpdi cr0,r14,0 + bge tlb_miss_huge_e6500 + + /* Now we build the MAS for a 2M indirect page: + * + * MAS 0 : ESEL needs to be filled by software round-robin + * MAS 1 : Fully set up + * - PID already updated by caller if necessary + * - TSIZE for now is base ind page size always + * - TID already cleared if necessary + * MAS 2 : Default not 2M-aligned, need to be redone + * MAS 3+7 : Needs to be done + */ + + ori r14,r14,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) + mtspr SPRN_MAS7_MAS3,r14 + + clrrdi r15,r16,21 /* make EA 2M-aligned */ + mtspr SPRN_MAS2,r15 + +tlb_miss_huge_done_e6500: + lbz r16,TCD_ESEL_MAX(r11) + lbz r14,TCD_ESEL_FIRST(r11) + rlwimi r10,r7,16,0x00ff0000 /* insert esel_next into MAS0 */ + addi r7,r7,1 /* increment esel_next */ + mtspr SPRN_MAS0,r10 + cmpw r7,r16 + iseleq r7,r14,r7 /* if next == last use first */ + stb r7,TCD_ESEL_NEXT(r11) + + tlbwe + +tlb_miss_done_e6500: + .macro tlb_unlock_e6500 +BEGIN_FTR_SECTION + beq cr1,1f /* no unlock if lock was recursively grabbed */ + li r15,0 + isync + stb r15,0(r11) +1: +END_FTR_SECTION_IFSET(CPU_FTR_SMT) + .endm + + tlb_unlock_e6500 + tlb_epilog_bolted + rfi + +tlb_miss_huge_e6500: + beq tlb_miss_fault_e6500 + rlwinm r15,r14,32-_PAGE_PSIZE_SHIFT,0x1e + + /* + * Now we build the MAS for a huge page. + * + * MAS 0 : ESEL needs to be filled by software round-robin + * - can be handled by indirect code + * MAS 1 : Need to clear IND and set TSIZE + * MAS 2,3+7: Needs to be redone similar to non-tablewalk handler + */ + + mfspr r10,SPRN_MAS1 + rlwinm r10,r10,0,~MAS1_IND + rlwimi r10,r15,MAS1_TSIZE_SHIFT,MAS1_TSIZE_MASK + mtspr SPRN_MAS1,r10 + + li r10,-0x400 + sld r15,r10,r15 /* Generate mask based on size */ + and r10,r16,r15 + rldicr r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT + rlwimi r10,r14,32-19,27,31 /* Insert WIMGE */ + clrldi r15,r15,PAGE_SHIFT /* Clear crap at the top */ + rlwimi r15,r14,32-8,22,25 /* Move in U bits */ + mtspr SPRN_MAS2,r10 + andi. r10,r14,_PAGE_DIRTY + rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ + + /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ + bne 1f + li r10,MAS3_SW|MAS3_UW + andc r15,r15,r10 +1: + mtspr SPRN_MAS7_MAS3,r15 + + mfspr r10,SPRN_MAS0 + b tlb_miss_huge_done_e6500 + +tlb_miss_kernel_e6500: + ld r14,PACA_KERNELPGD(r13) + srdi r15,r16,44 /* get kernel region */ + xoris r15,r15,0xc /* Check for vmalloc region */ + cmplwi cr1,r15,1 + beq+ cr1,tlb_miss_common_e6500 + +tlb_miss_fault_e6500: + tlb_unlock_e6500 + /* We need to check if it was an instruction miss */ + andi. r16,r16,1 + bne itlb_miss_fault_e6500 +dtlb_miss_fault_e6500: + tlb_epilog_bolted + b exc_data_storage_book3e +itlb_miss_fault_e6500: + tlb_epilog_bolted + b exc_instruction_storage_book3e + +/* + * This is the guts of the second-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = virtual page table faulting address + * r15 = region (top 4 bits of address) + * r14 = crap (free to use) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * Note that this should only ever be called as a second level handler + * with the current scheme when using SW load. + * That means we can always get the original fault DEAR at + * EX_TLB_DEAR-EX_TLB_SIZE(r12) + * + * It can be re-entered by the linear mapping miss handler. However, to + * avoid too much complication, it will restart the whole fault at level + * 0 so we don't care too much about clobbers + * + * XXX That code was written back when we couldn't clobber r14. We can now, + * so we could probably optimize things a bit + */ +virt_page_table_tlb_miss: + /* Are we hitting a kernel page table ? */ + srdi r15,r16,60 + andi. r10,r15,0x8 + + /* The cool thing now is that r10 contains 0 for user and 8 for kernel, + * and we happen to have the swapper_pg_dir at offset 8 from the user + * pgdir in the PACA :-). + */ + add r11,r10,r13 + + /* If kernel, we need to clear MAS1 TID */ + beq 1f + /* XXX replace the RMW cycles with immediate loads + writes */ + mfspr r10,SPRN_MAS1 + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 +#ifdef CONFIG_PPC_KUAP + b 2f +1: + mfspr r10,SPRN_MAS1 + rlwinm. r10,r10,0,0x3fff0000 + beq- virt_page_table_tlb_miss_fault /* KUAP fault */ +2: +#else +1: +#endif + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + cmpldi r10,0x80 + bne- virt_page_table_tlb_miss_fault + + /* Get the PGD pointer */ + ld r15,PACAPGD(r11) + cmpldi cr0,r15,0 + beq- virt_page_table_tlb_miss_fault + + /* Get to PGD entry */ + rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault + + /* Get to PUD entry */ + rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault + + /* Get to PMD entry */ + rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault + + /* Ok, we're all right, we can now create a kernel translation for + * a 4K or 64K page from r16 -> r15. + */ + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE for now is base page size always + * MAS 2 : Use defaults + * MAS 3+7 : Needs to be done + * + * So we only do MAS 2 and 3 for now... + */ + clrldi r11,r15,4 /* remove region ID from RPN */ + ori r10,r11,1 /* Or-in SR */ + + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 + + tlbwe + + /* Return to caller, normal case */ + TLB_MISS_EPILOG_SUCCESS + rfi + +virt_page_table_tlb_miss_fault: + /* If we fault here, things are a little bit tricky. We need to call + * either data or instruction store fault, and we need to retrieve + * the original fault address and ESR (for data). + * + * The thing is, we know that in normal circumstances, this is + * always called as a second level tlb miss for SW load or as a first + * level TLB miss for HW load, so we should be able to peek at the + * relevant information in the first exception frame in the PACA. + * + * However, we do need to double check that, because we may just hit + * a stray kernel pointer or a userland attack trying to hit those + * areas. If that is the case, we do a data fault. (We can't get here + * from an instruction tlb miss anyway). + * + * Note also that when going to a fault, we must unwind the previous + * level as well. Since we are doing that, we don't need to clear or + * restore the TLB reservation neither. + */ + subf r10,r13,r12 + cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE + bne- virt_page_table_tlb_miss_whacko_fault + + /* We dig the original DEAR and ESR from slot 0 */ + ld r15,EX_TLB_DEAR+PACA_EXTLB(r13) + ld r16,EX_TLB_ESR+PACA_EXTLB(r13) + + /* We check for the "special" ESR value for instruction faults */ + cmpdi cr0,r16,-1 + beq 1f + mtspr SPRN_DEAR,r15 + mtspr SPRN_ESR,r16 + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +virt_page_table_tlb_miss_whacko_fault: + /* The linear fault will restart everything so ESR and DEAR will + * not have been clobbered, let's just fault with what we have + */ + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + +/* + * This is the guts of "any" level TLB miss handler for kernel linear + * mapping misses. We are entered with: + * + * + * r16 = faulting address + * r15 = crap (free to use) + * r14 = ESR (data) or -1 (instruction) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * In addition we know that we will not re-enter, so in theory, we could + * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later. + * + * We also need to be careful about MAS registers here & TLB reservation, + * as we know we'll have clobbered them if we interrupt the main TLB miss + * handlers in which case we probably want to do a full restart at level + * 0 rather than saving / restoring the MAS. + * + * Note: If we care about performance of that core, we can easily shuffle + * a few things around + */ +tlb_load_linear: + /* For now, we assume the linear mapping is contiguous and stops at + * linear_map_top. We also assume the size is a multiple of 1G, thus + * we only use 1G pages for now. That might have to be changed in a + * final implementation, especially when dealing with hypervisors + */ + __LOAD_PACA_TOC(r11) + LOAD_REG_ADDR_ALTTOC(r11, r11, linear_map_top) + ld r10,0(r11) + tovirt(10,10) + cmpld cr0,r16,r10 + bge tlb_load_linear_fault + + /* MAS1 need whole new setup. */ + li r15,(BOOK3E_PAGESZ_1GB<<MAS1_TSIZE_SHIFT) + oris r15,r15,MAS1_VALID@h /* MAS1 needs V and TSIZE */ + mtspr SPRN_MAS1,r15 + + /* Already somebody there ? */ + PPC_TLBSRX_DOT(0,R16) + beq tlb_load_linear_done + + /* Now we build the remaining MAS. MAS0 and 2 should be fine + * with their defaults, which leaves us with MAS 3 and 7. The + * mapping is linear, so we just take the address, clear the + * region bits, and or in the permission bits which are currently + * hard wired + */ + clrrdi r10,r16,30 /* 1G page index */ + clrldi r10,r10,4 /* clear region bits */ + ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX + + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 + + tlbwe + +tlb_load_linear_done: + /* We use the "error" epilog for success as we do want to + * restore to the initial faulting context, whatever it was. + * We do that because we can't resume a fault within a TLB + * miss handler, due to MAS and TLB reservation being clobbered. + */ + TLB_MISS_EPILOG_ERROR + rfi + +tlb_load_linear_fault: + /* We keep the DEAR and ESR around, this shouldn't have happened */ + cmpdi cr0,r14,-1 + beq 1f + TLB_MISS_EPILOG_ERROR_SPECIAL + b exc_data_storage_book3e +1: TLB_MISS_EPILOG_ERROR_SPECIAL + b exc_instruction_storage_book3e diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 08397217e8ac..603a0f652ba6 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1,15 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * pSeries NUMA support * * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) "numa: " fmt + #include <linux/threads.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/mmzone.h> @@ -17,8 +15,8 @@ #include <linux/nodemask.h> #include <linux/cpu.h> #include <linux/notifier.h> -#include <linux/memblock.h> #include <linux/of.h> +#include <linux/of_address.h> #include <linux/pfn.h> #include <linux/cpuset.h> #include <linux/node.h> @@ -27,38 +25,44 @@ #include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/slab.h> +#include <asm/cputhreads.h> #include <asm/sparsemem.h> -#include <asm/prom.h> #include <asm/smp.h> +#include <asm/topology.h> #include <asm/firmware.h> #include <asm/paca.h> #include <asm/hvcall.h> #include <asm/setup.h> #include <asm/vdso.h> +#include <asm/vphn.h> +#include <asm/drmem.h> static int numa_enabled = 1; static char *cmdline __initdata; -static int numa_debug; -#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } - int numa_cpu_lookup_table[NR_CPUS]; cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; -struct pglist_data *node_data[MAX_NUMNODES]; EXPORT_SYMBOL(numa_cpu_lookup_table); EXPORT_SYMBOL(node_to_cpumask_map); -EXPORT_SYMBOL(node_data); -static int min_common_depth; +static int primary_domain_index; static int n_mem_addr_cells, n_mem_size_cells; -static int form1_affinity; + +#define FORM0_AFFINITY 0 +#define FORM1_AFFINITY 1 +#define FORM2_AFFINITY 2 +static int affinity_form; #define MAX_DISTANCE_REF_POINTS 4 static int distance_ref_points_depth; -static const unsigned int *distance_ref_points; +static const __be32 *distance_ref_points; static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; +static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = { + [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 } +}; +static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE }; /* * Allocate node_to_cpumask_map based on number of available nodes @@ -75,11 +79,11 @@ static void __init setup_node_to_cpumask_map(void) setup_nr_node_ids(); /* allocate the map */ - for (node = 0; node < nr_node_ids; node++) + for_each_node(node) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ - dbg("Node to cpumask map for %d nodes\n", nr_node_ids); + pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } static int __init fake_numa_create_new_node(unsigned long end_pfn, @@ -123,145 +127,152 @@ static int __init fake_numa_create_new_node(unsigned long end_pfn, cmdline = p; fake_nid++; *nid = fake_nid; - dbg("created new fake_node with id %d\n", fake_nid); + pr_debug("created new fake_node with id %d\n", fake_nid); return 1; } return 0; } -/* - * get_node_active_region - Return active region containing pfn - * Active range returned is empty if none found. - * @pfn: The page to return the region for - * @node_ar: Returned set to the active region containing @pfn - */ -static void __init get_node_active_region(unsigned long pfn, - struct node_active_region *node_ar) +static void __init reset_numa_cpu_lookup_table(void) { - unsigned long start_pfn, end_pfn; - int i, nid; + unsigned int cpu; - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { - if (pfn >= start_pfn && pfn < end_pfn) { - node_ar->nid = nid; - node_ar->start_pfn = start_pfn; - node_ar->end_pfn = end_pfn; - break; - } - } + for_each_possible_cpu(cpu) + numa_cpu_lookup_table[cpu] = -1; } -static void map_cpu_to_node(int cpu, int node) +void map_cpu_to_node(int cpu, int node) { - numa_cpu_lookup_table[cpu] = node; + update_numa_cpu_lookup_table(cpu, node); - dbg("adding cpu %d to node %d\n", cpu, node); - - if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) + if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) { + pr_debug("adding cpu %d to node %d\n", cpu, node); cpumask_set_cpu(cpu, node_to_cpumask_map[node]); + } } #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) -static void unmap_cpu_from_node(unsigned long cpu) +void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; - dbg("removing cpu %lu from node %d\n", cpu, node); - if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); + pr_debug("removing cpu %lu from node %d\n", cpu, node); } else { - printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", - cpu, node); + pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node); } } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ -/* must hold reference to node during call */ -static const int *of_get_associativity(struct device_node *dev) +static int __associativity_to_nid(const __be32 *associativity, + int max_array_sz) { - return of_get_property(dev, "ibm,associativity", NULL); -} + int nid; + /* + * primary_domain_index is 1 based array index. + */ + int index = primary_domain_index - 1; + if (!numa_enabled || index >= max_array_sz) + return NUMA_NO_NODE; + + nid = of_read_number(&associativity[index], 1); + + /* POWER4 LPAR uses 0xffff as invalid node */ + if (nid == 0xffff || nid >= nr_node_ids) + nid = NUMA_NO_NODE; + return nid; +} /* - * Returns the property linux,drconf-usable-memory if - * it exists (the property exists only in kexec/kdump kernels, - * added by kexec-tools) + * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA + * info is found. */ -static const u32 *of_get_usable_memory(struct device_node *memory) +static int associativity_to_nid(const __be32 *associativity) { - const u32 *prop; - u32 len; - prop = of_get_property(memory, "linux,drconf-usable-memory", &len); - if (!prop || len < sizeof(unsigned int)) + int array_sz = of_read_number(associativity, 1); + + /* Skip the first element in the associativity array */ + return __associativity_to_nid((associativity + 1), array_sz); +} + +static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + int dist; + int node1, node2; + + node1 = associativity_to_nid(cpu1_assoc); + node2 = associativity_to_nid(cpu2_assoc); + + dist = numa_distance_table[node1][node2]; + if (dist <= LOCAL_DISTANCE) return 0; - return prop; + else if (dist <= REMOTE_DISTANCE) + return 1; + else + return 2; } -int __node_distance(int a, int b) +static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { - int i; - int distance = LOCAL_DISTANCE; + int dist = 0; - if (!form1_affinity) - return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); + int i, index; for (i = 0; i < distance_ref_points_depth; i++) { - if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) + index = be32_to_cpu(distance_ref_points[i]); + if (cpu1_assoc[index] == cpu2_assoc[index]) break; - - /* Double the distance for each NUMA level */ - distance *= 2; + dist++; } - return distance; + return dist; } -static void initialize_distance_lookup_table(int nid, - const unsigned int *associativity) +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { - int i; - - if (!form1_affinity) - return; - - for (i = 0; i < distance_ref_points_depth; i++) { - distance_lookup_table[nid][i] = - associativity[distance_ref_points[i]]; - } + /* We should not get called with FORM0 */ + VM_WARN_ON(affinity_form == FORM0_AFFINITY); + if (affinity_form == FORM1_AFFINITY) + return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); + return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc); } -/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa - * info is found. - */ -static int associativity_to_nid(const unsigned int *associativity) +/* must hold reference to node during call */ +static const __be32 *of_get_associativity(struct device_node *dev) { - int nid = -1; + return of_get_property(dev, "ibm,associativity", NULL); +} - if (min_common_depth == -1) - goto out; +int __node_distance(int a, int b) +{ + int i; + int distance = LOCAL_DISTANCE; - if (associativity[0] >= min_common_depth) - nid = associativity[min_common_depth]; + if (affinity_form == FORM2_AFFINITY) + return numa_distance_table[a][b]; + else if (affinity_form == FORM0_AFFINITY) + return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); - /* POWER4 LPAR uses 0xffff as invalid node */ - if (nid == 0xffff || nid >= MAX_NUMNODES) - nid = -1; + for (i = 0; i < distance_ref_points_depth; i++) { + if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) + break; - if (nid > 0 && associativity[0] >= distance_ref_points_depth) - initialize_distance_lookup_table(nid, associativity); + /* Double the distance for each NUMA level */ + distance *= 2; + } -out: - return nid; + return distance; } +EXPORT_SYMBOL(__node_distance); /* Returns the nid associated with the given device tree node, * or -1 if not found. */ static int of_node_to_nid_single(struct device_node *device) { - int nid = -1; - const unsigned int *tmp; + int nid = NUMA_NO_NODE; + const __be32 *tmp; tmp = of_get_associativity(device); if (tmp) @@ -272,8 +283,7 @@ static int of_node_to_nid_single(struct device_node *device) /* Walk the device tree upwards, looking for an associativity id */ int of_node_to_nid(struct device_node *device) { - struct device_node *tmp; - int nid = -1; + int nid = NUMA_NO_NODE; of_node_get(device); while (device) { @@ -281,20 +291,163 @@ int of_node_to_nid(struct device_node *device) if (nid != -1) break; - tmp = device; - device = of_get_parent(tmp); - of_node_put(tmp); + device = of_get_next_parent(device); } of_node_put(device); return nid; } -EXPORT_SYMBOL_GPL(of_node_to_nid); +EXPORT_SYMBOL(of_node_to_nid); -static int __init find_min_common_depth(void) +static void __initialize_form1_numa_distance(const __be32 *associativity, + int max_array_sz) { - int depth; + int i, nid; + + if (affinity_form != FORM1_AFFINITY) + return; + + nid = __associativity_to_nid(associativity, max_array_sz); + if (nid != NUMA_NO_NODE) { + for (i = 0; i < distance_ref_points_depth; i++) { + const __be32 *entry; + int index = be32_to_cpu(distance_ref_points[i]) - 1; + + /* + * broken hierarchy, return with broken distance table + */ + if (WARN(index >= max_array_sz, "Broken ibm,associativity property")) + return; + + entry = &associativity[index]; + distance_lookup_table[nid][i] = of_read_number(entry, 1); + } + } +} + +static void initialize_form1_numa_distance(const __be32 *associativity) +{ + int array_sz; + + array_sz = of_read_number(associativity, 1); + /* Skip the first element in the associativity array */ + __initialize_form1_numa_distance(associativity + 1, array_sz); +} + +/* + * Used to update distance information w.r.t newly added node. + */ +void update_numa_distance(struct device_node *node) +{ + int nid; + + if (affinity_form == FORM0_AFFINITY) + return; + else if (affinity_form == FORM1_AFFINITY) { + const __be32 *associativity; + + associativity = of_get_associativity(node); + if (!associativity) + return; + + initialize_form1_numa_distance(associativity); + return; + } + + /* FORM2 affinity */ + nid = of_node_to_nid_single(node); + if (nid == NUMA_NO_NODE) + return; + + /* + * With FORM2 we expect NUMA distance of all possible NUMA + * nodes to be provided during boot. + */ + WARN(numa_distance_table[nid][nid] == -1, + "NUMA distance details for node %d not provided\n", nid); +} +EXPORT_SYMBOL_GPL(update_numa_distance); + +/* + * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} + * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements} + */ +static void __init initialize_form2_numa_distance_lookup_table(void) +{ + int i, j; struct device_node *root; + const __u8 *form2_distances; + const __be32 *numa_lookup_index; + int form2_distances_length; + int max_numa_index, distance_index; + + if (firmware_has_feature(FW_FEATURE_OPAL)) + root = of_find_node_by_path("/ibm,opal"); + else + root = of_find_node_by_path("/rtas"); + if (!root) + root = of_find_node_by_path("/"); + + numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL); + max_numa_index = of_read_number(&numa_lookup_index[0], 1); + + /* first element of the array is the size and is encode-int */ + form2_distances = of_get_property(root, "ibm,numa-distance-table", NULL); + form2_distances_length = of_read_number((const __be32 *)&form2_distances[0], 1); + /* Skip the size which is encoded int */ + form2_distances += sizeof(__be32); + + pr_debug("form2_distances_len = %d, numa_dist_indexes_len = %d\n", + form2_distances_length, max_numa_index); + + for (i = 0; i < max_numa_index; i++) + /* +1 skip the max_numa_index in the property */ + numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1); + + + if (form2_distances_length != max_numa_index * max_numa_index) { + WARN(1, "Wrong NUMA distance information\n"); + form2_distances = NULL; // don't use it + } + distance_index = 0; + for (i = 0; i < max_numa_index; i++) { + for (j = 0; j < max_numa_index; j++) { + int nodeA = numa_id_index_table[i]; + int nodeB = numa_id_index_table[j]; + int dist; + + if (form2_distances) + dist = form2_distances[distance_index++]; + else if (nodeA == nodeB) + dist = LOCAL_DISTANCE; + else + dist = REMOTE_DISTANCE; + numa_distance_table[nodeA][nodeB] = dist; + pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, dist); + } + } + + of_node_put(root); +} + +static int __init find_primary_domain_index(void) +{ + int index; + struct device_node *root; + + /* + * Check for which form of affinity. + */ + if (firmware_has_feature(FW_FEATURE_OPAL)) { + affinity_form = FORM1_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) { + pr_debug("Using form 2 affinity\n"); + affinity_form = FORM2_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { + pr_debug("Using form 1 affinity\n"); + affinity_form = FORM1_AFFINITY; + } else + affinity_form = FORM0_AFFINITY; if (firmware_has_feature(FW_FEATURE_OPAL)) root = of_find_node_by_path("/ibm,opal"); @@ -320,42 +473,37 @@ static int __init find_min_common_depth(void) &distance_ref_points_depth); if (!distance_ref_points) { - dbg("NUMA: ibm,associativity-reference-points not found.\n"); + pr_debug("ibm,associativity-reference-points not found.\n"); goto err; } distance_ref_points_depth /= sizeof(int); - - if (firmware_has_feature(FW_FEATURE_OPAL) || - firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { - dbg("Using form 1 affinity\n"); - form1_affinity = 1; - } - - if (form1_affinity) { - depth = distance_ref_points[0]; - } else { + if (affinity_form == FORM0_AFFINITY) { if (distance_ref_points_depth < 2) { - printk(KERN_WARNING "NUMA: " - "short ibm,associativity-reference-points\n"); + pr_warn("short ibm,associativity-reference-points\n"); goto err; } - depth = distance_ref_points[1]; + index = of_read_number(&distance_ref_points[1], 1); + } else { + /* + * Both FORM1 and FORM2 affinity find the primary domain details + * at the same offset. + */ + index = of_read_number(distance_ref_points, 1); } - /* * Warn and cap if the hardware supports more than * MAX_DISTANCE_REF_POINTS domains. */ if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { - printk(KERN_WARNING "NUMA: distance array capped at " - "%d entries\n", MAX_DISTANCE_REF_POINTS); + pr_warn("distance array capped at %d entries\n", + MAX_DISTANCE_REF_POINTS); distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; } of_node_put(root); - return depth; + return index; err: of_node_put(root); @@ -375,84 +523,21 @@ static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) of_node_put(memory); } -static unsigned long read_n_cells(int n, const unsigned int **buf) +static unsigned long read_n_cells(int n, const __be32 **buf) { unsigned long result = 0; while (n--) { - result = (result << 32) | **buf; + result = (result << 32) | of_read_number(*buf, 1); (*buf)++; } return result; } -/* - * Read the next memblock list entry from the ibm,dynamic-memory property - * and return the information in the provided of_drconf_cell structure. - */ -static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp) -{ - const u32 *cp; - - drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp); - - cp = *cellp; - drmem->drc_index = cp[0]; - drmem->reserved = cp[1]; - drmem->aa_index = cp[2]; - drmem->flags = cp[3]; - - *cellp = cp + 4; -} - -/* - * Retrieve and validate the ibm,dynamic-memory property of the device tree. - * - * The layout of the ibm,dynamic-memory property is a number N of memblock - * list entries followed by N memblock list entries. Each memblock list entry - * contains information as laid out in the of_drconf_cell struct above. - */ -static int of_get_drconf_memory(struct device_node *memory, const u32 **dm) -{ - const u32 *prop; - u32 len, entries; - - prop = of_get_property(memory, "ibm,dynamic-memory", &len); - if (!prop || len < sizeof(unsigned int)) - return 0; - - entries = *prop++; - - /* Now that we know the number of entries, revalidate the size - * of the property read in to ensure we have everything - */ - if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int)) - return 0; - - *dm = prop; - return entries; -} - -/* - * Retrieve and validate the ibm,lmb-size property for drconf memory - * from the device tree. - */ -static u64 of_get_lmb_size(struct device_node *memory) -{ - const u32 *prop; - u32 len; - - prop = of_get_property(memory, "ibm,lmb-size", &len); - if (!prop || len < sizeof(unsigned int)) - return 0; - - return read_n_cells(n_mem_size_cells, &prop); -} - struct assoc_arrays { u32 n_arrays; u32 array_sz; - const u32 *arrays; + const __be32 *arrays; }; /* @@ -465,18 +550,26 @@ struct assoc_arrays { * indicating the size of each associativity array, followed by a list * of N associativity arrays. */ -static int of_get_assoc_arrays(struct device_node *memory, - struct assoc_arrays *aa) +static int of_get_assoc_arrays(struct assoc_arrays *aa) { - const u32 *prop; + struct device_node *memory; + const __be32 *prop; u32 len; + memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (!memory) + return -1; + prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); - if (!prop || len < 2 * sizeof(unsigned int)) + if (!prop || len < 2 * sizeof(unsigned int)) { + of_node_put(memory); return -1; + } + + aa->n_arrays = of_read_number(prop++, 1); + aa->array_sz = of_read_number(prop++, 1); - aa->n_arrays = *prop++; - aa->array_sz = *prop++; + of_node_put(memory); /* Now that we know the number of arrays and size of each array, * revalidate the size of the property read in. @@ -488,79 +581,222 @@ static int of_get_assoc_arrays(struct device_node *memory, return 0; } +static int __init get_nid_and_numa_distance(struct drmem_lmb *lmb) +{ + struct assoc_arrays aa = { .arrays = NULL }; + int default_nid = NUMA_NO_NODE; + int nid = default_nid; + int rc, index; + + if ((primary_domain_index < 0) || !numa_enabled) + return default_nid; + + rc = of_get_assoc_arrays(&aa); + if (rc) + return default_nid; + + if (primary_domain_index <= aa.array_sz && + !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { + const __be32 *associativity; + + index = lmb->aa_index * aa.array_sz; + associativity = &aa.arrays[index]; + nid = __associativity_to_nid(associativity, aa.array_sz); + if (nid > 0 && affinity_form == FORM1_AFFINITY) { + /* + * lookup array associativity entries have + * no length of the array as the first element. + */ + __initialize_form1_numa_distance(associativity, aa.array_sz); + } + } + return nid; +} + /* * This is like of_node_to_nid_single() for memory represented in the * ibm,dynamic-reconfiguration-memory node. */ -static int of_drconf_to_nid_single(struct of_drconf_cell *drmem, - struct assoc_arrays *aa) +int of_drconf_to_nid_single(struct drmem_lmb *lmb) { - int default_nid = 0; + struct assoc_arrays aa = { .arrays = NULL }; + int default_nid = NUMA_NO_NODE; int nid = default_nid; - int index; + int rc, index; - if (min_common_depth > 0 && min_common_depth <= aa->array_sz && - !(drmem->flags & DRCONF_MEM_AI_INVALID) && - drmem->aa_index < aa->n_arrays) { - index = drmem->aa_index * aa->array_sz + min_common_depth - 1; - nid = aa->arrays[index]; + if ((primary_domain_index < 0) || !numa_enabled) + return default_nid; - if (nid == 0xffff || nid >= MAX_NUMNODES) - nid = default_nid; - } + rc = of_get_assoc_arrays(&aa); + if (rc) + return default_nid; + + if (primary_domain_index <= aa.array_sz && + !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { + const __be32 *associativity; + index = lmb->aa_index * aa.array_sz; + associativity = &aa.arrays[index]; + nid = __associativity_to_nid(associativity, aa.array_sz); + } return nid; } +#ifdef CONFIG_PPC_SPLPAR + +static int __vphn_get_associativity(long lcpu, __be32 *associativity) +{ + long rc, hwid; + + /* + * On a shared lpar, device tree will not have node associativity. + * At this time lppaca, or its __old_status field may not be + * updated. Hence kernel cannot detect if its on a shared lpar. So + * request an explicit associativity irrespective of whether the + * lpar is shared or dedicated. Use the device tree property as a + * fallback. cpu_to_phys_id is only valid between + * smp_setup_cpu_maps() and smp_setup_pacas(). + */ + if (firmware_has_feature(FW_FEATURE_VPHN)) { + if (cpu_to_phys_id) + hwid = cpu_to_phys_id[lcpu]; + else + hwid = get_hard_smp_processor_id(lcpu); + + rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity); + if (rc == H_SUCCESS) + return 0; + } + + return -1; +} + +static int vphn_get_nid(long lcpu) +{ + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + + + if (!__vphn_get_associativity(lcpu, associativity)) + return associativity_to_nid(associativity); + + return NUMA_NO_NODE; + +} +#else + +static int __vphn_get_associativity(long lcpu, __be32 *associativity) +{ + return -1; +} + +static int vphn_get_nid(long unused) +{ + return NUMA_NO_NODE; +} +#endif /* CONFIG_PPC_SPLPAR */ + /* * Figure out to which domain a cpu belongs and stick it there. * Return the id of the domain used. */ static int numa_setup_cpu(unsigned long lcpu) { - int nid = 0; - struct device_node *cpu = of_get_cpu_node(lcpu, NULL); + struct device_node *cpu; + int fcpu = cpu_first_thread_sibling(lcpu); + int nid = NUMA_NO_NODE; + + if (!cpu_present(lcpu)) { + set_cpu_numa_node(lcpu, first_online_node); + return first_online_node; + } + + /* + * If a valid cpu-to-node mapping is already available, use it + * directly instead of querying the firmware, since it represents + * the most recent mapping notified to us by the platform (eg: VPHN). + * Since cpu_to_node binding remains the same for all threads in the + * core. If a valid cpu-to-node mapping is already available, for + * the first thread in the core, use it. + */ + nid = numa_cpu_lookup_table[fcpu]; + if (nid >= 0) { + map_cpu_to_node(lcpu, nid); + return nid; + } + + nid = vphn_get_nid(lcpu); + if (nid != NUMA_NO_NODE) + goto out_present; + + cpu = of_get_cpu_node(lcpu, NULL); if (!cpu) { WARN_ON(1); - goto out; + if (cpu_present(lcpu)) + goto out_present; + else + goto out; } nid = of_node_to_nid_single(cpu); + of_node_put(cpu); - if (nid < 0 || !node_online(nid)) +out_present: + if (nid < 0 || !node_possible(nid)) nid = first_online_node; -out: - map_cpu_to_node(lcpu, nid); - of_node_put(cpu); + /* + * Update for the first thread of the core. All threads of a core + * have to be part of the same node. This not only avoids querying + * for every other thread in the core, but always avoids a case + * where virtual node associativity change causes subsequent threads + * of a core to be associated with different nid. However if first + * thread is already online, expect it to have a valid mapping. + */ + if (fcpu != lcpu) { + WARN_ON(cpu_online(fcpu)); + map_cpu_to_node(fcpu, nid); + } + map_cpu_to_node(lcpu, nid); +out: return nid; } -static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static void verify_cpu_node_mapping(int cpu, int node) { - unsigned long lcpu = (unsigned long)hcpu; - int ret = NOTIFY_DONE; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - numa_setup_cpu(lcpu); - ret = NOTIFY_OK; - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - unmap_cpu_from_node(lcpu); - break; - ret = NOTIFY_OK; -#endif + int base, sibling, i; + + /* Verify that all the threads in the core belong to the same node */ + base = cpu_first_thread_sibling(cpu); + + for (i = 0; i < threads_per_core; i++) { + sibling = base + i; + + if (sibling == cpu || cpu_is_offline(sibling)) + continue; + + if (cpu_to_node(sibling) != node) { + WARN(1, "CPU thread siblings %d and %d don't belong" + " to the same node!\n", cpu, sibling); + break; + } } - return ret; +} + +/* Must run before sched domains notifier. */ +static int ppc_numa_cpu_prepare(unsigned int cpu) +{ + int nid; + + nid = numa_setup_cpu(cpu); + verify_cpu_node_mapping(cpu, nid); + return 0; +} + +static int ppc_numa_cpu_dead(unsigned int cpu) +{ + return 0; } /* @@ -594,7 +830,7 @@ static unsigned long __init numa_enforce_memory_limit(unsigned long start, * Reads the counter for a given entry in * linux,drconf-usable-memory property */ -static inline int __init read_usm_ranges(const u32 **usm) +static inline int __init read_usm_ranges(const __be32 **usm) { /* * For each lmb in ibm,dynamic-memory a corresponding @@ -609,85 +845,83 @@ static inline int __init read_usm_ranges(const u32 **usm) * Extract NUMA information from the ibm,dynamic-reconfiguration-memory * node. This assumes n_mem_{addr,size}_cells have been set. */ -static void __init parse_drconf_memory(struct device_node *memory) +static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, + const __be32 **usm, + void *data) { - const u32 *uninitialized_var(dm), *usm; - unsigned int n, rc, ranges, is_kexec_kdump = 0; - unsigned long lmb_size, base, size, sz; + unsigned int ranges, is_kexec_kdump = 0; + unsigned long base, size, sz; int nid; - struct assoc_arrays aa = { .arrays = NULL }; - - n = of_get_drconf_memory(memory, &dm); - if (!n) - return; - lmb_size = of_get_lmb_size(memory); - if (!lmb_size) - return; - - rc = of_get_assoc_arrays(memory, &aa); - if (rc) - return; + /* + * Skip this block if the reserved bit is set in flags (0x80) + * or if the block is not assigned to this partition (0x8) + */ + if ((lmb->flags & DRCONF_MEM_RESERVED) + || !(lmb->flags & DRCONF_MEM_ASSIGNED)) + return 0; - /* check if this is a kexec/kdump kernel */ - usm = of_get_usable_memory(memory); - if (usm != NULL) + if (*usm) is_kexec_kdump = 1; - for (; n != 0; --n) { - struct of_drconf_cell drmem; + base = lmb->base_addr; + size = drmem_lmb_size(); + ranges = 1; - read_drconf_cell(&drmem, &dm); - - /* skip this block if the reserved bit is set in flags (0x80) - or if the block is not assigned to this partition (0x8) */ - if ((drmem.flags & DRCONF_MEM_RESERVED) - || !(drmem.flags & DRCONF_MEM_ASSIGNED)) - continue; - - base = drmem.base_addr; - size = lmb_size; - ranges = 1; + if (is_kexec_kdump) { + ranges = read_usm_ranges(usm); + if (!ranges) /* there are no (base, size) duple */ + return 0; + } + do { if (is_kexec_kdump) { - ranges = read_usm_ranges(&usm); - if (!ranges) /* there are no (base, size) duple */ - continue; + base = read_n_cells(n_mem_addr_cells, usm); + size = read_n_cells(n_mem_size_cells, usm); } - do { - if (is_kexec_kdump) { - base = read_n_cells(n_mem_addr_cells, &usm); - size = read_n_cells(n_mem_size_cells, &usm); - } - nid = of_drconf_to_nid_single(&drmem, &aa); - fake_numa_create_new_node( - ((base + size) >> PAGE_SHIFT), - &nid); - node_set_online(nid); - sz = numa_enforce_memory_limit(base, size); - if (sz) - memblock_set_node(base, sz, nid); - } while (--ranges); - } + + nid = get_nid_and_numa_distance(lmb); + fake_numa_create_new_node(((base + size) >> PAGE_SHIFT), + &nid); + node_set_online(nid); + sz = numa_enforce_memory_limit(base, size); + if (sz) + memblock_set_node(base, sz, &memblock.memory, nid); + } while (--ranges); + + return 0; } static int __init parse_numa_properties(void) { - struct device_node *memory; + struct device_node *memory, *pci; int default_nid = 0; unsigned long i; + const __be32 *associativity; if (numa_enabled == 0) { - printk(KERN_WARNING "NUMA disabled by user\n"); + pr_warn("disabled by user\n"); return -1; } - min_common_depth = find_min_common_depth(); + primary_domain_index = find_primary_domain_index(); - if (min_common_depth < 0) - return min_common_depth; + if (primary_domain_index < 0) { + /* + * if we fail to parse primary_domain_index from device tree + * mark the numa disabled, boot with numa disabled. + */ + numa_enabled = false; + return primary_domain_index; + } - dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); + pr_debug("associativity depth for CPU/Memory: %d\n", primary_domain_index); + + /* + * If it is FORM2 initialize the distance table here. + */ + if (affinity_form == FORM2_AFFINITY) + initialize_form2_numa_distance_lookup_table(); /* * Even though we connect cpus to numa domains later in SMP @@ -695,22 +929,36 @@ static int __init parse_numa_properties(void) * each node to be onlined must have NODE_DATA etc backing it. */ for_each_present_cpu(i) { + __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE]; struct device_node *cpu; - int nid; + int nid = NUMA_NO_NODE; - cpu = of_get_cpu_node(i, NULL); - BUG_ON(!cpu); - nid = of_node_to_nid_single(cpu); - of_node_put(cpu); + memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32)); - /* - * Don't fall back to default_nid yet -- we will plug - * cpus into nodes once the memory scan has discovered - * the topology. - */ - if (nid < 0) - continue; - node_set_online(nid); + if (__vphn_get_associativity(i, vphn_assoc) == 0) { + nid = associativity_to_nid(vphn_assoc); + initialize_form1_numa_distance(vphn_assoc); + } else { + + /* + * Don't fall back to default_nid yet -- we will plug + * cpus into nodes once the memory scan has discovered + * the topology. + */ + cpu = of_get_cpu_node(i, NULL); + BUG_ON(!cpu); + + associativity = of_get_associativity(cpu); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } + of_node_put(cpu); + } + + /* node_set_online() is an UB if 'nid' is negative */ + if (likely(nid >= 0)) + node_set_online(nid); } get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); @@ -720,7 +968,7 @@ static int __init parse_numa_properties(void) unsigned long size; int nid; int ranges; - const unsigned int *memcell_buf; + const __be32 *memcell_buf; unsigned int len; memcell_buf = of_get_property(memory, @@ -742,34 +990,46 @@ new_range: * have associativity properties. If none, then * everything goes to default_nid. */ - nid = of_node_to_nid_single(memory); - if (nid < 0) + associativity = of_get_associativity(memory); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } else nid = default_nid; fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); node_set_online(nid); - if (!(size = numa_enforce_memory_limit(start, size))) { - if (--ranges) - goto new_range; - else - continue; - } - - memblock_set_node(start, size, nid); + size = numa_enforce_memory_limit(start, size); + if (size) + memblock_set_node(start, size, &memblock.memory, nid); if (--ranges) goto new_range; } + for_each_node_by_name(pci, "pci") { + int nid = NUMA_NO_NODE; + + associativity = of_get_associativity(pci); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } + if (likely(nid >= 0) && !node_online(nid)) + node_set_online(nid); + } + /* * Now do the same thing for each MEMBLOCK listed in the * ibm,dynamic-memory property in the * ibm,dynamic-reconfiguration-memory node. */ memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); - if (memory) - parse_drconf_memory(memory); + if (memory) { + walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb); + of_node_put(memory); + } return 0; } @@ -780,20 +1040,16 @@ static void __init setup_nonnuma(void) unsigned long total_ram = memblock_phys_mem_size(); unsigned long start_pfn, end_pfn; unsigned int nid = 0; - struct memblock_region *reg; - - printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", - top_of_ram, total_ram); - printk(KERN_DEBUG "Memory hole size: %ldMB\n", - (top_of_ram - total_ram) >> 20); + int i; - for_each_memblock(memory, reg) { - start_pfn = memblock_region_memory_base_pfn(reg); - end_pfn = memblock_region_memory_end_pfn(reg); + pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram); + pr_debug("Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { fake_numa_create_new_node(end_pfn, &nid); memblock_set_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), nid); + PFN_PHYS(end_pfn - start_pfn), + &memblock.memory, nid); node_set_online(nid); } } @@ -803,11 +1059,11 @@ void __init dump_numa_cpu_topology(void) unsigned int node; unsigned int cpu, count; - if (min_common_depth == -1 || !numa_enabled) + if (!numa_enabled) return; for_each_online_node(node) { - printk(KERN_DEBUG "Node %d CPUs:", node); + pr_info("Node %d CPUs:", node); count = 0; /* @@ -818,263 +1074,156 @@ void __init dump_numa_cpu_topology(void) if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { if (count == 0) - printk(" %u", cpu); + pr_cont(" %u", cpu); ++count; } else { if (count > 1) - printk("-%u", cpu - 1); + pr_cont("-%u", cpu - 1); count = 0; } } if (count > 1) - printk("-%u", nr_cpu_ids - 1); - printk("\n"); + pr_cont("-%u", nr_cpu_ids - 1); + pr_cont("\n"); } } -static void __init dump_numa_memory_topology(void) +/* Initialize NODE_DATA for a node on the local memory */ +static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) { - unsigned int node; - unsigned int count; + u64 spanned_pages = end_pfn - start_pfn; - if (min_common_depth == -1 || !numa_enabled) - return; + alloc_node_data(nid); - for_each_online_node(node) { - unsigned long i; + NODE_DATA(nid)->node_id = nid; + NODE_DATA(nid)->node_start_pfn = start_pfn; + NODE_DATA(nid)->node_spanned_pages = spanned_pages; +} - printk(KERN_DEBUG "Node %d Memory:", node); +static void __init find_possible_nodes(void) +{ + struct device_node *rtas, *root; + const __be32 *domains = NULL; + int prop_length, max_nodes; + u32 i; - count = 0; + if (!numa_enabled) + return; - for (i = 0; i < memblock_end_of_DRAM(); - i += (1 << SECTION_SIZE_BITS)) { - if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) { - if (count == 0) - printk(" 0x%lx", i); - ++count; - } else { - if (count > 0) - printk("-0x%lx", i); - count = 0; - } - } + rtas = of_find_node_by_path("/rtas"); + if (!rtas) + return; - if (count > 0) - printk("-0x%lx", i); - printk("\n"); + /* + * ibm,current-associativity-domains is a fairly recent property. If + * it doesn't exist, then fallback on ibm,max-associativity-domains. + * Current denotes what the platform can support compared to max + * which denotes what the Hypervisor can support. + * + * If the LPAR is migratable, new nodes might be activated after a LPM, + * so we should consider the max number in that case. + */ + root = of_find_node_by_path("/"); + if (!of_get_property(root, "ibm,migratable-partition", NULL)) + domains = of_get_property(rtas, + "ibm,current-associativity-domains", + &prop_length); + of_node_put(root); + if (!domains) { + domains = of_get_property(rtas, "ibm,max-associativity-domains", + &prop_length); + if (!domains) + goto out; } -} -/* - * Allocate some memory, satisfying the memblock or bootmem allocator where - * required. nid is the preferred node and end is the physical address of - * the highest address in the node. - * - * Returns the virtual address of the memory. - */ -static void __init *careful_zallocation(int nid, unsigned long size, - unsigned long align, - unsigned long end_pfn) -{ - void *ret; - int new_nid; - unsigned long ret_paddr; + max_nodes = of_read_number(&domains[primary_domain_index], 1); + pr_info("Partition configured for %d NUMA nodes.\n", max_nodes); - ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT); + for (i = 0; i < max_nodes; i++) { + if (!node_possible(i)) + node_set(i, node_possible_map); + } - /* retry over all memory */ - if (!ret_paddr) - ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM()); + prop_length /= sizeof(int); + if (prop_length > primary_domain_index + 2) + coregroup_enabled = 1; - if (!ret_paddr) - panic("numa.c: cannot allocate %lu bytes for node %d", - size, nid); +out: + of_node_put(rtas); +} + +void __init mem_topology_setup(void) +{ + int cpu; - ret = __va(ret_paddr); + max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; + min_low_pfn = MEMORY_START >> PAGE_SHIFT; /* - * We initialize the nodes in numeric order: 0, 1, 2... - * and hand over control from the MEMBLOCK allocator to the - * bootmem allocator. If this function is called for - * node 5, then we know that all nodes <5 are using the - * bootmem allocator instead of the MEMBLOCK allocator. - * - * So, check the nid from which this allocation came - * and double check to see if we need to use bootmem - * instead of the MEMBLOCK. We don't free the MEMBLOCK memory - * since it would be useless. + * Linux/mm assumes node 0 to be online at boot. However this is not + * true on PowerPC, where node 0 is similar to any other node, it + * could be cpuless, memoryless node. So force node 0 to be offline + * for now. This will prevent cpuless, memoryless node 0 showing up + * unnecessarily as online. If a node has cpus or memory that need + * to be online, then node will anyway be marked online. */ - new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT); - if (new_nid < nid) { - ret = __alloc_bootmem_node(NODE_DATA(new_nid), - size, align, 0); + node_set_offline(0); - dbg("alloc_bootmem %p %lx\n", ret, size); - } + if (parse_numa_properties()) + setup_nonnuma(); - memset(ret, 0, size); - return ret; -} + /* + * Modify the set of possible NUMA nodes to reflect information + * available about the set of online nodes, and the set of nodes + * that we expect to make use of for this platform's affinity + * calculations. + */ + nodes_and(node_possible_map, node_possible_map, node_online_map); -static struct notifier_block ppc64_numa_nb = { - .notifier_call = cpu_numa_callback, - .priority = 1 /* Must run before sched domains notifier. */ -}; + find_possible_nodes(); -static void __init mark_reserved_regions_for_nid(int nid) -{ - struct pglist_data *node = NODE_DATA(nid); - struct memblock_region *reg; - - for_each_memblock(reserved, reg) { - unsigned long physbase = reg->base; - unsigned long size = reg->size; - unsigned long start_pfn = physbase >> PAGE_SHIFT; - unsigned long end_pfn = PFN_UP(physbase + size); - struct node_active_region node_ar; - unsigned long node_end_pfn = node->node_start_pfn + - node->node_spanned_pages; + setup_node_to_cpumask_map(); + + reset_numa_cpu_lookup_table(); + for_each_possible_cpu(cpu) { /* - * Check to make sure that this memblock.reserved area is - * within the bounds of the node that we care about. - * Checking the nid of the start and end points is not - * sufficient because the reserved area could span the - * entire node. + * Powerpc with CONFIG_NUMA always used to have a node 0, + * even if it was memoryless or cpuless. For all cpus that + * are possible but not present, cpu_to_node() would point + * to node 0. To remove a cpuless, memoryless dummy node, + * powerpc need to make sure all possible but not present + * cpu_to_node are set to a proper node. */ - if (end_pfn <= node->node_start_pfn || - start_pfn >= node_end_pfn) - continue; - - get_node_active_region(start_pfn, &node_ar); - while (start_pfn < end_pfn && - node_ar.start_pfn < node_ar.end_pfn) { - unsigned long reserve_size = size; - /* - * if reserved region extends past active region - * then trim size to active region - */ - if (end_pfn > node_ar.end_pfn) - reserve_size = (node_ar.end_pfn << PAGE_SHIFT) - - physbase; - /* - * Only worry about *this* node, others may not - * yet have valid NODE_DATA(). - */ - if (node_ar.nid == nid) { - dbg("reserve_bootmem %lx %lx nid=%d\n", - physbase, reserve_size, node_ar.nid); - reserve_bootmem_node(NODE_DATA(node_ar.nid), - physbase, reserve_size, - BOOTMEM_DEFAULT); - } - /* - * if reserved region is contained in the active region - * then done. - */ - if (end_pfn <= node_ar.end_pfn) - break; - - /* - * reserved region extends past the active region - * get next active region that contains this - * reserved region - */ - start_pfn = node_ar.end_pfn; - physbase = start_pfn << PAGE_SHIFT; - size = size - reserve_size; - get_node_active_region(start_pfn, &node_ar); - } + numa_setup_cpu(cpu); } } - -void __init do_init_bootmem(void) +void __init initmem_init(void) { int nid; - min_low_pfn = 0; - max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; - max_pfn = max_low_pfn; - - if (parse_numa_properties()) - setup_nonnuma(); - else - dump_numa_memory_topology(); + memblock_dump_all(); for_each_online_node(nid) { unsigned long start_pfn, end_pfn; - void *bootmem_vaddr; - unsigned long bootmap_pages; get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); - - /* - * Allocate the node structure node local if possible - * - * Be careful moving this around, as it relies on all - * previous nodes' bootmem to be initialized and have - * all reserved areas marked. - */ - NODE_DATA(nid) = careful_zallocation(nid, - sizeof(struct pglist_data), - SMP_CACHE_BYTES, end_pfn); - - dbg("node %d\n", nid); - dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); - - NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; - NODE_DATA(nid)->node_start_pfn = start_pfn; - NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; - - if (NODE_DATA(nid)->node_spanned_pages == 0) - continue; - - dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT); - dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT); - - bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); - bootmem_vaddr = careful_zallocation(nid, - bootmap_pages << PAGE_SHIFT, - PAGE_SIZE, end_pfn); - - dbg("bootmap_vaddr = %p\n", bootmem_vaddr); - - init_bootmem_node(NODE_DATA(nid), - __pa(bootmem_vaddr) >> PAGE_SHIFT, - start_pfn, end_pfn); - - free_bootmem_with_active_regions(nid, end_pfn); - /* - * Be very careful about moving this around. Future - * calls to careful_zallocation() depend on this getting - * done correctly. - */ - mark_reserved_regions_for_nid(nid); - sparse_memory_present_with_active_regions(nid); + setup_node_data(nid, start_pfn, end_pfn); } - init_bootmem_done = 1; + sparse_init(); /* - * Now bootmem is initialised we can create the node to cpumask - * lookup tables and setup the cpu callback to populate them. + * We need the numa_cpu_lookup_table to be accurate for all CPUs, + * even before we online them, so that we can use cpu_to_{node,mem} + * early in boot, cf. smp_prepare_cpus(). + * _nocalls() + manual invocation is used because cpuhp is not yet + * initialized for the boot CPU. */ - setup_node_to_cpumask_map(); - - register_cpu_notifier(&ppc64_numa_nb); - cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, - (void *)(unsigned long)boot_cpuid); -} - -void __init paging_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT; - free_area_init_nodes(max_zone_pfns); + cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare", + ppc_numa_cpu_prepare, ppc_numa_cpu_dead); } static int __init early_numa(char *p) @@ -1085,9 +1234,6 @@ static int __init early_numa(char *p) if (strstr(p, "off")) numa_enabled = 0; - if (strstr(p, "debug")) - numa_debug = 1; - p = strstr(p, "fake="); if (p) cmdline = p + strlen("fake="); @@ -1102,43 +1248,26 @@ early_param("numa", early_numa); * memory represented in the device tree by the property * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory. */ -static int hot_add_drconf_scn_to_nid(struct device_node *memory, - unsigned long scn_addr) +static int hot_add_drconf_scn_to_nid(unsigned long scn_addr) { - const u32 *dm; - unsigned int drconf_cell_cnt, rc; + struct drmem_lmb *lmb; unsigned long lmb_size; - struct assoc_arrays aa; - int nid = -1; + int nid = NUMA_NO_NODE; - drconf_cell_cnt = of_get_drconf_memory(memory, &dm); - if (!drconf_cell_cnt) - return -1; - - lmb_size = of_get_lmb_size(memory); - if (!lmb_size) - return -1; - - rc = of_get_assoc_arrays(memory, &aa); - if (rc) - return -1; - - for (; drconf_cell_cnt != 0; --drconf_cell_cnt) { - struct of_drconf_cell drmem; - - read_drconf_cell(&drmem, &dm); + lmb_size = drmem_lmb_size(); + for_each_drmem_lmb(lmb) { /* skip this block if it is reserved or not assigned to * this partition */ - if ((drmem.flags & DRCONF_MEM_RESERVED) - || !(drmem.flags & DRCONF_MEM_ASSIGNED)) + if ((lmb->flags & DRCONF_MEM_RESERVED) + || !(lmb->flags & DRCONF_MEM_ASSIGNED)) continue; - if ((scn_addr < drmem.base_addr) - || (scn_addr >= (drmem.base_addr + lmb_size))) + if ((scn_addr < lmb->base_addr) + || (scn_addr >= (lmb->base_addr + lmb_size))) continue; - nid = of_drconf_to_nid_single(&drmem, &aa); + nid = of_drconf_to_nid_single(lmb); break; } @@ -1150,29 +1279,21 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory, * represented in the device tree as a node (i.e. memory@XXXX) for * each memblock. */ -int hot_add_node_scn_to_nid(unsigned long scn_addr) +static int hot_add_node_scn_to_nid(unsigned long scn_addr) { struct device_node *memory; - int nid = -1; + int nid = NUMA_NO_NODE; for_each_node_by_type(memory, "memory") { - unsigned long start, size; - int ranges; - const unsigned int *memcell_buf; - unsigned int len; + int i = 0; - memcell_buf = of_get_property(memory, "reg", &len); - if (!memcell_buf || len <= 0) - continue; - - /* ranges in cell */ - ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); + while (1) { + struct resource res; - while (ranges--) { - start = read_n_cells(n_mem_addr_cells, &memcell_buf); - size = read_n_cells(n_mem_size_cells, &memcell_buf); + if (of_address_to_resource(memory, i++, &res)) + break; - if ((scn_addr < start) || (scn_addr >= (start + size))) + if ((scn_addr < res.start) || (scn_addr > res.end)) continue; nid = of_node_to_nid_single(memory); @@ -1196,50 +1317,45 @@ int hot_add_node_scn_to_nid(unsigned long scn_addr) int hot_add_scn_to_nid(unsigned long scn_addr) { struct device_node *memory = NULL; - int nid, found = 0; + int nid; - if (!numa_enabled || (min_common_depth < 0)) + if (!numa_enabled) return first_online_node; memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); if (memory) { - nid = hot_add_drconf_scn_to_nid(memory, scn_addr); + nid = hot_add_drconf_scn_to_nid(scn_addr); of_node_put(memory); } else { nid = hot_add_node_scn_to_nid(scn_addr); } - if (nid < 0 || !node_online(nid)) + if (nid < 0 || !node_possible(nid)) nid = first_online_node; - if (NODE_DATA(nid)->node_spanned_pages) - return nid; - - for_each_online_node(nid) { - if (NODE_DATA(nid)->node_spanned_pages) { - found = 1; - break; - } - } - - BUG_ON(!found); return nid; } -static u64 hot_add_drconf_memory_max(void) +u64 hot_add_drconf_memory_max(void) { - struct device_node *memory = NULL; - unsigned int drconf_cell_cnt = 0; - u64 lmb_size = 0; - const u32 *dm = 0; - - memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); - if (memory) { - drconf_cell_cnt = of_get_drconf_memory(memory, &dm); - lmb_size = of_get_lmb_size(memory); - of_node_put(memory); - } - return lmb_size * drconf_cell_cnt; + struct device_node *memory = NULL; + struct device_node *dn = NULL; + const __be64 *lrdr = NULL; + + dn = of_find_node_by_path("/rtas"); + if (dn) { + lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL); + of_node_put(dn); + if (lrdr) + return be64_to_cpup(lrdr); + } + + memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (memory) { + of_node_put(memory); + return drmem_lmb_memory_max(); + } + return 0; } /* @@ -1256,417 +1372,95 @@ u64 memory_hotplug_max(void) /* Virtual Processor Home Node (VPHN) support */ #ifdef CONFIG_PPC_SPLPAR -struct topology_update_data { - struct topology_update_data *next; - unsigned int cpu; - int old_nid; - int new_nid; -}; - -static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS]; -static cpumask_t cpu_associativity_changes_mask; -static int vphn_enabled; -static int prrn_enabled; -static void reset_topology_timer(void); - -/* - * Store the current values of the associativity change counters in the - * hypervisor. - */ -static void setup_cpu_associativity_change_counters(void) -{ - int cpu; - - /* The VPHN feature supports a maximum of 8 reference points */ - BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8); - - for_each_possible_cpu(cpu) { - int i; - u8 *counts = vphn_cpu_change_counts[cpu]; - volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; - - for (i = 0; i < distance_ref_points_depth; i++) - counts[i] = hypervisor_counts[i]; - } -} - -/* - * The hypervisor maintains a set of 8 associativity change counters in - * the VPA of each cpu that correspond to the associativity levels in the - * ibm,associativity-reference-points property. When an associativity - * level changes, the corresponding counter is incremented. - * - * Set a bit in cpu_associativity_changes_mask for each cpu whose home - * node associativity levels have changed. - * - * Returns the number of cpus with unhandled associativity changes. - */ -static int update_cpu_associativity_changes_mask(void) -{ - int cpu; - cpumask_t *changes = &cpu_associativity_changes_mask; - - for_each_possible_cpu(cpu) { - int i, changed = 0; - u8 *counts = vphn_cpu_change_counts[cpu]; - volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; - - for (i = 0; i < distance_ref_points_depth; i++) { - if (hypervisor_counts[i] != counts[i]) { - counts[i] = hypervisor_counts[i]; - changed = 1; - } - } - if (changed) { - cpumask_set_cpu(cpu, changes); - } - } - - return cpumask_weight(changes); -} - -/* - * 6 64-bit registers unpacked into 12 32-bit associativity values. To form - * the complete property we have to add the length in the first cell. - */ -#define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1) - -/* - * Convert the associativity domain numbers returned from the hypervisor - * to the sequence they would appear in the ibm,associativity property. - */ -static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked) -{ - int i, nr_assoc_doms = 0; - const u16 *field = (const u16*) packed; - -#define VPHN_FIELD_UNUSED (0xffff) -#define VPHN_FIELD_MSB (0x8000) -#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) - - for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { - if (*field == VPHN_FIELD_UNUSED) { - /* All significant fields processed, and remaining - * fields contain the reserved value of all 1's. - * Just store them. - */ - unpacked[i] = *((u32*)field); - field += 2; - } else if (*field & VPHN_FIELD_MSB) { - /* Data is in the lower 15 bits of this field */ - unpacked[i] = *field & VPHN_FIELD_MASK; - field++; - nr_assoc_doms++; - } else { - /* Data is in the lower 15 bits of this field - * concatenated with the next 16 bit field - */ - unpacked[i] = *((u32*)field); - field += 2; - nr_assoc_doms++; - } - } - - /* The first cell contains the length of the property */ - unpacked[0] = nr_assoc_doms; - - return nr_assoc_doms; -} +static int topology_inited; /* * Retrieve the new associativity information for a virtual processor's * home node. */ -static long hcall_vphn(unsigned long cpu, unsigned int *associativity) -{ - long rc; - long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; - u64 flags = 1; - int hwcpu = get_hard_smp_processor_id(cpu); - - rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu); - vphn_unpack_associativity(retbuf, associativity); - - return rc; -} - static long vphn_get_associativity(unsigned long cpu, - unsigned int *associativity) + __be32 *associativity) { long rc; - rc = hcall_vphn(cpu, associativity); + rc = hcall_vphn(get_hard_smp_processor_id(cpu), + VPHN_FLAG_VCPU, associativity); switch (rc) { + case H_SUCCESS: + pr_debug("VPHN hcall succeeded. Reset polling...\n"); + goto out; + case H_FUNCTION: - printk(KERN_INFO - "VPHN is not supported. Disabling polling...\n"); - stop_topology_update(); + pr_err_ratelimited("VPHN unsupported. Disabling polling...\n"); break; case H_HARDWARE: - printk(KERN_ERR - "hcall_vphn() experienced a hardware fault " + pr_err_ratelimited("hcall_vphn() experienced a hardware fault " "preventing VPHN. Disabling polling...\n"); - stop_topology_update(); - } - - return rc; -} - -/* - * Update the CPU maps and sysfs entries for a single CPU when its NUMA - * characteristics change. This function doesn't perform any locking and is - * only safe to call from stop_machine(). - */ -static int update_cpu_topology(void *data) -{ - struct topology_update_data *update; - unsigned long cpu; - - if (!data) - return -EINVAL; - - cpu = get_cpu(); - - for (update = data; update; update = update->next) { - if (cpu != update->cpu) - continue; - - unmap_cpu_from_node(update->cpu); - map_cpu_to_node(update->cpu, update->new_nid); - vdso_getcpu_init(); - } - - return 0; -} - -/* - * Update the node maps and sysfs entries for each cpu whose home node - * has changed. Returns 1 when the topology has changed, and 0 otherwise. - */ -int arch_update_cpu_topology(void) -{ - unsigned int cpu, changed = 0; - struct topology_update_data *updates, *ud; - unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0}; - cpumask_t updated_cpus; - struct device *dev; - int weight, i = 0; - - weight = cpumask_weight(&cpu_associativity_changes_mask); - if (!weight) - return 0; - - updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL); - if (!updates) - return 0; - - cpumask_clear(&updated_cpus); - - for_each_cpu(cpu, &cpu_associativity_changes_mask) { - ud = &updates[i++]; - ud->cpu = cpu; - vphn_get_associativity(cpu, associativity); - ud->new_nid = associativity_to_nid(associativity); - - if (ud->new_nid < 0 || !node_online(ud->new_nid)) - ud->new_nid = first_online_node; - - ud->old_nid = numa_cpu_lookup_table[cpu]; - cpumask_set_cpu(cpu, &updated_cpus); - - if (i < weight) - ud->next = &updates[i]; - } - - stop_machine(update_cpu_topology, &updates[0], &updated_cpus); - - for (ud = &updates[0]; ud; ud = ud->next) { - unregister_cpu_under_node(ud->cpu, ud->old_nid); - register_cpu_under_node(ud->cpu, ud->new_nid); - - dev = get_cpu_device(ud->cpu); - if (dev) - kobject_uevent(&dev->kobj, KOBJ_CHANGE); - cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask); - changed = 1; - } - - kfree(updates); - return changed; -} - -static void topology_work_fn(struct work_struct *work) -{ - rebuild_sched_domains(); -} -static DECLARE_WORK(topology_work, topology_work_fn); - -void topology_schedule_update(void) -{ - schedule_work(&topology_work); -} - -static void topology_timer_fn(unsigned long ignored) -{ - if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask)) - topology_schedule_update(); - else if (vphn_enabled) { - if (update_cpu_associativity_changes_mask() > 0) - topology_schedule_update(); - reset_topology_timer(); - } -} -static struct timer_list topology_timer = - TIMER_INITIALIZER(topology_timer_fn, 0, 0); - -static void reset_topology_timer(void) -{ - topology_timer.data = 0; - topology_timer.expires = jiffies + 60 * HZ; - mod_timer(&topology_timer, topology_timer.expires); -} - -#ifdef CONFIG_SMP - -static void stage_topology_update(int core_id) -{ - cpumask_or(&cpu_associativity_changes_mask, - &cpu_associativity_changes_mask, cpu_sibling_mask(core_id)); - reset_topology_timer(); -} - -static int dt_update_callback(struct notifier_block *nb, - unsigned long action, void *data) -{ - struct of_prop_reconfig *update; - int rc = NOTIFY_DONE; - - switch (action) { - case OF_RECONFIG_UPDATE_PROPERTY: - update = (struct of_prop_reconfig *)data; - if (!of_prop_cmp(update->dn->type, "cpu") && - !of_prop_cmp(update->prop->name, "ibm,associativity")) { - u32 core_id; - of_property_read_u32(update->dn, "reg", &core_id); - stage_topology_update(core_id); - rc = NOTIFY_OK; - } + break; + case H_PARAMETER: + pr_err_ratelimited("hcall_vphn() was passed an invalid parameter. " + "Disabling polling...\n"); + break; + default: + pr_err_ratelimited("hcall_vphn() returned %ld. Disabling polling...\n" + , rc); break; } - - return rc; -} - -static struct notifier_block dt_update_nb = { - .notifier_call = dt_update_callback, -}; - -#endif - -/* - * Start polling for associativity changes. - */ -int start_topology_update(void) -{ - int rc = 0; - - if (firmware_has_feature(FW_FEATURE_PRRN)) { - if (!prrn_enabled) { - prrn_enabled = 1; - vphn_enabled = 0; -#ifdef CONFIG_SMP - rc = of_reconfig_notifier_register(&dt_update_nb); -#endif - } - } else if (firmware_has_feature(FW_FEATURE_VPHN) && - get_lppaca()->shared_proc) { - if (!vphn_enabled) { - prrn_enabled = 0; - vphn_enabled = 1; - setup_cpu_associativity_change_counters(); - init_timer_deferrable(&topology_timer); - reset_topology_timer(); - } - } - +out: return rc; } -/* - * Disable polling for VPHN associativity changes. - */ -int stop_topology_update(void) +void find_and_update_cpu_nid(int cpu) { - int rc = 0; - - if (prrn_enabled) { - prrn_enabled = 0; -#ifdef CONFIG_SMP - rc = of_reconfig_notifier_unregister(&dt_update_nb); -#endif - } else if (vphn_enabled) { - vphn_enabled = 0; - rc = del_timer_sync(&topology_timer); - } + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + int new_nid; - return rc; -} + /* Use associativity from first thread for all siblings */ + if (vphn_get_associativity(cpu, associativity)) + return; -int prrn_is_enabled(void) -{ - return prrn_enabled; -} + /* Do not have previous associativity, so find it now. */ + new_nid = associativity_to_nid(associativity); -static int topology_read(struct seq_file *file, void *v) -{ - if (vphn_enabled || prrn_enabled) - seq_puts(file, "on\n"); + if (new_nid < 0 || !node_possible(new_nid)) + new_nid = first_online_node; else - seq_puts(file, "off\n"); + // Associate node <-> cpu, so cpu_up() calls + // try_online_node() on the right node. + set_cpu_numa_node(cpu, new_nid); - return 0; + pr_debug("%s:%d cpu %d nid %d\n", __func__, __LINE__, cpu, new_nid); } -static int topology_open(struct inode *inode, struct file *file) +int cpu_to_coregroup_id(int cpu) { - return single_open(file, topology_read, NULL); -} + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + int index; -static ssize_t topology_write(struct file *file, const char __user *buf, - size_t count, loff_t *off) -{ - char kbuf[4]; /* "on" or "off" plus null. */ - int read_len; + if (cpu < 0 || cpu > nr_cpu_ids) + return -1; - read_len = count < 3 ? count : 3; - if (copy_from_user(kbuf, buf, read_len)) - return -EINVAL; + if (!coregroup_enabled) + goto out; - kbuf[read_len] = '\0'; + if (!firmware_has_feature(FW_FEATURE_VPHN)) + goto out; - if (!strncmp(kbuf, "on", 2)) - start_topology_update(); - else if (!strncmp(kbuf, "off", 3)) - stop_topology_update(); - else - return -EINVAL; + if (vphn_get_associativity(cpu, associativity)) + goto out; - return count; -} + index = of_read_number(associativity, 1); + if (index > primary_domain_index + 1) + return of_read_number(&associativity[index - 1], 1); -static const struct file_operations topology_ops = { - .read = seq_read, - .write = topology_write, - .open = topology_open, - .release = single_release -}; +out: + return cpu_to_core_id(cpu); +} static int topology_update_init(void) { - start_topology_update(); - proc_create("powerpc/topology_updates", 644, NULL, &topology_ops); - + topology_inited = 1; return 0; } device_initcall(topology_update_init); diff --git a/arch/powerpc/mm/pageattr.c b/arch/powerpc/mm/pageattr.c new file mode 100644 index 000000000000..ac22bf28086f --- /dev/null +++ b/arch/powerpc/mm/pageattr.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * MMU-generic set_memory implementation for powerpc + * + * Copyright 2019-2021, IBM Corporation. + */ + +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/set_memory.h> + +#include <asm/mmu.h> +#include <asm/page.h> +#include <asm/pgtable.h> + +#include <mm/mmu_decl.h> + +static pte_basic_t pte_update_delta(pte_t *ptep, unsigned long addr, + unsigned long old, unsigned long new) +{ + return pte_update(&init_mm, addr, ptep, old & ~new, new & ~old, 0); +} + +/* + * Updates the attributes of a page atomically. + * + * This sequence is safe against concurrent updates, and also allows updating the + * attributes of a page currently being executed or accessed. + */ +static int change_page_attr(pte_t *ptep, unsigned long addr, void *data) +{ + long action = (long)data; + + addr &= PAGE_MASK; + /* modify the PTE bits as desired */ + switch (action) { + case SET_MEMORY_RO: + /* Don't clear DIRTY bit */ + pte_update_delta(ptep, addr, _PAGE_KERNEL_RW & ~_PAGE_DIRTY, _PAGE_KERNEL_RO); + break; + case SET_MEMORY_ROX: + /* Don't clear DIRTY bit */ + pte_update_delta(ptep, addr, _PAGE_KERNEL_RW & ~_PAGE_DIRTY, _PAGE_KERNEL_ROX); + break; + case SET_MEMORY_RW: + pte_update_delta(ptep, addr, _PAGE_KERNEL_RO, _PAGE_KERNEL_RW); + break; + case SET_MEMORY_NX: + pte_update_delta(ptep, addr, _PAGE_KERNEL_ROX, _PAGE_KERNEL_RO); + break; + case SET_MEMORY_X: + pte_update_delta(ptep, addr, _PAGE_KERNEL_RO, _PAGE_KERNEL_ROX); + break; + case SET_MEMORY_NP: + pte_update(&init_mm, addr, ptep, _PAGE_PRESENT, 0, 0); + break; + case SET_MEMORY_P: + pte_update(&init_mm, addr, ptep, 0, _PAGE_PRESENT, 0); + break; + default: + WARN_ON_ONCE(1); + break; + } + + /* See ptesync comment in radix__set_pte_at() */ + if (radix_enabled()) + asm volatile("ptesync": : :"memory"); + + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + + return 0; +} + +int change_memory_attr(unsigned long addr, int numpages, long action) +{ + unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); + unsigned long size = numpages * PAGE_SIZE; + + if (!numpages) + return 0; + + if (WARN_ON_ONCE(is_vmalloc_or_module_addr((void *)addr) && + is_vm_area_hugepages((void *)addr))) + return -EINVAL; + +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * On hash, the linear mapping is not in the Linux page table so + * apply_to_existing_page_range() will have no effect. If in the future + * the set_memory_* functions are used on the linear map this will need + * to be updated. + */ + if (!radix_enabled()) { + int region = get_region_id(addr); + + if (WARN_ON_ONCE(region != VMALLOC_REGION_ID && region != IO_REGION_ID)) + return -EINVAL; + } +#endif + + return apply_to_existing_page_range(&init_mm, start, size, + change_page_attr, (void *)action); +} + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) +#ifdef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + int err; + unsigned long addr = (unsigned long)page_address(page); + + if (PageHighMem(page)) + return; + + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled()) + err = hash__kernel_map_pages(page, numpages, enable); + else if (enable) + err = set_memory_p(addr, numpages); + else + err = set_memory_np(addr, numpages); + + if (err) + panic("%s: changing memory protections failed\n", __func__); +} +#endif +#endif diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c new file mode 100644 index 000000000000..77e55eac16e4 --- /dev/null +++ b/arch/powerpc/mm/pgtable-frag.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Handling Page Tables through page fragments + * + */ + +#include <linux/kernel.h> +#include <linux/gfp.h> +#include <linux/mm.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/hugetlb.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> + +void pte_frag_destroy(void *pte_frag) +{ + int count; + struct ptdesc *ptdesc; + + ptdesc = virt_to_ptdesc(pte_frag); + /* drop all the pending references */ + count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; + /* We allow PTE_FRAG_NR fragments from a PTE page */ + if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); + } +} + +static pte_t *get_pte_from_cache(struct mm_struct *mm) +{ + void *pte_frag, *ret; + + if (PTE_FRAG_NR == 1) + return NULL; + + spin_lock(&mm->page_table_lock); + ret = pte_frag_get(&mm->context); + if (ret) { + pte_frag = ret + PTE_FRAG_SIZE; + /* + * If we have taken up all the fragments mark PTE page NULL + */ + if (((unsigned long)pte_frag & ~PAGE_MASK) == 0) + pte_frag = NULL; + pte_frag_set(&mm->context, pte_frag); + } + spin_unlock(&mm->page_table_lock); + return (pte_t *)ret; +} + +static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) +{ + void *ret = NULL; + struct ptdesc *ptdesc; + gfp_t gfp = PGALLOC_GFP; + + if (!kernel) + gfp |= __GFP_ACCOUNT; + + ptdesc = pagetable_alloc(gfp, 0); + if (!ptdesc) + return NULL; + if (!pagetable_pte_ctor(mm, ptdesc)) { + pagetable_free(ptdesc); + return NULL; + } + + atomic_set(&ptdesc->pt_frag_refcount, 1); + + ret = ptdesc_address(ptdesc); + /* + * if we support only one fragment just return the + * allocated page. + */ + if (PTE_FRAG_NR == 1) + return ret; + spin_lock(&mm->page_table_lock); + /* + * If we find ptdesc_page set, we return + * the allocated page with single fragment + * count. + */ + if (likely(!pte_frag_get(&mm->context))) { + atomic_set(&ptdesc->pt_frag_refcount, PTE_FRAG_NR); + pte_frag_set(&mm->context, ret + PTE_FRAG_SIZE); + } + spin_unlock(&mm->page_table_lock); + + return (pte_t *)ret; +} + +pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel) +{ + pte_t *pte; + + pte = get_pte_from_cache(mm); + if (pte) + return pte; + + return __alloc_for_ptecache(mm, kernel); +} + +static void pte_free_now(struct rcu_head *head) +{ + struct ptdesc *ptdesc; + + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); +} + +void pte_fragment_free(unsigned long *table, int kernel) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(table); + + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); + + BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { + if (kernel || !folio_test_clear_active(ptdesc_folio(ptdesc))) + pte_free_now(&ptdesc->pt_rcu_head); + else + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); + } +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct folio *folio; + + folio = virt_to_folio(pgtable); + folio_set_active(folio); + pte_fragment_free((unsigned long *)pgtable, 0); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index edda589795c3..56d7e8960e77 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains common routines for dealing with free of page tables * Along with common page table handling code @@ -14,25 +15,26 @@ * * Dave Engebretsen <engebret@us.ibm.com> * Rework for PPC64 port. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/mm.h> -#include <linux/init.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/hugetlb.h> -#include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/tlb.h> +#include <asm/hugetlb.h> +#include <asm/pte-walk.h> + +#ifdef CONFIG_PPC64 +#define PGD_ALIGN (sizeof(pgd_t) * MAX_PTRS_PER_PGD) +#else +#define PGD_ALIGN PAGE_SIZE +#endif -#include "mmu_decl.h" +pgd_t swapper_pg_dir[MAX_PTRS_PER_PGD] __section(".bss..page_aligned") __aligned(PGD_ALIGN); static inline int is_exec_fault(void) { @@ -41,17 +43,22 @@ static inline int is_exec_fault(void) /* We only try to do i/d cache coherency on stuff that looks like * reasonably "normal" PTEs. We currently require a PTE to be present - * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that + * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that * on userspace PTEs */ -static inline int pte_looks_normal(pte_t pte) +static inline int pte_looks_normal(pte_t pte, unsigned long addr) { - return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == - (_PAGE_PRESENT | _PAGE_USER); + + if (pte_present(pte) && !pte_special(pte)) { + if (pte_ci(pte)) + return 0; + if (!is_kernel_addr(addr)) + return 1; + } + return 0; } -struct page * maybe_pte_to_page(pte_t pte) +static struct folio *maybe_pte_to_folio(pte_t pte) { unsigned long pfn = pte_pfn(pte); struct page *page; @@ -61,10 +68,10 @@ struct page * maybe_pte_to_page(pte_t pte) page = pfn_to_page(pfn); if (PageReserved(page)) return NULL; - return page; + return page_folio(page); } -#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 +#ifdef CONFIG_PPC_BOOK3S /* Server-style MMU handles coherency when hashing if HW exec permission * is supposed per page (currently 64-bit only). If not, then, we always @@ -72,84 +79,85 @@ struct page * maybe_pte_to_page(pte_t pte) * support falls into the same category. */ -static pte_t set_pte_filter(pte_t pte, unsigned long addr) +static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr) { pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); - if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || - cpu_has_feature(CPU_FTR_NOEXECUTE))) { - struct page *pg = maybe_pte_to_page(pte); - if (!pg) + if (pte_looks_normal(pte, addr) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || + cpu_has_feature(CPU_FTR_NOEXECUTE))) { + struct folio *folio = maybe_pte_to_folio(pte); + if (!folio) return pte; - if (!test_bit(PG_arch_1, &pg->flags)) { -#ifdef CONFIG_8xx - /* On 8xx, cache control instructions (particularly - * "dcbst" from flush_dcache_icache) fault as write - * operation if there is an unpopulated TLB entry - * for the address in question. To workaround that, - * we invalidate the TLB here, thus avoiding dcbst - * misbehaviour. - */ - /* 8xx doesn't care about PID, size or ind args */ - _tlbil_va(addr, 0, 0, 0); -#endif /* CONFIG_8xx */ - flush_dcache_icache_page(pg); - set_bit(PG_arch_1, &pg->flags); + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags.f); } } return pte; } -static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, - int dirty) -{ - return pte; -} +#else /* CONFIG_PPC_BOOK3S */ -#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */ +static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr) { return pte; } + +#endif /* CONFIG_PPC_BOOK3S */ /* Embedded type MMU with HW exec support. This is a bit more complicated * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so * instead we "filter out" the exec permission for non clean pages. + * + * This is also called once for the folio. So only work with folio->flags here. */ -static pte_t set_pte_filter(pte_t pte, unsigned long addr) +static inline pte_t set_pte_filter(pte_t pte, unsigned long addr) { - struct page *pg; + struct folio *folio; + + if (radix_enabled()) + return pte; + + if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return set_pte_filter_hash(pte, addr); /* No exec permission in the first place, move on */ - if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte)) + if (!pte_exec(pte) || !pte_looks_normal(pte, addr)) return pte; /* If you set _PAGE_EXEC on weird pages you're on your own */ - pg = maybe_pte_to_page(pte); - if (unlikely(!pg)) + folio = maybe_pte_to_folio(pte); + if (unlikely(!folio)) return pte; /* If the page clean, we move on */ - if (test_bit(PG_arch_1, &pg->flags)) + if (test_bit(PG_dcache_clean, &folio->flags.f)) return pte; /* If it's an exec fault, we flush the cache and make it clean */ if (is_exec_fault()) { - flush_dcache_icache_page(pg); - set_bit(PG_arch_1, &pg->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags.f); return pte; } /* Else, we filter out _PAGE_EXEC */ - return __pte(pte_val(pte) & ~_PAGE_EXEC); + return pte_exprotect(pte); } static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, int dirty) { - struct page *pg; + struct folio *folio; + + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) + return pte; + + if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return pte; /* So here, we only care about exec faults, as we use them * to recover lost _PAGE_EXEC and perform I$/D$ coherency * if necessary. Also if _PAGE_EXEC is already set, same deal, * we just bail out */ - if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault()) + if (dirty || pte_exec(pte) || !is_exec_fault()) return pte; #ifdef CONFIG_DEBUG_VM @@ -162,41 +170,67 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, #endif /* CONFIG_DEBUG_VM */ /* If you set _PAGE_EXEC on weird pages you're on your own */ - pg = maybe_pte_to_page(pte); - if (unlikely(!pg)) + folio = maybe_pte_to_folio(pte); + if (unlikely(!folio)) goto bail; /* If the page is already clean, we move on */ - if (test_bit(PG_arch_1, &pg->flags)) + if (test_bit(PG_dcache_clean, &folio->flags.f)) goto bail; - /* Clean the page and set PG_arch_1 */ - flush_dcache_icache_page(pg); - set_bit(PG_arch_1, &pg->flags); + /* Clean the page and set PG_dcache_clean */ + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags.f); bail: - return __pte(pte_val(pte) | _PAGE_EXEC); + return pte_mkexec(pte); } -#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */ - /* * set_pte stores a linux PTE into the linux page table. */ -void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte) +void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr) { -#ifdef CONFIG_DEBUG_VM - WARN_ON(pte_present(*ptep)); -#endif + /* Note: mm->context.id might not yet have been assigned as * this context might not have been activated yet when this - * is called. + * is called. Filter the pte value and use the filtered value + * to setup all the ptes in the range. */ pte = set_pte_filter(pte, addr); - /* Perform the setting of the PTE */ - __set_pte_at(mm, addr, ptep, pte, 0); + /* + * We don't need to call arch_enter/leave_lazy_mmu_mode() + * because we expect set_ptes to be only be used on not present + * and not hw_valid ptes. Hence there is no translation cache flush + * involved that need to be batched. + */ + for (;;) { + + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); + + /* Perform the setting of the PTE */ + __set_pte_at(mm, addr, ptep, pte, 0); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + pte = pte_next_pfn(pte); + } +} + +void unmap_kernel_page(unsigned long va) +{ + pmd_t *pmdp = pmd_off_k(va); + pte_t *ptep = pte_offset_kernel(pmdp, va); + + pte_clear(&init_mm, va, ptep); + flush_tlb_kernel_range(va, va + PAGE_SIZE); } /* @@ -213,38 +247,309 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, entry = set_access_flags_filter(entry, vma, dirty); changed = !pte_same(*(ptep), entry); if (changed) { - if (!is_vm_hugetlb_page(vma)) - assert_pte_locked(vma->vm_mm, address); - __ptep_set_access_flags(ptep, entry); - flush_tlb_page_nohash(vma, address); + assert_pte_locked(vma->vm_mm, address); + __ptep_set_access_flags(vma, ptep, entry, + address, mmu_virtual_psize); + } + return changed; +} + +#ifdef CONFIG_HUGETLB_PAGE +int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty) +{ +#ifdef HUGETLB_NEED_PRELOAD + /* + * The "return 1" forces a call of update_mmu_cache, which will write a + * TLB entry. Without this, platforms that don't do a write of the TLB + * entry in the TLB miss handler asm will fault ad infinitum. + */ + ptep_set_access_flags(vma, addr, ptep, pte, dirty); + return 1; +#else + int changed, psize; + + pte = set_access_flags_filter(pte, vma, dirty); + changed = !pte_same(*(ptep), pte); + if (changed) { + +#ifdef CONFIG_PPC_BOOK3S_64 + struct hstate *h = hstate_vma(vma); + + psize = hstate_get_psize(h); +#ifdef CONFIG_DEBUG_VM + assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep)); +#endif + +#else + /* + * Not used on non book3s64 platforms. + * 8xx compares it with mmu_virtual_psize to + * know if it is a huge page or not. + */ + psize = MMU_PAGE_COUNT; +#endif + __ptep_set_access_flags(vma, ptep, pte, addr, psize); } return changed; +#endif +} + +#if defined(CONFIG_PPC_8xx) + +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) || defined(CONFIG_SPLIT_PMD_PTLOCKS) +/* We need the same lock to protect the PMD table and the two PTE tables. */ +#error "8M hugetlb folios are incompatible with split page table locks" +#endif + +static void __set_huge_pte_at(pmd_t *pmd, pte_t *ptep, pte_basic_t val) +{ + pte_basic_t *entry = (pte_basic_t *)ptep; + int num, i; + + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); + + num = number_of_cells_per_pte(pmd, val, 1); + + for (i = 0; i < num; i++, entry++, val += SZ_4K) + *entry = val; +} + +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned long sz) +{ + pmd_t *pmdp = pmd_off(mm, addr); + + pte = set_pte_filter(pte, addr); + + if (sz == SZ_8M) { /* Flag both PMD entries as 8M and fill both page tables */ + *pmdp = __pmd(pmd_val(*pmdp) | _PMD_PAGE_8M); + *(pmdp + 1) = __pmd(pmd_val(*(pmdp + 1)) | _PMD_PAGE_8M); + + __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp, 0), pte_val(pte)); + __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp + 1, 0), pte_val(pte) + SZ_4M); + } else { + __set_huge_pte_at(pmdp, ptep, pte_val(pte)); + } +} +#else +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned long sz) +{ + unsigned long pdsize; + int i; + + pte = set_pte_filter(pte, addr); + + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); + + if (sz < PMD_SIZE) + pdsize = PAGE_SIZE; + else if (sz < PUD_SIZE) + pdsize = PMD_SIZE; + else if (sz < P4D_SIZE) + pdsize = PUD_SIZE; + else if (sz < PGDIR_SIZE) + pdsize = P4D_SIZE; + else + pdsize = PGDIR_SIZE; + + for (i = 0; i < sz / pdsize; i++, ptep++, addr += pdsize) { + __set_pte_at(mm, addr, ptep, pte, 0); + pte = __pte(pte_val(pte) + ((unsigned long long)pdsize / PAGE_SIZE << PFN_PTE_SHIFT)); + } } +#endif +#endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_DEBUG_VM void assert_pte_locked(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; if (mm == &init_mm) return; pgd = mm->pgd + pgd_index(addr); BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + BUG_ON(p4d_none(*p4d)); + pud = pud_offset(p4d, addr); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, addr); /* * khugepaged to collapse normal pages to hugepage, first set - * pmd to none to force page fault/gup to take mmap_sem. After + * pmd to none to force page fault/gup to take mmap_lock. After * pmd is set to none, we do a pte_clear which does this assertion * so if we find pmd none, return. */ if (pmd_none(*pmd)) return; - BUG_ON(!pmd_present(*pmd)); - assert_spin_locked(pte_lockptr(mm, pmd)); + pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); + BUG_ON(!pte); + assert_spin_locked(ptl); + pte_unmap(pte); } #endif /* CONFIG_DEBUG_VM */ +unsigned long vmalloc_to_phys(void *va) +{ + unsigned long pfn = vmalloc_to_pfn(va); + + BUG_ON(!pfn); + return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va); +} +EXPORT_SYMBOL_GPL(vmalloc_to_phys); + +/* + * We have 3 cases for pgds and pmds: + * (1) invalid (all zeroes) + * (2) pointer to next table, as normal; bottom 6 bits == 0 + * (3) leaf pte for huge page _PAGE_PTE set + * + * So long as we atomically load page table pointers we are safe against teardown, + * we can follow the address down to the page and take a ref on it. + * This function need to be called with interrupts disabled. We use this variant + * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED + */ +pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, + bool *is_thp, unsigned *hpage_shift) +{ + pgd_t *pgdp; +#ifdef CONFIG_PPC64 + p4d_t p4d, *p4dp; + pud_t pud, *pudp; +#endif + pmd_t pmd, *pmdp; + pte_t *ret_pte; + unsigned pdshift; + + if (hpage_shift) + *hpage_shift = 0; + + if (is_thp) + *is_thp = false; + + /* + * Always operate on the local stack value. This make sure the + * value don't get updated by a parallel THP split/collapse, + * page fault or a page unmap. The return pte_t * is still not + * stable. So should be checked there for above conditions. + * Top level is an exception because it is folded into p4d. + * + * On PPC32, P4D/PUD/PMD are folded into PGD so go straight to + * PMD level. + */ + pgdp = pgdir + pgd_index(ea); +#ifdef CONFIG_PPC64 + p4dp = p4d_offset(pgdp, ea); + p4d = READ_ONCE(*p4dp); + pdshift = P4D_SHIFT; + + if (p4d_none(p4d)) + return NULL; + + if (p4d_leaf(p4d)) { + ret_pte = (pte_t *)p4dp; + goto out; + } + + /* + * Even if we end up with an unmap, the pgtable will not + * be freed, because we do an rcu free and here we are + * irq disabled + */ + pdshift = PUD_SHIFT; + pudp = pud_offset(&p4d, ea); + pud = READ_ONCE(*pudp); + + if (pud_none(pud)) + return NULL; + + if (pud_leaf(pud)) { + ret_pte = (pte_t *)pudp; + goto out; + } + + pmdp = pmd_offset(&pud, ea); +#else + pmdp = pmd_offset(pud_offset(p4d_offset(pgdp, ea), ea), ea); +#endif + pdshift = PMD_SHIFT; + pmd = READ_ONCE(*pmdp); + + /* + * A hugepage collapse is captured by this condition, see + * pmdp_collapse_flush. + */ + if (pmd_none(pmd)) + return NULL; + +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * A hugepage split is captured by this condition, see + * pmdp_invalidate. + * + * Huge page modification can be caught here too. + */ + if (pmd_is_serializing(pmd)) + return NULL; +#endif + + if (pmd_trans_huge(pmd)) { + if (is_thp) + *is_thp = true; + ret_pte = (pte_t *)pmdp; + goto out; + } + + if (pmd_leaf(pmd)) { + ret_pte = (pte_t *)pmdp; + goto out; + } + + return pte_offset_kernel(&pmd, ea); + +out: + if (hpage_shift) + *hpage_shift = pdshift; + return ret_pte; +} +EXPORT_SYMBOL_GPL(__find_linux_pte); + +/* Note due to the way vm flags are laid out, the bits are XWR */ +const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_EXECONLY_X, + [VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_EXEC | VM_WRITE] = PAGE_COPY_X, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_X, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_EXECONLY_X, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_X, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_X +}; + +#ifndef CONFIG_PPC_BOOK3S_64 +DECLARE_VM_GET_PAGE_PROT +#endif diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 6c856fb8c15b..0c9ef705803e 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains the routines setting up the linux page tables. * -- paulus @@ -11,12 +12,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/kernel.h> @@ -28,297 +23,88 @@ #include <linux/highmem.h> #include <linux/memblock.h> #include <linux/slab.h> +#include <linux/set_memory.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/fixmap.h> -#include <asm/io.h> #include <asm/setup.h> +#include <asm/sections.h> +#include <asm/early_ioremap.h> -#include "mmu_decl.h" - -unsigned long ioremap_base; -unsigned long ioremap_bot; -EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ - -#if defined(CONFIG_6xx) || defined(CONFIG_POWER3) -#define HAVE_BATS 1 -#endif - -#if defined(CONFIG_FSL_BOOKE) -#define HAVE_TLBCAM 1 -#endif - -extern char etext[], _stext[]; - -#ifdef HAVE_BATS -extern phys_addr_t v_mapped_by_bats(unsigned long va); -extern unsigned long p_mapped_by_bats(phys_addr_t pa); -void setbat(int index, unsigned long virt, phys_addr_t phys, - unsigned int size, int flags); - -#else /* !HAVE_BATS */ -#define v_mapped_by_bats(x) (0UL) -#define p_mapped_by_bats(x) (0UL) -#endif /* HAVE_BATS */ - -#ifdef HAVE_TLBCAM -extern unsigned int tlbcam_index; -extern phys_addr_t v_mapped_by_tlbcam(unsigned long va); -extern unsigned long p_mapped_by_tlbcam(phys_addr_t pa); -#else /* !HAVE_TLBCAM */ -#define v_mapped_by_tlbcam(x) (0UL) -#define p_mapped_by_tlbcam(x) (0UL) -#endif /* HAVE_TLBCAM */ - -#define PGDIR_ORDER (32 + PGD_T_LOG2 - PGDIR_SHIFT) +#include <mm/mmu_decl.h> -pgd_t *pgd_alloc(struct mm_struct *mm) -{ - pgd_t *ret; - - /* pgdir take page or two with 4K pages and a page fraction otherwise */ -#ifndef CONFIG_PPC_4K_PAGES - ret = kzalloc(1 << PGDIR_ORDER, GFP_KERNEL); -#else - ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, - PGDIR_ORDER - PAGE_SHIFT); -#endif - return ret; -} - -void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ -#ifndef CONFIG_PPC_4K_PAGES - kfree((void *)pgd); -#else - free_pages((unsigned long)pgd, PGDIR_ORDER - PAGE_SHIFT); -#endif -} - -__init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -{ - pte_t *pte; - extern int mem_init_done; - extern void *early_get_page(void); - - if (mem_init_done) { - pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); - } else { - pte = (pte_t *)early_get_page(); - if (pte) - clear_page(pte); - } - return pte; -} +static u8 early_fixmap_pagetable[FIXMAP_PTE_SIZE] __page_aligned_data; -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +notrace void __init early_ioremap_init(void) { - struct page *ptepage; - - gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO; - - ptepage = alloc_pages(flags, 0); - if (!ptepage) - return NULL; - pgtable_page_ctor(ptepage); - return ptepage; -} + unsigned long addr = ALIGN_DOWN(FIXADDR_START, PGDIR_SIZE); + pte_t *ptep = (pte_t *)early_fixmap_pagetable; + pmd_t *pmdp = pmd_off_k(addr); -void __iomem * -ioremap(phys_addr_t addr, unsigned long size) -{ - return __ioremap_caller(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED, - __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap); + for (; (s32)(FIXADDR_TOP - addr) > 0; + addr += PGDIR_SIZE, ptep += PTRS_PER_PTE, pmdp++) + pmd_populate_kernel(&init_mm, pmdp, ptep); -void __iomem * -ioremap_wc(phys_addr_t addr, unsigned long size) -{ - return __ioremap_caller(addr, size, _PAGE_NO_CACHE, - __builtin_return_address(0)); + early_ioremap_setup(); } -EXPORT_SYMBOL(ioremap_wc); -void __iomem * -ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) +void __init *early_alloc_pgtable(unsigned long size) { - /* writeable implies dirty for kernel addresses */ - if (flags & _PAGE_RW) - flags |= _PAGE_DIRTY | _PAGE_HWWRITE; - - /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - flags &= ~(_PAGE_USER | _PAGE_EXEC); - -#ifdef _PAGE_BAP_SR - /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format - * which means that we just cleared supervisor access... oops ;-) This - * restores it - */ - flags |= _PAGE_BAP_SR; -#endif - - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap_prot); + return memblock_alloc_or_panic(size, size); -void __iomem * -__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) -{ - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); } -void __iomem * -__ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags, - void *caller) +pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va) { - unsigned long v, i; - phys_addr_t p; - int err; - - /* Make sure we have the base flags */ - if ((flags & _PAGE_PRESENT) == 0) - flags |= PAGE_KERNEL; - - /* Non-cacheable page cannot be coherent */ - if (flags & _PAGE_NO_CACHE) - flags &= ~_PAGE_COHERENT; - - /* - * Choose an address to map it to. - * Once the vmalloc system is running, we use it. - * Before then, we use space going down from ioremap_base - * (ioremap_bot records where we're up to). - */ - p = addr & PAGE_MASK; - size = PAGE_ALIGN(addr + size) - p; - - /* - * If the address lies within the first 16 MB, assume it's in ISA - * memory space - */ - if (p < 16*1024*1024) - p += _ISA_MEM_BASE; + if (pmd_none(*pmdp)) { + pte_t *ptep = early_alloc_pgtable(PTE_FRAG_SIZE); -#ifndef CONFIG_CRASH_DUMP - /* - * Don't allow anybody to remap normal RAM that we're using. - * mem_init() sets high_memory so only do the check after that. - */ - if (mem_init_done && (p < virt_to_phys(high_memory)) && - !(__allow_ioremap_reserved && memblock_is_region_reserved(p, size))) { - printk("__ioremap(): phys addr 0x%llx is RAM lr %pf\n", - (unsigned long long)p, __builtin_return_address(0)); - return NULL; - } -#endif - - if (size == 0) - return NULL; - - /* - * Is it already mapped? Perhaps overlapped by a previous - * BAT mapping. If the whole area is mapped then we're done, - * otherwise remap it since we want to keep the virt addrs for - * each request contiguous. - * - * We make the assumption here that if the bottom and top - * of the range we want are mapped then it's mapped to the - * same virt address (and this is contiguous). - * -- Cort - */ - if ((v = p_mapped_by_bats(p)) /*&& p_mapped_by_bats(p+size-1)*/ ) - goto out; - - if ((v = p_mapped_by_tlbcam(p))) - goto out; - - if (mem_init_done) { - struct vm_struct *area; - area = get_vm_area_caller(size, VM_IOREMAP, caller); - if (area == 0) - return NULL; - area->phys_addr = p; - v = (unsigned long) area->addr; - } else { - v = (ioremap_bot -= size); - } - - /* - * Should check if it is a candidate for a BAT mapping - */ - - err = 0; - for (i = 0; i < size && err == 0; i += PAGE_SIZE) - err = map_page(v+i, p+i, flags); - if (err) { - if (mem_init_done) - vunmap((void *)v); - return NULL; + pmd_populate_kernel(&init_mm, pmdp, ptep); } - -out: - return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK)); + return pte_offset_kernel(pmdp, va); } -EXPORT_SYMBOL(__ioremap); - -void iounmap(volatile void __iomem *addr) -{ - /* - * If mapped by BATs then there is nothing to do. - * Calling vfree() generates a benign warning. - */ - if (v_mapped_by_bats((unsigned long)addr)) return; - if (addr > high_memory && (unsigned long) addr < ioremap_bot) - vunmap((void *) (PAGE_MASK & (unsigned long)addr)); -} -EXPORT_SYMBOL(iounmap); -int map_page(unsigned long va, phys_addr_t pa, int flags) +int __ref map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) { pmd_t *pd; pte_t *pg; int err = -ENOMEM; /* Use upper 10 bits of VA to index the first level map */ - pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va); + pd = pmd_off_k(va); /* Use middle 10 bits of VA to index the second-level map */ - pg = pte_alloc_kernel(pd, va); - if (pg != 0) { + if (likely(slab_is_available())) + pg = pte_alloc_kernel(pd, va); + else + pg = early_pte_alloc_kernel(pd, va); + if (pg) { err = 0; /* The PTE should never be already set nor present in the * hash table */ - BUG_ON((pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)) && - flags); - set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); + BUG_ON((pte_present(*pg) | pte_hashpte(*pg)) && pgprot_val(prot)); + set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, prot)); } + smp_wmb(); return err; } /* * Map in a chunk of physical memory starting at start. */ -void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) +static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) { - unsigned long v, s, f; + unsigned long v, s; phys_addr_t p; - int ktext; + bool ktext; s = offset; v = PAGE_OFFSET + s; p = memstart_addr + s; for (; s < top; s += PAGE_SIZE) { - ktext = ((char *) v >= _stext && (char *) v < etext); - f = ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL; - map_page(v, p, f); -#ifdef CONFIG_PPC_STD_MMU_32 - if (ktext) - hash_preload(&init_mm, v, 0, 0x300); -#endif + ktext = core_kernel_text(v); + map_kernel_page(v, p, ktext ? PAGE_KERNEL_X : PAGE_KERNEL); v += PAGE_SIZE; p += PAGE_SIZE; } @@ -326,131 +112,71 @@ void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) void __init mapin_ram(void) { - unsigned long s, top; + phys_addr_t base, end; + u64 i; -#ifndef CONFIG_WII - top = total_lowmem; - s = mmu_mapin_ram(top); - __mapin_ram_chunk(s, top); -#else - if (!wii_hole_size) { - s = mmu_mapin_ram(total_lowmem); - __mapin_ram_chunk(s, total_lowmem); - } else { - top = wii_hole_start; - s = mmu_mapin_ram(top); - __mapin_ram_chunk(s, top); + for_each_mem_range(i, &base, &end) { + phys_addr_t top = min(end, total_lowmem); - top = memblock_end_of_DRAM(); - s = wii_mmu_mapin_mem2(top); - __mapin_ram_chunk(s, top); + if (base >= top) + continue; + base = mmu_mapin_ram(base, top); + __mapin_ram_chunk(base, top); } -#endif } -/* Scan the real Linux page tables and return a PTE pointer for - * a virtual address in a context. - * Returns true (1) if PTE was found, zero otherwise. The pointer to - * the PTE pointer is unmodified if PTE is not found. - */ -int -get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp) +static int __mark_initmem_nx(void) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int retval = 0; - - pgd = pgd_offset(mm, addr & PAGE_MASK); - if (pgd) { - pud = pud_offset(pgd, addr & PAGE_MASK); - if (pud && pud_present(*pud)) { - pmd = pmd_offset(pud, addr & PAGE_MASK); - if (pmd_present(*pmd)) { - pte = pte_offset_map(pmd, addr & PAGE_MASK); - if (pte) { - retval = 1; - *ptep = pte; - if (pmdp) - *pmdp = pmd; - /* XXX caller needs to do pte_unmap, yuck */ - } - } - } - } - return(retval); -} - -#ifdef CONFIG_DEBUG_PAGEALLOC - -static int __change_page_attr(struct page *page, pgprot_t prot) -{ - pte_t *kpte; - pmd_t *kpmd; - unsigned long address; - - BUG_ON(PageHighMem(page)); - address = (unsigned long)page_address(page); - - if (v_mapped_by_bats(address) || v_mapped_by_tlbcam(address)) - return 0; - if (!get_pteptr(&init_mm, address, &kpte, &kpmd)) - return -EINVAL; - __set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0); - wmb(); - flush_tlb_page(NULL, address); - pte_unmap(kpte); + unsigned long numpages = PFN_UP((unsigned long)_einittext) - + PFN_DOWN((unsigned long)_sinittext); + int err; - return 0; -} + err = mmu_mark_initmem_nx(); -/* - * Change the page attributes of an page in the linear mapping. - * - * THIS CONFLICTS WITH BAT MAPPINGS, DEBUG USE ONLY - */ -static int change_page_attr(struct page *page, int numpages, pgprot_t prot) -{ - int i, err = 0; - unsigned long flags; - - local_irq_save(flags); - for (i = 0; i < numpages; i++, page++) { - err = __change_page_attr(page, prot); + if (!v_block_mapped((unsigned long)_sinittext)) { + err = set_memory_nx((unsigned long)_sinittext, numpages); if (err) - break; + return err; + err = set_memory_rw((unsigned long)_sinittext, numpages); } - local_irq_restore(flags); return err; } - -void kernel_map_pages(struct page *page, int numpages, int enable) +void mark_initmem_nx(void) { - if (PageHighMem(page)) - return; + int err = __mark_initmem_nx(); - change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); + if (err) + panic("%s() failed, err = %d\n", __func__, err); } -#endif /* CONFIG_DEBUG_PAGEALLOC */ -static int fixmaps; - -void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) +#ifdef CONFIG_STRICT_KERNEL_RWX +static int __mark_rodata_ro(void) { - unsigned long address = __fix_to_virt(idx); + unsigned long numpages; - if (idx >= __end_of_fixed_addresses) { - BUG(); - return; - } + if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX) && mmu_has_feature(MMU_FTR_HPTE_TABLE)) + pr_warn("This platform has HASH MMU, STRICT_MODULE_RWX won't work\n"); + + if (v_block_mapped((unsigned long)_stext + 1)) + return mmu_mark_rodata_ro(); - map_page(address, phys, pgprot_val(flags)); - fixmaps++; + /* + * mark text and rodata as read only. __end_rodata is set by + * powerpc's linker script and includes tables and data + * requiring relocation which are not put in RO_DATA. + */ + numpages = PFN_UP((unsigned long)__end_rodata) - + PFN_DOWN((unsigned long)_stext); + + return set_memory_ro((unsigned long)_stext, numpages); } -void __this_fixmap_does_not_exist(void) +void mark_rodata_ro(void) { - WARN_ON(1); + int err = __mark_rodata_ro(); + + if (err) + panic("%s() failed, err = %d\n", __func__, err); } +#endif diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 536eec72c0f7..6621cfc3baf8 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -1,5 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * This file contains ioremap and related functions for 64-bit machines. + * This file contains pgtable related functions for 64-bit machines. * * Derived from arch/ppc64/mm/init.c * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -13,12 +14,6 @@ * * Dave Engebretsen <engebret@us.ibm.com> * Rework for PPC64 port. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/signal.h> @@ -33,17 +28,11 @@ #include <linux/swap.h> #include <linux/stddef.h> #include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/memblock.h> #include <linux/slab.h> +#include <linux/hugetlb.h> -#include <asm/pgalloc.h> #include <asm/page.h> -#include <asm/prom.h> -#include <asm/io.h> #include <asm/mmu_context.h> -#include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/smp.h> #include <asm/machdep.h> @@ -52,820 +41,122 @@ #include <asm/cputable.h> #include <asm/sections.h> #include <asm/firmware.h> +#include <asm/dma.h> -#include "mmu_decl.h" - -/* Some sanity checking */ -#if TASK_SIZE_USER64 > PGTABLE_RANGE -#error TASK_SIZE_USER64 exceeds pagetable range -#endif - -#ifdef CONFIG_PPC_STD_MMU_64 -#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) -#error TASK_SIZE_USER64 exceeds user VSID range -#endif -#endif - -unsigned long ioremap_bot = IOREMAP_BASE; - -#ifdef CONFIG_PPC_MMU_NOHASH -static void *early_alloc_pgtable(unsigned long size) -{ - void *pt; +#include <mm/mmu_decl.h> - if (init_bootmem_done) - pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS)); - else - pt = __va(memblock_alloc_base(size, size, - __pa(MAX_DMA_ADDRESS))); - memset(pt, 0, size); - - return pt; -} -#endif /* CONFIG_PPC_MMU_NOHASH */ +#ifdef CONFIG_PPC_BOOK3S_64 /* - * map_kernel_page currently only called by __ioremap - * map_kernel_page adds an entry to the ioremap page table - * and adds an entry to the HPT, possibly bolting it + * partition table and process table for ISA 3.0 */ -int map_kernel_page(unsigned long ea, unsigned long pa, int flags) -{ - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - if (slab_is_available()) { - pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); - if (!pudp) - return -ENOMEM; - pmdp = pmd_alloc(&init_mm, pudp, ea); - if (!pmdp) - return -ENOMEM; - ptep = pte_alloc_kernel(pmdp, ea); - if (!ptep) - return -ENOMEM; - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); - } else { -#ifdef CONFIG_PPC_MMU_NOHASH - /* Warning ! This will blow up if bootmem is not initialized - * which our ppc64 code is keen to do that, we'll need to - * fix it and/or be more careful - */ - pgdp = pgd_offset_k(ea); -#ifdef PUD_TABLE_SIZE - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - BUG_ON(pudp == NULL); - pgd_populate(&init_mm, pgdp, pudp); - } -#endif /* PUD_TABLE_SIZE */ - pudp = pud_offset(pgdp, ea); - if (pud_none(*pudp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); - BUG_ON(pmdp == NULL); - pud_populate(&init_mm, pudp, pmdp); - } - pmdp = pmd_offset(pudp, ea); - if (!pmd_present(*pmdp)) { - ptep = early_alloc_pgtable(PAGE_SIZE); - BUG_ON(ptep == NULL); - pmd_populate_kernel(&init_mm, pmdp, ptep); - } - ptep = pte_offset_kernel(pmdp, ea); - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); -#else /* CONFIG_PPC_MMU_NOHASH */ - /* - * If the mm subsystem is not fully up, we cannot create a - * linux page table entry for this mapping. Simply bolt an - * entry in the hardware page table. - * - */ - if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, - mmu_io_psize, mmu_kernel_ssize)) { - printk(KERN_ERR "Failed to do bolted mapping IO " - "memory at %016lx !\n", pa); - return -ENOMEM; - } -#endif /* !CONFIG_PPC_MMU_NOHASH */ - } - return 0; -} - - -/** - * __ioremap_at - Low level function to establish the page tables - * for an IO mapping - */ -void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, - unsigned long flags) -{ - unsigned long i; - - /* Make sure we have the base flags */ - if ((flags & _PAGE_PRESENT) == 0) - flags |= pgprot_val(PAGE_KERNEL); - - /* Non-cacheable page cannot be coherent */ - if (flags & _PAGE_NO_CACHE) - flags &= ~_PAGE_COHERENT; - - /* We don't support the 4K PFN hack with ioremap */ - if (flags & _PAGE_4K_PFN) - return NULL; - - WARN_ON(pa & ~PAGE_MASK); - WARN_ON(((unsigned long)ea) & ~PAGE_MASK); - WARN_ON(size & ~PAGE_MASK); - - for (i = 0; i < size; i += PAGE_SIZE) - if (map_kernel_page((unsigned long)ea+i, pa+i, flags)) - return NULL; - - return (void __iomem *)ea; -} - -/** - * __iounmap_from - Low level function to tear down the page tables - * for an IO mapping. This is used for mappings that - * are manipulated manually, like partial unmapping of - * PCI IOs or ISA space. +struct prtb_entry *process_tb; +struct patb_entry *partition_tb; +/* + * page table size */ -void __iounmap_at(void *ea, unsigned long size) -{ - WARN_ON(((unsigned long)ea) & ~PAGE_MASK); - WARN_ON(size & ~PAGE_MASK); - - unmap_kernel_range((unsigned long)ea, size); -} +unsigned long __pte_index_size; +EXPORT_SYMBOL(__pte_index_size); +unsigned long __pmd_index_size; +EXPORT_SYMBOL(__pmd_index_size); +unsigned long __pud_index_size; +EXPORT_SYMBOL(__pud_index_size); +unsigned long __pgd_index_size; +EXPORT_SYMBOL(__pgd_index_size); +unsigned long __pud_cache_index; +EXPORT_SYMBOL(__pud_cache_index); +unsigned long __pte_table_size; +EXPORT_SYMBOL(__pte_table_size); +unsigned long __pmd_table_size; +EXPORT_SYMBOL(__pmd_table_size); +unsigned long __pud_table_size; +EXPORT_SYMBOL(__pud_table_size); +unsigned long __pgd_table_size; +EXPORT_SYMBOL(__pgd_table_size); +unsigned long __pmd_val_bits; +EXPORT_SYMBOL(__pmd_val_bits); +unsigned long __pud_val_bits; +EXPORT_SYMBOL(__pud_val_bits); +unsigned long __pgd_val_bits; +EXPORT_SYMBOL(__pgd_val_bits); +unsigned long __kernel_virt_start; +EXPORT_SYMBOL(__kernel_virt_start); +unsigned long __vmalloc_start; +EXPORT_SYMBOL(__vmalloc_start); +unsigned long __vmalloc_end; +EXPORT_SYMBOL(__vmalloc_end); +unsigned long __kernel_io_start; +EXPORT_SYMBOL(__kernel_io_start); +unsigned long __kernel_io_end; +struct page *vmemmap; +EXPORT_SYMBOL(vmemmap); +unsigned long __pte_frag_nr; +EXPORT_SYMBOL(__pte_frag_nr); +unsigned long __pte_frag_size_shift; +EXPORT_SYMBOL(__pte_frag_size_shift); +#endif -void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, - unsigned long flags, void *caller) +#ifndef __PAGETABLE_PUD_FOLDED +/* 4 level page table */ +struct page *p4d_page(p4d_t p4d) { - phys_addr_t paligned; - void __iomem *ret; - - /* - * Choose an address to map it to. - * Once the imalloc system is running, we use it. - * Before that, we map using addresses going - * up from ioremap_bot. imalloc will use - * the addresses from ioremap_bot through - * IMALLOC_END - * - */ - paligned = addr & PAGE_MASK; - size = PAGE_ALIGN(addr + size) - paligned; - - if ((size == 0) || (paligned == 0)) - return NULL; - - if (mem_init_done) { - struct vm_struct *area; - - area = __get_vm_area_caller(size, VM_IOREMAP, - ioremap_bot, IOREMAP_END, - caller); - if (area == NULL) - return NULL; - - area->phys_addr = paligned; - ret = __ioremap_at(paligned, area->addr, size, flags); - if (!ret) - vunmap(area->addr); - } else { - ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags); - if (ret) - ioremap_bot += size; + if (p4d_leaf(p4d)) { + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + VM_WARN_ON(!p4d_leaf(p4d)); + return pte_page(p4d_pte(p4d)); } - - if (ret) - ret += addr & ~PAGE_MASK; - return ret; -} - -void __iomem * __ioremap(phys_addr_t addr, unsigned long size, - unsigned long flags) -{ - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); -} - -void __iomem * ioremap(phys_addr_t addr, unsigned long size) -{ - unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED; - void *caller = __builtin_return_address(0); - - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags, caller); - return __ioremap_caller(addr, size, flags, caller); -} - -void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) -{ - unsigned long flags = _PAGE_NO_CACHE; - void *caller = __builtin_return_address(0); - - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags, caller); - return __ioremap_caller(addr, size, flags, caller); + return virt_to_page(p4d_pgtable(p4d)); } - -void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, - unsigned long flags) -{ - void *caller = __builtin_return_address(0); - - /* writeable implies dirty for kernel addresses */ - if (flags & _PAGE_RW) - flags |= _PAGE_DIRTY; - - /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - flags &= ~(_PAGE_USER | _PAGE_EXEC); - -#ifdef _PAGE_BAP_SR - /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format - * which means that we just cleared supervisor access... oops ;-) This - * restores it - */ - flags |= _PAGE_BAP_SR; #endif - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags, caller); - return __ioremap_caller(addr, size, flags, caller); -} - - -/* - * Unmap an IO region and remove it from imalloc'd list. - * Access to IO memory should be serialized by driver. - */ -void __iounmap(volatile void __iomem *token) +struct page *pud_page(pud_t pud) { - void *addr; - - if (!mem_init_done) - return; - - addr = (void *) ((unsigned long __force) - PCI_FIX_ADDR(token) & PAGE_MASK); - if ((unsigned long)addr < ioremap_bot) { - printk(KERN_WARNING "Attempt to iounmap early bolted mapping" - " at 0x%p\n", addr); - return; + if (pud_leaf(pud)) { + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + VM_WARN_ON(!pud_leaf(pud)); + return pte_page(pud_pte(pud)); } - vunmap(addr); + return virt_to_page(pud_pgtable(pud)); } -void iounmap(volatile void __iomem *token) -{ - if (ppc_md.iounmap) - ppc_md.iounmap(token); - else - __iounmap(token); -} - -EXPORT_SYMBOL(ioremap); -EXPORT_SYMBOL(ioremap_wc); -EXPORT_SYMBOL(ioremap_prot); -EXPORT_SYMBOL(__ioremap); -EXPORT_SYMBOL(__ioremap_at); -EXPORT_SYMBOL(iounmap); -EXPORT_SYMBOL(__iounmap); -EXPORT_SYMBOL(__iounmap_at); - /* * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address. */ struct page *pmd_page(pmd_t pmd) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge(pmd)) - return pfn_to_page(pmd_pfn(pmd)); -#endif - return virt_to_page(pmd_page_vaddr(pmd)); -} - -#ifdef CONFIG_PPC_64K_PAGES -static pte_t *get_from_cache(struct mm_struct *mm) -{ - void *pte_frag, *ret; - - spin_lock(&mm->page_table_lock); - ret = mm->context.pte_frag; - if (ret) { - pte_frag = ret + PTE_FRAG_SIZE; - /* - * If we have taken up all the fragments mark PTE page NULL - */ - if (((unsigned long)pte_frag & ~PAGE_MASK) == 0) - pte_frag = NULL; - mm->context.pte_frag = pte_frag; - } - spin_unlock(&mm->page_table_lock); - return (pte_t *)ret; -} - -static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) -{ - void *ret = NULL; - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | - __GFP_REPEAT | __GFP_ZERO); - if (!page) - return NULL; - - ret = page_address(page); - spin_lock(&mm->page_table_lock); - /* - * If we find pgtable_page set, we return - * the allocated page with single fragement - * count. - */ - if (likely(!mm->context.pte_frag)) { - atomic_set(&page->_count, PTE_FRAG_NR); - mm->context.pte_frag = ret + PTE_FRAG_SIZE; - } - spin_unlock(&mm->page_table_lock); - - if (!kernel) - pgtable_page_ctor(page); - - return (pte_t *)ret; -} - -pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) -{ - pte_t *pte; - - pte = get_from_cache(mm); - if (pte) - return pte; - - return __alloc_for_cache(mm, kernel); -} - -void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) -{ - struct page *page = virt_to_page(table); - if (put_page_testzero(page)) { - if (!kernel) - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } -} - -#ifdef CONFIG_SMP -static void page_table_free_rcu(void *table) -{ - struct page *page = virt_to_page(table); - if (put_page_testzero(page)) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } -} - -void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) -{ - unsigned long pgf = (unsigned long)table; - - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - pgf |= shift; - tlb_remove_table(tlb, (void *)pgf); -} - -void __tlb_remove_table(void *_table) -{ - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); - unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; - - if (!shift) - /* PTE page needs special handling */ - page_table_free_rcu(table); - else { - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - kmem_cache_free(PGT_CACHE(shift), table); - } -} -#else -void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) -{ - if (!shift) { - /* PTE page needs special handling */ - struct page *page = virt_to_page(table); - if (put_page_testzero(page)) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } - } else { - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - kmem_cache_free(PGT_CACHE(shift), table); - } -} -#endif -#endif /* CONFIG_PPC_64K_PAGES */ - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - -/* - * This is called when relaxing access to a hugepage. It's also called in the page - * fault path when we don't hit any of the major fault cases, ie, a minor - * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have - * handled those two for us, we additionally deal with missing execute - * permission here on some processors - */ -int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp, pmd_t entry, int dirty) -{ - int changed; -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); - assert_spin_locked(&vma->vm_mm->page_table_lock); -#endif - changed = !pmd_same(*(pmdp), entry); - if (changed) { - __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); - /* - * Since we are not supporting SW TLB systems, we don't - * have any thing similar to flush_tlb_page_nohash() - */ - } - return changed; -} - -unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long clr) -{ - - unsigned long old, tmp; - -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); - assert_spin_locked(&mm->page_table_lock); -#endif - -#ifdef PTE_ATOMIC_UPDATES - __asm__ __volatile__( - "1: ldarx %0,0,%3\n\ - andi. %1,%0,%6\n\ - bne- 1b \n\ - andc %1,%0,%4 \n\ - stdcx. %1,0,%3 \n\ - bne- 1b" - : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) - : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY) - : "cc" ); -#else - old = pmd_val(*pmdp); - *pmdp = __pmd(old & ~clr); -#endif - if (old & _PAGE_HASHPTE) - hpte_do_hugepage_flush(mm, addr, pmdp); - return old; -} - -pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_t pmd; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - if (pmd_trans_huge(*pmdp)) { - pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); - } else { - /* - * khugepaged calls this for normal pmd - */ - pmd = *pmdp; - pmd_clear(pmdp); + if (pmd_leaf(pmd)) { /* - * Wait for all pending hash_page to finish. This is needed - * in case of subpage collapse. When we collapse normal pages - * to hugepage, we first clear the pmd, then invalidate all - * the PTE entries. The assumption here is that any low level - * page fault will see a none pmd and take the slow path that - * will wait on mmap_sem. But we could very well be in a - * hash_page with local ptep pointer value. Such a hash page - * can result in adding new HPTE entries for normal subpages. - * That means we could be modifying the page content as we - * copy them to a huge page. So wait for parallel hash_page - * to finish before invalidating HPTE entries. We can do this - * by sending an IPI to all the cpus and executing a dummy - * function there. + * vmalloc_to_page may be called on any vmap address (not only + * vmalloc), and it uses pmd_page() etc., when huge vmap is + * enabled so these checks can't be used. */ - kick_all_cpus_sync(); - /* - * Now invalidate the hpte entries in the range - * covered by pmd. This make sure we take a - * fault and will find the pmd as none, which will - * result in a major fault which takes mmap_sem and - * hence wait for collapse to complete. Without this - * the __collapse_huge_page_copy can result in copying - * the old content. - */ - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + VM_WARN_ON(!pmd_leaf(pmd)); + return pte_page(pmd_pte(pmd)); } - return pmd; -} - -int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); -} - -/* - * We currently remove entries from the hashtable regardless of whether - * the entry was young or dirty. The generic routines only flush if the - * entry was young or dirty which is not good enough. - * - * We should be more intelligent about this but for the moment we override - * these functions and force a tlb flush unconditionally - */ -int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); -} - -/* - * We mark the pmd splitting and invalidate all the hpte - * entries for this hugepage. - */ -void pmdp_splitting_flush(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - unsigned long old, tmp; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); - assert_spin_locked(&vma->vm_mm->page_table_lock); -#endif - -#ifdef PTE_ATOMIC_UPDATES - - __asm__ __volatile__( - "1: ldarx %0,0,%3\n\ - andi. %1,%0,%6\n\ - bne- 1b \n\ - ori %1,%0,%4 \n\ - stdcx. %1,0,%3 \n\ - bne- 1b" - : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) - : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY) - : "cc" ); -#else - old = pmd_val(*pmdp); - *pmdp = __pmd(old | _PAGE_SPLITTING); -#endif - /* - * If we didn't had the splitting flag set, go and flush the - * HPTE entries. - */ - if (!(old & _PAGE_SPLITTING)) { - /* We need to flush the hpte */ - if (old & _PAGE_HASHPTE) - hpte_do_hugepage_flush(vma->vm_mm, address, pmdp); - } -} - -/* - * We want to put the pgtable in pmd and use pgtable for tracking - * the base page size hptes - */ -void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) -{ - pgtable_t *pgtable_slot; - assert_spin_locked(&mm->page_table_lock); - /* - * we store the pgtable in the second half of PMD - */ - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - *pgtable_slot = pgtable; - /* - * expose the deposited pgtable to other cpus. - * before we set the hugepage PTE at pmd level - * hash fault code looks at the deposted pgtable - * to store hash index values. - */ - smp_wmb(); -} - -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) -{ - pgtable_t pgtable; - pgtable_t *pgtable_slot; - - assert_spin_locked(&mm->page_table_lock); - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - pgtable = *pgtable_slot; - /* - * Once we withdraw, mark the entry NULL. - */ - *pgtable_slot = NULL; - /* - * We store HPTE information in the deposited PTE fragment. - * zero out the content on withdraw. - */ - memset(pgtable, 0, PTE_FRAG_SIZE); - return pgtable; -} - -/* - * set a new huge pmd. We should not be called for updating - * an existing pmd entry. That should go via pmd_hugepage_update. - */ -void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) -{ -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_none(*pmdp)); - assert_spin_locked(&mm->page_table_lock); - WARN_ON(!pmd_trans_huge(pmd)); -#endif - return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); + return virt_to_page(pmd_page_vaddr(pmd)); } -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) +#ifdef CONFIG_STRICT_KERNEL_RWX +void mark_rodata_ro(void) { - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT); -} - -/* - * A linux hugepage PMD was changed and the corresponding hash table entries - * neesd to be flushed. - */ -void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - int ssize, i; - unsigned long s_addr; - int max_hpte_count; - unsigned int psize, valid; - unsigned char *hpte_slot_array; - unsigned long hidx, vpn, vsid, hash, shift, slot; - - /* - * Flush all the hptes mapping this hugepage - */ - s_addr = addr & HPAGE_PMD_MASK; - hpte_slot_array = get_hpte_slot_array(pmdp); - /* - * IF we try to do a HUGE PTE update after a withdraw is done. - * we will find the below NULL. This happens when we do - * split_huge_page_pmd - */ - if (!hpte_slot_array) + if (!mmu_has_feature(MMU_FTR_KERNEL_RO)) { + pr_warn("Warning: Unable to mark rodata read only on this CPU.\n"); return; - - /* get the base page size */ - psize = get_slice_psize(mm, s_addr); - - if (ppc_md.hugepage_invalidate) - return ppc_md.hugepage_invalidate(mm, hpte_slot_array, - s_addr, psize); - /* - * No bluk hpte removal support, invalidate each entry - */ - shift = mmu_psize_defs[psize].shift; - max_hpte_count = HPAGE_PMD_SIZE >> shift; - for (i = 0; i < max_hpte_count; i++) { - /* - * 8 bits per each hpte entries - * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] - */ - valid = hpte_valid(hpte_slot_array, i); - if (!valid) - continue; - hidx = hpte_hash_index(hpte_slot_array, i); - - /* get the vpn */ - addr = s_addr + (i * (1ul << shift)); - if (!is_kernel_addr(addr)) { - ssize = user_segment_size(addr); - vsid = get_vsid(mm->context.id, addr, ssize); - WARN_ON(vsid == 0); - } else { - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); - ssize = mmu_kernel_ssize; - } - - vpn = hpt_vpn(addr, vsid, ssize); - hash = hpt_hash(vpn, shift, ssize); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - ppc_md.hpte_invalidate(slot, vpn, psize, - MMU_PAGE_16M, ssize, 0); } -} - -static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) -{ - pmd_val(pmd) |= pgprot_val(pgprot); - return pmd; -} - -pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) -{ - pmd_t pmd; - /* - * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always - * set. We use this to check THP page at pmd level. - * leaf pte for huge page, bottom two bits != 00 - */ - pmd_val(pmd) = pfn << PTE_RPN_SHIFT; - pmd_val(pmd) |= _PAGE_THP_HUGE; - pmd = pmd_set_protbits(pmd, pgprot); - return pmd; -} - -pmd_t mk_pmd(struct page *page, pgprot_t pgprot) -{ - return pfn_pmd(page_to_pfn(page), pgprot); -} - -pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) -{ - pmd_val(pmd) &= _HPAGE_CHG_MASK; - pmd = pmd_set_protbits(pmd, newprot); - return pmd; -} - -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a HUGE PMD entry in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux HUGE PMD entry. - */ -void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd) -{ - return; -} - -pmd_t pmdp_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t old_pmd; - pgtable_t pgtable; - unsigned long old; - pgtable_t *pgtable_slot; - - old = pmd_hugepage_update(mm, addr, pmdp, ~0UL); - old_pmd = __pmd(old); - /* - * We have pmd == none and we are holding page_table_lock. - * So we can safely go and clear the pgtable hash - * index info. - */ - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - pgtable = *pgtable_slot; - /* - * Let's zero out old valid and hash index details - * hash fault look at them. - */ - memset(pgtable, 0, PTE_FRAG_SIZE); - return old_pmd; + if (radix_enabled()) + radix__mark_rodata_ro(); + else + hash__mark_rodata_ro(); } -int has_transparent_hugepage(void) +void mark_initmem_nx(void) { - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) - return 0; - /* - * We support THP only if PMD_SIZE is 16MB. - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) - return 0; - /* - * We need to make sure that we support 16MB hugepage in a segement - * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE - * of 64K. - */ - /* - * If we have 64K HPTE, we will be using that by default - */ - if (mmu_psize_defs[MMU_PAGE_64K].shift && - (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) - return 0; - /* - * Ok we only have 4K HPTE - */ - if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) - return 0; - - return 1; + if (radix_enabled()) + radix__mark_initmem_nx(); + else + hash__mark_initmem_nx(); } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c deleted file mode 100644 index 11571e118831..000000000000 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ /dev/null @@ -1,288 +0,0 @@ -/* - * This file contains the routines for handling the MMU on those - * PowerPC implementations where the MMU substantially follows the - * architecture specification. This includes the 6xx, 7xx, 7xxx, - * 8260, and POWER3 implementations but excludes the 8xx and 4xx. - * -- paulus - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/highmem.h> -#include <linux/memblock.h> - -#include <asm/prom.h> -#include <asm/mmu.h> -#include <asm/machdep.h> - -#include "mmu_decl.h" - -struct hash_pte *Hash, *Hash_end; -unsigned long Hash_size, Hash_mask; -unsigned long _SDR1; - -struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */ - -struct batrange { /* stores address ranges mapped by BATs */ - unsigned long start; - unsigned long limit; - phys_addr_t phys; -} bat_addrs[8]; - -/* - * Return PA for this VA if it is mapped by a BAT, or 0 - */ -phys_addr_t v_mapped_by_bats(unsigned long va) -{ - int b; - for (b = 0; b < 4; ++b) - if (va >= bat_addrs[b].start && va < bat_addrs[b].limit) - return bat_addrs[b].phys + (va - bat_addrs[b].start); - return 0; -} - -/* - * Return VA for a given PA or 0 if not mapped - */ -unsigned long p_mapped_by_bats(phys_addr_t pa) -{ - int b; - for (b = 0; b < 4; ++b) - if (pa >= bat_addrs[b].phys - && pa < (bat_addrs[b].limit-bat_addrs[b].start) - +bat_addrs[b].phys) - return bat_addrs[b].start+(pa-bat_addrs[b].phys); - return 0; -} - -unsigned long __init mmu_mapin_ram(unsigned long top) -{ - unsigned long tot, bl, done; - unsigned long max_size = (256<<20); - - if (__map_without_bats) { - printk(KERN_DEBUG "RAM mapped without BATs\n"); - return 0; - } - - /* Set up BAT2 and if necessary BAT3 to cover RAM. */ - - /* Make sure we don't map a block larger than the - smallest alignment of the physical address. */ - tot = top; - for (bl = 128<<10; bl < max_size; bl <<= 1) { - if (bl * 2 > tot) - break; - } - - setbat(2, PAGE_OFFSET, 0, bl, PAGE_KERNEL_X); - done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1; - if ((done < tot) && !bat_addrs[3].limit) { - /* use BAT3 to cover a bit more */ - tot -= done; - for (bl = 128<<10; bl < max_size; bl <<= 1) - if (bl * 2 > tot) - break; - setbat(3, PAGE_OFFSET+done, done, bl, PAGE_KERNEL_X); - done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1; - } - - return done; -} - -/* - * Set up one of the I/D BAT (block address translation) register pairs. - * The parameters are not checked; in particular size must be a power - * of 2 between 128k and 256M. - */ -void __init setbat(int index, unsigned long virt, phys_addr_t phys, - unsigned int size, int flags) -{ - unsigned int bl; - int wimgxpp; - struct ppc_bat *bat = BATS[index]; - - if ((flags & _PAGE_NO_CACHE) || - (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0)) - flags &= ~_PAGE_COHERENT; - - bl = (size >> 17) - 1; - if (PVR_VER(mfspr(SPRN_PVR)) != 1) { - /* 603, 604, etc. */ - /* Do DBAT first */ - wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE - | _PAGE_COHERENT | _PAGE_GUARDED); - wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX; - bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ - bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; - if (flags & _PAGE_USER) - bat[1].batu |= 1; /* Vp = 1 */ - if (flags & _PAGE_GUARDED) { - /* G bit must be zero in IBATs */ - bat[0].batu = bat[0].batl = 0; - } else { - /* make IBAT same as DBAT */ - bat[0] = bat[1]; - } - } else { - /* 601 cpu */ - if (bl > BL_8M) - bl = BL_8M; - wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE - | _PAGE_COHERENT); - wimgxpp |= (flags & _PAGE_RW)? - ((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX; - bat->batu = virt | wimgxpp | 4; /* Ks=0, Ku=1 */ - bat->batl = phys | bl | 0x40; /* V=1 */ - } - - bat_addrs[index].start = virt; - bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1; - bat_addrs[index].phys = phys; -} - -/* - * Preload a translation in the hash table - */ -void hash_preload(struct mm_struct *mm, unsigned long ea, - unsigned long access, unsigned long trap) -{ - pmd_t *pmd; - - if (Hash == 0) - return; - pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea); - if (!pmd_none(*pmd)) - add_hash_page(mm->context.id, ea, pmd_val(*pmd)); -} - -/* - * Initialize the hash table and patch the instructions in hashtable.S. - */ -void __init MMU_init_hw(void) -{ - unsigned int hmask, mb, mb2; - unsigned int n_hpteg, lg_n_hpteg; - - extern unsigned int hash_page_patch_A[]; - extern unsigned int hash_page_patch_B[], hash_page_patch_C[]; - extern unsigned int hash_page[]; - extern unsigned int flush_hash_patch_A[], flush_hash_patch_B[]; - - if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) { - /* - * Put a blr (procedure return) instruction at the - * start of hash_page, since we can still get DSI - * exceptions on a 603. - */ - hash_page[0] = 0x4e800020; - flush_icache_range((unsigned long) &hash_page[0], - (unsigned long) &hash_page[1]); - return; - } - - if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105); - -#define LG_HPTEG_SIZE 6 /* 64 bytes per HPTEG */ -#define SDR1_LOW_BITS ((n_hpteg - 1) >> 10) -#define MIN_N_HPTEG 1024 /* min 64kB hash table */ - - /* - * Allow 1 HPTE (1/8 HPTEG) for each page of memory. - * This is less than the recommended amount, but then - * Linux ain't AIX. - */ - n_hpteg = total_memory / (PAGE_SIZE * 8); - if (n_hpteg < MIN_N_HPTEG) - n_hpteg = MIN_N_HPTEG; - lg_n_hpteg = __ilog2(n_hpteg); - if (n_hpteg & (n_hpteg - 1)) { - ++lg_n_hpteg; /* round up if not power of 2 */ - n_hpteg = 1 << lg_n_hpteg; - } - Hash_size = n_hpteg << LG_HPTEG_SIZE; - - /* - * Find some memory for the hash table. - */ - if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322); - Hash = __va(memblock_alloc(Hash_size, Hash_size)); - cacheable_memzero(Hash, Hash_size); - _SDR1 = __pa(Hash) | SDR1_LOW_BITS; - - Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size); - - printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n", - (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash); - - - /* - * Patch up the instructions in hashtable.S:create_hpte - */ - if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345); - Hash_mask = n_hpteg - 1; - hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); - mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; - if (lg_n_hpteg > 16) - mb2 = 16 - LG_HPTEG_SIZE; - - hash_page_patch_A[0] = (hash_page_patch_A[0] & ~0xffff) - | ((unsigned int)(Hash) >> 16); - hash_page_patch_A[1] = (hash_page_patch_A[1] & ~0x7c0) | (mb << 6); - hash_page_patch_A[2] = (hash_page_patch_A[2] & ~0x7c0) | (mb2 << 6); - hash_page_patch_B[0] = (hash_page_patch_B[0] & ~0xffff) | hmask; - hash_page_patch_C[0] = (hash_page_patch_C[0] & ~0xffff) | hmask; - - /* - * Ensure that the locations we've patched have been written - * out from the data cache and invalidated in the instruction - * cache, on those machines with split caches. - */ - flush_icache_range((unsigned long) &hash_page_patch_A[0], - (unsigned long) &hash_page_patch_C[1]); - - /* - * Patch up the instructions in hashtable.S:flush_hash_page - */ - flush_hash_patch_A[0] = (flush_hash_patch_A[0] & ~0xffff) - | ((unsigned int)(Hash) >> 16); - flush_hash_patch_A[1] = (flush_hash_patch_A[1] & ~0x7c0) | (mb << 6); - flush_hash_patch_A[2] = (flush_hash_patch_A[2] & ~0x7c0) | (mb2 << 6); - flush_hash_patch_B[0] = (flush_hash_patch_B[0] & ~0xffff) | hmask; - flush_icache_range((unsigned long) &flush_hash_patch_A[0], - (unsigned long) &flush_hash_patch_B[1]); - - if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205); -} - -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); - - /* 601 can only access 16MB at the moment */ - if (PVR_VER(mfspr(SPRN_PVR)) == 1) - memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000)); - else /* Anything else has 256M mapped */ - memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000)); -} diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c new file mode 100644 index 000000000000..ff845f251724 --- /dev/null +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include <linux/kernel.h> +#include <linux/pgtable.h> + +#include "ptdump.h" + +static const struct flag_info flag_array[] = { + { +#ifdef CONFIG_PPC_16K_PAGES + .mask = _PAGE_HUGE, + .val = _PAGE_HUGE, +#else + .mask = _PAGE_SPS, + .val = _PAGE_SPS, +#endif + .set = "huge", + .clear = " ", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = 0, + .set = "rw", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = _PAGE_RO, + .set = "r ", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = _PAGE_NA, + .set = " ", + }, { + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "present", + .clear = " ", + }, { + .mask = _PAGE_GUARDED, + .val = _PAGE_GUARDED, + .set = "guarded", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_NO_CACHE, + .val = _PAGE_NO_CACHE, + .set = "no cache", + .clear = " ", + }, { + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct ptdump_pg_level pg_level[5] = { + { /* pgd */ + .name = "PGD", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ + .name = "P4D", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .name = "PUD", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .name = "PMD", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .name = "PTE", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile new file mode 100644 index 000000000000..0f7a050f327e --- /dev/null +++ b/arch/powerpc/mm/ptdump/Makefile @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += ptdump.o + +obj-$(CONFIG_44x) += shared.o +obj-$(CONFIG_PPC_8xx) += 8xx.o +obj-$(CONFIG_PPC_E500) += shared.o +obj-$(CONFIG_PPC_BOOK3S_32) += shared.o +obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o + +ifdef CONFIG_PTDUMP_DEBUGFS +obj-$(CONFIG_PPC_BOOK3S_32) += bats.o segment_regs.o +obj-$(CONFIG_PPC_64S_HASH_MMU) += hashpagetable.o +endif diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c new file mode 100644 index 000000000000..820c119013e4 --- /dev/null +++ b/arch/powerpc/mm/ptdump/bats.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright 2018, Christophe Leroy CS S.I. + * <christophe.leroy@c-s.fr> + * + * This dumps the content of BATS + */ + +#include <linux/pgtable.h> +#include <linux/debugfs.h> +#include <asm/cpu_has_feature.h> + +#include "ptdump.h" + +static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, bool is_d) +{ + u32 bepi = upper & 0xfffe0000; + u32 bl = (upper >> 2) & 0x7ff; + u32 k = upper & 3; + phys_addr_t brpn = PHYS_BAT_ADDR(lower); + u32 size = (bl + 1) << 17; + + seq_printf(m, "%d: ", idx); + if (k == 0) { + seq_puts(m, " -\n"); + return; + } + + seq_printf(m, "0x%08x-0x%08x ", bepi, bepi + size - 1); +#ifdef CONFIG_PHYS_64BIT + seq_printf(m, "0x%016llx ", brpn); +#else + seq_printf(m, "0x%08x ", brpn); +#endif + pt_dump_size(m, size); + + if (k == 1) + seq_puts(m, "User "); + else if (k == 2) + seq_puts(m, "Kernel "); + else + seq_puts(m, "Kernel/User "); + + if (lower & BPP_RX) + seq_puts(m, is_d ? "r " : " x "); + else if (lower & BPP_RW) + seq_puts(m, is_d ? "rw " : " x "); + else + seq_puts(m, is_d ? " " : " "); + + seq_puts(m, lower & _PAGE_WRITETHRU ? "w " : " "); + seq_puts(m, lower & _PAGE_NO_CACHE ? "i " : " "); + seq_puts(m, lower & _PAGE_COHERENT ? "m " : " "); + seq_puts(m, lower & _PAGE_GUARDED ? "g " : " "); + seq_puts(m, "\n"); +} + +#define BAT_SHOW_603(_m, _n, _l, _u, _d) bat_show_603(_m, _n, mfspr(_l), mfspr(_u), _d) + +static int bats_show(struct seq_file *m, void *v) +{ + seq_puts(m, "---[ Instruction Block Address Translation ]---\n"); + + BAT_SHOW_603(m, 0, SPRN_IBAT0L, SPRN_IBAT0U, false); + BAT_SHOW_603(m, 1, SPRN_IBAT1L, SPRN_IBAT1U, false); + BAT_SHOW_603(m, 2, SPRN_IBAT2L, SPRN_IBAT2U, false); + BAT_SHOW_603(m, 3, SPRN_IBAT3L, SPRN_IBAT3U, false); + if (mmu_has_feature(MMU_FTR_USE_HIGH_BATS)) { + BAT_SHOW_603(m, 4, SPRN_IBAT4L, SPRN_IBAT4U, false); + BAT_SHOW_603(m, 5, SPRN_IBAT5L, SPRN_IBAT5U, false); + BAT_SHOW_603(m, 6, SPRN_IBAT6L, SPRN_IBAT6U, false); + BAT_SHOW_603(m, 7, SPRN_IBAT7L, SPRN_IBAT7U, false); + } + + seq_puts(m, "\n---[ Data Block Address Translation ]---\n"); + + BAT_SHOW_603(m, 0, SPRN_DBAT0L, SPRN_DBAT0U, true); + BAT_SHOW_603(m, 1, SPRN_DBAT1L, SPRN_DBAT1U, true); + BAT_SHOW_603(m, 2, SPRN_DBAT2L, SPRN_DBAT2U, true); + BAT_SHOW_603(m, 3, SPRN_DBAT3L, SPRN_DBAT3U, true); + if (mmu_has_feature(MMU_FTR_USE_HIGH_BATS)) { + BAT_SHOW_603(m, 4, SPRN_DBAT4L, SPRN_DBAT4U, true); + BAT_SHOW_603(m, 5, SPRN_DBAT5L, SPRN_DBAT5U, true); + BAT_SHOW_603(m, 6, SPRN_DBAT6L, SPRN_DBAT6U, true); + BAT_SHOW_603(m, 7, SPRN_DBAT7L, SPRN_DBAT7U, true); + } + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(bats); + +static int __init bats_init(void) +{ + debugfs_create_file("block_address_translation", 0400, + arch_debugfs_dir, NULL, &bats_fops); + return 0; +} +device_initcall(bats_init); diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c new file mode 100644 index 000000000000..e8a21c6dc32e --- /dev/null +++ b/arch/powerpc/mm/ptdump/book3s64.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include <linux/kernel.h> +#include <linux/pgtable.h> + +#include "ptdump.h" + +static const struct flag_info flag_array[] = { + { + .mask = _PAGE_PRIVILEGED, + .val = 0, + .set = "user", + .clear = " ", + }, { + .mask = _PAGE_READ, + .val = _PAGE_READ, + .set = "r", + .clear = " ", + }, { + .mask = _PAGE_WRITE, + .val = _PAGE_WRITE, + .set = "w", + .clear = " ", + }, { + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { + .mask = _PAGE_PTE, + .val = _PAGE_PTE, + .set = "pte", + .clear = " ", + }, { + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "valid", + .clear = " ", + }, { + .mask = _PAGE_PRESENT | _PAGE_INVALID, + .val = 0, + .set = " ", + .clear = "present", + }, { + .mask = H_PAGE_HASHPTE, + .val = H_PAGE_HASHPTE, + .set = "hpte", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_NON_IDEMPOTENT, + .val = _PAGE_NON_IDEMPOTENT, + .set = "non-idempotent", + .clear = " ", + }, { + .mask = _PAGE_TOLERANT, + .val = _PAGE_TOLERANT, + .set = "tolerant", + .clear = " ", + }, { + .mask = H_PAGE_BUSY, + .val = H_PAGE_BUSY, + .set = "busy", + }, { +#ifdef CONFIG_PPC_64K_PAGES + .mask = H_PAGE_COMBO, + .val = H_PAGE_COMBO, + .set = "combo", + }, { + .mask = H_PAGE_4K_PFN, + .val = H_PAGE_4K_PFN, + .set = "4K_pfn", + }, { +#else /* CONFIG_PPC_64K_PAGES */ + .mask = H_PAGE_F_GIX, + .val = H_PAGE_F_GIX, + .set = "f_gix", + .is_val = true, + .shift = H_PAGE_F_GIX_SHIFT, + }, { + .mask = H_PAGE_F_SECOND, + .val = H_PAGE_F_SECOND, + .set = "f_second", + }, { +#endif /* CONFIG_PPC_64K_PAGES */ + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct ptdump_pg_level pg_level[5] = { + { /* pgd */ + .name = "PGD", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ + .name = "P4D", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .name = "PUD", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .name = "PMD", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .name = "PTE", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c new file mode 100644 index 000000000000..671d0dc00c6d --- /dev/null +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -0,0 +1,549 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + * This traverses the kernel virtual memory and dumps the pages that are in + * the hash pagetable, along with their flags to + * /sys/kernel/debug/kernel_hash_pagetable. + * + * If radix is enabled then there is no hash page table and so no debugfs file + * is generated. + */ +#include <linux/debugfs.h> +#include <linux/fs.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/const.h> +#include <asm/page.h> +#include <asm/plpar_wrappers.h> +#include <linux/memblock.h> +#include <asm/firmware.h> +#include <asm/pgalloc.h> + +struct pg_state { + struct seq_file *seq; + const struct addr_marker *marker; + unsigned long start_address; + unsigned int level; + u64 current_flags; +}; + +struct addr_marker { + unsigned long start_address; + const char *name; +}; + +static struct addr_marker address_markers[] = { + { 0, "Start of kernel VM" }, + { 0, "vmalloc() Area" }, + { 0, "vmalloc() End" }, + { 0, "isa I/O start" }, + { 0, "isa I/O end" }, + { 0, "phb I/O start" }, + { 0, "phb I/O end" }, + { 0, "I/O remap start" }, + { 0, "I/O remap end" }, + { 0, "vmemmap start" }, + { -1, NULL }, +}; + +struct flag_info { + u64 mask; + u64 val; + const char *set; + const char *clear; + bool is_val; + int shift; +}; + +static const struct flag_info v_flag_array[] = { + { + .mask = SLB_VSID_B, + .val = SLB_VSID_B_256M, + .set = "ssize: 256M", + .clear = "ssize: 1T ", + }, { + .mask = HPTE_V_SECONDARY, + .val = HPTE_V_SECONDARY, + .set = "secondary", + .clear = "primary ", + }, { + .mask = HPTE_V_VALID, + .val = HPTE_V_VALID, + .set = "valid ", + .clear = "invalid", + }, { + .mask = HPTE_V_BOLTED, + .val = HPTE_V_BOLTED, + .set = "bolted", + .clear = "", + } +}; + +static const struct flag_info r_flag_array[] = { + { + .mask = HPTE_R_PP0 | HPTE_R_PP, + .val = PP_RWXX, + .set = "prot:RW--", + }, { + .mask = HPTE_R_PP0 | HPTE_R_PP, + .val = PP_RWRX, + .set = "prot:RWR-", + }, { + .mask = HPTE_R_PP0 | HPTE_R_PP, + .val = PP_RWRW, + .set = "prot:RWRW", + }, { + .mask = HPTE_R_PP0 | HPTE_R_PP, + .val = PP_RXRX, + .set = "prot:R-R-", + }, { + .mask = HPTE_R_PP0 | HPTE_R_PP, + .val = PP_RXXX, + .set = "prot:R---", + }, { + .mask = HPTE_R_KEY_HI | HPTE_R_KEY_LO, + .val = HPTE_R_KEY_HI | HPTE_R_KEY_LO, + .set = "key", + .clear = "", + .is_val = true, + }, { + .mask = HPTE_R_R, + .val = HPTE_R_R, + .set = "ref", + .clear = " ", + }, { + .mask = HPTE_R_C, + .val = HPTE_R_C, + .set = "changed", + .clear = " ", + }, { + .mask = HPTE_R_N, + .val = HPTE_R_N, + .set = "no execute", + }, { + .mask = HPTE_R_WIMG, + .val = HPTE_R_W, + .set = "writethru", + }, { + .mask = HPTE_R_WIMG, + .val = HPTE_R_I, + .set = "no cache", + }, { + .mask = HPTE_R_WIMG, + .val = HPTE_R_G, + .set = "guarded", + } +}; + +static int calculate_pagesize(struct pg_state *st, int ps, char s[]) +{ + static const char units[] = "BKMGTPE"; + const char *unit = units; + + while (ps > 9 && unit[1]) { + ps -= 10; + unit++; + } + seq_printf(st->seq, " %s_ps: %i%c\t", s, 1<<ps, *unit); + return ps; +} + +static void dump_flag_info(struct pg_state *st, const struct flag_info + *flag, u64 pte, int num) +{ + unsigned int i; + + for (i = 0; i < num; i++, flag++) { + const char *s = NULL; + u64 val; + + /* flag not defined so don't check it */ + if (flag->mask == 0) + continue; + /* Some 'flags' are actually values */ + if (flag->is_val) { + val = pte & flag->val; + if (flag->shift) + val = val >> flag->shift; + seq_printf(st->seq, " %s:%llx", flag->set, val); + } else { + if ((pte & flag->mask) == flag->val) + s = flag->set; + else + s = flag->clear; + if (s) + seq_printf(st->seq, " %s", s); + } + } +} + +static void dump_hpte_info(struct pg_state *st, unsigned long ea, u64 v, u64 r, + unsigned long rpn, int bps, int aps, unsigned long lp) +{ + int aps_index; + + while (ea >= st->marker[1].start_address) { + st->marker++; + seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + } + seq_printf(st->seq, "0x%lx:\t", ea); + seq_printf(st->seq, "AVPN:%llx\t", HPTE_V_AVPN_VAL(v)); + dump_flag_info(st, v_flag_array, v, ARRAY_SIZE(v_flag_array)); + seq_printf(st->seq, " rpn: %lx\t", rpn); + dump_flag_info(st, r_flag_array, r, ARRAY_SIZE(r_flag_array)); + + calculate_pagesize(st, bps, "base"); + aps_index = calculate_pagesize(st, aps, "actual"); + if (aps_index != 2) + seq_printf(st->seq, "LP enc: %lx", lp); + seq_putc(st->seq, '\n'); +} + + +static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64 + *r) +{ + struct hash_pte *hptep; + unsigned long hash, vsid, vpn, hpte_group, want_v, hpte_v; + int i, ssize = mmu_kernel_ssize; + unsigned long shift = mmu_psize_defs[psize].shift; + + /* calculate hash */ + vsid = get_kernel_vsid(ea, ssize); + vpn = hpt_vpn(ea, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + want_v = hpte_old_to_new_v(want_v); + + /* to check in the secondary hash table, we invert the hash */ + if (!primary) + hash = ~hash; + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + for (i = 0; i < HPTES_PER_GROUP; i++) { + hptep = htab_address + hpte_group; + hpte_v = be64_to_cpu(hptep->v); + + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + /* HPTE matches */ + *v = be64_to_cpu(hptep->v); + *r = be64_to_cpu(hptep->r); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + *v = hpte_new_to_old_v(*v, *r); + *r = hpte_new_to_old_r(*r); + } + return 0; + } + ++hpte_group; + } + return -1; +} + +static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r) +{ + struct { + unsigned long v; + unsigned long r; + } ptes[4]; + unsigned long vsid, vpn, hash, hpte_group, want_v; + int i, j, ssize = mmu_kernel_ssize; + long lpar_rc = 0; + unsigned long shift = mmu_psize_defs[psize].shift; + + /* calculate hash */ + vsid = get_kernel_vsid(ea, ssize); + vpn = hpt_vpn(ea, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); + + /* to check in the secondary hash table, we invert the hash */ + if (!primary) + hash = ~hash; + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + /* see if we can find an entry in the hpte with this hash */ + for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) { + lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes); + + if (lpar_rc) + continue; + for (j = 0; j < 4; j++) { + if (HPTE_V_COMPARE(ptes[j].v, want_v) && + (ptes[j].v & HPTE_V_VALID)) { + /* HPTE matches */ + *v = ptes[j].v; + *r = ptes[j].r; + return 0; + } + } + } + return -1; +} + +static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps, + unsigned long *lp_bits) +{ + struct mmu_psize_def entry; + unsigned long arpn, mask, lp; + int penc = -2, idx = 0, shift; + + /*. + * The LP field has 8 bits. Depending on the actual page size, some of + * these bits are concatenated with the APRN to get the RPN. The rest + * of the bits in the LP field is the LP value and is an encoding for + * the base page size and the actual page size. + * + * - find the mmu entry for our base page size + * - go through all page encodings and use the associated mask to + * find an encoding that matches our encoding in the LP field. + */ + arpn = (r & HPTE_R_RPN) >> HPTE_R_RPN_SHIFT; + lp = arpn & 0xff; + + entry = mmu_psize_defs[bps]; + while (idx < MMU_PAGE_COUNT) { + penc = entry.penc[idx]; + if ((penc != -1) && (mmu_psize_defs[idx].shift)) { + shift = mmu_psize_defs[idx].shift - HPTE_R_RPN_SHIFT; + mask = (0x1 << (shift)) - 1; + if ((lp & mask) == penc) { + *aps = mmu_psize_to_shift(idx); + *lp_bits = lp & mask; + *rpn = arpn >> shift; + return; + } + } + idx++; + } +} + +static int base_hpte_find(unsigned long ea, int psize, bool primary, u64 *v, + u64 *r) +{ + if (IS_ENABLED(CONFIG_PPC_PSERIES) && firmware_has_feature(FW_FEATURE_LPAR)) + return pseries_find(ea, psize, primary, v, r); + + return native_find(ea, psize, primary, v, r); +} + +static unsigned long hpte_find(struct pg_state *st, unsigned long ea, int psize) +{ + unsigned long slot; + u64 v = 0, r = 0; + unsigned long rpn, lp_bits; + int base_psize = 0, actual_psize = 0; + + if (ea < PAGE_OFFSET) + return -1; + + /* Look in primary table */ + slot = base_hpte_find(ea, psize, true, &v, &r); + + /* Look in secondary table */ + if (slot == -1) + slot = base_hpte_find(ea, psize, false, &v, &r); + + /* No entry found */ + if (slot == -1) + return -1; + + /* + * We found an entry in the hash page table: + * - check that this has the same base page + * - find the actual page size + * - find the RPN + */ + base_psize = mmu_psize_to_shift(psize); + + if ((v & HPTE_V_LARGE) == HPTE_V_LARGE) { + decode_r(psize, r, &rpn, &actual_psize, &lp_bits); + } else { + /* 4K actual page size */ + actual_psize = 12; + rpn = (r & HPTE_R_RPN) >> HPTE_R_RPN_SHIFT; + /* In this case there are no LP bits */ + lp_bits = -1; + } + /* + * We didn't find a matching encoding, so the PTE we found isn't for + * this address. + */ + if (actual_psize == -1) + return -1; + + dump_hpte_info(st, ea, v, r, rpn, base_psize, actual_psize, lp_bits); + return 0; +} + +static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) +{ + pte_t *pte = pte_offset_kernel(pmd, 0); + unsigned long addr, pteval, psize; + int i, status; + + for (i = 0; i < PTRS_PER_PTE; i++, pte++) { + addr = start + i * PAGE_SIZE; + pteval = pte_val(*pte); + + if (addr < VMALLOC_END) + psize = mmu_vmalloc_psize; + else + psize = mmu_io_psize; + + /* check for secret 4K mappings */ + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && + ((pteval & H_PAGE_COMBO) == H_PAGE_COMBO || + (pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN)) + psize = mmu_io_psize; + + /* check for hashpte */ + status = hpte_find(st, addr, psize); + + if (((pteval & H_PAGE_HASHPTE) != H_PAGE_HASHPTE) + && (status != -1)) { + /* found a hpte that is not in the linux page tables */ + seq_printf(st->seq, "page probably bolted before linux" + " pagetables were set: addr:%lx, pteval:%lx\n", + addr, pteval); + } + } +} + +static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) +{ + pmd_t *pmd = pmd_offset(pud, 0); + unsigned long addr; + unsigned int i; + + for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { + addr = start + i * PMD_SIZE; + if (!pmd_none(*pmd)) + /* pmd exists */ + walk_pte(st, pmd, addr); + } +} + +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) +{ + pud_t *pud = pud_offset(p4d, 0); + unsigned long addr; + unsigned int i; + + for (i = 0; i < PTRS_PER_PUD; i++, pud++) { + addr = start + i * PUD_SIZE; + if (!pud_none(*pud)) + /* pud exists */ + walk_pmd(st, pud, addr); + } +} + +static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start) +{ + p4d_t *p4d = p4d_offset(pgd, 0); + unsigned long addr; + unsigned int i; + + for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { + addr = start + i * P4D_SIZE; + if (!p4d_none(*p4d)) + /* p4d exists */ + walk_pud(st, p4d, addr); + } +} + +static void walk_pagetables(struct pg_state *st) +{ + pgd_t *pgd = pgd_offset_k(0UL); + unsigned int i; + unsigned long addr; + + /* + * Traverse the linux pagetable structure and dump pages that are in + * the hash pagetable. + */ + for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { + addr = KERN_VIRT_START + i * PGDIR_SIZE; + if (!pgd_none(*pgd)) + /* pgd exists */ + walk_p4d(st, pgd, addr); + } +} + + +static void walk_linearmapping(struct pg_state *st) +{ + unsigned long addr; + + /* + * Traverse the linear mapping section of virtual memory and dump pages + * that are in the hash pagetable. + */ + unsigned long psize = 1 << mmu_psize_defs[mmu_linear_psize].shift; + + for (addr = PAGE_OFFSET; addr < PAGE_OFFSET + + memblock_end_of_DRAM(); addr += psize) + hpte_find(st, addr, mmu_linear_psize); +} + +static void walk_vmemmap(struct pg_state *st) +{ + struct vmemmap_backing *ptr = vmemmap_list; + + if (!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) + return; + /* + * Traverse the vmemmaped memory and dump pages that are in the hash + * pagetable. + */ + while (ptr) { + hpte_find(st, ptr->virt_addr, mmu_vmemmap_psize); + ptr = ptr->list; + } + seq_puts(st->seq, "---[ vmemmap end ]---\n"); +} + +static void populate_markers(void) +{ + address_markers[0].start_address = PAGE_OFFSET; + address_markers[1].start_address = VMALLOC_START; + address_markers[2].start_address = VMALLOC_END; + address_markers[3].start_address = ISA_IO_BASE; + address_markers[4].start_address = ISA_IO_END; + address_markers[5].start_address = PHB_IO_BASE; + address_markers[6].start_address = PHB_IO_END; + address_markers[7].start_address = IOREMAP_BASE; + address_markers[8].start_address = IOREMAP_END; + address_markers[9].start_address = H_VMEMMAP_START; +} + +static int ptdump_show(struct seq_file *m, void *v) +{ + struct pg_state st = { + .seq = m, + .start_address = PAGE_OFFSET, + .marker = address_markers, + }; + /* + * Traverse the 0xc, 0xd and 0xf areas of the kernel virtual memory and + * dump pages that are in the hash pagetable. + */ + walk_linearmapping(&st); + walk_pagetables(&st); + walk_vmemmap(&st); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(ptdump); + +static int ptdump_init(void) +{ + if (!radix_enabled()) { + populate_markers(); + debugfs_create_file("kernel_hash_pagetable", 0400, NULL, NULL, + &ptdump_fops); + } + return 0; +} +device_initcall(ptdump_init); diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c new file mode 100644 index 000000000000..0d499aebee72 --- /dev/null +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -0,0 +1,425 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + * This traverses the kernel pagetables and dumps the + * information about the used sections of memory to + * /sys/kernel/debug/kernel_pagetables. + * + * Derived from the arm64 implementation: + * Copyright (c) 2014, The Linux Foundation, Laura Abbott. + * (C) Copyright 2008 Intel Corporation, Arjan van de Ven. + */ +#include <linux/debugfs.h> +#include <linux/fs.h> +#include <linux/hugetlb.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/ptdump.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <asm/fixmap.h> +#include <linux/const.h> +#include <linux/kasan.h> +#include <asm/page.h> +#include <asm/hugetlb.h> + +#include <mm/mmu_decl.h> + +#include "ptdump.h" + +/* + * To visualise what is happening, + * + * - PTRS_PER_P** = how many entries there are in the corresponding P** + * - P**_SHIFT = how many bits of the address we use to index into the + * corresponding P** + * - P**_SIZE is how much memory we can access through the table - not the + * size of the table itself. + * P**={PGD, PUD, PMD, PTE} + * + * + * Each entry of the PGD points to a PUD. Each entry of a PUD points to a + * PMD. Each entry of a PMD points to a PTE. And every PTE entry points to + * a page. + * + * In the case where there are only 3 levels, the PUD is folded into the + * PGD: every PUD has only one entry which points to the PMD. + * + * The page dumper groups page table entries of the same type into a single + * description. It uses pg_state to track the range information while + * iterating over the PTE entries. When the continuity is broken it then + * dumps out a description of the range - ie PTEs that are virtually contiguous + * with the same PTE flags are chunked together. This is to make it clear how + * different areas of the kernel virtual memory are used. + * + */ +struct pg_state { + struct ptdump_state ptdump; + struct seq_file *seq; + const struct addr_marker *marker; + unsigned long start_address; + unsigned long start_pa; + int level; + u64 current_flags; + bool check_wx; + unsigned long wx_pages; +}; + +struct addr_marker { + unsigned long start_address; + const char *name; +}; + +static struct addr_marker address_markers[] = { + { 0, "Start of kernel VM" }, +#ifdef MODULES_VADDR + { 0, "modules start" }, + { 0, "modules end" }, +#endif + { 0, "vmalloc() Area" }, + { 0, "vmalloc() End" }, +#ifdef CONFIG_PPC64 + { 0, "isa I/O start" }, + { 0, "isa I/O end" }, + { 0, "phb I/O start" }, + { 0, "phb I/O end" }, + { 0, "I/O remap start" }, + { 0, "I/O remap end" }, + { 0, "vmemmap start" }, +#else + { 0, "Early I/O remap start" }, + { 0, "Early I/O remap end" }, +#ifdef CONFIG_HIGHMEM + { 0, "Highmem PTEs start" }, + { 0, "Highmem PTEs end" }, +#endif + { 0, "Fixmap start" }, + { 0, "Fixmap end" }, +#endif +#ifdef CONFIG_KASAN + { 0, "kasan shadow mem start" }, + { 0, "kasan shadow mem end" }, +#endif + { -1, NULL }, +}; + +static struct ptdump_range ptdump_range[] __ro_after_init = { + {TASK_SIZE_MAX, ~0UL}, + {0, 0} +}; + +#define pt_dump_seq_printf(m, fmt, args...) \ +({ \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + +#define pt_dump_seq_putc(m, c) \ +({ \ + if (m) \ + seq_putc(m, c); \ +}) + +void pt_dump_size(struct seq_file *m, unsigned long size) +{ + static const char units[] = " KMGTPE"; + const char *unit = units; + + /* Work out what appropriate unit to use */ + while (!(size & 1023) && unit[1]) { + size >>= 10; + unit++; + } + pt_dump_seq_printf(m, "%9lu%c ", size, *unit); +} + +static void dump_flag_info(struct pg_state *st, const struct flag_info + *flag, u64 pte, int num) +{ + unsigned int i; + + for (i = 0; i < num; i++, flag++) { + const char *s = NULL; + u64 val; + + /* flag not defined so don't check it */ + if (flag->mask == 0) + continue; + /* Some 'flags' are actually values */ + if (flag->is_val) { + val = pte & flag->val; + if (flag->shift) + val = val >> flag->shift; + pt_dump_seq_printf(st->seq, " %s:%llx", flag->set, val); + } else { + if ((pte & flag->mask) == flag->val) + s = flag->set; + else + s = flag->clear; + if (s) + pt_dump_seq_printf(st->seq, " %s", s); + } + st->current_flags &= ~flag->mask; + } + if (st->current_flags != 0) + pt_dump_seq_printf(st->seq, " unknown flags:%llx", st->current_flags); +} + +static void dump_addr(struct pg_state *st, unsigned long addr) +{ +#ifdef CONFIG_PPC64 +#define REG "0x%016lx" +#else +#define REG "0x%08lx" +#endif + + pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); + pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa); + pt_dump_size(st->seq, addr - st->start_address); + pt_dump_seq_printf(st->seq, "%s ", pg_level[st->level].name); +} + +static void note_prot_wx(struct pg_state *st, unsigned long addr) +{ + pte_t pte = __pte(st->current_flags); + + if (!st->check_wx) + return; + + if (!pte_write(pte) || !pte_exec(pte)) + return; + + WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX), + "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n", + (void *)st->start_address, (void *)st->start_address); + + st->wx_pages += (addr - st->start_address) / PAGE_SIZE; +} + +static void note_page_update_state(struct pg_state *st, unsigned long addr, int level, u64 val) +{ + u64 flag = level >= 0 ? val & pg_level[level].mask : 0; + u64 pa = val & PTE_RPN_MASK; + + st->level = level; + st->current_flags = flag; + st->start_address = addr; + st->start_pa = pa; + + while (addr >= st->marker[1].start_address) { + st->marker++; + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + } +} + +static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) +{ + u64 flag = level >= 0 ? val & pg_level[level].mask : 0; + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + + /* At first no level is set */ + if (st->level == -1) { + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + note_page_update_state(st, addr, level, val); + /* + * Dump the section of virtual memory when: + * - the PTE flags from one entry to the next differs. + * - we change levels in the tree. + * - the address is in a different section of memory and is thus + * used for a different purpose, regardless of the flags. + */ + } else if (flag != st->current_flags || level != st->level || + addr >= st->marker[1].start_address) { + + /* Check the PTE flags */ + if (st->current_flags) { + note_prot_wx(st, addr); + dump_addr(st, addr); + + /* Dump all the flags */ + if (pg_level[st->level].flag) + dump_flag_info(st, pg_level[st->level].flag, + st->current_flags, + pg_level[st->level].num); + + pt_dump_seq_putc(st->seq, '\n'); + } + + /* + * Address indicates we have passed the end of the + * current section of virtual memory + */ + note_page_update_state(st, addr, level, val); + } +} + +static void populate_markers(void) +{ + int i = 0; + +#ifdef CONFIG_PPC64 + address_markers[i++].start_address = PAGE_OFFSET; +#else + address_markers[i++].start_address = TASK_SIZE; +#endif +#ifdef MODULES_VADDR + address_markers[i++].start_address = MODULES_VADDR; + address_markers[i++].start_address = MODULES_END; +#endif + address_markers[i++].start_address = VMALLOC_START; + address_markers[i++].start_address = VMALLOC_END; +#ifdef CONFIG_PPC64 + address_markers[i++].start_address = ISA_IO_BASE; + address_markers[i++].start_address = ISA_IO_END; + address_markers[i++].start_address = PHB_IO_BASE; + address_markers[i++].start_address = PHB_IO_END; + address_markers[i++].start_address = IOREMAP_BASE; + address_markers[i++].start_address = IOREMAP_END; + /* What is the ifdef about? */ +#ifdef CONFIG_PPC_BOOK3S_64 + address_markers[i++].start_address = H_VMEMMAP_START; +#else + address_markers[i++].start_address = VMEMMAP_BASE; +#endif +#else /* !CONFIG_PPC64 */ + address_markers[i++].start_address = ioremap_bot; + address_markers[i++].start_address = IOREMAP_TOP; +#ifdef CONFIG_HIGHMEM + address_markers[i++].start_address = PKMAP_BASE; + address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP); +#endif + address_markers[i++].start_address = FIXADDR_START; + address_markers[i++].start_address = FIXADDR_TOP; +#endif /* CONFIG_PPC64 */ +#ifdef CONFIG_KASAN + address_markers[i++].start_address = KASAN_SHADOW_START; + address_markers[i++].start_address = KASAN_SHADOW_END; +#endif +} + +static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) +{ + note_page(pt_st, addr, 4, pte_val(pte)); +} + +static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} + +static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) +{ + note_page(pt_st, addr, 2, pud_val(pud)); +} + +static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} + +static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) +{ + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +static void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); +} + +static int ptdump_show(struct seq_file *m, void *v) +{ + struct pg_state st = { + .seq = m, + .marker = address_markers, + .level = -1, + .ptdump = { + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, + .range = ptdump_range, + } + }; + + /* Traverse kernel page tables */ + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(ptdump); + +static void __init build_pgtable_complete_mask(void) +{ + unsigned int i, j; + + for (i = 0; i < ARRAY_SIZE(pg_level); i++) + if (pg_level[i].flag) + for (j = 0; j < pg_level[i].num; j++) + pg_level[i].mask |= pg_level[i].flag[j].mask; +} + +bool ptdump_check_wx(void) +{ + struct pg_state st = { + .seq = NULL, + .marker = (struct addr_marker[]) { + { 0, NULL}, + { -1, NULL}, + }, + .level = -1, + .check_wx = true, + .ptdump = { + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, + .range = ptdump_range, + } + }; + + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !mmu_has_feature(MMU_FTR_KERNEL_RO)) + return true; + + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + + if (st.wx_pages) { + pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", + st.wx_pages); + + return false; + } else { + pr_info("Checked W+X mappings: passed, no W+X pages found\n"); + + return true; + } +} + +static int __init ptdump_init(void) +{ +#ifdef CONFIG_PPC64 + if (!radix_enabled()) + ptdump_range[0].start = KERN_VIRT_START; + else + ptdump_range[0].start = PAGE_OFFSET; + + ptdump_range[0].end = PAGE_OFFSET + (PGDIR_SIZE * PTRS_PER_PGD); +#endif + + populate_markers(); + build_pgtable_complete_mask(); + + if (IS_ENABLED(CONFIG_PTDUMP_DEBUGFS)) + debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); + + return 0; +} +device_initcall(ptdump_init); diff --git a/arch/powerpc/mm/ptdump/ptdump.h b/arch/powerpc/mm/ptdump/ptdump.h new file mode 100644 index 000000000000..12aa9eca8b0c --- /dev/null +++ b/arch/powerpc/mm/ptdump/ptdump.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/types.h> +#include <linux/seq_file.h> + +struct flag_info { + u64 mask; + u64 val; + const char *set; + const char *clear; + bool is_val; + int shift; +}; + +struct ptdump_pg_level { + const struct flag_info *flag; + char name[4]; + size_t num; + u64 mask; +}; + +extern struct ptdump_pg_level pg_level[5]; + +void pt_dump_size(struct seq_file *m, unsigned long delta); diff --git a/arch/powerpc/mm/ptdump/segment_regs.c b/arch/powerpc/mm/ptdump/segment_regs.c new file mode 100644 index 000000000000..9df3af8d481f --- /dev/null +++ b/arch/powerpc/mm/ptdump/segment_regs.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright 2018, Christophe Leroy CS S.I. + * <christophe.leroy@c-s.fr> + * + * This dumps the content of Segment Registers + */ + +#include <linux/debugfs.h> + +static void seg_show(struct seq_file *m, int i) +{ + u32 val = mfsr(i << 28); + + seq_printf(m, "0x%01x0000000-0x%01xfffffff ", i, i); + seq_printf(m, "Kern key %d ", (val >> 30) & 1); + seq_printf(m, "User key %d ", (val >> 29) & 1); + if (val & 0x80000000) { + seq_printf(m, "Device 0x%03x", (val >> 20) & 0x1ff); + seq_printf(m, "-0x%05x", val & 0xfffff); + } else { + if (val & 0x10000000) + seq_puts(m, "No Exec "); + seq_printf(m, "VSID 0x%06x", val & 0xffffff); + } + seq_puts(m, "\n"); +} + +static int sr_show(struct seq_file *m, void *v) +{ + int i; + + seq_puts(m, "---[ User Segments ]---\n"); + for (i = 0; i < TASK_SIZE >> 28; i++) + seg_show(m, i); + + seq_puts(m, "\n---[ Kernel Segments ]---\n"); + for (; i < 16; i++) + seg_show(m, i); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(sr); + +static int __init sr_init(void) +{ + debugfs_create_file("segment_registers", 0400, arch_debugfs_dir, + NULL, &sr_fops); + return 0; +} +device_initcall(sr_init); diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c new file mode 100644 index 000000000000..edc69da19b85 --- /dev/null +++ b/arch/powerpc/mm/ptdump/shared.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include <linux/kernel.h> +#include <linux/pgtable.h> + +#include "ptdump.h" + +static const struct flag_info flag_array[] = { + { + .mask = _PAGE_READ, + .val = 0, + .set = " ", + .clear = "r", + }, { + .mask = _PAGE_WRITE, + .val = 0, + .set = " ", + .clear = "w", + }, { + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "present", + .clear = " ", + }, { + .mask = _PAGE_COHERENT, + .val = _PAGE_COHERENT, + .set = "coherent", + .clear = " ", + }, { + .mask = _PAGE_GUARDED, + .val = _PAGE_GUARDED, + .set = "guarded", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_WRITETHRU, + .val = _PAGE_WRITETHRU, + .set = "write through", + .clear = " ", + }, { + .mask = _PAGE_NO_CACHE, + .val = _PAGE_NO_CACHE, + .set = "no cache", + .clear = " ", + }, { + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct ptdump_pg_level pg_level[5] = { + { /* pgd */ + .name = "PGD", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ + .name = "P4D", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .name = "PUD", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .name = "PMD", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .name = "PTE", + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c deleted file mode 100644 index a538c80db2df..000000000000 --- a/arch/powerpc/mm/slb.c +++ /dev/null @@ -1,332 +0,0 @@ -/* - * PowerPC64 SLB support. - * - * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM - * Based on earlier code written by: - * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com - * Copyright (c) 2001 Dave Engebretsen - * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/pgtable.h> -#include <asm/mmu.h> -#include <asm/mmu_context.h> -#include <asm/paca.h> -#include <asm/cputable.h> -#include <asm/cacheflush.h> -#include <asm/smp.h> -#include <linux/compiler.h> -#include <asm/udbg.h> -#include <asm/code-patching.h> - - -extern void slb_allocate_realmode(unsigned long ea); -extern void slb_allocate_user(unsigned long ea); - -static void slb_allocate(unsigned long ea) -{ - /* Currently, we do real mode for all SLBs including user, but - * that will change if we bring back dynamic VSIDs - */ - slb_allocate_realmode(ea); -} - -#define slb_esid_mask(ssize) \ - (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) - -static inline unsigned long mk_esid_data(unsigned long ea, int ssize, - unsigned long slot) -{ - return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot; -} - -#define slb_vsid_shift(ssize) \ - ((ssize) == MMU_SEGSIZE_256M? SLB_VSID_SHIFT: SLB_VSID_SHIFT_1T) - -static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, - unsigned long flags) -{ - return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags | - ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); -} - -static inline void slb_shadow_update(unsigned long ea, int ssize, - unsigned long flags, - unsigned long entry) -{ - /* - * Clear the ESID first so the entry is not valid while we are - * updating it. No write barriers are needed here, provided - * we only update the current CPU's SLB shadow buffer. - */ - get_slb_shadow()->save_area[entry].esid = 0; - get_slb_shadow()->save_area[entry].vsid = mk_vsid_data(ea, ssize, flags); - get_slb_shadow()->save_area[entry].esid = mk_esid_data(ea, ssize, entry); -} - -static inline void slb_shadow_clear(unsigned long entry) -{ - get_slb_shadow()->save_area[entry].esid = 0; -} - -static inline void create_shadowed_slbe(unsigned long ea, int ssize, - unsigned long flags, - unsigned long entry) -{ - /* - * Updating the shadow buffer before writing the SLB ensures - * we don't get a stale entry here if we get preempted by PHYP - * between these two statements. - */ - slb_shadow_update(ea, ssize, flags, entry); - - asm volatile("slbmte %0,%1" : - : "r" (mk_vsid_data(ea, ssize, flags)), - "r" (mk_esid_data(ea, ssize, entry)) - : "memory" ); -} - -static void __slb_flush_and_rebolt(void) -{ - /* If you change this make sure you change SLB_NUM_BOLTED - * appropriately too. */ - unsigned long linear_llp, vmalloc_llp, lflags, vflags; - unsigned long ksp_esid_data, ksp_vsid_data; - - linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; - vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; - lflags = SLB_VSID_KERNEL | linear_llp; - vflags = SLB_VSID_KERNEL | vmalloc_llp; - - ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, 2); - if ((ksp_esid_data & ~0xfffffffUL) <= PAGE_OFFSET) { - ksp_esid_data &= ~SLB_ESID_V; - ksp_vsid_data = 0; - slb_shadow_clear(2); - } else { - /* Update stack entry; others don't change */ - slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 2); - ksp_vsid_data = get_slb_shadow()->save_area[2].vsid; - } - - /* We need to do this all in asm, so we're sure we don't touch - * the stack between the slbia and rebolting it. */ - asm volatile("isync\n" - "slbia\n" - /* Slot 1 - first VMALLOC segment */ - "slbmte %0,%1\n" - /* Slot 2 - kernel stack */ - "slbmte %2,%3\n" - "isync" - :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)), - "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, 1)), - "r"(ksp_vsid_data), - "r"(ksp_esid_data) - : "memory"); -} - -void slb_flush_and_rebolt(void) -{ - - WARN_ON(!irqs_disabled()); - - /* - * We can't take a PMU exception in the following code, so hard - * disable interrupts. - */ - hard_irq_disable(); - - __slb_flush_and_rebolt(); - get_paca()->slb_cache_ptr = 0; -} - -void slb_vmalloc_update(void) -{ - unsigned long vflags; - - vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp; - slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, 1); - slb_flush_and_rebolt(); -} - -/* Helper function to compare esids. There are four cases to handle. - * 1. The system is not 1T segment size capable. Use the GET_ESID compare. - * 2. The system is 1T capable, both addresses are < 1T, use the GET_ESID compare. - * 3. The system is 1T capable, only one of the two addresses is > 1T. This is not a match. - * 4. The system is 1T capable, both addresses are > 1T, use the GET_ESID_1T macro to compare. - */ -static inline int esids_match(unsigned long addr1, unsigned long addr2) -{ - int esid_1t_count; - - /* System is not 1T segment size capable. */ - if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) - return (GET_ESID(addr1) == GET_ESID(addr2)); - - esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) + - ((addr2 >> SID_SHIFT_1T) != 0)); - - /* both addresses are < 1T */ - if (esid_1t_count == 0) - return (GET_ESID(addr1) == GET_ESID(addr2)); - - /* One address < 1T, the other > 1T. Not a match */ - if (esid_1t_count == 1) - return 0; - - /* Both addresses are > 1T. */ - return (GET_ESID_1T(addr1) == GET_ESID_1T(addr2)); -} - -/* Flush all user entries from the segment table of the current processor. */ -void switch_slb(struct task_struct *tsk, struct mm_struct *mm) -{ - unsigned long offset; - unsigned long slbie_data = 0; - unsigned long pc = KSTK_EIP(tsk); - unsigned long stack = KSTK_ESP(tsk); - unsigned long exec_base; - - /* - * We need interrupts hard-disabled here, not just soft-disabled, - * so that a PMU interrupt can't occur, which might try to access - * user memory (to get a stack trace) and possible cause an SLB miss - * which would update the slb_cache/slb_cache_ptr fields in the PACA. - */ - hard_irq_disable(); - offset = get_paca()->slb_cache_ptr; - if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && - offset <= SLB_CACHE_ENTRIES) { - int i; - asm volatile("isync" : : : "memory"); - for (i = 0; i < offset; i++) { - slbie_data = (unsigned long)get_paca()->slb_cache[i] - << SID_SHIFT; /* EA */ - slbie_data |= user_segment_size(slbie_data) - << SLBIE_SSIZE_SHIFT; - slbie_data |= SLBIE_C; /* C set for user addresses */ - asm volatile("slbie %0" : : "r" (slbie_data)); - } - asm volatile("isync" : : : "memory"); - } else { - __slb_flush_and_rebolt(); - } - - /* Workaround POWER5 < DD2.1 issue */ - if (offset == 1 || offset > SLB_CACHE_ENTRIES) - asm volatile("slbie %0" : : "r" (slbie_data)); - - get_paca()->slb_cache_ptr = 0; - get_paca()->context = mm->context; - - /* - * preload some userspace segments into the SLB. - * Almost all 32 and 64bit PowerPC executables are linked at - * 0x10000000 so it makes sense to preload this segment. - */ - exec_base = 0x10000000; - - if (is_kernel_addr(pc) || is_kernel_addr(stack) || - is_kernel_addr(exec_base)) - return; - - slb_allocate(pc); - - if (!esids_match(pc, stack)) - slb_allocate(stack); - - if (!esids_match(pc, exec_base) && - !esids_match(stack, exec_base)) - slb_allocate(exec_base); -} - -static inline void patch_slb_encoding(unsigned int *insn_addr, - unsigned int immed) -{ - int insn = (*insn_addr & 0xffff0000) | immed; - patch_instruction(insn_addr, insn); -} - -void slb_set_size(u16 size) -{ - extern unsigned int *slb_compare_rr_to_size; - - if (mmu_slb_size == size) - return; - - mmu_slb_size = size; - patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size); -} - -void slb_initialize(void) -{ - unsigned long linear_llp, vmalloc_llp, io_llp; - unsigned long lflags, vflags; - static int slb_encoding_inited; - extern unsigned int *slb_miss_kernel_load_linear; - extern unsigned int *slb_miss_kernel_load_io; - extern unsigned int *slb_compare_rr_to_size; -#ifdef CONFIG_SPARSEMEM_VMEMMAP - extern unsigned int *slb_miss_kernel_load_vmemmap; - unsigned long vmemmap_llp; -#endif - - /* Prepare our SLB miss handler based on our page size */ - linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; - io_llp = mmu_psize_defs[mmu_io_psize].sllp; - vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; - get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp; -#ifdef CONFIG_SPARSEMEM_VMEMMAP - vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp; -#endif - if (!slb_encoding_inited) { - slb_encoding_inited = 1; - patch_slb_encoding(slb_miss_kernel_load_linear, - SLB_VSID_KERNEL | linear_llp); - patch_slb_encoding(slb_miss_kernel_load_io, - SLB_VSID_KERNEL | io_llp); - patch_slb_encoding(slb_compare_rr_to_size, - mmu_slb_size); - - pr_devel("SLB: linear LLP = %04lx\n", linear_llp); - pr_devel("SLB: io LLP = %04lx\n", io_llp); - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - patch_slb_encoding(slb_miss_kernel_load_vmemmap, - SLB_VSID_KERNEL | vmemmap_llp); - pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp); -#endif - } - - get_paca()->stab_rr = SLB_NUM_BOLTED; - - lflags = SLB_VSID_KERNEL | linear_llp; - vflags = SLB_VSID_KERNEL | vmalloc_llp; - - /* Invalidate the entire SLB (even slot 0) & all the ERATS */ - asm volatile("isync":::"memory"); - asm volatile("slbmte %0,%0"::"r" (0) : "memory"); - asm volatile("isync; slbia; isync":::"memory"); - create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, 0); - - create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, 1); - - /* For the boot cpu, we're running on the stack in init_thread_union, - * which is in the first segment of the linear mapping, and also - * get_paca()->kstack hasn't been initialized yet. - * For secondary cpus, we need to bolt the kernel stack entry now. - */ - slb_shadow_clear(2); - if (raw_smp_processor_id() != boot_cpuid && - (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET) - create_shadowed_slbe(get_paca()->kstack, - mmu_kernel_ssize, lflags, 2); - - asm volatile("isync":::"memory"); -} diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S deleted file mode 100644 index 17aa6dfceb34..000000000000 --- a/arch/powerpc/mm/slb_low.S +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Low-level SLB routines - * - * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM - * - * Based on earlier C version: - * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com - * Copyright (c) 2001 Dave Engebretsen - * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/processor.h> -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/cputable.h> -#include <asm/page.h> -#include <asm/mmu.h> -#include <asm/pgtable.h> -#include <asm/firmware.h> - -/* void slb_allocate_realmode(unsigned long ea); - * - * Create an SLB entry for the given EA (user or kernel). - * r3 = faulting address, r13 = PACA - * r9, r10, r11 are clobbered by this function - * No other registers are examined or changed. - */ -_GLOBAL(slb_allocate_realmode) - /* - * check for bad kernel/user address - * (ea & ~REGION_MASK) >= PGTABLE_RANGE - */ - rldicr. r9,r3,4,(63 - 46 - 4) - bne- 8f - - srdi r9,r3,60 /* get region */ - srdi r10,r3,SID_SHIFT /* get esid */ - cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */ - - /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ - blt cr7,0f /* user or kernel? */ - - /* kernel address: proto-VSID = ESID */ - /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but - * this code will generate the protoVSID 0xfffffffff for the - * top segment. That's ok, the scramble below will translate - * it to VSID 0, which is reserved as a bad VSID - one which - * will never have any pages in it. */ - - /* Check if hitting the linear mapping or some other kernel space - */ - bne cr7,1f - - /* Linear mapping encoding bits, the "li" instruction below will - * be patched by the kernel at boot - */ -_GLOBAL(slb_miss_kernel_load_linear) - li r11,0 - /* - * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 - * r9 = region id. - */ - addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha - addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l - - -BEGIN_FTR_SECTION - b slb_finish_load -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) - b slb_finish_load_1T - -1: -#ifdef CONFIG_SPARSEMEM_VMEMMAP - /* Check virtual memmap region. To be patches at kernel boot */ - cmpldi cr0,r9,0xf - bne 1f -_GLOBAL(slb_miss_kernel_load_vmemmap) - li r11,0 - b 6f -1: -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ - - /* vmalloc mapping gets the encoding from the PACA as the mapping - * can be demoted from 64K -> 4K dynamically on some machines - */ - clrldi r11,r10,48 - cmpldi r11,(VMALLOC_SIZE >> 28) - 1 - bgt 5f - lhz r11,PACAVMALLOCSLLP(r13) - b 6f -5: - /* IO mapping */ - _GLOBAL(slb_miss_kernel_load_io) - li r11,0 -6: - /* - * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 - * r9 = region id. - */ - addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha - addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l - -BEGIN_FTR_SECTION - b slb_finish_load -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) - b slb_finish_load_1T - -0: - /* when using slices, we extract the psize off the slice bitmaps - * and then we need to get the sllp encoding off the mmu_psize_defs - * array. - * - * XXX This is a bit inefficient especially for the normal case, - * so we should try to implement a fast path for the standard page - * size using the old sllp value so we avoid the array. We cannot - * really do dynamic patching unfortunately as processes might flip - * between 4k and 64k standard page size - */ -#ifdef CONFIG_PPC_MM_SLICES - /* r10 have esid */ - cmpldi r10,16 - /* below SLICE_LOW_TOP */ - blt 5f - /* - * Handle hpsizes, - * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index - */ - srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */ - addi r9,r11,PACAHIGHSLICEPSIZE - lbzx r9,r13,r9 /* r9 is hpsizes[r11] */ - /* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */ - rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63 - b 6f - -5: - /* - * Handle lpsizes - * r9 is get_paca()->context.low_slices_psize, r11 is index - */ - ld r9,PACALOWSLICESPSIZE(r13) - mr r11,r10 -6: - sldi r11,r11,2 /* index * 4 */ - /* Extract the psize and multiply to get an array offset */ - srd r9,r9,r11 - andi. r9,r9,0xf - mulli r9,r9,MMUPSIZEDEFSIZE - - /* Now get to the array and obtain the sllp - */ - ld r11,PACATOC(r13) - ld r11,mmu_psize_defs@got(r11) - add r11,r11,r9 - ld r11,MMUPSIZESLLP(r11) - ori r11,r11,SLB_VSID_USER -#else - /* paca context sllp already contains the SLB_VSID_USER bits */ - lhz r11,PACACONTEXTSLLP(r13) -#endif /* CONFIG_PPC_MM_SLICES */ - - ld r9,PACACONTEXTID(r13) -BEGIN_FTR_SECTION - cmpldi r10,0x1000 - bge slb_finish_load_1T -END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) - b slb_finish_load - -8: /* invalid EA */ - li r10,0 /* BAD_VSID */ - li r9,0 /* BAD_VSID */ - li r11,SLB_VSID_USER /* flags don't much matter */ - b slb_finish_load - -#ifdef __DISABLED__ - -/* void slb_allocate_user(unsigned long ea); - * - * Create an SLB entry for the given EA (user or kernel). - * r3 = faulting address, r13 = PACA - * r9, r10, r11 are clobbered by this function - * No other registers are examined or changed. - * - * It is called with translation enabled in order to be able to walk the - * page tables. This is not currently used. - */ -_GLOBAL(slb_allocate_user) - /* r3 = faulting address */ - srdi r10,r3,28 /* get esid */ - - crset 4*cr7+lt /* set "user" flag for later */ - - /* check if we fit in the range covered by the pagetables*/ - srdi. r9,r3,PGTABLE_EADDR_SIZE - crnot 4*cr0+eq,4*cr0+eq - beqlr - - /* now we need to get to the page tables in order to get the page - * size encoding from the PMD. In the future, we'll be able to deal - * with 1T segments too by getting the encoding from the PGD instead - */ - ld r9,PACAPGDIR(r13) - cmpldi cr0,r9,0 - beqlr - rlwinm r11,r10,8,25,28 - ldx r9,r9,r11 /* get pgd_t */ - cmpldi cr0,r9,0 - beqlr - rlwinm r11,r10,3,17,28 - ldx r9,r9,r11 /* get pmd_t */ - cmpldi cr0,r9,0 - beqlr - - /* build vsid flags */ - andi. r11,r9,SLB_VSID_LLP - ori r11,r11,SLB_VSID_USER - - /* get context to calculate proto-VSID */ - ld r9,PACACONTEXTID(r13) - /* fall through slb_finish_load */ - -#endif /* __DISABLED__ */ - - -/* - * Finish loading of an SLB entry and return - * - * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET - */ -slb_finish_load: - rldimi r10,r9,ESID_BITS,0 - ASM_VSID_SCRAMBLE(r10,r9,256M) - /* - * bits above VSID_BITS_256M need to be ignored from r10 - * also combine VSID and flags - */ - rldimi r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M)) - - /* r3 = EA, r11 = VSID data */ - /* - * Find a slot, round robin. Previously we tried to find a - * free slot first but that took too long. Unfortunately we - * dont have any LRU information to help us choose a slot. - */ - -7: ld r10,PACASTABRR(r13) - addi r10,r10,1 - /* This gets soft patched on boot. */ -_GLOBAL(slb_compare_rr_to_size) - cmpldi r10,0 - - blt+ 4f - li r10,SLB_NUM_BOLTED - -4: - std r10,PACASTABRR(r13) - -3: - rldimi r3,r10,0,36 /* r3= EA[0:35] | entry */ - oris r10,r3,SLB_ESID_V@h /* r3 |= SLB_ESID_V */ - - /* r3 = ESID data, r11 = VSID data */ - - /* - * No need for an isync before or after this slbmte. The exception - * we enter with and the rfid we exit with are context synchronizing. - */ - slbmte r11,r10 - - /* we're done for kernel addresses */ - crclr 4*cr0+eq /* set result to "success" */ - bgelr cr7 - - /* Update the slb cache */ - lhz r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ - cmpldi r3,SLB_CACHE_ENTRIES - bge 1f - - /* still room in the slb cache */ - sldi r11,r3,2 /* r11 = offset * sizeof(u32) */ - srdi r10,r10,28 /* get the 36 bits of the ESID */ - add r11,r11,r13 /* r11 = (u32 *)paca + offset */ - stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ - addi r3,r3,1 /* offset++ */ - b 2f -1: /* offset >= SLB_CACHE_ENTRIES */ - li r3,SLB_CACHE_ENTRIES+1 -2: - sth r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ - crclr 4*cr0+eq /* set result to "success" */ - blr - -/* - * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return. - * - * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9 - */ -slb_finish_load_1T: - srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ - rldimi r10,r9,ESID_BITS_1T,0 - ASM_VSID_SCRAMBLE(r10,r9,1T) - /* - * bits above VSID_BITS_1T need to be ignored from r10 - * also combine VSID and flags - */ - rldimi r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T)) - li r10,MMU_SEGSIZE_1T - rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ - - /* r3 = EA, r11 = VSID data */ - clrrdi r3,r3,SID_SHIFT_1T /* clear out non-ESID bits */ - b 7b - diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c deleted file mode 100644 index 3e99c149271a..000000000000 --- a/arch/powerpc/mm/slice.c +++ /dev/null @@ -1,731 +0,0 @@ -/* - * address space "slices" (meta-segments) support - * - * Copyright (C) 2007 Benjamin Herrenschmidt, IBM Corporation. - * - * Based on hugetlb implementation - * - * Copyright (C) 2003 David Gibson, IBM Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#undef DEBUG - -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/err.h> -#include <linux/spinlock.h> -#include <linux/export.h> -#include <asm/mman.h> -#include <asm/mmu.h> -#include <asm/spu.h> - -/* some sanity checks */ -#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE -#error PGTABLE_RANGE exceeds slice_mask high_slices size -#endif - -static DEFINE_SPINLOCK(slice_convert_lock); - - -#ifdef DEBUG -int _slice_debug = 1; - -static void slice_print_mask(const char *label, struct slice_mask mask) -{ - char *p, buf[16 + 3 + 64 + 1]; - int i; - - if (!_slice_debug) - return; - p = buf; - for (i = 0; i < SLICE_NUM_LOW; i++) - *(p++) = (mask.low_slices & (1 << i)) ? '1' : '0'; - *(p++) = ' '; - *(p++) = '-'; - *(p++) = ' '; - for (i = 0; i < SLICE_NUM_HIGH; i++) - *(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0'; - *(p++) = 0; - - printk(KERN_DEBUG "%s:%s\n", label, buf); -} - -#define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0) - -#else - -static void slice_print_mask(const char *label, struct slice_mask mask) {} -#define slice_dbg(fmt...) - -#endif - -static struct slice_mask slice_range_to_mask(unsigned long start, - unsigned long len) -{ - unsigned long end = start + len - 1; - struct slice_mask ret = { 0, 0 }; - - if (start < SLICE_LOW_TOP) { - unsigned long mend = min(end, SLICE_LOW_TOP); - unsigned long mstart = min(start, SLICE_LOW_TOP); - - ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) - - (1u << GET_LOW_SLICE_INDEX(mstart)); - } - - if ((start + len) > SLICE_LOW_TOP) - ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1)) - - (1ul << GET_HIGH_SLICE_INDEX(start)); - - return ret; -} - -static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, - unsigned long len) -{ - struct vm_area_struct *vma; - - if ((mm->task_size - len) < addr) - return 0; - vma = find_vma(mm, addr); - return (!vma || (addr + len) <= vma->vm_start); -} - -static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice) -{ - return !slice_area_is_free(mm, slice << SLICE_LOW_SHIFT, - 1ul << SLICE_LOW_SHIFT); -} - -static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) -{ - unsigned long start = slice << SLICE_HIGH_SHIFT; - unsigned long end = start + (1ul << SLICE_HIGH_SHIFT); - - /* Hack, so that each addresses is controlled by exactly one - * of the high or low area bitmaps, the first high area starts - * at 4GB, not 0 */ - if (start == 0) - start = SLICE_LOW_TOP; - - return !slice_area_is_free(mm, start, end - start); -} - -static struct slice_mask slice_mask_for_free(struct mm_struct *mm) -{ - struct slice_mask ret = { 0, 0 }; - unsigned long i; - - for (i = 0; i < SLICE_NUM_LOW; i++) - if (!slice_low_has_vma(mm, i)) - ret.low_slices |= 1u << i; - - if (mm->task_size <= SLICE_LOW_TOP) - return ret; - - for (i = 0; i < SLICE_NUM_HIGH; i++) - if (!slice_high_has_vma(mm, i)) - ret.high_slices |= 1ul << i; - - return ret; -} - -static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) -{ - unsigned char *hpsizes; - int index, mask_index; - struct slice_mask ret = { 0, 0 }; - unsigned long i; - u64 lpsizes; - - lpsizes = mm->context.low_slices_psize; - for (i = 0; i < SLICE_NUM_LOW; i++) - if (((lpsizes >> (i * 4)) & 0xf) == psize) - ret.low_slices |= 1u << i; - - hpsizes = mm->context.high_slices_psize; - for (i = 0; i < SLICE_NUM_HIGH; i++) { - mask_index = i & 0x1; - index = i >> 1; - if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize) - ret.high_slices |= 1ul << i; - } - - return ret; -} - -static int slice_check_fit(struct slice_mask mask, struct slice_mask available) -{ - return (mask.low_slices & available.low_slices) == mask.low_slices && - (mask.high_slices & available.high_slices) == mask.high_slices; -} - -static void slice_flush_segments(void *parm) -{ - struct mm_struct *mm = parm; - unsigned long flags; - - if (mm != current->active_mm) - return; - - /* update the paca copy of the context struct */ - get_paca()->context = current->active_mm->context; - - local_irq_save(flags); - slb_flush_and_rebolt(); - local_irq_restore(flags); -} - -static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize) -{ - int index, mask_index; - /* Write the new slice psize bits */ - unsigned char *hpsizes; - u64 lpsizes; - unsigned long i, flags; - - slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize); - slice_print_mask(" mask", mask); - - /* We need to use a spinlock here to protect against - * concurrent 64k -> 4k demotion ... - */ - spin_lock_irqsave(&slice_convert_lock, flags); - - lpsizes = mm->context.low_slices_psize; - for (i = 0; i < SLICE_NUM_LOW; i++) - if (mask.low_slices & (1u << i)) - lpsizes = (lpsizes & ~(0xful << (i * 4))) | - (((unsigned long)psize) << (i * 4)); - - /* Assign the value back */ - mm->context.low_slices_psize = lpsizes; - - hpsizes = mm->context.high_slices_psize; - for (i = 0; i < SLICE_NUM_HIGH; i++) { - mask_index = i & 0x1; - index = i >> 1; - if (mask.high_slices & (1ul << i)) - hpsizes[index] = (hpsizes[index] & - ~(0xf << (mask_index * 4))) | - (((unsigned long)psize) << (mask_index * 4)); - } - - slice_dbg(" lsps=%lx, hsps=%lx\n", - mm->context.low_slices_psize, - mm->context.high_slices_psize); - - spin_unlock_irqrestore(&slice_convert_lock, flags); - -#ifdef CONFIG_SPU_BASE - spu_flush_all_slbs(mm); -#endif -} - -/* - * Compute which slice addr is part of; - * set *boundary_addr to the start or end boundary of that slice - * (depending on 'end' parameter); - * return boolean indicating if the slice is marked as available in the - * 'available' slice_mark. - */ -static bool slice_scan_available(unsigned long addr, - struct slice_mask available, - int end, - unsigned long *boundary_addr) -{ - unsigned long slice; - if (addr < SLICE_LOW_TOP) { - slice = GET_LOW_SLICE_INDEX(addr); - *boundary_addr = (slice + end) << SLICE_LOW_SHIFT; - return !!(available.low_slices & (1u << slice)); - } else { - slice = GET_HIGH_SLICE_INDEX(addr); - *boundary_addr = (slice + end) ? - ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP; - return !!(available.high_slices & (1u << slice)); - } -} - -static unsigned long slice_find_area_bottomup(struct mm_struct *mm, - unsigned long len, - struct slice_mask available, - int psize) -{ - int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); - unsigned long addr, found, next_end; - struct vm_unmapped_area_info info; - - info.flags = 0; - info.length = len; - info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); - info.align_offset = 0; - - addr = TASK_UNMAPPED_BASE; - while (addr < TASK_SIZE) { - info.low_limit = addr; - if (!slice_scan_available(addr, available, 1, &addr)) - continue; - - next_slice: - /* - * At this point [info.low_limit; addr) covers - * available slices only and ends at a slice boundary. - * Check if we need to reduce the range, or if we can - * extend it to cover the next available slice. - */ - if (addr >= TASK_SIZE) - addr = TASK_SIZE; - else if (slice_scan_available(addr, available, 1, &next_end)) { - addr = next_end; - goto next_slice; - } - info.high_limit = addr; - - found = vm_unmapped_area(&info); - if (!(found & ~PAGE_MASK)) - return found; - } - - return -ENOMEM; -} - -static unsigned long slice_find_area_topdown(struct mm_struct *mm, - unsigned long len, - struct slice_mask available, - int psize) -{ - int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); - unsigned long addr, found, prev; - struct vm_unmapped_area_info info; - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); - info.align_offset = 0; - - addr = mm->mmap_base; - while (addr > PAGE_SIZE) { - info.high_limit = addr; - if (!slice_scan_available(addr - 1, available, 0, &addr)) - continue; - - prev_slice: - /* - * At this point [addr; info.high_limit) covers - * available slices only and starts at a slice boundary. - * Check if we need to reduce the range, or if we can - * extend it to cover the previous available slice. - */ - if (addr < PAGE_SIZE) - addr = PAGE_SIZE; - else if (slice_scan_available(addr - 1, available, 0, &prev)) { - addr = prev; - goto prev_slice; - } - info.low_limit = addr; - - found = vm_unmapped_area(&info); - if (!(found & ~PAGE_MASK)) - return found; - } - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - return slice_find_area_bottomup(mm, len, available, psize); -} - - -static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, - struct slice_mask mask, int psize, - int topdown) -{ - if (topdown) - return slice_find_area_topdown(mm, len, mask, psize); - else - return slice_find_area_bottomup(mm, len, mask, psize); -} - -#define or_mask(dst, src) do { \ - (dst).low_slices |= (src).low_slices; \ - (dst).high_slices |= (src).high_slices; \ -} while (0) - -#define andnot_mask(dst, src) do { \ - (dst).low_slices &= ~(src).low_slices; \ - (dst).high_slices &= ~(src).high_slices; \ -} while (0) - -#ifdef CONFIG_PPC_64K_PAGES -#define MMU_PAGE_BASE MMU_PAGE_64K -#else -#define MMU_PAGE_BASE MMU_PAGE_4K -#endif - -unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, - unsigned long flags, unsigned int psize, - int topdown) -{ - struct slice_mask mask = {0, 0}; - struct slice_mask good_mask; - struct slice_mask potential_mask = {0,0} /* silence stupid warning */; - struct slice_mask compat_mask = {0, 0}; - int fixed = (flags & MAP_FIXED); - int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); - struct mm_struct *mm = current->mm; - unsigned long newaddr; - - /* Sanity checks */ - BUG_ON(mm->task_size == 0); - - slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); - slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", - addr, len, flags, topdown); - - if (len > mm->task_size) - return -ENOMEM; - if (len & ((1ul << pshift) - 1)) - return -EINVAL; - if (fixed && (addr & ((1ul << pshift) - 1))) - return -EINVAL; - if (fixed && addr > (mm->task_size - len)) - return -EINVAL; - - /* If hint, make sure it matches our alignment restrictions */ - if (!fixed && addr) { - addr = _ALIGN_UP(addr, 1ul << pshift); - slice_dbg(" aligned addr=%lx\n", addr); - /* Ignore hint if it's too large or overlaps a VMA */ - if (addr > mm->task_size - len || - !slice_area_is_free(mm, addr, len)) - addr = 0; - } - - /* First make up a "good" mask of slices that have the right size - * already - */ - good_mask = slice_mask_for_size(mm, psize); - slice_print_mask(" good_mask", good_mask); - - /* - * Here "good" means slices that are already the right page size, - * "compat" means slices that have a compatible page size (i.e. - * 4k in a 64k pagesize kernel), and "free" means slices without - * any VMAs. - * - * If MAP_FIXED: - * check if fits in good | compat => OK - * check if fits in good | compat | free => convert free - * else bad - * If have hint: - * check if hint fits in good => OK - * check if hint fits in good | free => convert free - * Otherwise: - * search in good, found => OK - * search in good | free, found => convert free - * search in good | compat | free, found => convert free. - */ - -#ifdef CONFIG_PPC_64K_PAGES - /* If we support combo pages, we can allow 64k pages in 4k slices */ - if (psize == MMU_PAGE_64K) { - compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); - if (fixed) - or_mask(good_mask, compat_mask); - } -#endif - - /* First check hint if it's valid or if we have MAP_FIXED */ - if (addr != 0 || fixed) { - /* Build a mask for the requested range */ - mask = slice_range_to_mask(addr, len); - slice_print_mask(" mask", mask); - - /* Check if we fit in the good mask. If we do, we just return, - * nothing else to do - */ - if (slice_check_fit(mask, good_mask)) { - slice_dbg(" fits good !\n"); - return addr; - } - } else { - /* Now let's see if we can find something in the existing - * slices for that size - */ - newaddr = slice_find_area(mm, len, good_mask, psize, topdown); - if (newaddr != -ENOMEM) { - /* Found within the good mask, we don't have to setup, - * we thus return directly - */ - slice_dbg(" found area at 0x%lx\n", newaddr); - return newaddr; - } - } - - /* We don't fit in the good mask, check what other slices are - * empty and thus can be converted - */ - potential_mask = slice_mask_for_free(mm); - or_mask(potential_mask, good_mask); - slice_print_mask(" potential", potential_mask); - - if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) { - slice_dbg(" fits potential !\n"); - goto convert; - } - - /* If we have MAP_FIXED and failed the above steps, then error out */ - if (fixed) - return -EBUSY; - - slice_dbg(" search...\n"); - - /* If we had a hint that didn't work out, see if we can fit - * anywhere in the good area. - */ - if (addr) { - addr = slice_find_area(mm, len, good_mask, psize, topdown); - if (addr != -ENOMEM) { - slice_dbg(" found area at 0x%lx\n", addr); - return addr; - } - } - - /* Now let's see if we can find something in the existing slices - * for that size plus free slices - */ - addr = slice_find_area(mm, len, potential_mask, psize, topdown); - -#ifdef CONFIG_PPC_64K_PAGES - if (addr == -ENOMEM && psize == MMU_PAGE_64K) { - /* retry the search with 4k-page slices included */ - or_mask(potential_mask, compat_mask); - addr = slice_find_area(mm, len, potential_mask, psize, - topdown); - } -#endif - - if (addr == -ENOMEM) - return -ENOMEM; - - mask = slice_range_to_mask(addr, len); - slice_dbg(" found potential area at 0x%lx\n", addr); - slice_print_mask(" mask", mask); - - convert: - andnot_mask(mask, good_mask); - andnot_mask(mask, compat_mask); - if (mask.low_slices || mask.high_slices) { - slice_convert(mm, mask, psize); - if (psize > MMU_PAGE_BASE) - on_each_cpu(slice_flush_segments, mm, 1); - } - return addr; - -} -EXPORT_SYMBOL_GPL(slice_get_unmapped_area); - -unsigned long arch_get_unmapped_area(struct file *filp, - unsigned long addr, - unsigned long len, - unsigned long pgoff, - unsigned long flags) -{ - return slice_get_unmapped_area(addr, len, flags, - current->mm->context.user_psize, 0); -} - -unsigned long arch_get_unmapped_area_topdown(struct file *filp, - const unsigned long addr0, - const unsigned long len, - const unsigned long pgoff, - const unsigned long flags) -{ - return slice_get_unmapped_area(addr0, len, flags, - current->mm->context.user_psize, 1); -} - -unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) -{ - unsigned char *hpsizes; - int index, mask_index; - - if (addr < SLICE_LOW_TOP) { - u64 lpsizes; - lpsizes = mm->context.low_slices_psize; - index = GET_LOW_SLICE_INDEX(addr); - return (lpsizes >> (index * 4)) & 0xf; - } - hpsizes = mm->context.high_slices_psize; - index = GET_HIGH_SLICE_INDEX(addr); - mask_index = index & 0x1; - return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xf; -} -EXPORT_SYMBOL_GPL(get_slice_psize); - -/* - * This is called by hash_page when it needs to do a lazy conversion of - * an address space from real 64K pages to combo 4K pages (typically - * when hitting a non cacheable mapping on a processor or hypervisor - * that won't allow them for 64K pages). - * - * This is also called in init_new_context() to change back the user - * psize from whatever the parent context had it set to - * N.B. This may be called before mm->context.id has been set. - * - * This function will only change the content of the {low,high)_slice_psize - * masks, it will not flush SLBs as this shall be handled lazily by the - * caller. - */ -void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) -{ - int index, mask_index; - unsigned char *hpsizes; - unsigned long flags, lpsizes; - unsigned int old_psize; - int i; - - slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize); - - spin_lock_irqsave(&slice_convert_lock, flags); - - old_psize = mm->context.user_psize; - slice_dbg(" old_psize=%d\n", old_psize); - if (old_psize == psize) - goto bail; - - mm->context.user_psize = psize; - wmb(); - - lpsizes = mm->context.low_slices_psize; - for (i = 0; i < SLICE_NUM_LOW; i++) - if (((lpsizes >> (i * 4)) & 0xf) == old_psize) - lpsizes = (lpsizes & ~(0xful << (i * 4))) | - (((unsigned long)psize) << (i * 4)); - /* Assign the value back */ - mm->context.low_slices_psize = lpsizes; - - hpsizes = mm->context.high_slices_psize; - for (i = 0; i < SLICE_NUM_HIGH; i++) { - mask_index = i & 0x1; - index = i >> 1; - if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == old_psize) - hpsizes[index] = (hpsizes[index] & - ~(0xf << (mask_index * 4))) | - (((unsigned long)psize) << (mask_index * 4)); - } - - - - - slice_dbg(" lsps=%lx, hsps=%lx\n", - mm->context.low_slices_psize, - mm->context.high_slices_psize); - - bail: - spin_unlock_irqrestore(&slice_convert_lock, flags); -} - -void slice_set_psize(struct mm_struct *mm, unsigned long address, - unsigned int psize) -{ - unsigned char *hpsizes; - unsigned long i, flags; - u64 *lpsizes; - - spin_lock_irqsave(&slice_convert_lock, flags); - if (address < SLICE_LOW_TOP) { - i = GET_LOW_SLICE_INDEX(address); - lpsizes = &mm->context.low_slices_psize; - *lpsizes = (*lpsizes & ~(0xful << (i * 4))) | - ((unsigned long) psize << (i * 4)); - } else { - int index, mask_index; - i = GET_HIGH_SLICE_INDEX(address); - hpsizes = mm->context.high_slices_psize; - mask_index = i & 0x1; - index = i >> 1; - hpsizes[index] = (hpsizes[index] & - ~(0xf << (mask_index * 4))) | - (((unsigned long)psize) << (mask_index * 4)); - } - - spin_unlock_irqrestore(&slice_convert_lock, flags); - -#ifdef CONFIG_SPU_BASE - spu_flush_all_slbs(mm); -#endif -} - -void slice_set_range_psize(struct mm_struct *mm, unsigned long start, - unsigned long len, unsigned int psize) -{ - struct slice_mask mask = slice_range_to_mask(start, len); - - slice_convert(mm, mask, psize); -} - -/* - * is_hugepage_only_range() is used by generic code to verify whether - * a normal mmap mapping (non hugetlbfs) is valid on a given area. - * - * until the generic code provides a more generic hook and/or starts - * calling arch get_unmapped_area for MAP_FIXED (which our implementation - * here knows how to deal with), we hijack it to keep standard mappings - * away from us. - * - * because of that generic code limitation, MAP_FIXED mapping cannot - * "convert" back a slice with no VMAs to the standard page size, only - * get_unmapped_area() can. It would be possible to fix it here but I - * prefer working on fixing the generic code instead. - * - * WARNING: This will not work if hugetlbfs isn't enabled since the - * generic code will redefine that function as 0 in that. This is ok - * for now as we only use slices with hugetlbfs enabled. This should - * be fixed as the generic code gets fixed. - */ -int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, - unsigned long len) -{ - struct slice_mask mask, available; - unsigned int psize = mm->context.user_psize; - - mask = slice_range_to_mask(addr, len); - available = slice_mask_for_size(mm, psize); -#ifdef CONFIG_PPC_64K_PAGES - /* We need to account for 4k slices too */ - if (psize == MMU_PAGE_64K) { - struct slice_mask compat_mask; - compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); - or_mask(available, compat_mask); - } -#endif - -#if 0 /* too verbose */ - slice_dbg("is_hugepage_only_range(mm=%p, addr=%lx, len=%lx)\n", - mm, addr, len); - slice_print_mask(" mask", mask); - slice_print_mask(" available", available); -#endif - return !slice_check_fit(mask, available); -} - diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c deleted file mode 100644 index 3f8efa6f2997..000000000000 --- a/arch/powerpc/mm/stab.c +++ /dev/null @@ -1,286 +0,0 @@ -/* - * PowerPC64 Segment Translation Support. - * - * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com - * Copyright (c) 2001 Dave Engebretsen - * - * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/memblock.h> - -#include <asm/pgtable.h> -#include <asm/mmu.h> -#include <asm/mmu_context.h> -#include <asm/paca.h> -#include <asm/cputable.h> -#include <asm/prom.h> - -struct stab_entry { - unsigned long esid_data; - unsigned long vsid_data; -}; - -#define NR_STAB_CACHE_ENTRIES 8 -static DEFINE_PER_CPU(long, stab_cache_ptr); -static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache); - -/* - * Create a segment table entry for the given esid/vsid pair. - */ -static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid) -{ - unsigned long esid_data, vsid_data; - unsigned long entry, group, old_esid, castout_entry, i; - unsigned int global_entry; - struct stab_entry *ste, *castout_ste; - unsigned long kernel_segment = (esid << SID_SHIFT) >= PAGE_OFFSET; - - vsid_data = vsid << STE_VSID_SHIFT; - esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V; - if (! kernel_segment) - esid_data |= STE_ESID_KS; - - /* Search the primary group first. */ - global_entry = (esid & 0x1f) << 3; - ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7)); - - /* Find an empty entry, if one exists. */ - for (group = 0; group < 2; group++) { - for (entry = 0; entry < 8; entry++, ste++) { - if (!(ste->esid_data & STE_ESID_V)) { - ste->vsid_data = vsid_data; - eieio(); - ste->esid_data = esid_data; - return (global_entry | entry); - } - } - /* Now search the secondary group. */ - global_entry = ((~esid) & 0x1f) << 3; - ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7)); - } - - /* - * Could not find empty entry, pick one with a round robin selection. - * Search all entries in the two groups. - */ - castout_entry = get_paca()->stab_rr; - for (i = 0; i < 16; i++) { - if (castout_entry < 8) { - global_entry = (esid & 0x1f) << 3; - ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7)); - castout_ste = ste + castout_entry; - } else { - global_entry = ((~esid) & 0x1f) << 3; - ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7)); - castout_ste = ste + (castout_entry - 8); - } - - /* Dont cast out the first kernel segment */ - if ((castout_ste->esid_data & ESID_MASK) != PAGE_OFFSET) - break; - - castout_entry = (castout_entry + 1) & 0xf; - } - - get_paca()->stab_rr = (castout_entry + 1) & 0xf; - - /* Modify the old entry to the new value. */ - - /* Force previous translations to complete. DRENG */ - asm volatile("isync" : : : "memory"); - - old_esid = castout_ste->esid_data >> SID_SHIFT; - castout_ste->esid_data = 0; /* Invalidate old entry */ - - asm volatile("sync" : : : "memory"); /* Order update */ - - castout_ste->vsid_data = vsid_data; - eieio(); /* Order update */ - castout_ste->esid_data = esid_data; - - asm volatile("slbie %0" : : "r" (old_esid << SID_SHIFT)); - /* Ensure completion of slbie */ - asm volatile("sync" : : : "memory"); - - return (global_entry | (castout_entry & 0x7)); -} - -/* - * Allocate a segment table entry for the given ea and mm - */ -static int __ste_allocate(unsigned long ea, struct mm_struct *mm) -{ - unsigned long vsid; - unsigned char stab_entry; - unsigned long offset; - - /* Kernel or user address? */ - if (is_kernel_addr(ea)) { - vsid = get_kernel_vsid(ea, MMU_SEGSIZE_256M); - } else { - if ((ea >= TASK_SIZE_USER64) || (! mm)) - return 1; - - vsid = get_vsid(mm->context.id, ea, MMU_SEGSIZE_256M); - } - - stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid); - - if (!is_kernel_addr(ea)) { - offset = __get_cpu_var(stab_cache_ptr); - if (offset < NR_STAB_CACHE_ENTRIES) - __get_cpu_var(stab_cache[offset++]) = stab_entry; - else - offset = NR_STAB_CACHE_ENTRIES+1; - __get_cpu_var(stab_cache_ptr) = offset; - - /* Order update */ - asm volatile("sync":::"memory"); - } - - return 0; -} - -int ste_allocate(unsigned long ea) -{ - return __ste_allocate(ea, current->mm); -} - -/* - * Do the segment table work for a context switch: flush all user - * entries from the table, then preload some probably useful entries - * for the new task - */ -void switch_stab(struct task_struct *tsk, struct mm_struct *mm) -{ - struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr; - struct stab_entry *ste; - unsigned long offset; - unsigned long pc = KSTK_EIP(tsk); - unsigned long stack = KSTK_ESP(tsk); - unsigned long unmapped_base; - - /* Force previous translations to complete. DRENG */ - asm volatile("isync" : : : "memory"); - - /* - * We need interrupts hard-disabled here, not just soft-disabled, - * so that a PMU interrupt can't occur, which might try to access - * user memory (to get a stack trace) and possible cause an STAB miss - * which would update the stab_cache/stab_cache_ptr per-cpu variables. - */ - hard_irq_disable(); - - offset = __get_cpu_var(stab_cache_ptr); - if (offset <= NR_STAB_CACHE_ENTRIES) { - int i; - - for (i = 0; i < offset; i++) { - ste = stab + __get_cpu_var(stab_cache[i]); - ste->esid_data = 0; /* invalidate entry */ - } - } else { - unsigned long entry; - - /* Invalidate all entries. */ - ste = stab; - - /* Never flush the first entry. */ - ste += 1; - for (entry = 1; - entry < (HW_PAGE_SIZE / sizeof(struct stab_entry)); - entry++, ste++) { - unsigned long ea; - ea = ste->esid_data & ESID_MASK; - if (!is_kernel_addr(ea)) { - ste->esid_data = 0; - } - } - } - - asm volatile("sync; slbia; sync":::"memory"); - - __get_cpu_var(stab_cache_ptr) = 0; - - /* Now preload some entries for the new task */ - if (test_tsk_thread_flag(tsk, TIF_32BIT)) - unmapped_base = TASK_UNMAPPED_BASE_USER32; - else - unmapped_base = TASK_UNMAPPED_BASE_USER64; - - __ste_allocate(pc, mm); - - if (GET_ESID(pc) == GET_ESID(stack)) - return; - - __ste_allocate(stack, mm); - - if ((GET_ESID(pc) == GET_ESID(unmapped_base)) - || (GET_ESID(stack) == GET_ESID(unmapped_base))) - return; - - __ste_allocate(unmapped_base, mm); - - /* Order update */ - asm volatile("sync" : : : "memory"); -} - -/* - * Allocate segment tables for secondary CPUs. These must all go in - * the first (bolted) segment, so that do_stab_bolted won't get a - * recursive segment miss on the segment table itself. - */ -void __init stabs_alloc(void) -{ - int cpu; - - if (mmu_has_feature(MMU_FTR_SLB)) - return; - - for_each_possible_cpu(cpu) { - unsigned long newstab; - - if (cpu == 0) - continue; /* stab for CPU 0 is statically allocated */ - - newstab = memblock_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE, - 1<<SID_SHIFT); - newstab = (unsigned long)__va(newstab); - - memset((void *)newstab, 0, HW_PAGE_SIZE); - - paca[cpu].stab_addr = newstab; - paca[cpu].stab_real = __pa(newstab); - printk(KERN_INFO "Segment table for CPU %d at 0x%llx " - "virtual, 0x%llx absolute\n", - cpu, paca[cpu].stab_addr, paca[cpu].stab_real); - } -} - -/* - * Build an entry for the base kernel segment and put it into - * the segment table or SLB. All other segment table or SLB - * entries are faulted in. - */ -void stab_initialize(unsigned long stab) -{ - unsigned long vsid = get_kernel_vsid(PAGE_OFFSET, MMU_SEGSIZE_256M); - unsigned long stabreal; - - asm volatile("isync; slbia; isync":::"memory"); - make_ste(stab, GET_ESID(PAGE_OFFSET), vsid); - - /* Order update */ - asm volatile("sync":::"memory"); - - /* Set ASR */ - stabreal = get_paca()->stab_real | 0x1ul; - - mtspr(SPRN_ASR, stabreal); -} diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c deleted file mode 100644 index 558e30cce33e..000000000000 --- a/arch/powerpc/mm/tlb_hash32.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * This file contains the routines for TLB flushing. - * On machines where the MMU uses a hash table to store virtual to - * physical translations, these routines flush entries from the - * hash table also. - * -- paulus - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> -#include <linux/export.h> - -#include <asm/tlbflush.h> -#include <asm/tlb.h> - -#include "mmu_decl.h" - -/* - * Called when unmapping pages to flush entries from the TLB/hash table. - */ -void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr) -{ - unsigned long ptephys; - - if (Hash != 0) { - ptephys = __pa(ptep) & PAGE_MASK; - flush_hash_pages(mm->context.id, addr, ptephys, 1); - } -} -EXPORT_SYMBOL(flush_hash_entry); - -/* - * Called by ptep_set_access_flags, must flush on CPUs for which the - * DSI handler can't just "fixup" the TLB on a write fault - */ -void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr) -{ - if (Hash != 0) - return; - _tlbie(addr); -} - -/* - * Called at the end of a mmu_gather operation to make sure the - * TLB flush is completely done. - */ -void tlb_flush(struct mmu_gather *tlb) -{ - if (Hash == 0) { - /* - * 603 needs to flush the whole TLB here since - * it doesn't use a hash table. - */ - _tlbia(); - } -} - -/* - * TLB flushing: - * - * - flush_tlb_mm(mm) flushes the specified mm context TLB's - * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_range(vma, start, end) flushes a range of pages - * - flush_tlb_kernel_range(start, end) flushes kernel pages - * - * since the hardware hash table functions as an extension of the - * tlb as far as the linux tables are concerned, flush it too. - * -- Cort - */ - -static void flush_range(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - pmd_t *pmd; - unsigned long pmd_end; - int count; - unsigned int ctx = mm->context.id; - - if (Hash == 0) { - _tlbia(); - return; - } - start &= PAGE_MASK; - if (start >= end) - return; - end = (end - 1) | ~PAGE_MASK; - pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start); - for (;;) { - pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1; - if (pmd_end > end) - pmd_end = end; - if (!pmd_none(*pmd)) { - count = ((pmd_end - start) >> PAGE_SHIFT) + 1; - flush_hash_pages(ctx, start, pmd_val(*pmd), count); - } - if (pmd_end == end) - break; - start = pmd_end + 1; - ++pmd; - } -} - -/* - * Flush kernel TLB entries in the given range - */ -void flush_tlb_kernel_range(unsigned long start, unsigned long end) -{ - flush_range(&init_mm, start, end); -} -EXPORT_SYMBOL(flush_tlb_kernel_range); - -/* - * Flush all the (user) entries for the address space described by mm. - */ -void flush_tlb_mm(struct mm_struct *mm) -{ - struct vm_area_struct *mp; - - if (Hash == 0) { - _tlbia(); - return; - } - - /* - * It is safe to go down the mm's list of vmas when called - * from dup_mmap, holding mmap_sem. It would also be safe from - * unmap_region or exit_mmap, but not from vmtruncate on SMP - - * but it seems dup_mmap is the only SMP case which gets here. - */ - for (mp = mm->mmap; mp != NULL; mp = mp->vm_next) - flush_range(mp->vm_mm, mp->vm_start, mp->vm_end); -} -EXPORT_SYMBOL(flush_tlb_mm); - -void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ - struct mm_struct *mm; - pmd_t *pmd; - - if (Hash == 0) { - _tlbie(vmaddr); - return; - } - mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm; - pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr); - if (!pmd_none(*pmd)) - flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1); -} -EXPORT_SYMBOL(flush_tlb_page); - -/* - * For each address in the range, find the pte for the address - * and check _PAGE_HASHPTE bit; if it is set, find and destroy - * the corresponding HPTE. - */ -void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - flush_range(vma->vm_mm, start, end); -} -EXPORT_SYMBOL(flush_tlb_range); - -void __init early_init_mmu(void) -{ -} diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S deleted file mode 100644 index b4113bf86353..000000000000 --- a/arch/powerpc/mm/tlb_low_64e.S +++ /dev/null @@ -1,980 +0,0 @@ -/* - * Low level TLB miss handlers for Book3E - * - * Copyright (C) 2008-2009 - * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/processor.h> -#include <asm/reg.h> -#include <asm/page.h> -#include <asm/mmu.h> -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/cputable.h> -#include <asm/pgtable.h> -#include <asm/exception-64e.h> -#include <asm/ppc-opcode.h> -#include <asm/kvm_asm.h> -#include <asm/kvm_booke_hv_asm.h> - -#ifdef CONFIG_PPC_64K_PAGES -#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1) -#else -#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE) -#endif -#define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE) -#define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE) -#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE) - -/********************************************************************** - * * - * TLB miss handling for Book3E with a bolted linear mapping * - * No virtual page table, no nested TLB misses * - * * - **********************************************************************/ - -.macro tlb_prolog_bolted intnum addr - mtspr SPRN_SPRG_GEN_SCRATCH,r13 - mfspr r13,SPRN_SPRG_PACA - std r10,PACA_EXTLB+EX_TLB_R10(r13) - mfcr r10 - std r11,PACA_EXTLB+EX_TLB_R11(r13) -#ifdef CONFIG_KVM_BOOKE_HV -BEGIN_FTR_SECTION - mfspr r11, SPRN_SRR1 -END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) -#endif - DO_KVM \intnum, SPRN_SRR1 - std r16,PACA_EXTLB+EX_TLB_R16(r13) - mfspr r16,\addr /* get faulting address */ - std r14,PACA_EXTLB+EX_TLB_R14(r13) - ld r14,PACAPGD(r13) - std r15,PACA_EXTLB+EX_TLB_R15(r13) - std r10,PACA_EXTLB+EX_TLB_CR(r13) - TLB_MISS_PROLOG_STATS_BOLTED -.endm - -.macro tlb_epilog_bolted - ld r14,PACA_EXTLB+EX_TLB_CR(r13) - ld r10,PACA_EXTLB+EX_TLB_R10(r13) - ld r11,PACA_EXTLB+EX_TLB_R11(r13) - mtcr r14 - ld r14,PACA_EXTLB+EX_TLB_R14(r13) - ld r15,PACA_EXTLB+EX_TLB_R15(r13) - TLB_MISS_RESTORE_STATS_BOLTED - ld r16,PACA_EXTLB+EX_TLB_R16(r13) - mfspr r13,SPRN_SPRG_GEN_SCRATCH -.endm - -/* Data TLB miss */ - START_EXCEPTION(data_tlb_miss_bolted) - tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR - - /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - /* We pre-test some combination of permissions to avoid double - * faults: - * - * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE - * ESR_ST is 0x00800000 - * _PAGE_BAP_SW is 0x00000010 - * So the shift is >> 19. This tests for supervisor writeability. - * If the page happens to be supervisor writeable and not user - * writeable, we will take a new fault later, but that should be - * a rare enough case. - * - * We also move ESR_ST in _PAGE_DIRTY position - * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 - * - * MAS1 is preset for all we need except for TID that needs to - * be cleared for kernel translations - */ - - mfspr r11,SPRN_ESR - - srdi r15,r16,60 /* get region */ - rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 - bne- dtlb_miss_fault_bolted /* Bail if fault addr is invalid */ - - rlwinm r10,r11,32-19,27,27 - rlwimi r10,r11,32-16,19,19 - cmpwi r15,0 /* user vs kernel check */ - ori r10,r10,_PAGE_PRESENT - oris r11,r10,_PAGE_ACCESSED@h - - TLB_MISS_STATS_SAVE_INFO_BOLTED - bne tlb_miss_kernel_bolted - -tlb_miss_common_bolted: -/* - * This is the guts of the TLB miss handler for bolted-linear. - * We are entered with: - * - * r16 = faulting address - * r15 = crap (free to use) - * r14 = page table base - * r13 = PACA - * r11 = PTE permission mask - * r10 = crap (free to use) - */ - rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3 - cmpldi cr0,r14,0 - clrrdi r15,r15,3 - beq tlb_miss_fault_bolted /* No PGDIR, bail */ - -BEGIN_MMU_FTR_SECTION - /* Set the TLB reservation and search for existing entry. Then load - * the entry. - */ - PPC_TLBSRX_DOT(0,R16) - ldx r14,r14,r15 /* grab pgd entry */ - beq normal_tlb_miss_done /* tlb exists already, bail */ -MMU_FTR_SECTION_ELSE - ldx r14,r14,r15 /* grab pgd entry */ -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) - -#ifndef CONFIG_PPC_64K_PAGES - rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 - clrrdi r15,r15,3 - cmpdi cr0,r14,0 - bge tlb_miss_fault_bolted /* Bad pgd entry or hugepage; bail */ - ldx r14,r14,r15 /* grab pud entry */ -#endif /* CONFIG_PPC_64K_PAGES */ - - rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 - clrrdi r15,r15,3 - cmpdi cr0,r14,0 - bge tlb_miss_fault_bolted - ldx r14,r14,r15 /* Grab pmd entry */ - - rldicl r15,r16,64-PAGE_SHIFT+3,64-PTE_INDEX_SIZE-3 - clrrdi r15,r15,3 - cmpdi cr0,r14,0 - bge tlb_miss_fault_bolted - ldx r14,r14,r15 /* Grab PTE, normal (!huge) page */ - - /* Check if required permissions are met */ - andc. r15,r11,r14 - rldicr r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT - bne- tlb_miss_fault_bolted - - /* Now we build the MAS: - * - * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG - * MAS 1 : Almost fully setup - * - PID already updated by caller if necessary - * - TSIZE need change if !base page size, not - * yet implemented for now - * MAS 2 : Defaults not useful, need to be redone - * MAS 3+7 : Needs to be done - */ - clrrdi r11,r16,12 /* Clear low crap in EA */ - clrldi r15,r15,12 /* Clear crap at the top */ - rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ - rlwimi r15,r14,32-8,22,25 /* Move in U bits */ - mtspr SPRN_MAS2,r11 - andi. r11,r14,_PAGE_DIRTY - rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ - - /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ - bne 1f - li r11,MAS3_SW|MAS3_UW - andc r15,r15,r11 -1: - mtspr SPRN_MAS7_MAS3,r15 - tlbwe - - TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) - tlb_epilog_bolted - rfi - -itlb_miss_kernel_bolted: - li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ - oris r11,r11,_PAGE_ACCESSED@h -tlb_miss_kernel_bolted: - mfspr r10,SPRN_MAS1 - ld r14,PACA_KERNELPGD(r13) - cmpldi cr0,r15,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - beq+ tlb_miss_common_bolted - -tlb_miss_fault_bolted: - /* We need to check if it was an instruction miss */ - andi. r10,r11,_PAGE_EXEC|_PAGE_BAP_SX - bne itlb_miss_fault_bolted -dtlb_miss_fault_bolted: - TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) - tlb_epilog_bolted - b exc_data_storage_book3e -itlb_miss_fault_bolted: - TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) - tlb_epilog_bolted - b exc_instruction_storage_book3e - -/* Instruction TLB miss */ - START_EXCEPTION(instruction_tlb_miss_bolted) - tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0 - - rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 - srdi r15,r16,60 /* get region */ - TLB_MISS_STATS_SAVE_INFO_BOLTED - bne- itlb_miss_fault_bolted - - li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - - cmpldi cr0,r15,0 /* Check for user region */ - oris r11,r11,_PAGE_ACCESSED@h - beq tlb_miss_common_bolted - b itlb_miss_kernel_bolted - -/********************************************************************** - * * - * TLB miss handling for Book3E with TLB reservation and HES support * - * * - **********************************************************************/ - - -/* Data TLB miss */ - START_EXCEPTION(data_tlb_miss) - TLB_MISS_PROLOG - - /* Now we handle the fault proper. We only save DEAR in normal - * fault case since that's the only interesting values here. - * We could probably also optimize by not saving SRR0/1 in the - * linear mapping case but I'll leave that for later - */ - mfspr r14,SPRN_ESR - mfspr r16,SPRN_DEAR /* get faulting address */ - srdi r15,r16,60 /* get region */ - cmpldi cr0,r15,0xc /* linear mapping ? */ - TLB_MISS_STATS_SAVE_INFO - beq tlb_load_linear /* yes -> go to linear map load */ - - /* The page tables are mapped virtually linear. At this point, though, - * we don't know whether we are trying to fault in a first level - * virtual address or a virtual page table address. We can get that - * from bit 0x1 of the region ID which we have set for a page table - */ - andi. r10,r15,0x1 - bne- virt_page_table_tlb_miss - - std r14,EX_TLB_ESR(r12); /* save ESR */ - std r16,EX_TLB_DEAR(r12); /* save DEAR */ - - /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ - li r11,_PAGE_PRESENT - oris r11,r11,_PAGE_ACCESSED@h - - /* We do the user/kernel test for the PID here along with the RW test - */ - cmpldi cr0,r15,0 /* Check for user region */ - - /* We pre-test some combination of permissions to avoid double - * faults: - * - * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE - * ESR_ST is 0x00800000 - * _PAGE_BAP_SW is 0x00000010 - * So the shift is >> 19. This tests for supervisor writeability. - * If the page happens to be supervisor writeable and not user - * writeable, we will take a new fault later, but that should be - * a rare enough case. - * - * We also move ESR_ST in _PAGE_DIRTY position - * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 - * - * MAS1 is preset for all we need except for TID that needs to - * be cleared for kernel translations - */ - rlwimi r11,r14,32-19,27,27 - rlwimi r11,r14,32-16,19,19 - beq normal_tlb_miss - /* XXX replace the RMW cycles with immediate loads + writes */ -1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r15,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - beq+ normal_tlb_miss - - /* We got a crappy address, just fault with whatever DEAR and ESR - * are here - */ - TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e - -/* Instruction TLB miss */ - START_EXCEPTION(instruction_tlb_miss) - TLB_MISS_PROLOG - - /* If we take a recursive fault, the second level handler may need - * to know whether we are handling a data or instruction fault in - * order to get to the right store fault handler. We provide that - * info by writing a crazy value in ESR in our exception frame - */ - li r14,-1 /* store to exception frame is done later */ - - /* Now we handle the fault proper. We only save DEAR in the non - * linear mapping case since we know the linear mapping case will - * not re-enter. We could indeed optimize and also not save SRR0/1 - * in the linear mapping case but I'll leave that for later - * - * Faulting address is SRR0 which is already in r16 - */ - srdi r15,r16,60 /* get region */ - cmpldi cr0,r15,0xc /* linear mapping ? */ - TLB_MISS_STATS_SAVE_INFO - beq tlb_load_linear /* yes -> go to linear map load */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ - oris r11,r11,_PAGE_ACCESSED@h - - cmpldi cr0,r15,0 /* Check for user region */ - std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */ - beq normal_tlb_miss - - li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ - oris r11,r11,_PAGE_ACCESSED@h - /* XXX replace the RMW cycles with immediate loads + writes */ - mfspr r10,SPRN_MAS1 - cmpldi cr0,r15,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - beq+ normal_tlb_miss - - /* We got a crappy address, just fault */ - TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - -/* - * This is the guts of the first-level TLB miss handler for direct - * misses. We are entered with: - * - * r16 = faulting address - * r15 = region ID - * r14 = crap (free to use) - * r13 = PACA - * r12 = TLB exception frame in PACA - * r11 = PTE permission mask - * r10 = crap (free to use) - */ -normal_tlb_miss: - /* So we first construct the page table address. We do that by - * shifting the bottom of the address (not the region ID) by - * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and - * or'ing the fourth high bit. - * - * NOTE: For 64K pages, we do things slightly differently in - * order to handle the weird page table format used by linux - */ - ori r10,r15,0x1 -#ifdef CONFIG_PPC_64K_PAGES - /* For the top bits, 16 bytes per PTE */ - rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4 - /* Now create the bottom bits as 0 in position 0x8000 and - * the rest calculated for 8 bytes per PTE - */ - rldicl r15,r16,64-(PAGE_SHIFT-3),64-15 - /* Insert the bottom bits in */ - rlwimi r14,r15,0,16,31 -#else - rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 -#endif - sldi r15,r10,60 - clrrdi r14,r14,3 - or r10,r15,r14 - -BEGIN_MMU_FTR_SECTION - /* Set the TLB reservation and search for existing entry. Then load - * the entry. - */ - PPC_TLBSRX_DOT(0,R16) - ld r14,0(r10) - beq normal_tlb_miss_done -MMU_FTR_SECTION_ELSE - ld r14,0(r10) -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) - -finish_normal_tlb_miss: - /* Check if required permissions are met */ - andc. r15,r11,r14 - bne- normal_tlb_miss_access_fault - - /* Now we build the MAS: - * - * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG - * MAS 1 : Almost fully setup - * - PID already updated by caller if necessary - * - TSIZE need change if !base page size, not - * yet implemented for now - * MAS 2 : Defaults not useful, need to be redone - * MAS 3+7 : Needs to be done - * - * TODO: mix up code below for better scheduling - */ - clrrdi r11,r16,12 /* Clear low crap in EA */ - rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ - mtspr SPRN_MAS2,r11 - - /* Check page size, if not standard, update MAS1 */ - rldicl r11,r14,64-8,64-8 -#ifdef CONFIG_PPC_64K_PAGES - cmpldi cr0,r11,BOOK3E_PAGESZ_64K -#else - cmpldi cr0,r11,BOOK3E_PAGESZ_4K -#endif - beq- 1f - mfspr r11,SPRN_MAS1 - rlwimi r11,r14,31,21,24 - rlwinm r11,r11,0,21,19 - mtspr SPRN_MAS1,r11 -1: - /* Move RPN in position */ - rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT - clrldi r15,r11,12 /* Clear crap at the top */ - rlwimi r15,r14,32-8,22,25 /* Move in U bits */ - rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ - - /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ - andi. r11,r14,_PAGE_DIRTY - bne 1f - li r11,MAS3_SW|MAS3_UW - andc r15,r15,r11 -1: -BEGIN_MMU_FTR_SECTION - srdi r16,r15,32 - mtspr SPRN_MAS3,r15 - mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r15 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) - - tlbwe - -normal_tlb_miss_done: - /* We don't bother with restoring DEAR or ESR since we know we are - * level 0 and just going back to userland. They are only needed - * if you are going to take an access fault - */ - TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) - TLB_MISS_EPILOG_SUCCESS - rfi - -normal_tlb_miss_access_fault: - /* We need to check if it was an instruction miss */ - andi. r10,r11,_PAGE_EXEC - bne 1f - ld r14,EX_TLB_DEAR(r12) - ld r15,EX_TLB_ESR(r12) - mtspr SPRN_DEAR,r14 - mtspr SPRN_ESR,r15 - TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e -1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - - -/* - * This is the guts of the second-level TLB miss handler for direct - * misses. We are entered with: - * - * r16 = virtual page table faulting address - * r15 = region (top 4 bits of address) - * r14 = crap (free to use) - * r13 = PACA - * r12 = TLB exception frame in PACA - * r11 = crap (free to use) - * r10 = crap (free to use) - * - * Note that this should only ever be called as a second level handler - * with the current scheme when using SW load. - * That means we can always get the original fault DEAR at - * EX_TLB_DEAR-EX_TLB_SIZE(r12) - * - * It can be re-entered by the linear mapping miss handler. However, to - * avoid too much complication, it will restart the whole fault at level - * 0 so we don't care too much about clobbers - * - * XXX That code was written back when we couldn't clobber r14. We can now, - * so we could probably optimize things a bit - */ -virt_page_table_tlb_miss: - /* Are we hitting a kernel page table ? */ - andi. r10,r15,0x8 - - /* The cool thing now is that r10 contains 0 for user and 8 for kernel, - * and we happen to have the swapper_pg_dir at offset 8 from the user - * pgdir in the PACA :-). - */ - add r11,r10,r13 - - /* If kernel, we need to clear MAS1 TID */ - beq 1f - /* XXX replace the RMW cycles with immediate loads + writes */ - mfspr r10,SPRN_MAS1 - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 -1: -BEGIN_MMU_FTR_SECTION - /* Search if we already have a TLB entry for that virtual address, and - * if we do, bail out. - */ - PPC_TLBSRX_DOT(0,R16) - beq virt_page_table_tlb_miss_done -END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) - - /* Now, we need to walk the page tables. First check if we are in - * range. - */ - rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 - bne- virt_page_table_tlb_miss_fault - - /* Get the PGD pointer */ - ld r15,PACAPGD(r11) - cmpldi cr0,r15,0 - beq- virt_page_table_tlb_miss_fault - - /* Get to PGD entry */ - rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge virt_page_table_tlb_miss_fault - -#ifndef CONFIG_PPC_64K_PAGES - /* Get to PUD entry */ - rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge virt_page_table_tlb_miss_fault -#endif /* CONFIG_PPC_64K_PAGES */ - - /* Get to PMD entry */ - rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge virt_page_table_tlb_miss_fault - - /* Ok, we're all right, we can now create a kernel translation for - * a 4K or 64K page from r16 -> r15. - */ - /* Now we build the MAS: - * - * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG - * MAS 1 : Almost fully setup - * - PID already updated by caller if necessary - * - TSIZE for now is base page size always - * MAS 2 : Use defaults - * MAS 3+7 : Needs to be done - * - * So we only do MAS 2 and 3 for now... - */ - clrldi r11,r15,4 /* remove region ID from RPN */ - ori r10,r11,1 /* Or-in SR */ - -BEGIN_MMU_FTR_SECTION - srdi r16,r10,32 - mtspr SPRN_MAS3,r10 - mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) - - tlbwe - -BEGIN_MMU_FTR_SECTION -virt_page_table_tlb_miss_done: - - /* We have overriden MAS2:EPN but currently our primary TLB miss - * handler will always restore it so that should not be an issue, - * if we ever optimize the primary handler to not write MAS2 on - * some cases, we'll have to restore MAS2:EPN here based on the - * original fault's DEAR. If we do that we have to modify the - * ITLB miss handler to also store SRR0 in the exception frame - * as DEAR. - * - * However, one nasty thing we did is we cleared the reservation - * (well, potentially we did). We do a trick here thus if we - * are not a level 0 exception (we interrupted the TLB miss) we - * offset the return address by -4 in order to replay the tlbsrx - * instruction there - */ - subf r10,r13,r12 - cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE - bne- 1f - ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) - addi r10,r11,-4 - std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) -1: -END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) - /* Return to caller, normal case */ - TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK); - TLB_MISS_EPILOG_SUCCESS - rfi - -virt_page_table_tlb_miss_fault: - /* If we fault here, things are a little bit tricky. We need to call - * either data or instruction store fault, and we need to retrieve - * the original fault address and ESR (for data). - * - * The thing is, we know that in normal circumstances, this is - * always called as a second level tlb miss for SW load or as a first - * level TLB miss for HW load, so we should be able to peek at the - * relevant information in the first exception frame in the PACA. - * - * However, we do need to double check that, because we may just hit - * a stray kernel pointer or a userland attack trying to hit those - * areas. If that is the case, we do a data fault. (We can't get here - * from an instruction tlb miss anyway). - * - * Note also that when going to a fault, we must unwind the previous - * level as well. Since we are doing that, we don't need to clear or - * restore the TLB reservation neither. - */ - subf r10,r13,r12 - cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE - bne- virt_page_table_tlb_miss_whacko_fault - - /* We dig the original DEAR and ESR from slot 0 */ - ld r15,EX_TLB_DEAR+PACA_EXTLB(r13) - ld r16,EX_TLB_ESR+PACA_EXTLB(r13) - - /* We check for the "special" ESR value for instruction faults */ - cmpdi cr0,r16,-1 - beq 1f - mtspr SPRN_DEAR,r15 - mtspr SPRN_ESR,r16 - TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT); - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e -1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT); - TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - -virt_page_table_tlb_miss_whacko_fault: - /* The linear fault will restart everything so ESR and DEAR will - * not have been clobbered, let's just fault with what we have - */ - TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT); - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e - - -/************************************************************** - * * - * TLB miss handling for Book3E with hw page table support * - * * - **************************************************************/ - - -/* Data TLB miss */ - START_EXCEPTION(data_tlb_miss_htw) - TLB_MISS_PROLOG - - /* Now we handle the fault proper. We only save DEAR in normal - * fault case since that's the only interesting values here. - * We could probably also optimize by not saving SRR0/1 in the - * linear mapping case but I'll leave that for later - */ - mfspr r14,SPRN_ESR - mfspr r16,SPRN_DEAR /* get faulting address */ - srdi r11,r16,60 /* get region */ - cmpldi cr0,r11,0xc /* linear mapping ? */ - TLB_MISS_STATS_SAVE_INFO - beq tlb_load_linear /* yes -> go to linear map load */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - cmpldi cr0,r11,0 /* Check for user region */ - ld r15,PACAPGD(r13) /* Load user pgdir */ - beq htw_tlb_miss - - /* XXX replace the RMW cycles with immediate loads + writes */ -1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r11,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ - beq+ htw_tlb_miss - - /* We got a crappy address, just fault with whatever DEAR and ESR - * are here - */ - TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e - -/* Instruction TLB miss */ - START_EXCEPTION(instruction_tlb_miss_htw) - TLB_MISS_PROLOG - - /* If we take a recursive fault, the second level handler may need - * to know whether we are handling a data or instruction fault in - * order to get to the right store fault handler. We provide that - * info by keeping a crazy value for ESR in r14 - */ - li r14,-1 /* store to exception frame is done later */ - - /* Now we handle the fault proper. We only save DEAR in the non - * linear mapping case since we know the linear mapping case will - * not re-enter. We could indeed optimize and also not save SRR0/1 - * in the linear mapping case but I'll leave that for later - * - * Faulting address is SRR0 which is already in r16 - */ - srdi r11,r16,60 /* get region */ - cmpldi cr0,r11,0xc /* linear mapping ? */ - TLB_MISS_STATS_SAVE_INFO - beq tlb_load_linear /* yes -> go to linear map load */ - - /* We do the user/kernel test for the PID here along with the RW test - */ - cmpldi cr0,r11,0 /* Check for user region */ - ld r15,PACAPGD(r13) /* Load user pgdir */ - beq htw_tlb_miss - - /* XXX replace the RMW cycles with immediate loads + writes */ -1: mfspr r10,SPRN_MAS1 - cmpldi cr0,r11,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ - beq+ htw_tlb_miss - - /* We got a crappy address, just fault */ - TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - - -/* - * This is the guts of the second-level TLB miss handler for direct - * misses. We are entered with: - * - * r16 = virtual page table faulting address - * r15 = PGD pointer - * r14 = ESR - * r13 = PACA - * r12 = TLB exception frame in PACA - * r11 = crap (free to use) - * r10 = crap (free to use) - * - * It can be re-entered by the linear mapping miss handler. However, to - * avoid too much complication, it will save/restore things for us - */ -htw_tlb_miss: - /* Search if we already have a TLB entry for that virtual address, and - * if we do, bail out. - * - * MAS1:IND should be already set based on MAS4 - */ - PPC_TLBSRX_DOT(0,R16) - beq htw_tlb_miss_done - - /* Now, we need to walk the page tables. First check if we are in - * range. - */ - rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 - bne- htw_tlb_miss_fault - - /* Get the PGD pointer */ - cmpldi cr0,r15,0 - beq- htw_tlb_miss_fault - - /* Get to PGD entry */ - rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge htw_tlb_miss_fault - -#ifndef CONFIG_PPC_64K_PAGES - /* Get to PUD entry */ - rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge htw_tlb_miss_fault -#endif /* CONFIG_PPC_64K_PAGES */ - - /* Get to PMD entry */ - rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 - clrrdi r10,r11,3 - ldx r15,r10,r15 - cmpdi cr0,r15,0 - bge htw_tlb_miss_fault - - /* Ok, we're all right, we can now create an indirect entry for - * a 1M or 256M page. - * - * The last trick is now that because we use "half" pages for - * the HTW (1M IND is 2K and 256M IND is 32K) we need to account - * for an added LSB bit to the RPN. For 64K pages, there is no - * problem as we already use 32K arrays (half PTE pages), but for - * 4K page we need to extract a bit from the virtual address and - * insert it into the "PA52" bit of the RPN. - */ -#ifndef CONFIG_PPC_64K_PAGES - rlwimi r15,r16,32-9,20,20 -#endif - /* Now we build the MAS: - * - * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG - * MAS 1 : Almost fully setup - * - PID already updated by caller if necessary - * - TSIZE for now is base ind page size always - * MAS 2 : Use defaults - * MAS 3+7 : Needs to be done - */ -#ifdef CONFIG_PPC_64K_PAGES - ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT) -#else - ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) -#endif - -BEGIN_MMU_FTR_SECTION - srdi r16,r10,32 - mtspr SPRN_MAS3,r10 - mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) - - tlbwe - -htw_tlb_miss_done: - /* We don't bother with restoring DEAR or ESR since we know we are - * level 0 and just going back to userland. They are only needed - * if you are going to take an access fault - */ - TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK) - TLB_MISS_EPILOG_SUCCESS - rfi - -htw_tlb_miss_fault: - /* We need to check if it was an instruction miss. We know this - * though because r14 would contain -1 - */ - cmpdi cr0,r14,-1 - beq 1f - mtspr SPRN_DEAR,r16 - mtspr SPRN_ESR,r14 - TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_data_storage_book3e -1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT) - TLB_MISS_EPILOG_ERROR - b exc_instruction_storage_book3e - -/* - * This is the guts of "any" level TLB miss handler for kernel linear - * mapping misses. We are entered with: - * - * - * r16 = faulting address - * r15 = crap (free to use) - * r14 = ESR (data) or -1 (instruction) - * r13 = PACA - * r12 = TLB exception frame in PACA - * r11 = crap (free to use) - * r10 = crap (free to use) - * - * In addition we know that we will not re-enter, so in theory, we could - * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later. - * - * We also need to be careful about MAS registers here & TLB reservation, - * as we know we'll have clobbered them if we interrupt the main TLB miss - * handlers in which case we probably want to do a full restart at level - * 0 rather than saving / restoring the MAS. - * - * Note: If we care about performance of that core, we can easily shuffle - * a few things around - */ -tlb_load_linear: - /* For now, we assume the linear mapping is contiguous and stops at - * linear_map_top. We also assume the size is a multiple of 1G, thus - * we only use 1G pages for now. That might have to be changed in a - * final implementation, especially when dealing with hypervisors - */ - ld r11,PACATOC(r13) - ld r11,linear_map_top@got(r11) - ld r10,0(r11) - cmpld cr0,r10,r16 - bge tlb_load_linear_fault - - /* MAS1 need whole new setup. */ - li r15,(BOOK3E_PAGESZ_1GB<<MAS1_TSIZE_SHIFT) - oris r15,r15,MAS1_VALID@h /* MAS1 needs V and TSIZE */ - mtspr SPRN_MAS1,r15 - - /* Already somebody there ? */ - PPC_TLBSRX_DOT(0,R16) - beq tlb_load_linear_done - - /* Now we build the remaining MAS. MAS0 and 2 should be fine - * with their defaults, which leaves us with MAS 3 and 7. The - * mapping is linear, so we just take the address, clear the - * region bits, and or in the permission bits which are currently - * hard wired - */ - clrrdi r10,r16,30 /* 1G page index */ - clrldi r10,r10,4 /* clear region bits */ - ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX - -BEGIN_MMU_FTR_SECTION - srdi r16,r10,32 - mtspr SPRN_MAS3,r10 - mtspr SPRN_MAS7,r16 -MMU_FTR_SECTION_ELSE - mtspr SPRN_MAS7_MAS3,r10 -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) - - tlbwe - -tlb_load_linear_done: - /* We use the "error" epilog for success as we do want to - * restore to the initial faulting context, whatever it was. - * We do that because we can't resume a fault within a TLB - * miss handler, due to MAS and TLB reservation being clobbered. - */ - TLB_MISS_STATS_X(MMSTAT_TLB_MISS_LINEAR) - TLB_MISS_EPILOG_ERROR - rfi - -tlb_load_linear_fault: - /* We keep the DEAR and ESR around, this shouldn't have happened */ - cmpdi cr0,r14,-1 - beq 1f - TLB_MISS_EPILOG_ERROR_SPECIAL - b exc_data_storage_book3e -1: TLB_MISS_EPILOG_ERROR_SPECIAL - b exc_instruction_storage_book3e - - -#ifdef CONFIG_BOOK3E_MMU_TLB_STATS -.tlb_stat_inc: -1: ldarx r8,0,r9 - addi r8,r8,1 - stdcx. r8,0,r9 - bne- 1b - blr -#endif diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c deleted file mode 100644 index 41cd68dee681..000000000000 --- a/arch/powerpc/mm/tlb_nohash.c +++ /dev/null @@ -1,692 +0,0 @@ -/* - * This file contains the routines for TLB flushing. - * On machines where the MMU does not use a hash table to store virtual to - * physical translations (ie, SW loaded TLBs or Book3E compilant processors, - * this does -not- include 603 however which shares the implementation with - * hash based processors) - * - * -- BenH - * - * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org> - * IBM Corp. - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> -#include <linux/preempt.h> -#include <linux/spinlock.h> -#include <linux/memblock.h> -#include <linux/of_fdt.h> -#include <linux/hugetlb.h> - -#include <asm/tlbflush.h> -#include <asm/tlb.h> -#include <asm/code-patching.h> -#include <asm/hugetlb.h> - -#include "mmu_decl.h" - -/* - * This struct lists the sw-supported page sizes. The hardawre MMU may support - * other sizes not listed here. The .ind field is only used on MMUs that have - * indirect page table entries. - */ -#ifdef CONFIG_PPC_BOOK3E_MMU -#ifdef CONFIG_PPC_FSL_BOOK3E -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { - [MMU_PAGE_4K] = { - .shift = 12, - .enc = BOOK3E_PAGESZ_4K, - }, - [MMU_PAGE_4M] = { - .shift = 22, - .enc = BOOK3E_PAGESZ_4M, - }, - [MMU_PAGE_16M] = { - .shift = 24, - .enc = BOOK3E_PAGESZ_16M, - }, - [MMU_PAGE_64M] = { - .shift = 26, - .enc = BOOK3E_PAGESZ_64M, - }, - [MMU_PAGE_256M] = { - .shift = 28, - .enc = BOOK3E_PAGESZ_256M, - }, - [MMU_PAGE_1G] = { - .shift = 30, - .enc = BOOK3E_PAGESZ_1GB, - }, -}; -#else -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { - [MMU_PAGE_4K] = { - .shift = 12, - .ind = 20, - .enc = BOOK3E_PAGESZ_4K, - }, - [MMU_PAGE_16K] = { - .shift = 14, - .enc = BOOK3E_PAGESZ_16K, - }, - [MMU_PAGE_64K] = { - .shift = 16, - .ind = 28, - .enc = BOOK3E_PAGESZ_64K, - }, - [MMU_PAGE_1M] = { - .shift = 20, - .enc = BOOK3E_PAGESZ_1M, - }, - [MMU_PAGE_16M] = { - .shift = 24, - .ind = 36, - .enc = BOOK3E_PAGESZ_16M, - }, - [MMU_PAGE_256M] = { - .shift = 28, - .enc = BOOK3E_PAGESZ_256M, - }, - [MMU_PAGE_1G] = { - .shift = 30, - .enc = BOOK3E_PAGESZ_1GB, - }, -}; -#endif /* CONFIG_FSL_BOOKE */ - -static inline int mmu_get_tsize(int psize) -{ - return mmu_psize_defs[psize].enc; -} -#else -static inline int mmu_get_tsize(int psize) -{ - /* This isn't used on !Book3E for now */ - return 0; -} -#endif /* CONFIG_PPC_BOOK3E_MMU */ - -/* The variables below are currently only used on 64-bit Book3E - * though this will probably be made common with other nohash - * implementations at some point - */ -#ifdef CONFIG_PPC64 - -int mmu_linear_psize; /* Page size used for the linear mapping */ -int mmu_pte_psize; /* Page size used for PTE pages */ -int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ -int book3e_htw_enabled; /* Is HW tablewalk enabled ? */ -unsigned long linear_map_top; /* Top of linear mapping */ - -#endif /* CONFIG_PPC64 */ - -#ifdef CONFIG_PPC_FSL_BOOK3E -/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */ -DEFINE_PER_CPU(int, next_tlbcam_idx); -EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx); -#endif - -/* - * Base TLB flushing operations: - * - * - flush_tlb_mm(mm) flushes the specified mm context TLB's - * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_range(vma, start, end) flushes a range of pages - * - flush_tlb_kernel_range(start, end) flushes kernel pages - * - * - local_* variants of page and mm only apply to the current - * processor - */ - -/* - * These are the base non-SMP variants of page and mm flushing - */ -void local_flush_tlb_mm(struct mm_struct *mm) -{ - unsigned int pid; - - preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbil_pid(pid); - preempt_enable(); -} -EXPORT_SYMBOL(local_flush_tlb_mm); - -void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, - int tsize, int ind) -{ - unsigned int pid; - - preempt_disable(); - pid = mm ? mm->context.id : 0; - if (pid != MMU_NO_CONTEXT) - _tlbil_va(vmaddr, pid, tsize, ind); - preempt_enable(); -} - -void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ - __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, - mmu_get_tsize(mmu_virtual_psize), 0); -} -EXPORT_SYMBOL(local_flush_tlb_page); - -/* - * And here are the SMP non-local implementations - */ -#ifdef CONFIG_SMP - -static DEFINE_RAW_SPINLOCK(tlbivax_lock); - -static int mm_is_core_local(struct mm_struct *mm) -{ - return cpumask_subset(mm_cpumask(mm), - topology_thread_cpumask(smp_processor_id())); -} - -struct tlb_flush_param { - unsigned long addr; - unsigned int pid; - unsigned int tsize; - unsigned int ind; -}; - -static void do_flush_tlb_mm_ipi(void *param) -{ - struct tlb_flush_param *p = param; - - _tlbil_pid(p ? p->pid : 0); -} - -static void do_flush_tlb_page_ipi(void *param) -{ - struct tlb_flush_param *p = param; - - _tlbil_va(p->addr, p->pid, p->tsize, p->ind); -} - - -/* Note on invalidations and PID: - * - * We snapshot the PID with preempt disabled. At this point, it can still - * change either because: - * - our context is being stolen (PID -> NO_CONTEXT) on another CPU - * - we are invaliating some target that isn't currently running here - * and is concurrently acquiring a new PID on another CPU - * - some other CPU is re-acquiring a lost PID for this mm - * etc... - * - * However, this shouldn't be a problem as we only guarantee - * invalidation of TLB entries present prior to this call, so we - * don't care about the PID changing, and invalidating a stale PID - * is generally harmless. - */ - -void flush_tlb_mm(struct mm_struct *mm) -{ - unsigned int pid; - - preempt_disable(); - pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) - goto no_context; - if (!mm_is_core_local(mm)) { - struct tlb_flush_param p = { .pid = pid }; - /* Ignores smp_processor_id() even if set. */ - smp_call_function_many(mm_cpumask(mm), - do_flush_tlb_mm_ipi, &p, 1); - } - _tlbil_pid(pid); - no_context: - preempt_enable(); -} -EXPORT_SYMBOL(flush_tlb_mm); - -void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, - int tsize, int ind) -{ - struct cpumask *cpu_mask; - unsigned int pid; - - preempt_disable(); - pid = mm ? mm->context.id : 0; - if (unlikely(pid == MMU_NO_CONTEXT)) - goto bail; - cpu_mask = mm_cpumask(mm); - if (!mm_is_core_local(mm)) { - /* If broadcast tlbivax is supported, use it */ - if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { - int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); - if (lock) - raw_spin_lock(&tlbivax_lock); - _tlbivax_bcast(vmaddr, pid, tsize, ind); - if (lock) - raw_spin_unlock(&tlbivax_lock); - goto bail; - } else { - struct tlb_flush_param p = { - .pid = pid, - .addr = vmaddr, - .tsize = tsize, - .ind = ind, - }; - /* Ignores smp_processor_id() even if set in cpu_mask */ - smp_call_function_many(cpu_mask, - do_flush_tlb_page_ipi, &p, 1); - } - } - _tlbil_va(vmaddr, pid, tsize, ind); - bail: - preempt_enable(); -} - -void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) -{ -#ifdef CONFIG_HUGETLB_PAGE - if (is_vm_hugetlb_page(vma)) - flush_hugetlb_page(vma, vmaddr); -#endif - - __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, - mmu_get_tsize(mmu_virtual_psize), 0); -} -EXPORT_SYMBOL(flush_tlb_page); - -#endif /* CONFIG_SMP */ - -#ifdef CONFIG_PPC_47x -void __init early_init_mmu_47x(void) -{ -#ifdef CONFIG_SMP - unsigned long root = of_get_flat_dt_root(); - if (of_get_flat_dt_prop(root, "cooperative-partition", NULL)) - mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST); -#endif /* CONFIG_SMP */ -} -#endif /* CONFIG_PPC_47x */ - -/* - * Flush kernel TLB entries in the given range - */ -void flush_tlb_kernel_range(unsigned long start, unsigned long end) -{ -#ifdef CONFIG_SMP - preempt_disable(); - smp_call_function(do_flush_tlb_mm_ipi, NULL, 1); - _tlbil_pid(0); - preempt_enable(); -#else - _tlbil_pid(0); -#endif -} -EXPORT_SYMBOL(flush_tlb_kernel_range); - -/* - * Currently, for range flushing, we just do a full mm flush. This should - * be optimized based on a threshold on the size of the range, since - * some implementation can stack multiple tlbivax before a tlbsync but - * for now, we keep it that way - */ -void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) - -{ - flush_tlb_mm(vma->vm_mm); -} -EXPORT_SYMBOL(flush_tlb_range); - -void tlb_flush(struct mmu_gather *tlb) -{ - flush_tlb_mm(tlb->mm); -} - -/* - * Below are functions specific to the 64-bit variant of Book3E though that - * may change in the future - */ - -#ifdef CONFIG_PPC64 - -/* - * Handling of virtual linear page tables or indirect TLB entries - * flushing when PTE pages are freed - */ -void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) -{ - int tsize = mmu_psize_defs[mmu_pte_psize].enc; - - if (book3e_htw_enabled) { - unsigned long start = address & PMD_MASK; - unsigned long end = address + PMD_SIZE; - unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift; - - /* This isn't the most optimal, ideally we would factor out the - * while preempt & CPU mask mucking around, or even the IPI but - * it will do for now - */ - while (start < end) { - __flush_tlb_page(tlb->mm, start, tsize, 1); - start += size; - } - } else { - unsigned long rmask = 0xf000000000000000ul; - unsigned long rid = (address & rmask) | 0x1000000000000000ul; - unsigned long vpte = address & ~rmask; - -#ifdef CONFIG_PPC_64K_PAGES - vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful; -#else - vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; -#endif - vpte |= rid; - __flush_tlb_page(tlb->mm, vpte, tsize, 0); - } -} - -static void setup_page_sizes(void) -{ - unsigned int tlb0cfg; - unsigned int tlb0ps; - unsigned int eptcfg; - int i, psize; - -#ifdef CONFIG_PPC_FSL_BOOK3E - unsigned int mmucfg = mfspr(SPRN_MMUCFG); - int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E); - - if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { - unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG); - unsigned int min_pg, max_pg; - - min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT; - max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT; - - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def; - unsigned int shift; - - def = &mmu_psize_defs[psize]; - shift = def->shift; - - if (shift == 0) - continue; - - /* adjust to be in terms of 4^shift Kb */ - shift = (shift - 10) >> 1; - - if ((shift >= min_pg) && (shift <= max_pg)) - def->flags |= MMU_PAGE_SIZE_DIRECT; - } - - goto no_indirect; - } - - if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) { - u32 tlb1ps = mfspr(SPRN_TLB1PS); - - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - - if (tlb1ps & (1U << (def->shift - 10))) { - def->flags |= MMU_PAGE_SIZE_DIRECT; - } - } - - goto no_indirect; - } -#endif - - tlb0cfg = mfspr(SPRN_TLB0CFG); - tlb0ps = mfspr(SPRN_TLB0PS); - eptcfg = mfspr(SPRN_EPTCFG); - - /* Look for supported direct sizes */ - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - - if (tlb0ps & (1U << (def->shift - 10))) - def->flags |= MMU_PAGE_SIZE_DIRECT; - } - - /* Indirect page sizes supported ? */ - if ((tlb0cfg & TLBnCFG_IND) == 0) - goto no_indirect; - - /* Now, we only deal with one IND page size for each - * direct size. Hopefully all implementations today are - * unambiguous, but we might want to be careful in the - * future. - */ - for (i = 0; i < 3; i++) { - unsigned int ps, sps; - - sps = eptcfg & 0x1f; - eptcfg >>= 5; - ps = eptcfg & 0x1f; - eptcfg >>= 5; - if (!ps || !sps) - continue; - for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - - if (ps == (def->shift - 10)) - def->flags |= MMU_PAGE_SIZE_INDIRECT; - if (sps == (def->shift - 10)) - def->ind = ps + 10; - } - } - no_indirect: - - /* Cleanup array and print summary */ - pr_info("MMU: Supported page sizes\n"); - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - struct mmu_psize_def *def = &mmu_psize_defs[psize]; - const char *__page_type_names[] = { - "unsupported", - "direct", - "indirect", - "direct & indirect" - }; - if (def->flags == 0) { - def->shift = 0; - continue; - } - pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10), - __page_type_names[def->flags & 0x3]); - } -} - -static void __patch_exception(int exc, unsigned long addr) -{ - extern unsigned int interrupt_base_book3e; - unsigned int *ibase = &interrupt_base_book3e; - - /* Our exceptions vectors start with a NOP and -then- a branch - * to deal with single stepping from userspace which stops on - * the second instruction. Thus we need to patch the second - * instruction of the exception, not the first one - */ - - patch_branch(ibase + (exc / 4) + 1, addr, 0); -} - -#define patch_exception(exc, name) do { \ - extern unsigned int name; \ - __patch_exception((exc), (unsigned long)&name); \ -} while (0) - -static void setup_mmu_htw(void) -{ - /* Check if HW tablewalk is present, and if yes, enable it by: - * - * - patching the TLB miss handlers to branch to the - * one dedicates to it - * - * - setting the global book3e_htw_enabled - */ - unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG); - - if ((tlb0cfg & TLBnCFG_IND) && - (tlb0cfg & TLBnCFG_PT)) { - patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e); - patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e); - book3e_htw_enabled = 1; - } - pr_info("MMU: Book3E HW tablewalk %s\n", - book3e_htw_enabled ? "enabled" : "not supported"); -} - -/* - * Early initialization of the MMU TLB code - */ -static void __early_init_mmu(int boot_cpu) -{ - unsigned int mas4; - - /* XXX This will have to be decided at runtime, but right - * now our boot and TLB miss code hard wires it. Ideally - * we should find out a suitable page size and patch the - * TLB miss code (either that or use the PACA to store - * the value we want) - */ - mmu_linear_psize = MMU_PAGE_1G; - - /* XXX This should be decided at runtime based on supported - * page sizes in the TLB, but for now let's assume 16M is - * always there and a good fit (which it probably is) - */ - mmu_vmemmap_psize = MMU_PAGE_16M; - - /* XXX This code only checks for TLB 0 capabilities and doesn't - * check what page size combos are supported by the HW. It - * also doesn't handle the case where a separate array holds - * the IND entries from the array loaded by the PT. - */ - if (boot_cpu) { - /* Look for supported page sizes */ - setup_page_sizes(); - - /* Look for HW tablewalk support */ - setup_mmu_htw(); - } - - /* Set MAS4 based on page table setting */ - - mas4 = 0x4 << MAS4_WIMGED_SHIFT; - if (book3e_htw_enabled) { - mas4 |= mas4 | MAS4_INDD; -#ifdef CONFIG_PPC_64K_PAGES - mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT; - mmu_pte_psize = MMU_PAGE_256M; -#else - mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT; - mmu_pte_psize = MMU_PAGE_1M; -#endif - } else { -#ifdef CONFIG_PPC_64K_PAGES - mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT; -#else - mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; -#endif - mmu_pte_psize = mmu_virtual_psize; - } - mtspr(SPRN_MAS4, mas4); - - /* Set the global containing the top of the linear mapping - * for use by the TLB miss code - */ - linear_map_top = memblock_end_of_DRAM(); - -#ifdef CONFIG_PPC_FSL_BOOK3E - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { - unsigned int num_cams; - - /* use a quarter of the TLBCAM for bolted linear map */ - num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; - linear_map_top = map_mem_in_cams(linear_map_top, num_cams); - - /* limit memory so we dont have linear faults */ - memblock_enforce_memory_limit(linear_map_top); - - patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e); - patch_exception(0x1e0, exc_instruction_tlb_miss_bolted_book3e); - } -#endif - - /* A sync won't hurt us after mucking around with - * the MMU configuration - */ - mb(); - - memblock_set_current_limit(linear_map_top); -} - -void __init early_init_mmu(void) -{ - __early_init_mmu(1); -} - -void early_init_mmu_secondary(void) -{ - __early_init_mmu(0); -} - -void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* On non-FSL Embedded 64-bit, we adjust the RMA size to match - * the bolted TLB entry. We know for now that only 1G - * entries are supported though that may eventually - * change. - * - * on FSL Embedded 64-bit, we adjust the RMA size to match the - * first bolted TLB entry size. We still limit max to 1G even if - * the TLB could cover more. This is due to what the early init - * code is setup to do. - * - * We crop it to the size of the first MEMBLOCK to - * avoid going over total available memory just in case... - */ -#ifdef CONFIG_PPC_FSL_BOOK3E - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { - unsigned long linear_sz; - linear_sz = calc_cam_sz(first_memblock_size, PAGE_OFFSET, - first_memblock_base); - ppc64_rma_size = min_t(u64, linear_sz, 0x40000000); - } else -#endif - ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); - - /* Finally limit subsequent allocations */ - memblock_set_current_limit(first_memblock_base + ppc64_rma_size); -} -#else /* ! CONFIG_PPC64 */ -void __init early_init_mmu(void) -{ -#ifdef CONFIG_PPC_47x - early_init_mmu_47x(); -#endif -} -#endif /* CONFIG_PPC64 */ |
