summaryrefslogtreecommitdiff
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/40x_mmu.c159
-rw-r--r--arch/powerpc/mm/Makefile47
-rw-r--r--arch/powerpc/mm/book3s32/Makefile12
-rw-r--r--arch/powerpc/mm/book3s32/hash_low.S (renamed from arch/powerpc/mm/hash_low_32.S)358
-rw-r--r--arch/powerpc/mm/book3s32/kuap.c22
-rw-r--r--arch/powerpc/mm/book3s32/mmu.c446
-rw-r--r--arch/powerpc/mm/book3s32/mmu_context.c (renamed from arch/powerpc/mm/mmu_context_hash32.c)57
-rw-r--r--arch/powerpc/mm/book3s32/nohash_low.S80
-rw-r--r--arch/powerpc/mm/book3s32/tlb.c116
-rw-r--r--arch/powerpc/mm/book3s64/Makefile33
-rw-r--r--arch/powerpc/mm/book3s64/hash_4k.c129
-rw-r--r--arch/powerpc/mm/book3s64/hash_64k.c343
-rw-r--r--arch/powerpc/mm/book3s64/hash_hugepage.c (renamed from arch/powerpc/mm/hugepage-hash64.c)131
-rw-r--r--arch/powerpc/mm/book3s64/hash_native.c (renamed from arch/powerpc/mm/hash_native_64.c)551
-rw-r--r--arch/powerpc/mm/book3s64/hash_pgtable.c563
-rw-r--r--arch/powerpc/mm/book3s64/hash_tlb.c (renamed from arch/powerpc/mm/tlb_hash64.c)85
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c2486
-rw-r--r--arch/powerpc/mm/book3s64/hugetlbpage.c177
-rw-r--r--arch/powerpc/mm/book3s64/internal.h36
-rw-r--r--arch/powerpc/mm/book3s64/iommu_api.c402
-rw-r--r--arch/powerpc/mm/book3s64/mmu_context.c347
-rw-r--r--arch/powerpc/mm/book3s64/pgtable.c665
-rw-r--r--arch/powerpc/mm/book3s64/pkeys.c471
-rw-r--r--arch/powerpc/mm/book3s64/radix_hugetlbpage.c63
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c1694
-rw-r--r--arch/powerpc/mm/book3s64/radix_tlb.c1587
-rw-r--r--arch/powerpc/mm/book3s64/slb.c795
-rw-r--r--arch/powerpc/mm/book3s64/slice.c819
-rw-r--r--arch/powerpc/mm/book3s64/subpage_prot.c (renamed from arch/powerpc/mm/subpage-prot.c)128
-rw-r--r--arch/powerpc/mm/book3s64/trace.c7
-rw-r--r--arch/powerpc/mm/cacheflush.c221
-rw-r--r--arch/powerpc/mm/copro_fault.c136
-rw-r--r--arch/powerpc/mm/dma-noncoherent.c351
-rw-r--r--arch/powerpc/mm/drmem.c514
-rw-r--r--arch/powerpc/mm/fault.c932
-rw-r--r--arch/powerpc/mm/gup.c216
-rw-r--r--arch/powerpc/mm/hash_low_64.S981
-rw-r--r--arch/powerpc/mm/hash_utils_64.c1436
-rw-r--r--arch/powerpc/mm/highmem.c86
-rw-r--r--arch/powerpc/mm/hugetlbpage-book3e.c122
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c125
-rw-r--r--arch/powerpc/mm/hugetlbpage.c1048
-rw-r--r--arch/powerpc/mm/icswx.c292
-rw-r--r--arch/powerpc/mm/icswx.h68
-rw-r--r--arch/powerpc/mm/icswx_pid.c87
-rw-r--r--arch/powerpc/mm/init-common.c167
-rw-r--r--arch/powerpc/mm/init_32.c113
-rw-r--r--arch/powerpc/mm/init_64.c708
-rw-r--r--arch/powerpc/mm/ioremap.c63
-rw-r--r--arch/powerpc/mm/ioremap_32.c92
-rw-r--r--arch/powerpc/mm/ioremap_64.c57
-rw-r--r--arch/powerpc/mm/kasan/8xx.c78
-rw-r--r--arch/powerpc/mm/kasan/Makefile10
-rw-r--r--arch/powerpc/mm/kasan/book3s_32.c60
-rw-r--r--arch/powerpc/mm/kasan/init_32.c192
-rw-r--r--arch/powerpc/mm/kasan/init_book3e_64.c133
-rw-r--r--arch/powerpc/mm/kasan/init_book3s_64.c100
-rw-r--r--arch/powerpc/mm/maccess.c13
-rw-r--r--arch/powerpc/mm/mem.c682
-rw-r--r--arch/powerpc/mm/mmap.c99
-rw-r--r--arch/powerpc/mm/mmu_context.c117
-rw-r--r--arch/powerpc/mm/mmu_context_hash64.c146
-rw-r--r--arch/powerpc/mm/mmu_decl.h143
-rw-r--r--arch/powerpc/mm/nohash/44x.c (renamed from arch/powerpc/mm/44x_mmu.c)32
-rw-r--r--arch/powerpc/mm/nohash/8xx.c224
-rw-r--r--arch/powerpc/mm/nohash/Makefile16
-rw-r--r--arch/powerpc/mm/nohash/book3e_pgtable.c132
-rw-r--r--arch/powerpc/mm/nohash/e500.c (renamed from arch/powerpc/mm/fsl_booke_mmu.c)215
-rw-r--r--arch/powerpc/mm/nohash/e500_hugetlbpage.c193
-rw-r--r--arch/powerpc/mm/nohash/kaslr_booke.c395
-rw-r--r--arch/powerpc/mm/nohash/kup.c27
-rw-r--r--arch/powerpc/mm/nohash/mmu_context.c (renamed from arch/powerpc/mm/mmu_context_nohash.c)345
-rw-r--r--arch/powerpc/mm/nohash/tlb.c341
-rw-r--r--arch/powerpc/mm/nohash/tlb_64e.c314
-rw-r--r--arch/powerpc/mm/nohash/tlb_low.S (renamed from arch/powerpc/mm/tlb_nohash_low.S)175
-rw-r--r--arch/powerpc/mm/nohash/tlb_low_64e.S743
-rw-r--r--arch/powerpc/mm/numa.c1766
-rw-r--r--arch/powerpc/mm/pageattr.c127
-rw-r--r--arch/powerpc/mm/pgtable-frag.c141
-rw-r--r--arch/powerpc/mm/pgtable.c459
-rw-r--r--arch/powerpc/mm/pgtable_32.c434
-rw-r--r--arch/powerpc/mm/pgtable_64.c883
-rw-r--r--arch/powerpc/mm/ppc_mmu_32.c288
-rw-r--r--arch/powerpc/mm/ptdump/8xx.c94
-rw-r--r--arch/powerpc/mm/ptdump/Makefile14
-rw-r--r--arch/powerpc/mm/ptdump/bats.c99
-rw-r--r--arch/powerpc/mm/ptdump/book3s64.c127
-rw-r--r--arch/powerpc/mm/ptdump/hashpagetable.c549
-rw-r--r--arch/powerpc/mm/ptdump/ptdump.c425
-rw-r--r--arch/powerpc/mm/ptdump/ptdump.h23
-rw-r--r--arch/powerpc/mm/ptdump/segment_regs.c52
-rw-r--r--arch/powerpc/mm/ptdump/shared.c92
-rw-r--r--arch/powerpc/mm/slb.c332
-rw-r--r--arch/powerpc/mm/slb_low.S317
-rw-r--r--arch/powerpc/mm/slice.c731
-rw-r--r--arch/powerpc/mm/stab.c286
-rw-r--r--arch/powerpc/mm/tlb_hash32.c184
-rw-r--r--arch/powerpc/mm/tlb_low_64e.S980
-rw-r--r--arch/powerpc/mm/tlb_nohash.c692
99 files changed, 21442 insertions, 13202 deletions
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c
deleted file mode 100644
index 5810967511d4..000000000000
--- a/arch/powerpc/mm/40x_mmu.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * This file contains the routines for initializing the MMU
- * on the 4xx series of chips.
- * -- paulus
- *
- * Derived from arch/ppc/mm/init.c:
- * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- * and Cort Dougan (PReP) (cort@cs.nmt.edu)
- * Copyright (C) 1996 Paul Mackerras
- *
- * Derived from "arch/i386/mm/init.c"
- * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/stddef.h>
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/highmem.h>
-#include <linux/memblock.h>
-
-#include <asm/pgalloc.h>
-#include <asm/prom.h>
-#include <asm/io.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/uaccess.h>
-#include <asm/smp.h>
-#include <asm/bootx.h>
-#include <asm/machdep.h>
-#include <asm/setup.h>
-
-#include "mmu_decl.h"
-
-extern int __map_without_ltlbs;
-/*
- * MMU_init_hw does the chip-specific initialization of the MMU hardware.
- */
-void __init MMU_init_hw(void)
-{
- /*
- * The Zone Protection Register (ZPR) defines how protection will
- * be applied to every page which is a member of a given zone. At
- * present, we utilize only two of the 4xx's zones.
- * The zone index bits (of ZSEL) in the PTE are used for software
- * indicators, except the LSB. For user access, zone 1 is used,
- * for kernel access, zone 0 is used. We set all but zone 1
- * to zero, allowing only kernel access as indicated in the PTE.
- * For zone 1, we set a 01 binary (a value of 10 will not work)
- * to allow user access as indicated in the PTE. This also allows
- * kernel access as indicated in the PTE.
- */
-
- mtspr(SPRN_ZPR, 0x10000000);
-
- flush_instruction_cache();
-
- /*
- * Set up the real-mode cache parameters for the exception vector
- * handlers (which are run in real-mode).
- */
-
- mtspr(SPRN_DCWR, 0x00000000); /* All caching is write-back */
-
- /*
- * Cache instruction and data space where the exception
- * vectors and the kernel live in real-mode.
- */
-
- mtspr(SPRN_DCCR, 0xFFFF0000); /* 2GByte of data space at 0x0. */
- mtspr(SPRN_ICCR, 0xFFFF0000); /* 2GByte of instr. space at 0x0. */
-}
-
-#define LARGE_PAGE_SIZE_16M (1<<24)
-#define LARGE_PAGE_SIZE_4M (1<<22)
-
-unsigned long __init mmu_mapin_ram(unsigned long top)
-{
- unsigned long v, s, mapped;
- phys_addr_t p;
-
- v = KERNELBASE;
- p = 0;
- s = total_lowmem;
-
- if (__map_without_ltlbs)
- return 0;
-
- while (s >= LARGE_PAGE_SIZE_16M) {
- pmd_t *pmdp;
- unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE;
-
- pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
- pmd_val(*pmdp++) = val;
- pmd_val(*pmdp++) = val;
- pmd_val(*pmdp++) = val;
- pmd_val(*pmdp++) = val;
-
- v += LARGE_PAGE_SIZE_16M;
- p += LARGE_PAGE_SIZE_16M;
- s -= LARGE_PAGE_SIZE_16M;
- }
-
- while (s >= LARGE_PAGE_SIZE_4M) {
- pmd_t *pmdp;
- unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE;
-
- pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
- pmd_val(*pmdp) = val;
-
- v += LARGE_PAGE_SIZE_4M;
- p += LARGE_PAGE_SIZE_4M;
- s -= LARGE_PAGE_SIZE_4M;
- }
-
- mapped = total_lowmem - s;
-
- /* If the size of RAM is not an exact power of two, we may not
- * have covered RAM in its entirety with 16 and 4 MiB
- * pages. Consequently, restrict the top end of RAM currently
- * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail"
- * coverage with normal-sized pages (or other reasons) do not
- * attempt to allocate outside the allowed range.
- */
- memblock_set_current_limit(mapped);
-
- return mapped;
-}
-
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size)
-{
- /* We don't currently support the first MEMBLOCK not mapping 0
- * physical on those processors
- */
- BUG_ON(first_memblock_base != 0);
-
- /* 40x can only access 16MB at the moment (see head_40x.S) */
- memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
-}
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 51230ee6a407..8c1582b2987d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -1,38 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux ppc-specific parts of the memory manager.
#
-subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
-
-ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
-
-obj-y := fault.o mem.o pgtable.o gup.o mmap.o \
- init_$(CONFIG_WORD_SIZE).o \
- pgtable_$(CONFIG_WORD_SIZE).o
-obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
- tlb_nohash_low.o
-obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o
-hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o
-obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \
- slb_low.o slb.o stab.o \
- $(hash64-y)
-obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o
-obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \
- tlb_hash$(CONFIG_WORD_SIZE).o \
- mmu_context_hash$(CONFIG_WORD_SIZE).o
-obj-$(CONFIG_PPC_ICSWX) += icswx.o
-obj-$(CONFIG_PPC_ICSWX_PID) += icswx_pid.o
-obj-$(CONFIG_40x) += 40x_mmu.o
-obj-$(CONFIG_44x) += 44x_mmu.o
-obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o
-obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
-obj-$(CONFIG_PPC_MM_SLICES) += slice.o
-obj-y += hugetlbpage.o
-ifeq ($(CONFIG_HUGETLB_PAGE),y)
-obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o
-obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
-endif
-obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
-obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
+obj-y := fault.o mem.o pgtable.o maccess.o pageattr.o \
+ init_$(BITS).o pgtable_$(BITS).o \
+ pgtable-frag.o ioremap.o ioremap_$(BITS).o \
+ init-common.o mmu_context.o drmem.o \
+ cacheflush.o
+obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/
+obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/
+obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/
+obj-$(CONFIG_NUMA) += numa.o
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
-obj-$(CONFIG_HIGHMEM) += highmem.o
+obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
+obj-$(CONFIG_PTDUMP) += ptdump/
+obj-$(CONFIG_KASAN) += kasan/
diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile
new file mode 100644
index 000000000000..50dd8f6bdf46
--- /dev/null
+++ b/arch/powerpc/mm/book3s32/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+
+KASAN_SANITIZE_mmu.o := n
+
+ifdef CONFIG_KASAN
+CFLAGS_mmu.o += -DDISABLE_BRANCH_PROFILING
+endif
+
+obj-y += mmu.o mmu_context.o
+obj-$(CONFIG_PPC_BOOK3S_603) += nohash_low.o
+obj-$(CONFIG_PPC_BOOK3S_604) += hash_low.o tlb.o
+obj-$(CONFIG_PPC_KUAP) += kuap.o
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/book3s32/hash_low.S
index 115347f74ce5..4ed0efd03db5 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/book3s32/hash_low.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -11,48 +12,46 @@
* This file contains low-level assembler routines for managing
* the PowerPC MMU hash table. (PPC 8xx processors don't use a
* hash table, so this file is not used on them.)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
+#include <linux/export.h>
+#include <linux/pgtable.h>
+#include <linux/init.h>
#include <asm/reg.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
#include <asm/cputable.h>
#include <asm/ppc_asm.h>
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
+#include <asm/feature-fixups.h>
+#include <asm/code-patching-asm.h>
-#ifdef CONFIG_SMP
- .section .bss
- .align 2
- .globl mmu_hash_lock
-mmu_hash_lock:
- .space 4
-#endif /* CONFIG_SMP */
+#ifdef CONFIG_PTE_64BIT
+#define PTE_T_SIZE 8
+#define PTE_FLAGS_OFFSET 4 /* offset of PTE flags, in bytes */
+#else
+#define PTE_T_SIZE 4
+#define PTE_FLAGS_OFFSET 0
+#endif
/*
* Load a PTE into the hash table, if possible.
- * The address is in r4, and r3 contains an access flag:
- * _PAGE_RW (0x400) if a write.
+ * The address is in r4, and r3 contains required access flags:
+ * - For ISI: _PAGE_PRESENT | _PAGE_EXEC
+ * - For DSI: _PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE if a write.
* r9 contains the SRR1 value, from which we use the MSR_PR bit.
* SPRG_THREAD contains the physical address of the current task's thread.
*
* Returns to the caller if the access is illegal or there is no
* mapping for the address. Otherwise it places an appropriate PTE
* in the hash table and returns from the exception.
- * Uses r0, r3 - r8, r10, ctr, lr.
+ * Uses r0, r3 - r6, r8, r10, ctr, lr.
*/
.text
_GLOBAL(hash_page)
- tophys(r7,0) /* gets -KERNELBASE into r7 */
#ifdef CONFIG_SMP
- addis r8,r7,mmu_hash_lock@h
- ori r8,r8,mmu_hash_lock@l
+ lis r8, (mmu_hash_lock - PAGE_OFFSET)@h
+ ori r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l
lis r0,0x0fff
b 10f
11: lwz r6,0(r8)
@@ -66,16 +65,20 @@ _GLOBAL(hash_page)
isync
#endif
/* Get PTE (linux-style) and check access */
- lis r0,KERNELBASE@h /* check if kernel address */
+ lis r0, TASK_SIZE@h /* check if kernel address */
cmplw 0,r4,r0
mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */
- ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
lwz r5,PGDIR(r8) /* virt page-table root */
blt+ 112f /* assume user more likely */
lis r5,swapper_pg_dir@ha /* if kernel address, use */
+ andi. r0,r9,MSR_PR /* Check usermode */
addi r5,r5,swapper_pg_dir@l /* kernel page table */
- rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */
-112: add r5,r5,r7 /* convert to phys addr */
+#ifdef CONFIG_SMP
+ bne- .Lhash_page_out /* return if usermode */
+#else
+ bnelr-
+#endif
+112: tophys(r5, r5)
#ifndef CONFIG_PTE_64BIT
rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */
lwz r8,0(r5) /* get pmd entry */
@@ -86,7 +89,7 @@ _GLOBAL(hash_page)
rlwinm. r8,r8,0,0,20 /* extract pt base address */
#endif
#ifdef CONFIG_SMP
- beq- hash_page_out /* return if no mapping */
+ beq- .Lhash_page_out /* return if no mapping */
#else
/* XXX it seems like the 601 will give a machine fault on the
rfi if its alignment is wrong (bottom 4 bits of address are
@@ -98,27 +101,35 @@ _GLOBAL(hash_page)
rlwimi r8,r4,22,20,29 /* insert next 10 bits of address */
#else
rlwimi r8,r4,23,20,28 /* compute pte address */
+ /*
+ * If PTE_64BIT is set, the low word is the flags word; use that
+ * word for locking since it contains all the interesting bits.
+ */
+ addi r8,r8,PTE_FLAGS_OFFSET
#endif
- rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */
- ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
/*
* Update the linux PTE atomically. We do the lwarx up-front
* because almost always, there won't be a permission violation
* and there won't already be an HPTE, and thus we will have
* to update the PTE to set _PAGE_HASHPTE. -- paulus.
- *
- * If PTE_64BIT is set, the low word is the flags word; use that
- * word for locking since it contains all the interesting bits.
*/
-#if (PTE_FLAGS_OFFSET != 0)
- addi r8,r8,PTE_FLAGS_OFFSET
-#endif
-retry:
+.Lretry:
lwarx r6,0,r8 /* get linux-style pte, flag word */
+#ifdef CONFIG_PPC_KUAP
+ mfsrin r5,r4
+ rlwinm r0,r9,28,_PAGE_WRITE /* MSR[PR] => _PAGE_WRITE */
+ rlwinm r5,r5,12,_PAGE_WRITE /* Ks => _PAGE_WRITE */
+ andc r5,r5,r0 /* Ks & ~MSR[PR] */
+ andc r5,r6,r5 /* Clear _PAGE_WRITE when Ks = 1 && MSR[PR] = 0 */
+ andc. r5,r3,r5 /* check access & ~permission */
+#else
andc. r5,r3,r6 /* check access & ~permission */
+#endif
+ rlwinm r0,r3,32-3,24,24 /* _PAGE_WRITE access -> _PAGE_DIRTY */
+ ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
#ifdef CONFIG_SMP
- bne- hash_page_out /* return if access not permitted */
+ bne- .Lhash_page_out /* return if access not permitted */
#else
bnelr-
#endif
@@ -133,36 +144,28 @@ retry:
#endif /* CONFIG_SMP */
#endif /* CONFIG_PTE_64BIT */
stwcx. r5,0,r8 /* attempt to update PTE */
- bne- retry /* retry if someone got there first */
+ bne- .Lretry /* retry if someone got there first */
mfsrin r3,r4 /* get segment reg for segment */
- mfctr r0
- stw r0,_CTR(r11)
bl create_hpte /* add the hash table entry */
#ifdef CONFIG_SMP
eieio
- addis r8,r7,mmu_hash_lock@ha
+ lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha
li r0,0
- stw r0,mmu_hash_lock@l(r8)
+ stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
#endif
-
- /* Return from the exception */
- lwz r5,_CTR(r11)
- mtctr r5
- lwz r0,GPR0(r11)
- lwz r7,GPR7(r11)
- lwz r8,GPR8(r11)
- b fast_exception_return
+ b fast_hash_page_return
#ifdef CONFIG_SMP
-hash_page_out:
+.Lhash_page_out:
eieio
- addis r8,r7,mmu_hash_lock@ha
+ lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha
li r0,0
- stw r0,mmu_hash_lock@l(r8)
+ stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
blr
#endif /* CONFIG_SMP */
+_ASM_NOKPROBE_SYMBOL(hash_page)
/*
* Add an entry for a particular page to the hash table.
@@ -177,15 +180,8 @@ _GLOBAL(add_hash_page)
mflr r0
stw r0,4(r1)
- /* Convert context and va to VSID */
- mulli r3,r3,897*16 /* multiply context by context skew */
- rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */
- mulli r0,r0,0x111 /* multiply by ESID skew */
- add r3,r3,r0 /* note create_hpte trims to 24 bits */
-
#ifdef CONFIG_SMP
- CURRENT_THREAD_INFO(r8, r1) /* use cpu number to make tag */
- lwz r8,TI_CPU(r8) /* to go in mmu_hash_lock */
+ lwz r8,TASK_CPU(r2) /* to go in mmu_hash_lock */
oris r8,r8,12
#endif /* CONFIG_SMP */
@@ -199,25 +195,21 @@ _GLOBAL(add_hash_page)
* covered by a BAT). -- paulus
*/
mfmsr r9
- SYNC
rlwinm r0,r9,0,17,15 /* clear bit 16 (MSR_EE) */
rlwinm r0,r0,0,28,26 /* clear MSR_DR */
mtmsr r0
- SYNC_601
isync
- tophys(r7,0)
-
#ifdef CONFIG_SMP
- addis r6,r7,mmu_hash_lock@ha
- addi r6,r6,mmu_hash_lock@l
+ lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha
+ addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l
10: lwarx r0,0,r6 /* take the mmu_hash_lock */
- cmpi 0,r0,0
+ cmpwi 0,r0,0
bne- 11f
stwcx. r8,0,r6
beq+ 12f
11: lwz r0,0(r6)
- cmpi 0,r0,0
+ cmpwi 0,r0,0
beq 10b
b 11b
12: isync
@@ -251,12 +243,18 @@ _GLOBAL(add_hash_page)
stwcx. r5,0,r8
bne- 1b
+ /* Convert context and va to VSID */
+ mulli r3,r3,897*16 /* multiply context by context skew */
+ rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */
+ mulli r0,r0,0x111 /* multiply by ESID skew */
+ add r3,r3,r0 /* note create_hpte trims to 24 bits */
+
bl create_hpte
9:
#ifdef CONFIG_SMP
- addis r6,r7,mmu_hash_lock@ha
- addi r6,r6,mmu_hash_lock@l
+ lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha
+ addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l
eieio
li r0,0
stw r0,0(r6) /* clear mmu_hash_lock */
@@ -264,22 +262,20 @@ _GLOBAL(add_hash_page)
/* reenable interrupts and DR */
mtmsr r9
- SYNC_601
isync
lwz r0,4(r1)
mtlr r0
blr
+_ASM_NOKPROBE_SYMBOL(add_hash_page)
/*
* This routine adds a hardware PTE to the hash table.
* It is designed to be called with the MMU either on or off.
* r3 contains the VSID, r4 contains the virtual address,
* r5 contains the linux PTE, r6 contains the old value of the
- * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the
- * offset to be added to addresses (0 if the MMU is on,
- * -KERNELBASE if it is off). r10 contains the upper half of
- * the PTE if CONFIG_PTE_64BIT.
+ * linux PTE (before setting _PAGE_HASHPTE). r10 contains the
+ * upper half of the PTE if CONFIG_PTE_64BIT.
* On SMP, the caller should have the mmu_hash_lock held.
* We assume that the caller has (or will) set the _PAGE_HASHPTE
* bit in the linux PTE in memory. The value passed in r6 should
@@ -290,9 +286,9 @@ _GLOBAL(add_hash_page)
*
* For speed, 4 of the instructions get patched once the size and
* physical address of the hash table are known. These definitions
- * of Hash_base and Hash_bits below are just an example.
+ * of Hash_base and Hash_bits below are for the early hash table.
*/
-Hash_base = 0xc0180000
+Hash_base = early_hash
Hash_bits = 12 /* e.g. 256kB hash table */
Hash_msk = (((1 << Hash_bits) - 1) * 64)
@@ -313,15 +309,19 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64)
#define HASH_LEFT 31-(LG_PTEG_SIZE+Hash_bits-1)
#define HASH_RIGHT 31-LG_PTEG_SIZE
+__REF
_GLOBAL(create_hpte)
/* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */
- rlwinm r8,r5,32-10,31,31 /* _PAGE_RW -> PP lsb */
- rlwinm r0,r5,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */
+ lis r0, TASK_SIZE@h
+ rlwinm r5,r5,0,~3 /* Clear PP bits */
+ cmplw r4,r0
+ rlwinm r8,r5,32-9,30,30 /* _PAGE_WRITE -> PP msb */
+ rlwinm r0,r5,32-6,30,30 /* _PAGE_DIRTY -> PP msb */
and r8,r8,r0 /* writable if _RW & _DIRTY */
- rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */
- rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */
- ori r8,r8,0xe04 /* clear out reserved bits */
- andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */
+ bge- 1f /* Kernelspace ? Skip */
+ ori r5,r5,3 /* Userspace ? PP = 3 */
+1: ori r8,r8,0xe04 /* clear out reserved bits */
+ andc r8,r5,r8 /* PP = user? (rw&dirty? 1: 3): 0 */
BEGIN_FTR_SECTION
rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */
END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
@@ -336,11 +336,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
rlwimi r5,r4,10,26,31 /* put in API (abbrev page index) */
SET_V(r5) /* set V (valid) bit */
+ patch_site 0f, patch__hash_page_A0
+ patch_site 1f, patch__hash_page_A1
+ patch_site 2f, patch__hash_page_A2
/* Get the address of the primary PTE group in the hash table (r3) */
-_GLOBAL(hash_page_patch_A)
- addis r0,r7,Hash_base@h /* base address of hash table */
- rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */
- rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
+0: lis r0, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */
+1: rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */
+2: rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
xor r3,r3,r0 /* make primary hash */
li r0,8 /* PTEs/group */
@@ -352,30 +354,25 @@ _GLOBAL(hash_page_patch_A)
beq+ 10f /* no PTE: go look for an empty slot */
tlbie r4
- addis r4,r7,htab_hash_searches@ha
- lwz r6,htab_hash_searches@l(r4)
- addi r6,r6,1 /* count how many searches we do */
- stw r6,htab_hash_searches@l(r4)
-
/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
mtctr r0
addi r4,r3,-HPTE_SIZE
1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */
CMPPTE 0,r6,r5
bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
- beq+ found_slot
+ beq+ .Lfound_slot
+ patch_site 0f, patch__hash_page_B
/* Search the secondary PTEG for a matching PTE */
ori r5,r5,PTE_H /* set H (secondary hash) bit */
-_GLOBAL(hash_page_patch_B)
- xoris r4,r3,Hash_msk>>16 /* compute secondary hash */
+0: xoris r4,r3,Hash_msk>>16 /* compute secondary hash */
xori r4,r4,(-PTEG_SIZE & 0xffff)
addi r4,r4,-HPTE_SIZE
mtctr r0
2: LDPTEu r6,HPTE_SIZE(r4)
CMPPTE 0,r6,r5
bdnzf 2,2b
- beq+ found_slot
+ beq+ .Lfound_slot
xori r5,r5,PTE_H /* clear H bit again */
/* Search the primary PTEG for an empty slot */
@@ -384,25 +381,19 @@ _GLOBAL(hash_page_patch_B)
1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */
TST_V(r6) /* test valid bit */
bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
- beq+ found_empty
-
- /* update counter of times that the primary PTEG is full */
- addis r4,r7,primary_pteg_full@ha
- lwz r6,primary_pteg_full@l(r4)
- addi r6,r6,1
- stw r6,primary_pteg_full@l(r4)
+ beq+ .Lfound_empty
+ patch_site 0f, patch__hash_page_C
/* Search the secondary PTEG for an empty slot */
ori r5,r5,PTE_H /* set H (secondary hash) bit */
-_GLOBAL(hash_page_patch_C)
- xoris r4,r3,Hash_msk>>16 /* compute secondary hash */
+0: xoris r4,r3,Hash_msk>>16 /* compute secondary hash */
xori r4,r4,(-PTEG_SIZE & 0xffff)
addi r4,r4,-HPTE_SIZE
mtctr r0
2: LDPTEu r6,HPTE_SIZE(r4)
TST_V(r6)
bdnzf 2,2b
- beq+ found_empty
+ beq+ .Lfound_empty
xori r5,r5,PTE_H /* clear H bit again */
/*
@@ -413,36 +404,20 @@ _GLOBAL(hash_page_patch_C)
* and we know there is a definite (although small) speed
* advantage to putting the PTE in the primary PTEG, we always
* put the PTE in the primary PTEG.
- *
- * In addition, we skip any slot that is mapping kernel text in
- * order to avoid a deadlock when not using BAT mappings if
- * trying to hash in the kernel hash code itself after it has
- * already taken the hash table lock. This works in conjunction
- * with pre-faulting of the kernel text.
- *
- * If the hash table bucket is full of kernel text entries, we'll
- * lockup here but that shouldn't happen
*/
-1: addis r4,r7,next_slot@ha /* get next evict slot */
- lwz r6,next_slot@l(r4)
+ lis r4, (next_slot - PAGE_OFFSET)@ha /* get next evict slot */
+ lwz r6, (next_slot - PAGE_OFFSET)@l(r4)
addi r6,r6,HPTE_SIZE /* search for candidate */
andi. r6,r6,7*HPTE_SIZE
stw r6,next_slot@l(r4)
add r4,r3,r6
- LDPTE r0,HPTE_SIZE/2(r4) /* get PTE second word */
- clrrwi r0,r0,12
- lis r6,etext@h
- ori r6,r6,etext@l /* get etext */
- tophys(r6,r6)
- cmpl cr0,r0,r6 /* compare and try again */
- blt 1b
#ifndef CONFIG_SMP
/* Store PTE in PTEG */
-found_empty:
+.Lfound_empty:
STPTE r5,0(r4)
-found_slot:
+.Lfound_slot:
STPTE r8,HPTE_SIZE/2(r4)
#else /* CONFIG_SMP */
@@ -463,8 +438,8 @@ found_slot:
* We do however have to make sure that the PTE is never in an invalid
* state with the V bit set.
*/
-found_empty:
-found_slot:
+.Lfound_empty:
+.Lfound_slot:
CLR_V(r5,r0) /* clear V (valid) bit in PTE */
STPTE r5,0(r4)
sync
@@ -477,15 +452,13 @@ found_slot:
sync /* make sure pte updates get to memory */
blr
+ .previous
+_ASM_NOKPROBE_SYMBOL(create_hpte)
.section .bss
.align 2
next_slot:
.space 4
-primary_pteg_full:
- .space 4
-htab_hash_searches:
- .space 4
.previous
/*
@@ -496,9 +469,8 @@ htab_hash_searches:
*
* We assume that there is a hash table in use (Hash != 0).
*/
+__REF
_GLOBAL(flush_hash_pages)
- tophys(r7,0)
-
/*
* We disable interrupts here, even on UP, because we want
* the _PAGE_HASHPTE bit to be a reliable indication of
@@ -508,11 +480,9 @@ _GLOBAL(flush_hash_pages)
* covered by a BAT). -- paulus
*/
mfmsr r10
- SYNC
rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
rlwinm r0,r0,0,28,26 /* clear MSR_DR */
mtmsr r0
- SYNC_601
isync
/* First find a PTE in the range that has _PAGE_HASHPTE set */
@@ -520,14 +490,15 @@ _GLOBAL(flush_hash_pages)
rlwimi r5,r4,22,20,29
#else
rlwimi r5,r4,23,20,28
+ addi r5,r5,PTE_FLAGS_OFFSET
#endif
-1: lwz r0,PTE_FLAGS_OFFSET(r5)
+1: lwz r0,0(r5)
cmpwi cr1,r6,1
andi. r0,r0,_PAGE_HASHPTE
bne 2f
ble cr1,19f
addi r4,r4,0x1000
- addi r5,r5,PTE_SIZE
+ addi r5,r5,PTE_T_SIZE
addi r6,r6,-1
b 1b
@@ -543,19 +514,18 @@ _GLOBAL(flush_hash_pages)
SET_V(r11) /* set V (valid) bit */
#ifdef CONFIG_SMP
- addis r9,r7,mmu_hash_lock@ha
- addi r9,r9,mmu_hash_lock@l
- CURRENT_THREAD_INFO(r8, r1)
- add r8,r8,r7
- lwz r8,TI_CPU(r8)
+ lis r9, (mmu_hash_lock - PAGE_OFFSET)@ha
+ addi r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l
+ tophys (r8, r2)
+ lwz r8, TASK_CPU(r8)
oris r8,r8,9
10: lwarx r0,0,r9
- cmpi 0,r0,0
+ cmpwi 0,r0,0
bne- 11f
stwcx. r8,0,r9
beq+ 12f
11: lwz r0,0(r9)
- cmpi 0,r0,0
+ cmpwi 0,r0,0
beq 10b
b 11b
12: isync
@@ -566,9 +536,6 @@ _GLOBAL(flush_hash_pages)
* already clear, we're done (for this pte). If not,
* clear it (atomically) and proceed. -- paulus.
*/
-#if (PTE_FLAGS_OFFSET != 0)
- addi r5,r5,PTE_FLAGS_OFFSET
-#endif
33: lwarx r8,0,r5 /* fetch the pte flags word */
andi. r0,r8,_PAGE_HASHPTE
beq 8f /* done if HASHPTE is already clear */
@@ -576,11 +543,13 @@ _GLOBAL(flush_hash_pages)
stwcx. r8,0,r5 /* update the pte */
bne- 33b
+ patch_site 0f, patch__flush_hash_A0
+ patch_site 1f, patch__flush_hash_A1
+ patch_site 2f, patch__flush_hash_A2
/* Get the address of the primary PTE group in the hash table (r3) */
-_GLOBAL(flush_hash_patch_A)
- addis r8,r7,Hash_base@h /* base address of hash table */
- rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */
- rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
+0: lis r8, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */
+1: rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */
+2: rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
xor r8,r0,r8 /* make primary hash */
/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
@@ -592,11 +561,11 @@ _GLOBAL(flush_hash_patch_A)
bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
beq+ 3f
+ patch_site 0f, patch__flush_hash_B
/* Search the secondary PTEG for a matching PTE */
ori r11,r11,PTE_H /* set H (secondary hash) bit */
li r0,8 /* PTEs/group */
-_GLOBAL(flush_hash_patch_B)
- xoris r12,r8,Hash_msk>>16 /* compute secondary hash */
+0: xoris r12,r8,Hash_msk>>16 /* compute secondary hash */
xori r12,r12,(-PTEG_SIZE & 0xffff)
addi r12,r12,-HPTE_SIZE
mtctr r0
@@ -614,7 +583,7 @@ _GLOBAL(flush_hash_patch_B)
8: ble cr1,9f /* if all ptes checked */
81: addi r6,r6,-1
- addi r5,r5,PTE_SIZE
+ addi r5,r5,PTE_T_SIZE
addi r4,r4,0x1000
lwz r0,0(r5) /* check next pte */
cmpwi cr1,r6,1
@@ -630,83 +599,8 @@ _GLOBAL(flush_hash_patch_B)
#endif
19: mtmsr r10
- SYNC_601
- isync
- blr
-
-/*
- * Flush an entry from the TLB
- */
-_GLOBAL(_tlbie)
-#ifdef CONFIG_SMP
- CURRENT_THREAD_INFO(r8, r1)
- lwz r8,TI_CPU(r8)
- oris r8,r8,11
- mfmsr r10
- SYNC
- rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
- rlwinm r0,r0,0,28,26 /* clear DR */
- mtmsr r0
- SYNC_601
- isync
- lis r9,mmu_hash_lock@h
- ori r9,r9,mmu_hash_lock@l
- tophys(r9,r9)
-10: lwarx r7,0,r9
- cmpwi 0,r7,0
- bne- 10b
- stwcx. r8,0,r9
- bne- 10b
- eieio
- tlbie r3
- sync
- TLBSYNC
- li r0,0
- stw r0,0(r9) /* clear mmu_hash_lock */
- mtmsr r10
- SYNC_601
- isync
-#else /* CONFIG_SMP */
- tlbie r3
- sync
-#endif /* CONFIG_SMP */
- blr
-
-/*
- * Flush the entire TLB. 603/603e only
- */
-_GLOBAL(_tlbia)
-#if defined(CONFIG_SMP)
- CURRENT_THREAD_INFO(r8, r1)
- lwz r8,TI_CPU(r8)
- oris r8,r8,10
- mfmsr r10
- SYNC
- rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
- rlwinm r0,r0,0,28,26 /* clear DR */
- mtmsr r0
- SYNC_601
isync
- lis r9,mmu_hash_lock@h
- ori r9,r9,mmu_hash_lock@l
- tophys(r9,r9)
-10: lwarx r7,0,r9
- cmpwi 0,r7,0
- bne- 10b
- stwcx. r8,0,r9
- bne- 10b
- sync
- tlbia
- sync
- TLBSYNC
- li r0,0
- stw r0,0(r9) /* clear mmu_hash_lock */
- mtmsr r10
- SYNC_601
- isync
-#else /* CONFIG_SMP */
- sync
- tlbia
- sync
-#endif /* CONFIG_SMP */
blr
+ .previous
+EXPORT_SYMBOL(flush_hash_pages)
+_ASM_NOKPROBE_SYMBOL(flush_hash_pages)
diff --git a/arch/powerpc/mm/book3s32/kuap.c b/arch/powerpc/mm/book3s32/kuap.c
new file mode 100644
index 000000000000..3a8815555a48
--- /dev/null
+++ b/arch/powerpc/mm/book3s32/kuap.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <asm/kup.h>
+#include <asm/smp.h>
+
+void setup_kuap(bool disabled)
+{
+ if (!disabled) {
+ update_user_segments(mfsr(0) | SR_KS);
+ isync(); /* Context sync required after mtsr() */
+ init_mm.context.sr0 |= SR_KS;
+ current->thread.sr0 |= SR_KS;
+ }
+
+ if (smp_processor_id() != boot_cpuid)
+ return;
+
+ if (disabled)
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_KUAP;
+ else
+ pr_info("Activating Kernel Userspace Access Protection\n");
+}
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
new file mode 100644
index 000000000000..c42ecdf94e48
--- /dev/null
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -0,0 +1,446 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU substantially follows the
+ * architecture specification. This includes the 6xx, 7xx, 7xxx,
+ * and 8260 implementations but excludes the 8xx and 4xx.
+ * -- paulus
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/memblock.h>
+
+#include <asm/mmu.h>
+#include <asm/machdep.h>
+#include <asm/text-patching.h>
+#include <asm/sections.h>
+
+#include <mm/mmu_decl.h>
+
+u8 __initdata early_hash[SZ_256K] __aligned(SZ_256K) = {0};
+
+static struct hash_pte __initdata *Hash = (struct hash_pte *)early_hash;
+static unsigned long __initdata Hash_size, Hash_mask;
+static unsigned int __initdata hash_mb, hash_mb2;
+unsigned long __initdata _SDR1;
+
+struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */
+
+static struct batrange { /* stores address ranges mapped by BATs */
+ unsigned long start;
+ unsigned long limit;
+ phys_addr_t phys;
+} bat_addrs[8];
+
+#ifdef CONFIG_SMP
+unsigned long mmu_hash_lock;
+#endif
+
+/*
+ * Return PA for this VA if it is mapped by a BAT, or 0
+ */
+phys_addr_t v_block_mapped(unsigned long va)
+{
+ int b;
+ for (b = 0; b < ARRAY_SIZE(bat_addrs); ++b)
+ if (va >= bat_addrs[b].start && va < bat_addrs[b].limit)
+ return bat_addrs[b].phys + (va - bat_addrs[b].start);
+ return 0;
+}
+
+/*
+ * Return VA for a given PA or 0 if not mapped
+ */
+unsigned long p_block_mapped(phys_addr_t pa)
+{
+ int b;
+ for (b = 0; b < ARRAY_SIZE(bat_addrs); ++b)
+ if (pa >= bat_addrs[b].phys
+ && pa < (bat_addrs[b].limit-bat_addrs[b].start)
+ +bat_addrs[b].phys)
+ return bat_addrs[b].start+(pa-bat_addrs[b].phys);
+ return 0;
+}
+
+int __init find_free_bat(void)
+{
+ int b;
+ int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
+
+ for (b = 0; b < n; b++) {
+ struct ppc_bat *bat = BATS[b];
+
+ if (!(bat[1].batu & 3))
+ return b;
+ }
+ return -1;
+}
+
+/*
+ * This function calculates the size of the larger block usable to map the
+ * beginning of an area based on the start address and size of that area:
+ * - max block size is 256 on 6xx.
+ * - base address must be aligned to the block size. So the maximum block size
+ * is identified by the lowest bit set to 1 in the base address (for instance
+ * if base is 0x16000000, max size is 0x02000000).
+ * - block size has to be a power of two. This is calculated by finding the
+ * highest bit set to 1.
+ */
+unsigned int bat_block_size(unsigned long base, unsigned long top)
+{
+ unsigned int max_size = SZ_256M;
+ unsigned int base_shift = (ffs(base) - 1) & 31;
+ unsigned int block_shift = (fls(top - base) - 1) & 31;
+
+ return min3(max_size, 1U << base_shift, 1U << block_shift);
+}
+
+/*
+ * Set up one of the IBAT (block address translation) register pairs.
+ * The parameters are not checked; in particular size must be a power
+ * of 2 between 128k and 256M.
+ */
+static void setibat(int index, unsigned long virt, phys_addr_t phys,
+ unsigned int size, pgprot_t prot)
+{
+ unsigned int bl = (size >> 17) - 1;
+ int wimgxpp;
+ struct ppc_bat *bat = BATS[index];
+ unsigned long flags = pgprot_val(prot);
+
+ if (!cpu_has_feature(CPU_FTR_NEED_COHERENT))
+ flags &= ~_PAGE_COHERENT;
+
+ wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX);
+ bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
+ bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
+ if (!is_kernel_addr(virt))
+ bat[0].batu |= 1; /* Vp = 1 */
+}
+
+static void clearibat(int index)
+{
+ struct ppc_bat *bat = BATS[index];
+
+ bat[0].batu = 0;
+ bat[0].batl = 0;
+}
+
+static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long top)
+{
+ int idx;
+
+ while ((idx = find_free_bat()) != -1 && base != top) {
+ unsigned int size = bat_block_size(base, top);
+
+ if (size < 128 << 10)
+ break;
+ setbat(idx, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X);
+ base += size;
+ }
+
+ return base;
+}
+
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
+{
+ unsigned long done;
+ unsigned long border = (unsigned long)__srwx_boundary - PAGE_OFFSET;
+ unsigned long size;
+
+ size = roundup_pow_of_two((unsigned long)_einittext - PAGE_OFFSET);
+ setibat(0, PAGE_OFFSET, 0, size, PAGE_KERNEL_X);
+
+ if (debug_pagealloc_enabled_or_kfence()) {
+ pr_debug_once("Read-Write memory mapped without BATs\n");
+ if (base >= border)
+ return base;
+ if (top >= border)
+ top = border;
+ }
+
+ if (!strict_kernel_rwx_enabled() || base >= border || top <= border)
+ return __mmu_mapin_ram(base, top);
+
+ done = __mmu_mapin_ram(base, border);
+ if (done != border)
+ return done;
+
+ return __mmu_mapin_ram(border, top);
+}
+
+static bool is_module_segment(unsigned long addr)
+{
+ if (!IS_ENABLED(CONFIG_EXECMEM))
+ return false;
+ if (addr < ALIGN_DOWN(MODULES_VADDR, SZ_256M))
+ return false;
+ if (addr > ALIGN(MODULES_END, SZ_256M) - 1)
+ return false;
+ return true;
+}
+
+int mmu_mark_initmem_nx(void)
+{
+ int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
+ int i;
+ unsigned long base = (unsigned long)_stext - PAGE_OFFSET;
+ unsigned long top = ALIGN((unsigned long)_etext - PAGE_OFFSET, SZ_128K);
+ unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET;
+ unsigned long size;
+
+ for (i = 0; i < nb - 1 && base < top;) {
+ size = bat_block_size(base, top);
+ setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X);
+ base += size;
+ }
+ if (base < top) {
+ size = bat_block_size(base, top);
+ if ((top - base) > size) {
+ size <<= 1;
+ if (strict_kernel_rwx_enabled() && base + size > border)
+ pr_warn("Some RW data is getting mapped X. "
+ "Adjust CONFIG_DATA_SHIFT to avoid that.\n");
+ }
+ setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X);
+ base += size;
+ }
+ for (; i < nb; i++)
+ clearibat(i);
+
+ update_bats();
+
+ BUILD_BUG_ON(ALIGN_DOWN(MODULES_VADDR, SZ_256M) < TASK_SIZE);
+
+ for (i = TASK_SIZE >> 28; i < 16; i++) {
+ /* Do not set NX on VM space for modules */
+ if (is_module_segment(i << 28))
+ continue;
+
+ mtsr(mfsr(i << 28) | 0x10000000, i << 28);
+ }
+ return 0;
+}
+
+int mmu_mark_rodata_ro(void)
+{
+ int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
+ int i;
+
+ for (i = 0; i < nb; i++) {
+ struct ppc_bat *bat = BATS[i];
+
+ if (bat_addrs[i].start < (unsigned long)__end_rodata)
+ bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX;
+ }
+
+ update_bats();
+
+ return 0;
+}
+
+/*
+ * Set up one of the D BAT (block address translation) register pairs.
+ * The parameters are not checked; in particular size must be a power
+ * of 2 between 128k and 256M.
+ */
+void __init setbat(int index, unsigned long virt, phys_addr_t phys,
+ unsigned int size, pgprot_t prot)
+{
+ unsigned int bl;
+ int wimgxpp;
+ struct ppc_bat *bat;
+ unsigned long flags = pgprot_val(prot);
+
+ if (index == -1)
+ index = find_free_bat();
+ if (index == -1) {
+ pr_err("%s: no BAT available for mapping 0x%llx\n", __func__,
+ (unsigned long long)phys);
+ return;
+ }
+ bat = BATS[index];
+
+ if ((flags & _PAGE_NO_CACHE) ||
+ (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0))
+ flags &= ~_PAGE_COHERENT;
+
+ bl = (size >> 17) - 1;
+ /* Do DBAT first */
+ wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
+ | _PAGE_COHERENT | _PAGE_GUARDED);
+ wimgxpp |= (flags & _PAGE_WRITE) ? BPP_RW : BPP_RX;
+ bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
+ bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
+ if (!is_kernel_addr(virt))
+ bat[1].batu |= 1; /* Vp = 1 */
+ if (flags & _PAGE_GUARDED) {
+ /* G bit must be zero in IBATs */
+ flags &= ~_PAGE_EXEC;
+ }
+
+ bat_addrs[index].start = virt;
+ bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1;
+ bat_addrs[index].phys = phys;
+}
+
+/*
+ * Preload a translation in the hash table
+ */
+static void hash_preload(struct mm_struct *mm, unsigned long ea)
+{
+ pmd_t *pmd;
+
+ if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ return;
+ pmd = pmd_off(mm, ea);
+ if (!pmd_none(*pmd))
+ add_hash_page(mm->context.id, ea, pmd_val(*pmd));
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux PTE.
+ *
+ * This must always be called with the pte lock held.
+ */
+void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep)
+{
+ /*
+ * We don't need to worry about _PAGE_PRESENT here because we are
+ * called with either mm->page_table_lock held or ptl lock held
+ */
+
+ /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
+ if (!pte_young(*ptep) || address >= TASK_SIZE)
+ return;
+
+ /* We have to test for regs NULL since init will get here first thing at boot */
+ if (!current->thread.regs)
+ return;
+
+ /* We also avoid filling the hash if not coming from a fault */
+ if (TRAP(current->thread.regs) != 0x300 && TRAP(current->thread.regs) != 0x400)
+ return;
+
+ hash_preload(vma->vm_mm, address);
+}
+
+/*
+ * Initialize the hash table and patch the instructions in hashtable.S.
+ */
+void __init MMU_init_hw(void)
+{
+ unsigned int n_hpteg, lg_n_hpteg;
+
+ if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ return;
+
+ if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105);
+
+#define LG_HPTEG_SIZE 6 /* 64 bytes per HPTEG */
+#define SDR1_LOW_BITS ((n_hpteg - 1) >> 10)
+#define MIN_N_HPTEG 1024 /* min 64kB hash table */
+
+ /*
+ * Allow 1 HPTE (1/8 HPTEG) for each page of memory.
+ * This is less than the recommended amount, but then
+ * Linux ain't AIX.
+ */
+ n_hpteg = total_memory / (PAGE_SIZE * 8);
+ if (n_hpteg < MIN_N_HPTEG)
+ n_hpteg = MIN_N_HPTEG;
+ lg_n_hpteg = __ilog2(n_hpteg);
+ if (n_hpteg & (n_hpteg - 1)) {
+ ++lg_n_hpteg; /* round up if not power of 2 */
+ n_hpteg = 1 << lg_n_hpteg;
+ }
+ Hash_size = n_hpteg << LG_HPTEG_SIZE;
+
+ /*
+ * Find some memory for the hash table.
+ */
+ if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
+ Hash = memblock_alloc_or_panic(Hash_size, Hash_size);
+ _SDR1 = __pa(Hash) | SDR1_LOW_BITS;
+
+ pr_info("Total memory = %lldMB; using %ldkB for hash table\n",
+ (unsigned long long)(total_memory >> 20), Hash_size >> 10);
+
+
+ Hash_mask = n_hpteg - 1;
+ hash_mb2 = hash_mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg;
+ if (lg_n_hpteg > 16)
+ hash_mb2 = 16 - LG_HPTEG_SIZE;
+}
+
+void __init MMU_init_hw_patch(void)
+{
+ unsigned int hmask = Hash_mask >> (16 - LG_HPTEG_SIZE);
+ unsigned int hash = (unsigned int)Hash - PAGE_OFFSET;
+
+ if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ return;
+
+ if (ppc_md.progress)
+ ppc_md.progress("hash:patch", 0x345);
+ if (ppc_md.progress)
+ ppc_md.progress("hash:done", 0x205);
+
+ /* WARNING: Make sure nothing can trigger a KASAN check past this point */
+
+ /*
+ * Patch up the instructions in hashtable.S:create_hpte
+ */
+ modify_instruction_site(&patch__hash_page_A0, 0xffff, hash >> 16);
+ modify_instruction_site(&patch__hash_page_A1, 0x7c0, hash_mb << 6);
+ modify_instruction_site(&patch__hash_page_A2, 0x7c0, hash_mb2 << 6);
+ modify_instruction_site(&patch__hash_page_B, 0xffff, hmask);
+ modify_instruction_site(&patch__hash_page_C, 0xffff, hmask);
+
+ /*
+ * Patch up the instructions in hashtable.S:flush_hash_page
+ */
+ modify_instruction_site(&patch__flush_hash_A0, 0xffff, hash >> 16);
+ modify_instruction_site(&patch__flush_hash_A1, 0x7c0, hash_mb << 6);
+ modify_instruction_site(&patch__flush_hash_A2, 0x7c0, hash_mb2 << 6);
+ modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask);
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /* We don't currently support the first MEMBLOCK not mapping 0
+ * physical on those processors
+ */
+ BUG_ON(first_memblock_base != 0);
+
+ memblock_set_current_limit(min_t(u64, first_memblock_size, SZ_256M));
+}
+
+void __init print_system_hash_info(void)
+{
+ pr_info("Hash_size = 0x%lx\n", Hash_size);
+ if (Hash_mask)
+ pr_info("Hash_mask = 0x%lx\n", Hash_mask);
+}
+
+void __init early_init_mmu(void)
+{
+}
diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/book3s32/mmu_context.c
index 78fef6726e10..1922f9a6b058 100644
--- a/arch/powerpc/mm/mmu_context_hash32.c
+++ b/arch/powerpc/mm/book3s32/mmu_context.c
@@ -1,8 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file contains the routines for handling the MMU on those
* PowerPC implementations where the MMU substantially follows the
* architecture specification. This includes the 6xx, 7xx, 7xxx,
- * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
+ * and 8260 implementations but excludes the 8xx and 4xx.
* -- paulus
*
* Derived from arch/ppc/mm/init.c:
@@ -14,12 +15,6 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/mm.h>
@@ -27,7 +22,12 @@
#include <linux/export.h>
#include <asm/mmu_context.h>
-#include <asm/tlbflush.h>
+
+/*
+ * Room for two PTE pointers, usually the kernel and current user pointers
+ * to their respective root page table.
+ */
+void *abatron_pteptrs[2];
/*
* On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs
@@ -45,19 +45,6 @@
#define LAST_CONTEXT 32767
#define FIRST_CONTEXT 1
-/*
- * This function defines the mapping from contexts to VSIDs (virtual
- * segment IDs). We use a skew on both the context and the high 4 bits
- * of the 32-bit virtual address (the "effective segment ID") in order
- * to spread out the entries in the MMU hash table. Note, if this
- * function is changed then arch/ppc/mm/hashtable.S will have to be
- * changed to correspond.
- *
- *
- * CTX_TO_VSID(ctx, va) (((ctx) * (897 * 16) + ((va) >> 28) * 0x111) \
- * & 0xffffff)
- */
-
static unsigned long next_mmu_context;
static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
@@ -82,6 +69,12 @@ EXPORT_SYMBOL_GPL(__init_new_context);
int init_new_context(struct task_struct *t, struct mm_struct *mm)
{
mm->context.id = __init_new_context();
+ mm->context.sr0 = CTX_TO_VSID(mm->context.id, 0);
+
+ if (IS_ENABLED(CONFIG_PPC_KUEP))
+ mm->context.sr0 |= SR_NX;
+ if (!kuap_is_disabled())
+ mm->context.sr0 |= SR_KS;
return 0;
}
@@ -117,3 +110,25 @@ void __init mmu_context_init(void)
context_map[0] = (1 << FIRST_CONTEXT) - 1;
next_mmu_context = FIRST_CONTEXT;
}
+
+void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk)
+{
+ long id = next->context.id;
+
+ if (id < 0)
+ panic("mm_struct %p has no context ID", next);
+
+ isync();
+
+ update_user_segments(next->context.sr0);
+
+ if (IS_ENABLED(CONFIG_BDI_SWITCH))
+ abatron_pteptrs[1] = next->pgd;
+
+ if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ mtspr(SPRN_SDR1, rol32(__pa(next->pgd), 4) & 0xffff01ff);
+
+ mb(); /* sync */
+ isync();
+}
+EXPORT_SYMBOL(switch_mmu_context);
diff --git a/arch/powerpc/mm/book3s32/nohash_low.S b/arch/powerpc/mm/book3s32/nohash_low.S
new file mode 100644
index 000000000000..19f418b0ed2d
--- /dev/null
+++ b/arch/powerpc/mm/book3s32/nohash_low.S
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * This file contains low-level assembler routines for managing
+ * the PowerPC 603 tlb invalidation.
+ */
+
+#include <asm/page.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+/*
+ * Flush an entry from the TLB
+ */
+#ifdef CONFIG_SMP
+_GLOBAL(_tlbie)
+ lwz r8,TASK_CPU(r2)
+ oris r8,r8,11
+ mfmsr r10
+ rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
+ rlwinm r0,r0,0,28,26 /* clear DR */
+ mtmsr r0
+ isync
+ lis r9,mmu_hash_lock@h
+ ori r9,r9,mmu_hash_lock@l
+ tophys(r9,r9)
+10: lwarx r7,0,r9
+ cmpwi 0,r7,0
+ bne- 10b
+ stwcx. r8,0,r9
+ bne- 10b
+ eieio
+ tlbie r3
+ sync
+ TLBSYNC
+ li r0,0
+ stw r0,0(r9) /* clear mmu_hash_lock */
+ mtmsr r10
+ isync
+ blr
+_ASM_NOKPROBE_SYMBOL(_tlbie)
+#endif /* CONFIG_SMP */
+
+/*
+ * Flush the entire TLB. 603/603e only
+ */
+_GLOBAL(_tlbia)
+#if defined(CONFIG_SMP)
+ lwz r8,TASK_CPU(r2)
+ oris r8,r8,10
+ mfmsr r10
+ rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
+ rlwinm r0,r0,0,28,26 /* clear DR */
+ mtmsr r0
+ isync
+ lis r9,mmu_hash_lock@h
+ ori r9,r9,mmu_hash_lock@l
+ tophys(r9,r9)
+10: lwarx r7,0,r9
+ cmpwi 0,r7,0
+ bne- 10b
+ stwcx. r8,0,r9
+ bne- 10b
+#endif /* CONFIG_SMP */
+ li r5, 32
+ lis r4, KERNELBASE@h
+ mtctr r5
+ sync
+0: tlbie r4
+ addi r4, r4, 0x1000
+ bdnz 0b
+ sync
+#ifdef CONFIG_SMP
+ TLBSYNC
+ li r0,0
+ stw r0,0(r9) /* clear mmu_hash_lock */
+ mtmsr r10
+ isync
+#endif /* CONFIG_SMP */
+ blr
+_ASM_NOKPROBE_SYMBOL(_tlbia)
diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c
new file mode 100644
index 000000000000..e54a7b011232
--- /dev/null
+++ b/arch/powerpc/mm/book3s32/tlb.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file contains the routines for TLB flushing.
+ * On machines where the MMU uses a hash table to store virtual to
+ * physical translations, these routines flush entries from the
+ * hash table also.
+ * -- paulus
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/export.h>
+
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+#include <mm/mmu_decl.h>
+
+/*
+ * TLB flushing:
+ *
+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ * - flush_tlb_page(vma, vmaddr) flushes one page
+ * - flush_tlb_range(vma, start, end) flushes a range of pages
+ * - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ * since the hardware hash table functions as an extension of the
+ * tlb as far as the linux tables are concerned, flush it too.
+ * -- Cort
+ */
+
+/*
+ * For each address in the range, find the pte for the address
+ * and check _PAGE_HASHPTE bit; if it is set, find and destroy
+ * the corresponding HPTE.
+ */
+void hash__flush_range(struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+ pmd_t *pmd;
+ unsigned long pmd_end;
+ int count;
+ unsigned int ctx = mm->context.id;
+
+ start &= PAGE_MASK;
+ if (start >= end)
+ return;
+ end = (end - 1) | ~PAGE_MASK;
+ pmd = pmd_off(mm, start);
+ for (;;) {
+ pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
+ if (pmd_end > end)
+ pmd_end = end;
+ if (!pmd_none(*pmd)) {
+ count = ((pmd_end - start) >> PAGE_SHIFT) + 1;
+ flush_hash_pages(ctx, start, pmd_val(*pmd), count);
+ }
+ if (pmd_end == end)
+ break;
+ start = pmd_end + 1;
+ ++pmd;
+ }
+}
+EXPORT_SYMBOL(hash__flush_range);
+
+/*
+ * Flush all the (user) entries for the address space described by mm.
+ */
+void hash__flush_tlb_mm(struct mm_struct *mm)
+{
+ struct vm_area_struct *mp;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ /*
+ * It is safe to iterate the vmas when called from dup_mmap,
+ * holding mmap_lock. It would also be safe from unmap_region
+ * or exit_mmap, but not from vmtruncate on SMP - but it seems
+ * dup_mmap is the only SMP case which gets here.
+ */
+ for_each_vma(vmi, mp)
+ hash__flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
+}
+EXPORT_SYMBOL(hash__flush_tlb_mm);
+
+void hash__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+ struct mm_struct *mm;
+ pmd_t *pmd;
+
+ mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
+ pmd = pmd_off(mm, vmaddr);
+ if (!pmd_none(*pmd))
+ flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
+}
+EXPORT_SYMBOL(hash__flush_tlb_page);
+
+void hash__flush_gather(struct mmu_gather *tlb)
+{
+ if (tlb->fullmm || tlb->need_flush_all)
+ hash__flush_tlb_mm(tlb->mm);
+ else
+ hash__flush_range(tlb->mm, tlb->start, tlb->end);
+}
+EXPORT_SYMBOL(hash__flush_gather);
diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile
new file mode 100644
index 000000000000..33af5795856a
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/Makefile
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += mmu_context.o pgtable.o trace.o
+ifdef CONFIG_PPC_64S_HASH_MMU
+CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
+obj-y += hash_pgtable.o hash_utils.o hash_tlb.o slb.o slice.o
+obj-$(CONFIG_PPC_HASH_MMU_NATIVE) += hash_native.o
+obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o
+obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o
+obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o
+endif
+
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+
+obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o
+ifdef CONFIG_HUGETLB_PAGE
+obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o
+endif
+obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o
+obj-$(CONFIG_PPC_PKEY) += pkeys.o
+
+# Instrumenting the SLB fault path can lead to duplicate SLB entries
+KCOV_INSTRUMENT_slb.o := n
+
+# Parts of these can run in real mode and therefore are
+# not safe with the current outline KASAN implementation
+KASAN_SANITIZE_mmu_context.o := n
+KASAN_SANITIZE_pgtable.o := n
+KASAN_SANITIZE_radix_pgtable.o := n
+KASAN_SANITIZE_radix_tlb.o := n
+KASAN_SANITIZE_slb.o := n
+KASAN_SANITIZE_pkeys.o := n
diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c
new file mode 100644
index 000000000000..02acbfd05b46
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/hash_4k.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright IBM Corporation, 2015
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+
+#include "internal.h"
+
+int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+ pte_t *ptep, unsigned long trap, unsigned long flags,
+ int ssize, int subpg_prot)
+{
+ real_pte_t rpte;
+ unsigned long hpte_group;
+ unsigned long rflags, pa;
+ unsigned long old_pte, new_pte;
+ unsigned long vpn, hash, slot;
+ unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
+
+ /*
+ * atomically mark the linux large page PTE busy and dirty
+ */
+ do {
+ pte_t pte = READ_ONCE(*ptep);
+
+ old_pte = pte_val(pte);
+ /* If PTE busy, retry the access */
+ if (unlikely(old_pte & H_PAGE_BUSY))
+ return 0;
+ /* If PTE permissions don't match, take page fault */
+ if (unlikely(!check_pte_access(access, old_pte)))
+ return 1;
+ /*
+ * Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access. Since this is 4K insert of 64K page size
+ * also add H_PAGE_COMBO
+ */
+ new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_WRITE)
+ new_pte |= _PAGE_DIRTY;
+ } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+ /*
+ * PP bits. _PAGE_USER is already PP bit 0x2, so we only
+ * need to add in 0x1 if it's a read-only user page
+ */
+ rflags = htab_convert_pte_flags(new_pte, flags);
+ rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
+
+ if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+ !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+ vpn = hpt_vpn(ea, vsid, ssize);
+ if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+ /*
+ * There MIGHT be an HPTE for this pte
+ */
+ unsigned long gslot = pte_get_hash_gslot(vpn, shift, ssize,
+ rpte, 0);
+
+ if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_4K,
+ MMU_PAGE_4K, ssize, flags) == -1)
+ old_pte &= ~_PAGE_HPTEFLAGS;
+ }
+
+ if (likely(!(old_pte & H_PAGE_HASHPTE))) {
+
+ pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+ hash = hpt_hash(vpn, shift, ssize);
+
+repeat:
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+ /* Insert into the hash table, primary slot */
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+ MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+ /*
+ * Primary is full, try the secondary
+ */
+ if (unlikely(slot == -1)) {
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+ rflags,
+ HPTE_V_SECONDARY,
+ MMU_PAGE_4K,
+ MMU_PAGE_4K, ssize);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = (hash & htab_hash_mask) *
+ HPTES_PER_GROUP;
+ mmu_hash_ops.hpte_remove(hpte_group);
+ /*
+ * FIXME!! Should be try the group from which we removed ?
+ */
+ goto repeat;
+ }
+ }
+ /*
+ * Hypervisor failure. Restore old pte and return -1
+ * similar to __hash_page_*
+ */
+ if (unlikely(slot == -2)) {
+ *ptep = __pte(old_pte);
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
+ return -1;
+ }
+ new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+ new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+ if (stress_hpt())
+ hpt_do_stress(ea, hpte_group);
+ }
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+ return 0;
+}
diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c
new file mode 100644
index 000000000000..954af420f358
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/hash_64k.c
@@ -0,0 +1,343 @@
+/*
+ * Copyright IBM Corporation, 2015
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+
+#include "internal.h"
+
+/*
+ * Return true, if the entry has a slot value which
+ * the software considers as invalid.
+ */
+static inline bool hpte_soft_invalid(unsigned long hidx)
+{
+ return ((hidx & 0xfUL) == 0xfUL);
+}
+
+/*
+ * index from 0 - 15
+ */
+bool __rpte_sub_valid(real_pte_t rpte, unsigned long index)
+{
+ return !(hpte_soft_invalid(__rpte_to_hidx(rpte, index)));
+}
+
+int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+ pte_t *ptep, unsigned long trap, unsigned long flags,
+ int ssize, int subpg_prot)
+{
+ real_pte_t rpte;
+ unsigned long hpte_group;
+ unsigned int subpg_index;
+ unsigned long rflags, pa;
+ unsigned long old_pte, new_pte, subpg_pte;
+ unsigned long vpn, hash, slot, gslot;
+ unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
+
+ /*
+ * atomically mark the linux large page PTE busy and dirty
+ */
+ do {
+ pte_t pte = READ_ONCE(*ptep);
+
+ old_pte = pte_val(pte);
+ /* If PTE busy, retry the access */
+ if (unlikely(old_pte & H_PAGE_BUSY))
+ return 0;
+ /* If PTE permissions don't match, take page fault */
+ if (unlikely(!check_pte_access(access, old_pte)))
+ return 1;
+ /*
+ * Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access. Since this is 4K insert of 64K page size
+ * also add H_PAGE_COMBO
+ */
+ new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO;
+ if (access & _PAGE_WRITE)
+ new_pte |= _PAGE_DIRTY;
+ } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+ /*
+ * Handle the subpage protection bits
+ */
+ subpg_pte = new_pte & ~subpg_prot;
+ rflags = htab_convert_pte_flags(subpg_pte, flags);
+
+ if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+ !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+ /*
+ * No CPU has hugepages but lacks no execute, so we
+ * don't need to worry about that case
+ */
+ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+ }
+
+ subpg_index = (ea & (PAGE_SIZE - 1)) >> shift;
+ vpn = hpt_vpn(ea, vsid, ssize);
+ rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
+ /*
+ *None of the sub 4k page is hashed
+ */
+ if (!(old_pte & H_PAGE_HASHPTE))
+ goto htab_insert_hpte;
+ /*
+ * Check if the pte was already inserted into the hash table
+ * as a 64k HW page, and invalidate the 64k HPTE if so.
+ */
+ if (!(old_pte & H_PAGE_COMBO)) {
+ flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags);
+ /*
+ * clear the old slot details from the old and new pte.
+ * On hash insert failure we use old pte value and we don't
+ * want slot information there if we have a insert failure.
+ */
+ old_pte &= ~H_PAGE_HASHPTE;
+ new_pte &= ~H_PAGE_HASHPTE;
+ goto htab_insert_hpte;
+ }
+ /*
+ * Check for sub page valid and update
+ */
+ if (__rpte_sub_valid(rpte, subpg_index)) {
+ int ret;
+
+ gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte,
+ subpg_index);
+ ret = mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn,
+ MMU_PAGE_4K, MMU_PAGE_4K,
+ ssize, flags);
+
+ /*
+ * If we failed because typically the HPTE wasn't really here
+ * we try an insertion.
+ */
+ if (ret == -1)
+ goto htab_insert_hpte;
+
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+ return 0;
+ }
+
+htab_insert_hpte:
+
+ /*
+ * Initialize all hidx entries to invalid value, the first time
+ * the PTE is about to allocate a 4K HPTE.
+ */
+ if (!(old_pte & H_PAGE_COMBO))
+ rpte.hidx = INVALID_RPTE_HIDX;
+
+ /*
+ * handle H_PAGE_4K_PFN case
+ */
+ if (old_pte & H_PAGE_4K_PFN) {
+ /*
+ * All the sub 4k page have the same
+ * physical address.
+ */
+ pa = pte_pfn(__pte(old_pte)) << HW_PAGE_SHIFT;
+ } else {
+ pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+ pa += (subpg_index << shift);
+ }
+ hash = hpt_hash(vpn, shift, ssize);
+repeat:
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+ /* Insert into the hash table, primary slot */
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+ MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+ /*
+ * Primary is full, try the secondary
+ */
+ if (unlikely(slot == -1)) {
+ bool soft_invalid;
+
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+ rflags, HPTE_V_SECONDARY,
+ MMU_PAGE_4K, MMU_PAGE_4K,
+ ssize);
+
+ soft_invalid = hpte_soft_invalid(slot);
+ if (unlikely(soft_invalid)) {
+ /*
+ * We got a valid slot from a hardware point of view.
+ * but we cannot use it, because we use this special
+ * value; as defined by hpte_soft_invalid(), to track
+ * invalid slots. We cannot use it. So invalidate it.
+ */
+ gslot = slot & _PTEIDX_GROUP_IX;
+ mmu_hash_ops.hpte_invalidate(hpte_group + gslot, vpn,
+ MMU_PAGE_4K, MMU_PAGE_4K,
+ ssize, 0);
+ }
+
+ if (unlikely(slot == -1 || soft_invalid)) {
+ /*
+ * For soft invalid slot, let's ensure that we release a
+ * slot from the primary, with the hope that we will
+ * acquire that slot next time we try. This will ensure
+ * that we do not get the same soft-invalid slot.
+ */
+ if (soft_invalid || (mftb() & 0x1))
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+ mmu_hash_ops.hpte_remove(hpte_group);
+ /*
+ * FIXME!! Should be try the group from which we removed ?
+ */
+ goto repeat;
+ }
+ }
+ /*
+ * Hypervisor failure. Restore old pte and return -1
+ * similar to __hash_page_*
+ */
+ if (unlikely(slot == -2)) {
+ *ptep = __pte(old_pte);
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
+ return -1;
+ }
+
+ new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
+ new_pte |= H_PAGE_HASHPTE;
+
+ if (stress_hpt())
+ hpt_do_stress(ea, hpte_group);
+
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+ return 0;
+}
+
+int __hash_page_64K(unsigned long ea, unsigned long access,
+ unsigned long vsid, pte_t *ptep, unsigned long trap,
+ unsigned long flags, int ssize)
+{
+ real_pte_t rpte;
+ unsigned long hpte_group;
+ unsigned long rflags, pa;
+ unsigned long old_pte, new_pte;
+ unsigned long vpn, hash, slot;
+ unsigned long shift = mmu_psize_defs[MMU_PAGE_64K].shift;
+
+ /*
+ * atomically mark the linux large page PTE busy and dirty
+ */
+ do {
+ pte_t pte = READ_ONCE(*ptep);
+
+ old_pte = pte_val(pte);
+ /* If PTE busy, retry the access */
+ if (unlikely(old_pte & H_PAGE_BUSY))
+ return 0;
+ /* If PTE permissions don't match, take page fault */
+ if (unlikely(!check_pte_access(access, old_pte)))
+ return 1;
+ /*
+ * Check if PTE has the cache-inhibit bit set
+ * If so, bail out and refault as a 4k page
+ */
+ if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) &&
+ unlikely(pte_ci(pte)))
+ return 0;
+ /*
+ * Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access.
+ */
+ new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_WRITE)
+ new_pte |= _PAGE_DIRTY;
+ } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+ rflags = htab_convert_pte_flags(new_pte, flags);
+ rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
+
+ if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+ !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+ vpn = hpt_vpn(ea, vsid, ssize);
+ if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+ unsigned long gslot;
+
+ /*
+ * There MIGHT be an HPTE for this pte
+ */
+ gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
+ if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K,
+ MMU_PAGE_64K, ssize,
+ flags) == -1)
+ old_pte &= ~_PAGE_HPTEFLAGS;
+ }
+
+ if (likely(!(old_pte & H_PAGE_HASHPTE))) {
+
+ pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+ hash = hpt_hash(vpn, shift, ssize);
+
+repeat:
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+ /* Insert into the hash table, primary slot */
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+ MMU_PAGE_64K, MMU_PAGE_64K,
+ ssize);
+ /*
+ * Primary is full, try the secondary
+ */
+ if (unlikely(slot == -1)) {
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+ rflags,
+ HPTE_V_SECONDARY,
+ MMU_PAGE_64K,
+ MMU_PAGE_64K, ssize);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = (hash & htab_hash_mask) *
+ HPTES_PER_GROUP;
+ mmu_hash_ops.hpte_remove(hpte_group);
+ /*
+ * FIXME!! Should be try the group from which we removed ?
+ */
+ goto repeat;
+ }
+ }
+ /*
+ * Hypervisor failure. Restore old pte and return -1
+ * similar to __hash_page_*
+ */
+ if (unlikely(slot == -2)) {
+ *ptep = __pte(old_pte);
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ MMU_PAGE_64K, MMU_PAGE_64K, old_pte);
+ return -1;
+ }
+
+ new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+ new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+ if (stress_hpt())
+ hpt_do_stress(ea, hpte_group);
+ }
+
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+
+ return 0;
+}
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/book3s64/hash_hugepage.c
index 34de9e0cdc34..cdfd4fe75edb 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/book3s64/hash_hugepage.c
@@ -1,6 +1,6 @@
/*
* Copyright IBM Corporation, 2013
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
@@ -19,8 +19,8 @@
#include <asm/machdep.h>
int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
- pmd_t *pmdp, unsigned long trap, int local, int ssize,
- unsigned int psize)
+ pmd_t *pmdp, unsigned long trap, unsigned long flags,
+ int ssize, unsigned int psize)
{
unsigned int index, valid;
unsigned char *hpte_slot_array;
@@ -33,70 +33,81 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
* atomically mark the linux large page PMD busy and dirty
*/
do {
- old_pmd = pmd_val(*pmdp);
+ pmd_t pmd = READ_ONCE(*pmdp);
+
+ old_pmd = pmd_val(pmd);
/* If PMD busy, retry the access */
- if (unlikely(old_pmd & _PAGE_BUSY))
- return 0;
- /* If PMD is trans splitting retry the access */
- if (unlikely(old_pmd & _PAGE_SPLITTING))
+ if (unlikely(old_pmd & H_PAGE_BUSY))
return 0;
/* If PMD permissions don't match, take page fault */
- if (unlikely(access & ~old_pmd))
+ if (unlikely(!check_pte_access(access, old_pmd)))
return 1;
/*
* Try to lock the PTE, add ACCESSED and DIRTY if it was
* a write access
*/
- new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
- if (access & _PAGE_RW)
+ new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_WRITE)
new_pmd |= _PAGE_DIRTY;
- } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
- old_pmd, new_pmd));
+ } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd)));
+
/*
- * PP bits. _PAGE_USER is already PP bit 0x2, so we only
- * need to add in 0x1 if it's a read-only user page
+ * Make sure this is thp or devmap entry
*/
- rflags = new_pmd & _PAGE_USER;
- if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) &&
- (new_pmd & _PAGE_DIRTY)))
- rflags |= 0x1;
+ if (!(old_pmd & H_PAGE_THP_HUGE))
+ return 0;
+
+ rflags = htab_convert_pte_flags(new_pmd, flags);
+
/*
- * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
+ * THPs are only supported on platforms that can do mixed page size
+ * segments (MPSS) and all such platforms have coherent icache. Hence we
+ * don't need to do lazy icache flush (hash_page_do_lazy_icache()) on
+ * noexecute fault.
*/
- rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N);
-
-#if 0
- if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
- /*
- * No CPU has hugepages but lacks no execute, so we
- * don't need to worry about that case
- */
- rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
- }
-#endif
/*
* Find the slot index details for this ea, using base page size.
*/
shift = mmu_psize_defs[psize].shift;
index = (ea & ~HPAGE_PMD_MASK) >> shift;
- BUG_ON(index >= 4096);
+ BUG_ON(index >= PTE_FRAG_SIZE);
vpn = hpt_vpn(ea, vsid, ssize);
- hash = hpt_hash(vpn, shift, ssize);
hpte_slot_array = get_hpte_slot_array(pmdp);
+ if (psize == MMU_PAGE_4K) {
+ /*
+ * invalidate the old hpte entry if we have that mapped via 64K
+ * base page size. This is because demote_segment won't flush
+ * hash page table entries.
+ */
+ if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) {
+ flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K,
+ ssize, flags);
+ /*
+ * With THP, we also clear the slot information with
+ * respect to all the 64K hash pte mapping the 16MB
+ * page. They are all invalid now. This make sure we
+ * don't find the slot valid when we fault with 4k
+ * base page size.
+ *
+ */
+ memset(hpte_slot_array, 0, PTE_FRAG_SIZE);
+ }
+ }
valid = hpte_valid(hpte_slot_array, index);
if (valid) {
/* update the hpte bits */
+ hash = hpt_hash(vpn, shift, ssize);
hidx = hpte_hash_index(hpte_slot_array, index);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
- ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
- psize, lpsize, ssize, local);
+ ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn,
+ psize, lpsize, ssize, flags);
/*
* We failed to update, try to insert a new entry.
*/
@@ -107,46 +118,39 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
* safely update this here.
*/
valid = 0;
- new_pmd &= ~_PAGE_HPTEFLAGS;
hpte_slot_array[index] = 0;
- } else
- /* clear the busy bits and set the hash pte bits */
- new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+ }
}
if (!valid) {
unsigned long hpte_group;
+ hash = hpt_hash(vpn, shift, ssize);
/* insert new entry */
pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
-repeat:
- hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
-
- /* clear the busy bits and set the hash pte bits */
- new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+ new_pmd |= H_PAGE_HASHPTE;
- /* Add in WIMG bits */
- rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
- _PAGE_COHERENT | _PAGE_GUARDED));
+repeat:
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
/* Insert into the hash table, primary slot */
- slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
- psize, lpsize, ssize);
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+ psize, lpsize, ssize);
/*
* Primary is full, try the secondary
*/
if (unlikely(slot == -1)) {
- hpte_group = ((~hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
- slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
- rflags, HPTE_V_SECONDARY,
- psize, lpsize, ssize);
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+ rflags,
+ HPTE_V_SECONDARY,
+ psize, lpsize, ssize);
if (slot == -1) {
if (mftb() & 0x1)
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (hash & htab_hash_mask) *
+ HPTES_PER_GROUP;
- ppc_md.hpte_remove(hpte_group);
+ mmu_hash_ops.hpte_remove(hpte_group);
goto repeat;
}
}
@@ -168,8 +172,17 @@ repeat:
mark_hpte_slot_valid(hpte_slot_array, index, slot);
}
/*
- * No need to use ldarx/stdcx here
+ * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with
+ * base page size 4k.
+ */
+ if (psize == MMU_PAGE_4K)
+ new_pmd |= H_PAGE_COMBO;
+ /*
+ * The hpte valid is stored in the pgtable whose address is in the
+ * second half of the PMD. Order this against clearing of the busy bit in
+ * huge pmd.
*/
- *pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
+ smp_wmb();
+ *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY);
return 0;
}
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/book3s64/hash_native.c
index 3f0c30ae4791..e9e2dd70c060 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/book3s64/hash_native.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* native hashtable management.
*
* SMP scalability work:
* Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#undef DEBUG_LOW
@@ -15,19 +11,21 @@
#include <linux/spinlock.h>
#include <linux/bitops.h>
#include <linux/of.h>
+#include <linux/processor.h>
#include <linux/threads.h>
#include <linux/smp.h>
+#include <linux/pgtable.h>
#include <asm/machdep.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
+#include <asm/trace.h>
#include <asm/tlb.h>
#include <asm/cputable.h>
#include <asm/udbg.h>
#include <asm/kexec.h>
#include <asm/ppc-opcode.h>
+#include <asm/feature-fixups.h>
#ifdef DEBUG_LOW
#define DBG_LOW(fmt...) udbg_printf(fmt)
@@ -35,20 +33,49 @@
#define DBG_LOW(fmt...)
#endif
+#ifdef __BIG_ENDIAN__
#define HPTE_LOCK_BIT 3
+#else
+#define HPTE_LOCK_BIT (56+3)
+#endif
-DEFINE_RAW_SPINLOCK(native_tlbie_lock);
+static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
-static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map hpte_lock_map =
+ STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map);
+
+static void acquire_hpte_lock(void)
+{
+ lock_map_acquire(&hpte_lock_map);
+}
+
+static void release_hpte_lock(void)
+{
+ lock_map_release(&hpte_lock_map);
+}
+#else
+static void acquire_hpte_lock(void)
+{
+}
+
+static void release_hpte_lock(void)
+{
+}
+#endif
+
+static inline unsigned long ___tlbie(unsigned long vpn, int psize,
+ int apsize, int ssize)
{
unsigned long va;
unsigned int penc;
+ unsigned long sllp;
/*
* We need 14 to 65 bits of va for a tlibe of 4K page
* With vpn we ignore the lower VPN_SHIFT bits already.
* And top two bits are already ignored because we can
- * only accomadate 76 bits in a 64 bit vpn with a VPN_SHIFT
+ * only accomodate 76 bits in a 64 bit vpn with a VPN_SHIFT
* of 12.
*/
va = vpn << VPN_SHIFT;
@@ -57,14 +84,16 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
* Older versions of the architecture (2.02 and earler) require the
* masking of the top 16 bits.
*/
- va &= ~(0xffffULL << 48);
+ if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
+ va &= ~(0xffffULL << 48);
switch (psize) {
case MMU_PAGE_4K:
/* clear out bits after (52) [0....52.....63] */
va &= ~((1ul << (64 - 52)) - 1);
va |= ssize << 8;
- va |= mmu_psize_defs[apsize].sllp << 6;
+ sllp = get_sllp_encoding(apsize);
+ va |= sllp << 5;
asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
: : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
: "memory");
@@ -75,29 +104,68 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
va |= penc << 12;
va |= ssize << 8;
- /* Add AVAL part */
- if (psize != apsize) {
- /*
- * MPSS, 64K base page size and 16MB parge page size
- * We don't need all the bits, but rest of the bits
- * must be ignored by the processor.
- * vpn cover upto 65 bits of va. (0...65) and we need
- * 58..64 bits of va.
- */
- va |= (vpn & 0xfe);
- }
+ /*
+ * AVAL bits:
+ * We don't need all the bits, but rest of the bits
+ * must be ignored by the processor.
+ * vpn cover upto 65 bits of va. (0...65) and we need
+ * 58..64 bits of va.
+ */
+ va |= (vpn & 0xfe); /* AVAL */
va |= 1; /* L */
asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
: : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
: "memory");
break;
}
+ return va;
+}
+
+static inline void fixup_tlbie_vpn(unsigned long vpn, int psize,
+ int apsize, int ssize)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ /* Radix flush for a hash guest */
+
+ unsigned long rb,rs,prs,r,ric;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rs = 0; /* lpid = 0 */
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+ ric = 0; /* RIC_FLSUH_TLB */
+
+ /*
+ * Need the extra ptesync to make sure we don't
+ * re-order the tlbie
+ */
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs),
+ "i"(ric), "r"(rs) : "memory");
+ }
+
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ /* Need the extra ptesync to ensure we don't reorder tlbie*/
+ asm volatile("ptesync": : :"memory");
+ ___tlbie(vpn, psize, apsize, ssize);
+ }
+}
+
+static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
+{
+ unsigned long rb;
+
+ rb = ___tlbie(vpn, psize, apsize, ssize);
+ trace_tlbie(0, 0, rb, 0, 0, 0, 0);
}
static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
{
unsigned long va;
unsigned int penc;
+ unsigned long sllp;
/* VPN_SHIFT can be atmost 12 */
va = vpn << VPN_SHIFT;
@@ -106,16 +174,19 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
* Older versions of the architecture (2.02 and earler) require the
* masking of the top 16 bits.
*/
- va &= ~(0xffffULL << 48);
+ if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
+ va &= ~(0xffffULL << 48);
switch (psize) {
case MMU_PAGE_4K:
/* clear out bits after(52) [0....52.....63] */
va &= ~((1ul << (64 - 52)) - 1);
va |= ssize << 8;
- va |= mmu_psize_defs[apsize].sllp << 6;
- asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
- : : "r"(va) : "memory");
+ sllp = get_sllp_encoding(apsize);
+ va |= sllp << 5;
+ asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 0), %1)
+ : : "r" (va), "i" (CPU_FTR_ARCH_206)
+ : "memory");
break;
default:
/* We need 14 to 14 + i bits of va */
@@ -123,22 +194,21 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
va |= penc << 12;
va |= ssize << 8;
- /* Add AVAL part */
- if (psize != apsize) {
- /*
- * MPSS, 64K base page size and 16MB parge page size
- * We don't need all the bits, but rest of the bits
- * must be ignored by the processor.
- * vpn cover upto 65 bits of va. (0...65) and we need
- * 58..64 bits of va.
- */
- va |= (vpn & 0xfe);
- }
+ /*
+ * AVAL bits:
+ * We don't need all the bits, but rest of the bits
+ * must be ignored by the processor.
+ * vpn cover upto 65 bits of va. (0...65) and we need
+ * 58..64 bits of va.
+ */
+ va |= (vpn & 0xfe);
va |= 1; /* L */
- asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)"
- : : "r"(va) : "memory");
+ asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 1), %1)
+ : : "r" (va), "i" (CPU_FTR_ARCH_206)
+ : "memory");
break;
}
+ trace_tlbie(0, 1, va, 0, 0, 0, 0);
}
@@ -155,9 +225,10 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize,
asm volatile("ptesync": : :"memory");
if (use_local) {
__tlbiel(vpn, psize, apsize, ssize);
- asm volatile("ptesync": : :"memory");
+ ppc_after_tlbiel_barrier();
} else {
__tlbie(vpn, psize, apsize, ssize);
+ fixup_tlbie_vpn(vpn, psize, apsize, ssize);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
if (lock_tlbie && !use_local)
@@ -166,20 +237,24 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize,
static inline void native_lock_hpte(struct hash_pte *hptep)
{
- unsigned long *word = &hptep->v;
+ unsigned long *word = (unsigned long *)&hptep->v;
+ acquire_hpte_lock();
while (1) {
if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
break;
+ spin_begin();
while(test_bit(HPTE_LOCK_BIT, word))
- cpu_relax();
+ spin_cpu_relax();
+ spin_end();
}
}
static inline void native_unlock_hpte(struct hash_pte *hptep)
{
- unsigned long *word = &hptep->v;
+ unsigned long *word = (unsigned long *)&hptep->v;
+ release_hpte_lock();
clear_bit_unlock(HPTE_LOCK_BIT, word);
}
@@ -189,8 +264,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
{
struct hash_pte *hptep = htab_address + hpte_group;
unsigned long hpte_v, hpte_r;
+ unsigned long flags;
int i;
+ local_irq_save(flags);
+
if (!(vflags & HPTE_V_BOLTED)) {
DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx,"
" rflags=%lx, vflags=%lx, psize=%d)\n",
@@ -198,10 +276,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
}
for (i = 0; i < HPTES_PER_GROUP; i++) {
- if (! (hptep->v & HPTE_V_VALID)) {
+ if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
/* retry with lock held */
native_lock_hpte(hptep);
- if (! (hptep->v & HPTE_V_VALID))
+ if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
break;
native_unlock_hpte(hptep);
}
@@ -209,8 +287,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
hptep++;
}
- if (i == HPTES_PER_GROUP)
+ if (i == HPTES_PER_GROUP) {
+ local_irq_restore(flags);
return -1;
+ }
hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
@@ -220,26 +300,36 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
i, hpte_v, hpte_r);
}
- hptep->r = hpte_r;
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ hpte_r = hpte_old_to_new_r(hpte_v, hpte_r);
+ hpte_v = hpte_old_to_new_v(hpte_v);
+ }
+
+ hptep->r = cpu_to_be64(hpte_r);
/* Guarantee the second dword is visible before the valid bit */
eieio();
/*
* Now set the first dword including the valid bit
* NOTE: this also unlocks the hpte
*/
- hptep->v = hpte_v;
+ release_hpte_lock();
+ hptep->v = cpu_to_be64(hpte_v);
__asm__ __volatile__ ("ptesync" : : : "memory");
+ local_irq_restore(flags);
+
return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
}
static long native_hpte_remove(unsigned long hpte_group)
{
+ unsigned long hpte_v, flags;
struct hash_pte *hptep;
int i;
int slot_offset;
- unsigned long hpte_v;
+
+ local_irq_save(flags);
DBG_LOW(" remove(group=%lx)\n", hpte_group);
@@ -248,12 +338,12 @@ static long native_hpte_remove(unsigned long hpte_group)
for (i = 0; i < HPTES_PER_GROUP; i++) {
hptep = htab_address + hpte_group + slot_offset;
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
/* retry with lock held */
native_lock_hpte(hptep);
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
if ((hpte_v & HPTE_V_VALID)
&& !(hpte_v & HPTE_V_BOLTED))
break;
@@ -264,31 +354,36 @@ static long native_hpte_remove(unsigned long hpte_group)
slot_offset &= 0x7;
}
- if (i == HPTES_PER_GROUP)
- return -1;
+ if (i == HPTES_PER_GROUP) {
+ i = -1;
+ goto out;
+ }
/* Invalidate the hpte. NOTE: this also unlocks it */
+ release_hpte_lock();
hptep->v = 0;
-
+out:
+ local_irq_restore(flags);
return i;
}
static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
unsigned long vpn, int bpsize,
- int apsize, int ssize, int local)
+ int apsize, int ssize, unsigned long flags)
{
struct hash_pte *hptep = htab_address + slot;
unsigned long hpte_v, want_v;
- int ret = 0;
+ int ret = 0, local = 0;
+ unsigned long irqflags;
+
+ local_irq_save(irqflags);
want_v = hpte_encode_avpn(vpn, bpsize, ssize);
DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
vpn, want_v & HPTE_V_AVPN, slot, newpp);
- native_lock_hpte(hptep);
-
- hpte_v = hptep->v;
+ hpte_v = hpte_get_old_v(hptep);
/*
* We need to invalidate the TLB always because hpte_remove doesn't do
* a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -300,36 +395,46 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
DBG_LOW(" -> miss\n");
ret = -1;
} else {
- DBG_LOW(" -> hit\n");
- /* Update the HPTE */
- hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
- (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C));
+ native_lock_hpte(hptep);
+ /* recheck with locks held */
+ hpte_v = hpte_get_old_v(hptep);
+ if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) ||
+ !(hpte_v & HPTE_V_VALID))) {
+ ret = -1;
+ } else {
+ DBG_LOW(" -> hit\n");
+ /* Update the HPTE */
+ hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
+ ~(HPTE_R_PPP | HPTE_R_N)) |
+ (newpp & (HPTE_R_PPP | HPTE_R_N |
+ HPTE_R_C)));
+ }
+ native_unlock_hpte(hptep);
}
- native_unlock_hpte(hptep);
- /* Ensure it is out of the tlb too. */
- tlbie(vpn, bpsize, apsize, ssize, local);
+ if (flags & HPTE_LOCAL_UPDATE)
+ local = 1;
+ /*
+ * Ensure it is out of the tlb too if it is not a nohpte fault
+ */
+ if (!(flags & HPTE_NOHPTE_UPDATE))
+ tlbie(vpn, bpsize, apsize, ssize, local);
+
+ local_irq_restore(irqflags);
return ret;
}
-static long native_hpte_find(unsigned long vpn, int psize, int ssize)
+static long __native_hpte_find(unsigned long want_v, unsigned long slot)
{
struct hash_pte *hptep;
- unsigned long hash;
+ unsigned long hpte_v;
unsigned long i;
- long slot;
- unsigned long want_v, hpte_v;
- hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
- want_v = hpte_encode_avpn(vpn, psize, ssize);
-
- /* Bolted mappings are only ever in the primary group */
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
for (i = 0; i < HPTES_PER_GROUP; i++) {
- hptep = htab_address + slot;
- hpte_v = hptep->v;
+ hptep = htab_address + slot;
+ hpte_v = hpte_get_old_v(hptep);
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
/* HPTE matches */
return slot;
@@ -339,6 +444,33 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
return -1;
}
+static long native_hpte_find(unsigned long vpn, int psize, int ssize)
+{
+ unsigned long hpte_group;
+ unsigned long want_v;
+ unsigned long hash;
+ long slot;
+
+ hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
+
+ /*
+ * We try to keep bolted entries always in primary hash
+ * But in some case we can find them in secondary too.
+ */
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = __native_hpte_find(want_v, hpte_group);
+ if (slot < 0) {
+ /* Try in secondary */
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = __native_hpte_find(want_v, hpte_group);
+ if (slot < 0)
+ return -1;
+ }
+
+ return slot;
+}
+
/*
* Update the page protection bits. Intended to be used to create
* guard pages for kernel data structures on pages which are bolted
@@ -353,6 +485,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
unsigned long vsid;
long slot;
struct hash_pte *hptep;
+ unsigned long flags;
+
+ local_irq_save(flags);
vsid = get_kernel_vsid(ea, ssize);
vpn = hpt_vpn(ea, vsid, ssize);
@@ -363,15 +498,56 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
hptep = htab_address + slot;
/* Update the HPTE */
- hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
- (newpp & (HPTE_R_PP | HPTE_R_N));
+ hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
+ ~(HPTE_R_PPP | HPTE_R_N)) |
+ (newpp & (HPTE_R_PPP | HPTE_R_N)));
/*
* Ensure it is out of the tlb too. Bolted entries base and
* actual page size will be same.
*/
tlbie(vpn, psize, psize, ssize, 0);
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Remove a bolted kernel entry. Memory hotplug uses this.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
+{
+ unsigned long vpn;
+ unsigned long vsid;
+ long slot;
+ struct hash_pte *hptep;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ vsid = get_kernel_vsid(ea, ssize);
+ vpn = hpt_vpn(ea, vsid, ssize);
+
+ slot = native_hpte_find(vpn, psize, ssize);
+ if (slot == -1)
+ return -ENOENT;
+
+ hptep = htab_address + slot;
+
+ VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED));
+
+ /* Invalidate the hpte */
+ hptep->v = 0;
+
+ /* Invalidate the TLB */
+ tlbie(vpn, psize, psize, ssize, 0);
+
+ local_irq_restore(flags);
+
+ return 0;
}
+
static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
int bpsize, int apsize, int ssize, int local)
{
@@ -385,9 +561,20 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
want_v = hpte_encode_avpn(vpn, bpsize, ssize);
- native_lock_hpte(hptep);
- hpte_v = hptep->v;
+ hpte_v = hpte_get_old_v(hptep);
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+ native_lock_hpte(hptep);
+ /* recheck with locks held */
+ hpte_v = hpte_get_old_v(hptep);
+
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+ /* Invalidate the hpte. NOTE: this also unlocks it */
+ release_hpte_lock();
+ hptep->v = 0;
+ } else
+ native_unlock_hpte(hptep);
+ }
/*
* We need to invalidate the TLB always because hpte_remove doesn't do
* a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -395,30 +582,24 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
* (hpte_remove) because we assume the old translation is still
* technically "valid".
*/
- if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
- native_unlock_hpte(hptep);
- else
- /* Invalidate the hpte. NOTE: this also unlocks it */
- hptep->v = 0;
-
- /* Invalidate the TLB */
tlbie(vpn, bpsize, apsize, ssize, local);
local_irq_restore(flags);
}
-static void native_hugepage_invalidate(struct mm_struct *mm,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void native_hugepage_invalidate(unsigned long vsid,
+ unsigned long addr,
unsigned char *hpte_slot_array,
- unsigned long addr, int psize)
+ int psize, int ssize, int local)
{
- int ssize = 0, i;
- int lock_tlbie;
+ int i;
struct hash_pte *hptep;
int actual_psize = MMU_PAGE_16M;
unsigned int max_hpte_count, valid;
unsigned long flags, s_addr = addr;
unsigned long hpte_v, want_v, shift;
- unsigned long hidx, vpn = 0, vsid, hash, slot;
+ unsigned long hidx, vpn = 0, hash, slot;
shift = mmu_psize_defs[psize].shift;
max_hpte_count = 1U << (PMD_SHIFT - shift);
@@ -432,15 +613,6 @@ static void native_hugepage_invalidate(struct mm_struct *mm,
/* get the vpn */
addr = s_addr + (i * (1ul << shift));
- if (!is_kernel_addr(addr)) {
- ssize = user_segment_size(addr);
- vsid = get_vsid(mm->context.id, addr, ssize);
- WARN_ON(vsid == 0);
- } else {
- vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
- ssize = mmu_kernel_ssize;
- }
-
vpn = hpt_vpn(addr, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize);
if (hidx & _PTEIDX_SECONDARY)
@@ -451,88 +623,61 @@ static void native_hugepage_invalidate(struct mm_struct *mm,
hptep = htab_address + slot;
want_v = hpte_encode_avpn(vpn, psize, ssize);
- native_lock_hpte(hptep);
- hpte_v = hptep->v;
+ hpte_v = hpte_get_old_v(hptep);
/* Even if we miss, we need to invalidate the TLB */
- if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
- native_unlock_hpte(hptep);
- else
- /* Invalidate the hpte. NOTE: this also unlocks it */
- hptep->v = 0;
- }
- /*
- * Since this is a hugepage, we just need a single tlbie.
- * use the last vpn.
- */
- lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
- if (lock_tlbie)
- raw_spin_lock(&native_tlbie_lock);
-
- asm volatile("ptesync":::"memory");
- __tlbie(vpn, psize, actual_psize, ssize);
- asm volatile("eieio; tlbsync; ptesync":::"memory");
-
- if (lock_tlbie)
- raw_spin_unlock(&native_tlbie_lock);
-
- local_irq_restore(flags);
-}
-
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
- int i, shift;
- unsigned int mask;
-
- /* start from 1 ignoring MMU_PAGE_4K */
- for (i = 1; i < MMU_PAGE_COUNT; i++) {
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+ /* recheck with locks held */
+ native_lock_hpte(hptep);
+ hpte_v = hpte_get_old_v(hptep);
- /* invalid penc */
- if (mmu_psize_defs[psize].penc[i] == -1)
- continue;
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+ /* Invalidate the hpte. NOTE: this also unlocks it */
+ release_hpte_lock();
+ hptep->v = 0;
+ } else
+ native_unlock_hpte(hptep);
+ }
/*
- * encoding bits per actual page size
- * PTE LP actual page size
- * rrrr rrrz >=8KB
- * rrrr rrzz >=16KB
- * rrrr rzzz >=32KB
- * rrrr zzzz >=64KB
- * .......
+ * We need to do tlb invalidate for all the address, tlbie
+ * instruction compares entry_VA in tlb with the VA specified
+ * here
*/
- shift = mmu_psize_defs[i].shift - LP_SHIFT;
- if (shift > LP_BITS)
- shift = LP_BITS;
- mask = (1 << shift) - 1;
- if ((lp & mask) == mmu_psize_defs[psize].penc[i])
- return i;
+ tlbie(vpn, psize, actual_psize, ssize, local);
}
- return -1;
+ local_irq_restore(flags);
+}
+#else
+static void native_hugepage_invalidate(unsigned long vsid,
+ unsigned long addr,
+ unsigned char *hpte_slot_array,
+ int psize, int ssize, int local)
+{
+ WARN(1, "%s called without THP support\n", __func__);
}
+#endif
static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
int *psize, int *apsize, int *ssize, unsigned long *vpn)
{
unsigned long avpn, pteg, vpi;
- unsigned long hpte_v = hpte->v;
+ unsigned long hpte_v = be64_to_cpu(hpte->v);
+ unsigned long hpte_r = be64_to_cpu(hpte->r);
unsigned long vsid, seg_off;
int size, a_size, shift;
/* Look at the 8 bit LP value */
- unsigned int lp = (hpte->r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+ unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ hpte_v = hpte_new_to_old_v(hpte_v, hpte_r);
+ hpte_r = hpte_new_to_old_r(hpte_r);
+ }
if (!(hpte_v & HPTE_V_LARGE)) {
size = MMU_PAGE_4K;
a_size = MMU_PAGE_4K;
} else {
- for (size = 0; size < MMU_PAGE_COUNT; size++) {
-
- /* valid entries have a shift value */
- if (!mmu_psize_defs[size].shift)
- continue;
-
- a_size = __hpte_actual_psize(lp, size);
- if (a_size != -1)
- break;
- }
+ size = hpte_page_sizes[lp] & 0xf;
+ a_size = hpte_page_sizes[lp] >> 4;
}
/* This works for all page sizes, and for 256M and 1T segments */
*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
@@ -554,6 +699,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
seg_off |= vpi << shift;
}
*vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+ break;
case MMU_SEGSIZE_1T:
/* We only have 40 - 23 bits of seg_off in avpn */
seg_off = (avpn & 0x1ffff) << 23;
@@ -563,6 +709,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
seg_off |= vpi << shift;
}
*vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+ break;
default:
*vpn = size = 0;
}
@@ -575,13 +722,21 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
* be when they isi), and we are the only one left. We rely on our kernel
* mapping being 0xC0's and the hardware ignoring those two real bits.
*
+ * This must be called with interrupts disabled.
+ *
+ * Taking the native_tlbie_lock is unsafe here due to the possibility of
+ * lockdep being on. On pre POWER5 hardware, not taking the lock could
+ * cause deadlock. POWER5 and newer not taking the lock is fine. This only
+ * gets called during boot before secondary CPUs have come up and during
+ * crashdump and all bets are off anyway.
+ *
* TODO: add batching support when enabled. remember, no dynamic memory here,
- * athough there is the control page available...
+ * although there is the control page available...
*/
-static void native_hpte_clear(void)
+static notrace void native_hpte_clear(void)
{
unsigned long vpn = 0;
- unsigned long slot, slots, flags;
+ unsigned long slot, slots;
struct hash_pte *hptep = htab_address;
unsigned long hpte_v;
unsigned long pteg_count;
@@ -589,13 +744,6 @@ static void native_hpte_clear(void)
pteg_count = htab_hash_mask + 1;
- local_irq_save(flags);
-
- /* we take the tlbie lock and hold it. Some hardware will
- * deadlock if we try to tlbie from two processors at once.
- */
- raw_spin_lock(&native_tlbie_lock);
-
slots = pteg_count * HPTES_PER_GROUP;
for (slot = 0; slot < slots; slot++, hptep++) {
@@ -604,22 +752,20 @@ static void native_hpte_clear(void)
* running, right? and for crash dump, we probably
* don't want to wait for a maybe bad cpu.
*/
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
/*
- * Call __tlbie() here rather than tlbie() since we
- * already hold the native_tlbie_lock.
+ * Call __tlbie() here rather than tlbie() since we can't take the
+ * native_tlbie_lock.
*/
if (hpte_v & HPTE_V_VALID) {
hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
hptep->v = 0;
- __tlbie(vpn, psize, apsize, ssize);
+ ___tlbie(vpn, psize, apsize, ssize);
}
}
asm volatile("eieio; tlbsync; ptesync":::"memory");
- raw_spin_unlock(&native_tlbie_lock);
- local_irq_restore(flags);
}
/*
@@ -628,14 +774,14 @@ static void native_hpte_clear(void)
*/
static void native_flush_hash_range(unsigned long number, int local)
{
- unsigned long vpn;
+ unsigned long vpn = 0;
unsigned long hash, index, hidx, shift, slot;
struct hash_pte *hptep;
unsigned long hpte_v;
unsigned long want_v;
unsigned long flags;
real_pte_t pte;
- struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+ struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
unsigned long psize = batch->psize;
int ssize = batch->ssize;
int i;
@@ -655,13 +801,21 @@ static void native_flush_hash_range(unsigned long number, int local)
slot += hidx & _PTEIDX_GROUP_IX;
hptep = htab_address + slot;
want_v = hpte_encode_avpn(vpn, psize, ssize);
+ hpte_v = hpte_get_old_v(hptep);
+
+ if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+ continue;
+ /* lock and try again */
native_lock_hpte(hptep);
- hpte_v = hptep->v;
- if (!HPTE_V_COMPARE(hpte_v, want_v) ||
- !(hpte_v & HPTE_V_VALID))
+ hpte_v = hpte_get_old_v(hptep);
+
+ if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
- else
+ else {
+ release_hpte_lock();
hptep->v = 0;
+ }
+
} pte_iterate_hashed_end();
}
@@ -677,7 +831,7 @@ static void native_flush_hash_range(unsigned long number, int local)
__tlbiel(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
- asm volatile("ptesync":::"memory");
+ ppc_after_tlbiel_barrier();
} else {
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
@@ -694,6 +848,10 @@ static void native_flush_hash_range(unsigned long number, int local)
__tlbie(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
+ /*
+ * Just do one more with the last used values.
+ */
+ fixup_tlbie_vpn(vpn, psize, psize, ssize);
asm volatile("eieio; tlbsync; ptesync":::"memory");
if (lock_tlbie)
@@ -705,12 +863,13 @@ static void native_flush_hash_range(unsigned long number, int local)
void __init hpte_init_native(void)
{
- ppc_md.hpte_invalidate = native_hpte_invalidate;
- ppc_md.hpte_updatepp = native_hpte_updatepp;
- ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp;
- ppc_md.hpte_insert = native_hpte_insert;
- ppc_md.hpte_remove = native_hpte_remove;
- ppc_md.hpte_clear_all = native_hpte_clear;
- ppc_md.flush_hash_range = native_flush_hash_range;
- ppc_md.hugepage_invalidate = native_hugepage_invalidate;
+ mmu_hash_ops.hpte_invalidate = native_hpte_invalidate;
+ mmu_hash_ops.hpte_updatepp = native_hpte_updatepp;
+ mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
+ mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
+ mmu_hash_ops.hpte_insert = native_hpte_insert;
+ mmu_hash_ops.hpte_remove = native_hpte_remove;
+ mmu_hash_ops.hpte_clear_all = native_hpte_clear;
+ mmu_hash_ops.flush_hash_range = native_flush_hash_range;
+ mmu_hash_ops.hugepage_invalidate = native_hugepage_invalidate;
}
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
new file mode 100644
index 000000000000..82d31177630b
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -0,0 +1,563 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2005, Paul Mackerras, IBM Corporation.
+ * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm_types.h>
+#include <linux/mm.h>
+#include <linux/stop_machine.h>
+
+#include <asm/sections.h>
+#include <asm/mmu.h>
+#include <asm/tlb.h>
+#include <asm/firmware.h>
+
+#include <mm/mmu_decl.h>
+
+#include <trace/events/thp.h>
+
+#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
+#warning Limited user VSID range means pagetable space is wasted
+#endif
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * vmemmap is the starting address of the virtual address space where
+ * struct pages are allocated for all possible PFNs present on the system
+ * including holes and bad memory (hence sparse). These virtual struct
+ * pages are stored in sequence in this virtual address space irrespective
+ * of the fact whether the corresponding PFN is valid or not. This achieves
+ * constant relationship between address of struct page and its PFN.
+ *
+ * During boot or memory hotplug operation when a new memory section is
+ * added, physical memory allocation (including hash table bolting) will
+ * be performed for the set of struct pages which are part of the memory
+ * section. This saves memory by not allocating struct pages for PFNs
+ * which are not valid.
+ *
+ * ----------------------------------------------
+ * | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
+ * ----------------------------------------------
+ *
+ * f000000000000000 c000000000000000
+ * vmemmap +--------------+ +--------------+
+ * + | page struct | +--------------> | page struct |
+ * | +--------------+ +--------------+
+ * | | page struct | +--------------> | page struct |
+ * | +--------------+ | +--------------+
+ * | | page struct | + +------> | page struct |
+ * | +--------------+ | +--------------+
+ * | | page struct | | +--> | page struct |
+ * | +--------------+ | | +--------------+
+ * | | page struct | | |
+ * | +--------------+ | |
+ * | | page struct | | |
+ * | +--------------+ | |
+ * | | page struct | | |
+ * | +--------------+ | |
+ * | | page struct | | |
+ * | +--------------+ | |
+ * | | page struct | +-------+ |
+ * | +--------------+ |
+ * | | page struct | +-----------+
+ * | +--------------+
+ * | | page struct | No mapping
+ * | +--------------+
+ * | | page struct | No mapping
+ * v +--------------+
+ *
+ * -----------------------------------------
+ * | RELATION BETWEEN STRUCT PAGES AND PFNS|
+ * -----------------------------------------
+ *
+ * vmemmap +--------------+ +---------------+
+ * + | page struct | +-------------> | PFN |
+ * | +--------------+ +---------------+
+ * | | page struct | +-------------> | PFN |
+ * | +--------------+ +---------------+
+ * | | page struct | +-------------> | PFN |
+ * | +--------------+ +---------------+
+ * | | page struct | +-------------> | PFN |
+ * | +--------------+ +---------------+
+ * | | |
+ * | +--------------+
+ * | | |
+ * | +--------------+
+ * | | |
+ * | +--------------+ +---------------+
+ * | | page struct | +-------------> | PFN |
+ * | +--------------+ +---------------+
+ * | | |
+ * | +--------------+
+ * | | |
+ * | +--------------+ +---------------+
+ * | | page struct | +-------------> | PFN |
+ * | +--------------+ +---------------+
+ * | | page struct | +-------------> | PFN |
+ * v +--------------+ +---------------+
+ */
+/*
+ * On hash-based CPUs, the vmemmap is bolted in the hash table.
+ *
+ */
+int __meminit hash__vmemmap_create_mapping(unsigned long start,
+ unsigned long page_size,
+ unsigned long phys)
+{
+ int rc;
+
+ if ((start + page_size) >= H_VMEMMAP_END) {
+ pr_warn("Outside the supported range\n");
+ return -1;
+ }
+
+ rc = htab_bolt_mapping(start, start + page_size, phys,
+ pgprot_val(PAGE_KERNEL),
+ mmu_vmemmap_psize, mmu_kernel_ssize);
+ if (rc < 0) {
+ int rc2 = htab_remove_mapping(start, start + page_size,
+ mmu_vmemmap_psize,
+ mmu_kernel_ssize);
+ BUG_ON(rc2 && (rc2 != -ENOENT));
+ }
+ return rc;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void hash__vmemmap_remove_mapping(unsigned long start,
+ unsigned long page_size)
+{
+ int rc = htab_remove_mapping(start, start + page_size,
+ mmu_vmemmap_psize,
+ mmu_kernel_ssize);
+ BUG_ON((rc < 0) && (rc != -ENOENT));
+ WARN_ON(rc == -ENOENT);
+}
+#endif
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+/*
+ * map_kernel_page currently only called by __ioremap
+ * map_kernel_page adds an entry to the ioremap page table
+ * and adds an entry to the HPT, possibly bolting it
+ */
+int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
+{
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
+ if (slab_is_available()) {
+ pgdp = pgd_offset_k(ea);
+ p4dp = p4d_offset(pgdp, ea);
+ pudp = pud_alloc(&init_mm, p4dp, ea);
+ if (!pudp)
+ return -ENOMEM;
+ pmdp = pmd_alloc(&init_mm, pudp, ea);
+ if (!pmdp)
+ return -ENOMEM;
+ ptep = pte_alloc_kernel(pmdp, ea);
+ if (!ptep)
+ return -ENOMEM;
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
+ } else {
+ /*
+ * If the mm subsystem is not fully up, we cannot create a
+ * linux page table entry for this mapping. Simply bolt an
+ * entry in the hardware page table.
+ *
+ */
+ if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
+ mmu_io_psize, mmu_kernel_ssize)) {
+ printk(KERN_ERR "Failed to do bolted mapping IO "
+ "memory at %016lx !\n", pa);
+ return -ENOMEM;
+ }
+ }
+
+ smp_wmb();
+ return 0;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, unsigned long clr,
+ unsigned long set)
+{
+ __be64 old_be, tmp;
+ unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!hash__pmd_trans_huge(*pmdp));
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+#endif
+
+ __asm__ __volatile__(
+ "1: ldarx %0,0,%3\n\
+ and. %1,%0,%6\n\
+ bne- 1b \n\
+ andc %1,%0,%4 \n\
+ or %1,%1,%7\n\
+ stdcx. %1,0,%3 \n\
+ bne- 1b"
+ : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
+ : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
+ "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
+ : "cc" );
+
+ old = be64_to_cpu(old_be);
+
+ trace_hugepage_update_pmd(addr, old, clr, set);
+ if (old & H_PAGE_HASHPTE)
+ hpte_do_hugepage_flush(mm, addr, pmdp, old);
+ return old;
+}
+
+pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(pmd_trans_huge(*pmdp));
+
+ pmd = *pmdp;
+ pmd_clear(pmdp);
+ /*
+ * Wait for all pending hash_page to finish. This is needed
+ * in case of subpage collapse. When we collapse normal pages
+ * to hugepage, we first clear the pmd, then invalidate all
+ * the PTE entries. The assumption here is that any low level
+ * page fault will see a none pmd and take the slow path that
+ * will wait on mmap_lock. But we could very well be in a
+ * hash_page with local ptep pointer value. Such a hash page
+ * can result in adding new HPTE entries for normal subpages.
+ * That means we could be modifying the page content as we
+ * copy them to a huge page. So wait for parallel hash_page
+ * to finish before invalidating HPTE entries. We can do this
+ * by sending an IPI to all the cpus and executing a dummy
+ * function there.
+ */
+ serialize_against_pte_lookup(vma->vm_mm);
+ /*
+ * Now invalidate the hpte entries in the range
+ * covered by pmd. This make sure we take a
+ * fault and will find the pmd as none, which will
+ * result in a major fault which takes mmap_lock and
+ * hence wait for collapse to complete. Without this
+ * the __collapse_huge_page_copy can result in copying
+ * the old content.
+ */
+ flush_hash_table_pmd_range(vma->vm_mm, &pmd, address);
+ return pmd;
+}
+
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ pgtable_t *pgtable_slot;
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+ /*
+ * we store the pgtable in the second half of PMD
+ */
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ *pgtable_slot = pgtable;
+ /*
+ * expose the deposited pgtable to other cpus.
+ * before we set the hugepage PTE at pmd level
+ * hash fault code looks at the deposted pgtable
+ * to store hash index values.
+ */
+ smp_wmb();
+}
+
+pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pgtable_t pgtable;
+ pgtable_t *pgtable_slot;
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ pgtable = *pgtable_slot;
+ /*
+ * Once we withdraw, mark the entry NULL.
+ */
+ *pgtable_slot = NULL;
+ /*
+ * We store HPTE information in the deposited PTE fragment.
+ * zero out the content on withdraw.
+ */
+ memset(pgtable, 0, PTE_FRAG_SIZE);
+ return pgtable;
+}
+
+/*
+ * A linux hugepage PMD was changed and the corresponding hash table entries
+ * neesd to be flushed.
+ */
+void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, unsigned long old_pmd)
+{
+ int ssize;
+ unsigned int psize;
+ unsigned long vsid;
+ unsigned long flags = 0;
+
+ /* get the base page size,vsid and segment size */
+#ifdef CONFIG_DEBUG_VM
+ psize = get_slice_psize(mm, addr);
+ BUG_ON(psize == MMU_PAGE_16M);
+#endif
+ if (old_pmd & H_PAGE_COMBO)
+ psize = MMU_PAGE_4K;
+ else
+ psize = MMU_PAGE_64K;
+
+ if (!is_kernel_addr(addr)) {
+ ssize = user_segment_size(addr);
+ vsid = get_user_vsid(&mm->context, addr, ssize);
+ WARN_ON(vsid == 0);
+ } else {
+ vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+ ssize = mmu_kernel_ssize;
+ }
+
+ if (mm_is_thread_local(mm))
+ flags |= HPTE_LOCAL_UPDATE;
+
+ return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
+}
+
+pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old_pmd;
+ pgtable_t pgtable;
+ unsigned long old;
+ pgtable_t *pgtable_slot;
+
+ old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+ old_pmd = __pmd(old);
+ /*
+ * We have pmd == none and we are holding page_table_lock.
+ * So we can safely go and clear the pgtable hash
+ * index info.
+ */
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ pgtable = *pgtable_slot;
+ /*
+ * Let's zero out old valid and hash index details
+ * hash fault look at them.
+ */
+ memset(pgtable, 0, PTE_FRAG_SIZE);
+ return old_pmd;
+}
+
+int hash__has_transparent_hugepage(void)
+{
+
+ if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+ return 0;
+ /*
+ * We support THP only if PMD_SIZE is 16MB.
+ */
+ if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
+ return 0;
+ /*
+ * We need to make sure that we support 16MB hugepage in a segment
+ * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+ * of 64K.
+ */
+ /*
+ * If we have 64K HPTE, we will be using that by default
+ */
+ if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+ (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+ return 0;
+ /*
+ * Ok we only have 4K HPTE
+ */
+ if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);
+
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+
+struct change_memory_parms {
+ unsigned long start, end, newpp;
+ unsigned int step, nr_cpus;
+ atomic_t master_cpu;
+ atomic_t cpu_counter;
+};
+
+// We'd rather this was on the stack but it has to be in the RMO
+static struct change_memory_parms chmem_parms;
+
+// And therefore we need a lock to protect it from concurrent use
+static DEFINE_MUTEX(chmem_lock);
+
+static void change_memory_range(unsigned long start, unsigned long end,
+ unsigned int step, unsigned long newpp)
+{
+ unsigned long idx;
+
+ pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
+ start, end, newpp, step);
+
+ for (idx = start; idx < end; idx += step)
+ /* Not sure if we can do much with the return value */
+ mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
+ mmu_kernel_ssize);
+}
+
+static int notrace chmem_secondary_loop(struct change_memory_parms *parms)
+{
+ unsigned long msr, tmp, flags;
+ int *p;
+
+ p = &parms->cpu_counter.counter;
+
+ local_irq_save(flags);
+ hard_irq_disable();
+
+ asm volatile (
+ // Switch to real mode and leave interrupts off
+ "mfmsr %[msr] ;"
+ "li %[tmp], %[MSR_IR_DR] ;"
+ "andc %[tmp], %[msr], %[tmp] ;"
+ "mtmsrd %[tmp] ;"
+
+ // Tell the master we are in real mode
+ "1: "
+ "lwarx %[tmp], 0, %[p] ;"
+ "addic %[tmp], %[tmp], -1 ;"
+ "stwcx. %[tmp], 0, %[p] ;"
+ "bne- 1b ;"
+
+ // Spin until the counter goes to zero
+ "2: ;"
+ "lwz %[tmp], 0(%[p]) ;"
+ "cmpwi %[tmp], 0 ;"
+ "bne- 2b ;"
+
+ // Switch back to virtual mode
+ "mtmsrd %[msr] ;"
+
+ : // outputs
+ [msr] "=&r" (msr), [tmp] "=&b" (tmp), "+m" (*p)
+ : // inputs
+ [p] "b" (p), [MSR_IR_DR] "i" (MSR_IR | MSR_DR)
+ : // clobbers
+ "cc", "xer"
+ );
+
+ local_irq_restore(flags);
+
+ return 0;
+}
+
+static int change_memory_range_fn(void *data)
+{
+ struct change_memory_parms *parms = data;
+
+ // First CPU goes through, all others wait.
+ if (atomic_xchg(&parms->master_cpu, 1) == 1)
+ return chmem_secondary_loop(parms);
+
+ // Wait for all but one CPU (this one) to call-in
+ while (atomic_read(&parms->cpu_counter) > 1)
+ barrier();
+
+ change_memory_range(parms->start, parms->end, parms->step, parms->newpp);
+
+ mb();
+
+ // Signal the other CPUs that we're done
+ atomic_dec(&parms->cpu_counter);
+
+ return 0;
+}
+
+static bool hash__change_memory_range(unsigned long start, unsigned long end,
+ unsigned long newpp)
+{
+ unsigned int step, shift;
+
+ shift = mmu_psize_defs[mmu_linear_psize].shift;
+ step = 1 << shift;
+
+ start = ALIGN_DOWN(start, step);
+ end = ALIGN(end, step); // aligns up
+
+ if (start >= end)
+ return false;
+
+ if (firmware_has_feature(FW_FEATURE_LPAR)) {
+ mutex_lock(&chmem_lock);
+
+ chmem_parms.start = start;
+ chmem_parms.end = end;
+ chmem_parms.step = step;
+ chmem_parms.newpp = newpp;
+ atomic_set(&chmem_parms.master_cpu, 0);
+
+ cpus_read_lock();
+
+ atomic_set(&chmem_parms.cpu_counter, num_online_cpus());
+
+ // Ensure state is consistent before we call the other CPUs
+ mb();
+
+ stop_machine_cpuslocked(change_memory_range_fn, &chmem_parms,
+ cpu_online_mask);
+
+ cpus_read_unlock();
+ mutex_unlock(&chmem_lock);
+ } else
+ change_memory_range(start, end, step, newpp);
+
+ return true;
+}
+
+void hash__mark_rodata_ro(void)
+{
+ unsigned long start, end, pp;
+
+ start = (unsigned long)_stext;
+ end = (unsigned long)__end_rodata;
+
+ pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY);
+
+ WARN_ON(!hash__change_memory_range(start, end, pp));
+}
+
+void hash__mark_initmem_nx(void)
+{
+ unsigned long start, end, pp;
+
+ start = (unsigned long)__init_begin;
+ end = (unsigned long)__init_end;
+
+ pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY);
+
+ WARN_ON(!hash__change_memory_range(start, end, pp));
+}
+#endif
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/book3s64/hash_tlb.c
index 36e44b4260eb..21fcad97ae80 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/book3s64/hash_tlb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file contains the routines for flushing entries from the
* TLB and MMU hash table.
@@ -14,22 +15,19 @@
*
* Dave Engebretsen <engebret@us.ibm.com>
* Rework for PPC64 port.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/init.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/bug.h>
+#include <asm/pte-walk.h>
+
+
+#include <trace/events/thp.h>
DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
@@ -48,11 +46,12 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
unsigned int psize;
int ssize;
real_pte_t rpte;
- int i;
+ int i, offset;
i = batch->index;
- /* Get page size (maybe move back to caller).
+ /*
+ * Get page size (maybe move back to caller).
*
* NOTE: when using special 64K mappings in 4K environment like
* for SPEs, we obtain the page size from the slice, which thus
@@ -64,40 +63,45 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
psize = get_slice_psize(mm, addr);
/* Mask the address for the correct page size */
addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
+ if (unlikely(psize == MMU_PAGE_16G))
+ offset = PTRS_PER_PUD;
+ else
+ offset = PTRS_PER_PMD;
#else
BUG();
psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
#endif
} else {
psize = pte_pagesize_index(mm, addr, pte);
- /* Mask the address for the standard page size. If we
+ /*
+ * Mask the address for the standard page size. If we
* have a 64k page kernel, but the hardware does not
* support 64k pages, this might be different from the
- * hardware page size encoded in the slice table. */
+ * hardware page size encoded in the slice table.
+ */
addr &= PAGE_MASK;
+ offset = PTRS_PER_PTE;
}
/* Build full vaddr */
if (!is_kernel_addr(addr)) {
ssize = user_segment_size(addr);
- vsid = get_vsid(mm->context.id, addr, ssize);
+ vsid = get_user_vsid(&mm->context, addr, ssize);
} else {
vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
ssize = mmu_kernel_ssize;
}
WARN_ON(vsid == 0);
vpn = hpt_vpn(addr, vsid, ssize);
- rpte = __real_pte(__pte(pte), ptep);
+ rpte = __real_pte(__pte(pte), ptep, offset);
/*
* Check if we have an active batch on this CPU. If not, just
- * flush now and return. For now, we don global invalidates
- * in that case, might be worth testing the mm cpu mask though
- * and decide to use local invalidates instead...
+ * flush now and return.
*/
if (!batch->active) {
- flush_hash_page(vpn, rpte, psize, ssize, 0);
+ flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm));
put_cpu_var(ppc64_tlb_batch);
return;
}
@@ -139,13 +143,10 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
*/
void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
{
- const struct cpumask *tmp;
- int i, local = 0;
+ int i, local;
i = batch->index;
- tmp = cpumask_of(smp_processor_id());
- if (cpumask_equal(mm_cpumask(batch->mm), tmp))
- local = 1;
+ local = mm_is_thread_local(batch->mm);
if (i == 1)
flush_hash_page(batch->vpn[0], batch->pte[0],
batch->psize, batch->ssize, local);
@@ -154,11 +155,12 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
batch->index = 0;
}
-void tlb_flush(struct mmu_gather *tlb)
+void hash__tlb_flush(struct mmu_gather *tlb)
{
struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch);
- /* If there's a TLB batch pending, then we must flush it because the
+ /*
+ * If there's a TLB batch pending, then we must flush it because the
* pages are going to be freed and we really don't want to have a CPU
* access a freed page because it has a stale TLB
*/
@@ -173,7 +175,6 @@ void tlb_flush(struct mmu_gather *tlb)
* from the hash table (and the TLB). But keeps
* the linux PTEs intact.
*
- * @mm : mm_struct of the target address space (generally init_mm)
* @start : starting address
* @end : ending address (not included in the flush)
*
@@ -186,18 +187,17 @@ void tlb_flush(struct mmu_gather *tlb)
* Because of that usage pattern, it is implemented for small size rather
* than speed.
*/
-void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
- unsigned long end)
+void __flush_hash_table_range(unsigned long start, unsigned long end)
{
int hugepage_shift;
unsigned long flags;
- start = _ALIGN_DOWN(start, PAGE_SIZE);
- end = _ALIGN_UP(end, PAGE_SIZE);
+ start = ALIGN_DOWN(start, PAGE_SIZE);
+ end = ALIGN(end, PAGE_SIZE);
- BUG_ON(!mm->pgd);
- /* Note: Normally, we should only ever use a batch within a
+ /*
+ * Note: Normally, we should only ever use a batch within a
* PTE locked section. This violates the rule, but will work
* since we don't actually modify the PTEs, we just flush the
* hash while leaving the PTEs intact (including their reference
@@ -207,32 +207,29 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
for (; start < end; start += PAGE_SIZE) {
- pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start,
- &hugepage_shift);
+ pte_t *ptep = find_init_mm_pte(start, &hugepage_shift);
unsigned long pte;
if (ptep == NULL)
continue;
pte = pte_val(*ptep);
- if (!(pte & _PAGE_HASHPTE))
+ if (!(pte & H_PAGE_HASHPTE))
continue;
- if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
- hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
- else
- hpte_need_flush(mm, start, ptep, pte, 0);
+ hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift);
}
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
}
-void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
{
pte_t *pte;
pte_t *start_pte;
unsigned long flags;
- addr = _ALIGN_DOWN(addr, PMD_SIZE);
- /* Note: Normally, we should only ever use a batch within a
+ addr = ALIGN_DOWN(addr, PMD_SIZE);
+ /*
+ * Note: Normally, we should only ever use a batch within a
* PTE locked section. This violates the rule, but will work
* since we don't actually modify the PTEs, we just flush the
* hash while leaving the PTEs intact (including their reference
@@ -242,12 +239,16 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
start_pte = pte_offset_map(pmd, addr);
+ if (!start_pte)
+ goto out;
for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
unsigned long pteval = pte_val(*pte);
- if (pteval & _PAGE_HASHPTE)
+ if (pteval & H_PAGE_HASHPTE)
hpte_need_flush(mm, addr, pte, pteval, 0);
addr += PAGE_SIZE;
}
+ pte_unmap(start_pte);
+out:
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
}
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
new file mode 100644
index 000000000000..9dc5889d6ecb
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -0,0 +1,2486 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerPC64 port by Mike Corrigan and Dave Engebretsen
+ * {mikejc|engebret}@us.ibm.com
+ *
+ * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
+ *
+ * SMP scalability work:
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * Module name: htab.c
+ *
+ * Description:
+ * PowerPC Hashed Page Table functions
+ */
+
+#undef DEBUG
+#undef DEBUG_LOW
+
+#define pr_fmt(fmt) "hash-mmu: " fmt
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/sched/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/export.h>
+#include <linux/ctype.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/signal.h>
+#include <linux/memblock.h>
+#include <linux/context_tracking.h>
+#include <linux/libfdt.h>
+#include <linux/pkeys.h>
+#include <linux/hugetlb.h>
+#include <linux/cpu.h>
+#include <linux/pgtable.h>
+#include <linux/debugfs.h>
+#include <linux/random.h>
+#include <linux/elf-randomize.h>
+#include <linux/of_fdt.h>
+#include <linux/kfence.h>
+
+#include <asm/interrupt.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/types.h>
+#include <linux/uaccess.h>
+#include <asm/machdep.h>
+#include <asm/io.h>
+#include <asm/eeh.h>
+#include <asm/tlb.h>
+#include <asm/cacheflush.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/spu.h>
+#include <asm/udbg.h>
+#include <asm/text-patching.h>
+#include <asm/fadump.h>
+#include <asm/firmware.h>
+#include <asm/tm.h>
+#include <asm/trace.h>
+#include <asm/ps3.h>
+#include <asm/pte-walk.h>
+#include <asm/asm-prototypes.h>
+#include <asm/ultravisor.h>
+#include <asm/kfence.h>
+
+#include <mm/mmu_decl.h>
+
+#include "internal.h"
+
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) udbg_printf(fmt)
+#else
+#define DBG_LOW(fmt...)
+#endif
+
+#define KB (1024)
+#define MB (1024*KB)
+#define GB (1024L*MB)
+
+/*
+ * Note: pte --> Linux PTE
+ * HPTE --> PowerPC Hashed Page Table Entry
+ *
+ * Execution context:
+ * htab_initialize is called with the MMU off (of course), but
+ * the kernel has been copied down to zero so it can directly
+ * reference global data. At this point it is very difficult
+ * to print debug info.
+ *
+ */
+
+static unsigned long _SDR1;
+
+u8 hpte_page_sizes[1 << LP_BITS];
+EXPORT_SYMBOL_GPL(hpte_page_sizes);
+
+struct hash_pte *htab_address;
+unsigned long htab_size_bytes;
+unsigned long htab_hash_mask;
+EXPORT_SYMBOL_GPL(htab_hash_mask);
+int mmu_linear_psize = MMU_PAGE_4K;
+EXPORT_SYMBOL_GPL(mmu_linear_psize);
+int mmu_virtual_psize = MMU_PAGE_4K;
+int mmu_vmalloc_psize = MMU_PAGE_4K;
+EXPORT_SYMBOL_GPL(mmu_vmalloc_psize);
+int mmu_io_psize = MMU_PAGE_4K;
+int mmu_kernel_ssize = MMU_SEGSIZE_256M;
+EXPORT_SYMBOL_GPL(mmu_kernel_ssize);
+int mmu_highuser_ssize = MMU_SEGSIZE_256M;
+u16 mmu_slb_size = 64;
+EXPORT_SYMBOL_GPL(mmu_slb_size);
+#ifdef CONFIG_PPC_64K_PAGES
+int mmu_ci_restrictions;
+#endif
+struct mmu_hash_ops mmu_hash_ops __ro_after_init;
+EXPORT_SYMBOL(mmu_hash_ops);
+
+/*
+ * These are definitions of page sizes arrays to be used when none
+ * is provided by the firmware.
+ */
+
+/*
+ * Fallback (4k pages only)
+ */
+static struct mmu_psize_def mmu_psize_defaults[] = {
+ [MMU_PAGE_4K] = {
+ .shift = 12,
+ .sllp = 0,
+ .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
+ .avpnm = 0,
+ .tlbiel = 0,
+ },
+};
+
+/*
+ * POWER4, GPUL, POWER5
+ *
+ * Support for 16Mb large pages
+ */
+static struct mmu_psize_def mmu_psize_defaults_gp[] = {
+ [MMU_PAGE_4K] = {
+ .shift = 12,
+ .sllp = 0,
+ .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
+ .avpnm = 0,
+ .tlbiel = 1,
+ },
+ [MMU_PAGE_16M] = {
+ .shift = 24,
+ .sllp = SLB_VSID_L,
+ .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0,
+ [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 },
+ .avpnm = 0x1UL,
+ .tlbiel = 0,
+ },
+};
+
+static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
+{
+ unsigned long rb;
+
+ rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+
+ asm volatile("tlbiel %0" : : "r" (rb));
+}
+
+/*
+ * tlbiel instruction for hash, set invalidation
+ * i.e., r=1 and is=01 or is=10 or is=11
+ */
+static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
+ unsigned int pid,
+ unsigned int ric, unsigned int prs)
+{
+ unsigned long rb;
+ unsigned long rs;
+ unsigned int r = 0; /* hash format */
+
+ rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+ rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
+
+ asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
+ : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r)
+ : "memory");
+}
+
+
+static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
+{
+ unsigned int set;
+
+ asm volatile("ptesync": : :"memory");
+
+ for (set = 0; set < num_sets; set++)
+ tlbiel_hash_set_isa206(set, is);
+
+ ppc_after_tlbiel_barrier();
+}
+
+static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
+{
+ unsigned int set;
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the partition table cache if this is HV mode.
+ */
+ if (early_cpu_has_feature(CPU_FTR_HVMODE))
+ tlbiel_hash_set_isa300(0, is, 0, 2, 0);
+
+ /*
+ * Now invalidate the process table cache. UPRT=0 HPT modes (what
+ * current hardware implements) do not use the process table, but
+ * add the flushes anyway.
+ *
+ * From ISA v3.0B p. 1078:
+ * The following forms are invalid.
+ * * PRS=1, R=0, and RIC!=2 (The only process-scoped
+ * HPT caching is of the Process Table.)
+ */
+ tlbiel_hash_set_isa300(0, is, 0, 2, 1);
+
+ /*
+ * Then flush the sets of the TLB proper. Hash mode uses
+ * partition scoped TLB translations, which may be flushed
+ * in !HV mode.
+ */
+ for (set = 0; set < num_sets; set++)
+ tlbiel_hash_set_isa300(set, is, 0, 0, 0);
+
+ ppc_after_tlbiel_barrier();
+
+ asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+void hash__tlbiel_all(unsigned int action)
+{
+ unsigned int is;
+
+ switch (action) {
+ case TLB_INVAL_SCOPE_GLOBAL:
+ is = 3;
+ break;
+ case TLB_INVAL_SCOPE_LPID:
+ is = 2;
+ break;
+ default:
+ BUG();
+ }
+
+ if (early_cpu_has_feature(CPU_FTR_ARCH_300))
+ tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
+ else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
+ tlbiel_all_isa206(POWER8_TLB_SETS, is);
+ else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
+ tlbiel_all_isa206(POWER7_TLB_SETS, is);
+ else
+ WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
+}
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
+static void kernel_map_linear_page(unsigned long vaddr, unsigned long idx,
+ u8 *slots, raw_spinlock_t *lock)
+{
+ unsigned long hash;
+ unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
+ unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+ unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY);
+ long ret;
+
+ hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+
+ /* Don't create HPTE entries for bad address */
+ if (!vsid)
+ return;
+
+ if (slots[idx] & 0x80)
+ return;
+
+ ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
+ HPTE_V_BOLTED,
+ mmu_linear_psize, mmu_kernel_ssize);
+
+ BUG_ON (ret < 0);
+ raw_spin_lock(lock);
+ BUG_ON(slots[idx] & 0x80);
+ slots[idx] = ret | 0x80;
+ raw_spin_unlock(lock);
+}
+
+static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long idx,
+ u8 *slots, raw_spinlock_t *lock)
+{
+ unsigned long hash, hslot, slot;
+ unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
+ unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+
+ hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+ raw_spin_lock(lock);
+ if (!(slots[idx] & 0x80)) {
+ raw_spin_unlock(lock);
+ return;
+ }
+ hslot = slots[idx] & 0x7f;
+ slots[idx] = 0;
+ raw_spin_unlock(lock);
+ if (hslot & _PTEIDX_SECONDARY)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hslot & _PTEIDX_GROUP_IX;
+ mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
+ mmu_linear_psize,
+ mmu_kernel_ssize, 0);
+}
+#endif
+
+static inline bool hash_supports_debug_pagealloc(void)
+{
+ unsigned long max_hash_count = ppc64_rma_size / 4;
+ unsigned long linear_map_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
+
+ if (!debug_pagealloc_enabled() || linear_map_count > max_hash_count)
+ return false;
+ return true;
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static u8 *linear_map_hash_slots;
+static unsigned long linear_map_hash_count;
+static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
+static __init void hash_debug_pagealloc_alloc_slots(void)
+{
+ if (!hash_supports_debug_pagealloc())
+ return;
+
+ linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
+ linear_map_hash_slots = memblock_alloc_try_nid(
+ linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
+ ppc64_rma_size, NUMA_NO_NODE);
+ if (!linear_map_hash_slots)
+ panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
+ __func__, linear_map_hash_count, &ppc64_rma_size);
+}
+
+static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr,
+ int slot)
+{
+ if (!debug_pagealloc_enabled() || !linear_map_hash_count)
+ return;
+ if ((paddr >> PAGE_SHIFT) < linear_map_hash_count)
+ linear_map_hash_slots[paddr >> PAGE_SHIFT] = slot | 0x80;
+}
+
+static int hash_debug_pagealloc_map_pages(struct page *page, int numpages,
+ int enable)
+{
+ unsigned long flags, vaddr, lmi;
+ int i;
+
+ if (!debug_pagealloc_enabled() || !linear_map_hash_count)
+ return 0;
+
+ local_irq_save(flags);
+ for (i = 0; i < numpages; i++, page++) {
+ vaddr = (unsigned long)page_address(page);
+ lmi = __pa(vaddr) >> PAGE_SHIFT;
+ if (lmi >= linear_map_hash_count)
+ continue;
+ if (enable)
+ kernel_map_linear_page(vaddr, lmi,
+ linear_map_hash_slots, &linear_map_hash_lock);
+ else
+ kernel_unmap_linear_page(vaddr, lmi,
+ linear_map_hash_slots, &linear_map_hash_lock);
+ }
+ local_irq_restore(flags);
+ return 0;
+}
+
+#else /* CONFIG_DEBUG_PAGEALLOC */
+static inline void hash_debug_pagealloc_alloc_slots(void) {}
+static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot) {}
+static int __maybe_unused
+hash_debug_pagealloc_map_pages(struct page *page, int numpages, int enable)
+{
+ return 0;
+}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+#ifdef CONFIG_KFENCE
+static u8 *linear_map_kf_hash_slots;
+static unsigned long linear_map_kf_hash_count;
+static DEFINE_RAW_SPINLOCK(linear_map_kf_hash_lock);
+
+static phys_addr_t kfence_pool;
+
+static __init void hash_kfence_alloc_pool(void)
+{
+ if (!kfence_early_init_enabled())
+ goto err;
+
+ /* allocate linear map for kfence within RMA region */
+ linear_map_kf_hash_count = KFENCE_POOL_SIZE >> PAGE_SHIFT;
+ linear_map_kf_hash_slots = memblock_alloc_try_nid(
+ linear_map_kf_hash_count, 1,
+ MEMBLOCK_LOW_LIMIT, ppc64_rma_size,
+ NUMA_NO_NODE);
+ if (!linear_map_kf_hash_slots) {
+ pr_err("%s: memblock for linear map (%lu) failed\n", __func__,
+ linear_map_kf_hash_count);
+ goto err;
+ }
+
+ /* allocate kfence pool early */
+ kfence_pool = memblock_phys_alloc_range(KFENCE_POOL_SIZE, PAGE_SIZE,
+ MEMBLOCK_LOW_LIMIT, MEMBLOCK_ALLOC_ANYWHERE);
+ if (!kfence_pool) {
+ pr_err("%s: memblock for kfence pool (%lu) failed\n", __func__,
+ KFENCE_POOL_SIZE);
+ memblock_free(linear_map_kf_hash_slots,
+ linear_map_kf_hash_count);
+ linear_map_kf_hash_count = 0;
+ goto err;
+ }
+ memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
+
+ return;
+err:
+ pr_info("Disabling kfence\n");
+ disable_kfence();
+}
+
+static __init void hash_kfence_map_pool(void)
+{
+ unsigned long kfence_pool_start, kfence_pool_end;
+ unsigned long prot = pgprot_val(PAGE_KERNEL);
+ unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift;
+
+ if (!kfence_pool)
+ return;
+
+ kfence_pool_start = (unsigned long) __va(kfence_pool);
+ kfence_pool_end = kfence_pool_start + KFENCE_POOL_SIZE;
+ __kfence_pool = (char *) kfence_pool_start;
+ BUG_ON(htab_bolt_mapping(kfence_pool_start, kfence_pool_end,
+ kfence_pool, prot, mmu_linear_psize,
+ mmu_kernel_ssize));
+ update_page_count(mmu_linear_psize, KFENCE_POOL_SIZE >> pshift);
+ memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
+}
+
+static inline void hash_kfence_add_slot(phys_addr_t paddr, int slot)
+{
+ unsigned long vaddr = (unsigned long) __va(paddr);
+ unsigned long lmi = (vaddr - (unsigned long)__kfence_pool)
+ >> PAGE_SHIFT;
+
+ if (!kfence_pool)
+ return;
+ BUG_ON(!is_kfence_address((void *)vaddr));
+ BUG_ON(lmi >= linear_map_kf_hash_count);
+ linear_map_kf_hash_slots[lmi] = slot | 0x80;
+}
+
+static int hash_kfence_map_pages(struct page *page, int numpages, int enable)
+{
+ unsigned long flags, vaddr, lmi;
+ int i;
+
+ WARN_ON_ONCE(!linear_map_kf_hash_count);
+ local_irq_save(flags);
+ for (i = 0; i < numpages; i++, page++) {
+ vaddr = (unsigned long)page_address(page);
+ lmi = (vaddr - (unsigned long)__kfence_pool) >> PAGE_SHIFT;
+
+ /* Ideally this should never happen */
+ if (lmi >= linear_map_kf_hash_count) {
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ if (enable)
+ kernel_map_linear_page(vaddr, lmi,
+ linear_map_kf_hash_slots,
+ &linear_map_kf_hash_lock);
+ else
+ kernel_unmap_linear_page(vaddr, lmi,
+ linear_map_kf_hash_slots,
+ &linear_map_kf_hash_lock);
+ }
+ local_irq_restore(flags);
+ return 0;
+}
+#else
+static inline void hash_kfence_alloc_pool(void) {}
+static inline void hash_kfence_map_pool(void) {}
+static inline void hash_kfence_add_slot(phys_addr_t paddr, int slot) {}
+static int __maybe_unused
+hash_kfence_map_pages(struct page *page, int numpages, int enable)
+{
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
+int hash__kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ void *vaddr = page_address(page);
+
+ if (is_kfence_address(vaddr))
+ return hash_kfence_map_pages(page, numpages, enable);
+ else
+ return hash_debug_pagealloc_map_pages(page, numpages, enable);
+}
+
+static void hash_linear_map_add_slot(phys_addr_t paddr, int slot)
+{
+ if (is_kfence_address(__va(paddr)))
+ hash_kfence_add_slot(paddr, slot);
+ else
+ hash_debug_pagealloc_add_slot(paddr, slot);
+}
+#else
+static void hash_linear_map_add_slot(phys_addr_t paddr, int slot) {}
+#endif
+
+/*
+ * 'R' and 'C' update notes:
+ * - Under pHyp or KVM, the updatepp path will not set C, thus it *will*
+ * create writeable HPTEs without C set, because the hcall H_PROTECT
+ * that we use in that case will not update C
+ * - The above is however not a problem, because we also don't do that
+ * fancy "no flush" variant of eviction and we use H_REMOVE which will
+ * do the right thing and thus we don't have the race I described earlier
+ *
+ * - Under bare metal, we do have the race, so we need R and C set
+ * - We make sure R is always set and never lost
+ * - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping
+ */
+unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags)
+{
+ unsigned long rflags = 0;
+
+ /* _PAGE_EXEC -> NOEXEC */
+ if ((pteflags & _PAGE_EXEC) == 0)
+ rflags |= HPTE_R_N;
+ /*
+ * PPP bits:
+ * Linux uses slb key 0 for kernel and 1 for user.
+ * kernel RW areas are mapped with PPP=0b000
+ * User area is mapped with PPP=0b010 for read/write
+ * or PPP=0b011 for read-only (including writeable but clean pages).
+ */
+ if (pteflags & _PAGE_PRIVILEGED) {
+ /*
+ * Kernel read only mapped with ppp bits 0b110
+ */
+ if (!(pteflags & _PAGE_WRITE)) {
+ if (mmu_has_feature(MMU_FTR_KERNEL_RO))
+ rflags |= (HPTE_R_PP0 | 0x2);
+ else
+ rflags |= 0x3;
+ }
+ VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request");
+ } else {
+ if (pteflags & _PAGE_RWX)
+ rflags |= 0x2;
+ /*
+ * We should never hit this in normal fault handling because
+ * a permission check (check_pte_access()) will bubble this
+ * to higher level linux handler even for PAGE_NONE.
+ */
+ VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request");
+ if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
+ rflags |= 0x1;
+ }
+ /*
+ * We can't allow hardware to update hpte bits. Hence always
+ * set 'R' bit and set 'C' if it is a write fault
+ */
+ rflags |= HPTE_R_R;
+
+ if (pteflags & _PAGE_DIRTY)
+ rflags |= HPTE_R_C;
+ /*
+ * Add in WIG bits
+ */
+
+ if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT)
+ rflags |= HPTE_R_I;
+ else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)
+ rflags |= (HPTE_R_I | HPTE_R_G);
+ else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
+ rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M);
+ else
+ /*
+ * Add memory coherence if cache inhibited is not set
+ */
+ rflags |= HPTE_R_M;
+
+ rflags |= pte_to_hpte_pkey_bits(pteflags, flags);
+ return rflags;
+}
+
+int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
+ unsigned long pstart, unsigned long prot,
+ int psize, int ssize)
+{
+ unsigned long vaddr, paddr;
+ unsigned int step, shift;
+ int ret = 0;
+
+ shift = mmu_psize_defs[psize].shift;
+ step = 1 << shift;
+
+ prot = htab_convert_pte_flags(prot, HPTE_USE_KERNEL_KEY);
+
+ DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
+ vstart, vend, pstart, prot, psize, ssize);
+
+ /* Carefully map only the possible range */
+ vaddr = ALIGN(vstart, step);
+ paddr = ALIGN(pstart, step);
+ vend = ALIGN_DOWN(vend, step);
+
+ for (; vaddr < vend; vaddr += step, paddr += step) {
+ unsigned long hash, hpteg;
+ unsigned long vsid = get_kernel_vsid(vaddr, ssize);
+ unsigned long vpn = hpt_vpn(vaddr, vsid, ssize);
+ unsigned long tprot = prot;
+ bool secondary_hash = false;
+
+ /*
+ * If we hit a bad address return error.
+ */
+ if (!vsid)
+ return -1;
+ /* Make kernel text executable */
+ if (overlaps_kernel_text(vaddr, vaddr + step))
+ tprot &= ~HPTE_R_N;
+
+ /*
+ * If relocatable, check if it overlaps interrupt vectors that
+ * are copied down to real 0. For relocatable kernel
+ * (e.g. kdump case) we copy interrupt vectors down to real
+ * address 0. Mark that region as executable. This is
+ * because on p8 system with relocation on exception feature
+ * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence
+ * in order to execute the interrupt handlers in virtual
+ * mode the vector region need to be marked as executable.
+ */
+ if ((PHYSICAL_START > MEMORY_START) &&
+ overlaps_interrupt_vector_text(vaddr, vaddr + step))
+ tprot &= ~HPTE_R_N;
+
+ hash = hpt_hash(vpn, shift, ssize);
+ hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+ BUG_ON(!mmu_hash_ops.hpte_insert);
+repeat:
+ ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
+ HPTE_V_BOLTED, psize, psize,
+ ssize);
+ if (ret == -1) {
+ /*
+ * Try to keep bolted entries in primary.
+ * Remove non bolted entries and try insert again
+ */
+ ret = mmu_hash_ops.hpte_remove(hpteg);
+ if (ret != -1)
+ ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
+ HPTE_V_BOLTED, psize, psize,
+ ssize);
+ if (ret == -1 && !secondary_hash) {
+ secondary_hash = true;
+ hpteg = ((~hash & htab_hash_mask) * HPTES_PER_GROUP);
+ goto repeat;
+ }
+ }
+
+ if (ret < 0)
+ break;
+
+ cond_resched();
+ /* add slot info in debug_pagealloc / kfence linear map */
+ hash_linear_map_add_slot(paddr, ret);
+ }
+ return ret < 0 ? ret : 0;
+}
+
+int htab_remove_mapping(unsigned long vstart, unsigned long vend,
+ int psize, int ssize)
+{
+ unsigned long vaddr, time_limit;
+ unsigned int step, shift;
+ int rc;
+ int ret = 0;
+
+ shift = mmu_psize_defs[psize].shift;
+ step = 1 << shift;
+
+ if (!mmu_hash_ops.hpte_removebolted)
+ return -ENODEV;
+
+ /* Unmap the full range specificied */
+ vaddr = ALIGN_DOWN(vstart, step);
+ time_limit = jiffies + HZ;
+
+ for (;vaddr < vend; vaddr += step) {
+ rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize);
+
+ /*
+ * For large number of mappings introduce a cond_resched()
+ * to prevent softlockup warnings.
+ */
+ if (time_after(jiffies, time_limit)) {
+ cond_resched();
+ time_limit = jiffies + HZ;
+ }
+ if (rc == -ENOENT) {
+ ret = -ENOENT;
+ continue;
+ }
+ if (rc < 0)
+ return rc;
+ }
+
+ return ret;
+}
+
+static bool disable_1tb_segments __ro_after_init;
+
+static int __init parse_disable_1tb_segments(char *p)
+{
+ disable_1tb_segments = true;
+ return 0;
+}
+early_param("disable_1tb_segments", parse_disable_1tb_segments);
+
+bool stress_hpt_enabled __initdata;
+
+static int __init parse_stress_hpt(char *p)
+{
+ stress_hpt_enabled = true;
+ return 0;
+}
+early_param("stress_hpt", parse_stress_hpt);
+
+__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_hpt_key);
+
+/*
+ * per-CPU array allocated if we enable stress_hpt.
+ */
+#define STRESS_MAX_GROUPS 16
+struct stress_hpt_struct {
+ unsigned long last_group[STRESS_MAX_GROUPS];
+};
+
+static inline int stress_nr_groups(void)
+{
+ /*
+ * LPAR H_REMOVE flushes TLB, so need some number > 1 of entries
+ * to allow practical forward progress. Bare metal returns 1, which
+ * seems to help uncover more bugs.
+ */
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ return STRESS_MAX_GROUPS;
+ else
+ return 1;
+}
+
+static struct stress_hpt_struct *stress_hpt_struct;
+
+static int __init htab_dt_scan_seg_sizes(unsigned long node,
+ const char *uname, int depth,
+ void *data)
+{
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+ int size = 0;
+
+ /* We are scanning "cpu" nodes only */
+ if (type == NULL || strcmp(type, "cpu") != 0)
+ return 0;
+
+ prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size);
+ if (prop == NULL)
+ return 0;
+ for (; size >= 4; size -= 4, ++prop) {
+ if (be32_to_cpu(prop[0]) == 40) {
+ DBG("1T segment support detected\n");
+
+ if (disable_1tb_segments) {
+ DBG("1T segments disabled by command line\n");
+ break;
+ }
+
+ cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
+ return 1;
+ }
+ }
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
+ return 0;
+}
+
+static int __init get_idx_from_shift(unsigned int shift)
+{
+ int idx = -1;
+
+ switch (shift) {
+ case 0xc:
+ idx = MMU_PAGE_4K;
+ break;
+ case 0x10:
+ idx = MMU_PAGE_64K;
+ break;
+ case 0x14:
+ idx = MMU_PAGE_1M;
+ break;
+ case 0x18:
+ idx = MMU_PAGE_16M;
+ break;
+ case 0x22:
+ idx = MMU_PAGE_16G;
+ break;
+ }
+ return idx;
+}
+
+static int __init htab_dt_scan_page_sizes(unsigned long node,
+ const char *uname, int depth,
+ void *data)
+{
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+ int size = 0;
+
+ /* We are scanning "cpu" nodes only */
+ if (type == NULL || strcmp(type, "cpu") != 0)
+ return 0;
+
+ prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size);
+ if (!prop)
+ return 0;
+
+ pr_info("Page sizes from device-tree:\n");
+ size /= 4;
+ cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
+ while(size > 0) {
+ unsigned int base_shift = be32_to_cpu(prop[0]);
+ unsigned int slbenc = be32_to_cpu(prop[1]);
+ unsigned int lpnum = be32_to_cpu(prop[2]);
+ struct mmu_psize_def *def;
+ int idx, base_idx;
+
+ size -= 3; prop += 3;
+ base_idx = get_idx_from_shift(base_shift);
+ if (base_idx < 0) {
+ /* skip the pte encoding also */
+ prop += lpnum * 2; size -= lpnum * 2;
+ continue;
+ }
+ def = &mmu_psize_defs[base_idx];
+ if (base_idx == MMU_PAGE_16M)
+ cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
+
+ def->shift = base_shift;
+ if (base_shift <= 23)
+ def->avpnm = 0;
+ else
+ def->avpnm = (1 << (base_shift - 23)) - 1;
+ def->sllp = slbenc;
+ /*
+ * We don't know for sure what's up with tlbiel, so
+ * for now we only set it for 4K and 64K pages
+ */
+ if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
+ def->tlbiel = 1;
+ else
+ def->tlbiel = 0;
+
+ while (size > 0 && lpnum) {
+ unsigned int shift = be32_to_cpu(prop[0]);
+ int penc = be32_to_cpu(prop[1]);
+
+ prop += 2; size -= 2;
+ lpnum--;
+
+ idx = get_idx_from_shift(shift);
+ if (idx < 0)
+ continue;
+
+ if (penc == -1)
+ pr_err("Invalid penc for base_shift=%d "
+ "shift=%d\n", base_shift, shift);
+
+ def->penc[idx] = penc;
+ pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
+ " avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
+ base_shift, shift, def->sllp,
+ def->avpnm, def->tlbiel, def->penc[idx]);
+ }
+ }
+
+ return 1;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Scan for 16G memory blocks that have been set aside for huge pages
+ * and reserve those blocks for 16G huge pages.
+ */
+static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
+ const char *uname, int depth,
+ void *data) {
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be64 *addr_prop;
+ const __be32 *page_count_prop;
+ unsigned int expected_pages;
+ long unsigned int phys_addr;
+ long unsigned int block_size;
+
+ /* We are scanning "memory" nodes only */
+ if (type == NULL || strcmp(type, "memory") != 0)
+ return 0;
+
+ /*
+ * This property is the log base 2 of the number of virtual pages that
+ * will represent this memory block.
+ */
+ page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
+ if (page_count_prop == NULL)
+ return 0;
+ expected_pages = (1 << be32_to_cpu(page_count_prop[0]));
+ addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
+ if (addr_prop == NULL)
+ return 0;
+ phys_addr = be64_to_cpu(addr_prop[0]);
+ block_size = be64_to_cpu(addr_prop[1]);
+ if (block_size != (16 * GB))
+ return 0;
+ pr_info("Huge page(16GB) memory: "
+ "addr = 0x%lX size = 0x%lX pages = %d\n",
+ phys_addr, block_size, expected_pages);
+ if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
+ memblock_reserve(phys_addr, block_size * expected_pages);
+ pseries_add_gpage(phys_addr, block_size, expected_pages);
+ }
+ return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static void __init mmu_psize_set_default_penc(void)
+{
+ int bpsize, apsize;
+ for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+ for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
+ mmu_psize_defs[bpsize].penc[apsize] = -1;
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+
+static bool __init might_have_hea(void)
+{
+ /*
+ * The HEA ethernet adapter requires awareness of the
+ * GX bus. Without that awareness we can easily assume
+ * we will never see an HEA ethernet device.
+ */
+#ifdef CONFIG_IBMEBUS
+ return !cpu_has_feature(CPU_FTR_ARCH_207S) &&
+ firmware_has_feature(FW_FEATURE_SPLPAR);
+#else
+ return false;
+#endif
+}
+
+#endif /* #ifdef CONFIG_PPC_64K_PAGES */
+
+static void __init htab_scan_page_sizes(void)
+{
+ int rc;
+
+ /* se the invalid penc to -1 */
+ mmu_psize_set_default_penc();
+
+ /* Default to 4K pages only */
+ memcpy(mmu_psize_defs, mmu_psize_defaults,
+ sizeof(mmu_psize_defaults));
+
+ /*
+ * Try to find the available page sizes in the device-tree
+ */
+ rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL);
+ if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) {
+ /*
+ * Nothing in the device-tree, but the CPU supports 16M pages,
+ * so let's fallback on a known size list for 16M capable CPUs.
+ */
+ memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
+ sizeof(mmu_psize_defaults_gp));
+ }
+
+#ifdef CONFIG_HUGETLB_PAGE
+ if (!hugetlb_disabled && !early_radix_enabled() ) {
+ /* Reserve 16G huge page memory sections for huge pages */
+ of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+ }
+#endif /* CONFIG_HUGETLB_PAGE */
+}
+
+/*
+ * Fill in the hpte_page_sizes[] array.
+ * We go through the mmu_psize_defs[] array looking for all the
+ * supported base/actual page size combinations. Each combination
+ * has a unique pagesize encoding (penc) value in the low bits of
+ * the LP field of the HPTE. For actual page sizes less than 1MB,
+ * some of the upper LP bits are used for RPN bits, meaning that
+ * we need to fill in several entries in hpte_page_sizes[].
+ *
+ * In diagrammatic form, with r = RPN bits and z = page size bits:
+ * PTE LP actual page size
+ * rrrr rrrz >=8KB
+ * rrrr rrzz >=16KB
+ * rrrr rzzz >=32KB
+ * rrrr zzzz >=64KB
+ * ...
+ *
+ * The zzzz bits are implementation-specific but are chosen so that
+ * no encoding for a larger page size uses the same value in its
+ * low-order N bits as the encoding for the 2^(12+N) byte page size
+ * (if it exists).
+ */
+static void __init init_hpte_page_sizes(void)
+{
+ long int ap, bp;
+ long int shift, penc;
+
+ for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
+ if (!mmu_psize_defs[bp].shift)
+ continue; /* not a supported page size */
+ for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
+ penc = mmu_psize_defs[bp].penc[ap];
+ if (penc == -1 || !mmu_psize_defs[ap].shift)
+ continue;
+ shift = mmu_psize_defs[ap].shift - LP_SHIFT;
+ if (shift <= 0)
+ continue; /* should never happen */
+ /*
+ * For page sizes less than 1MB, this loop
+ * replicates the entry for all possible values
+ * of the rrrr bits.
+ */
+ while (penc < (1 << LP_BITS)) {
+ hpte_page_sizes[penc] = (ap << 4) | bp;
+ penc += 1 << shift;
+ }
+ }
+ }
+}
+
+static void __init htab_init_page_sizes(void)
+{
+ bool aligned = true;
+ init_hpte_page_sizes();
+
+ if (!hash_supports_debug_pagealloc() && !kfence_early_init_enabled()) {
+ /*
+ * Pick a size for the linear mapping. Currently, we only
+ * support 16M, 1M and 4K which is the default
+ */
+ if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) &&
+ (unsigned long)_stext % 0x1000000) {
+ if (mmu_psize_defs[MMU_PAGE_16M].shift)
+ pr_warn("Kernel not 16M aligned, disabling 16M linear map alignment\n");
+ aligned = false;
+ }
+
+ if (mmu_psize_defs[MMU_PAGE_16M].shift && aligned)
+ mmu_linear_psize = MMU_PAGE_16M;
+ else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+ mmu_linear_psize = MMU_PAGE_1M;
+ }
+
+#ifdef CONFIG_PPC_64K_PAGES
+ /*
+ * Pick a size for the ordinary pages. Default is 4K, we support
+ * 64K for user mappings and vmalloc if supported by the processor.
+ * We only use 64k for ioremap if the processor
+ * (and firmware) support cache-inhibited large pages.
+ * If not, we use 4k and set mmu_ci_restrictions so that
+ * hash_page knows to switch processes that use cache-inhibited
+ * mappings to 4k pages.
+ */
+ if (mmu_psize_defs[MMU_PAGE_64K].shift) {
+ mmu_virtual_psize = MMU_PAGE_64K;
+ mmu_vmalloc_psize = MMU_PAGE_64K;
+ if (mmu_linear_psize == MMU_PAGE_4K)
+ mmu_linear_psize = MMU_PAGE_64K;
+ if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) {
+ /*
+ * When running on pSeries using 64k pages for ioremap
+ * would stop us accessing the HEA ethernet. So if we
+ * have the chance of ever seeing one, stay at 4k.
+ */
+ if (!might_have_hea())
+ mmu_io_psize = MMU_PAGE_64K;
+ } else
+ mmu_ci_restrictions = 1;
+ }
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ /*
+ * We try to use 16M pages for vmemmap if that is supported
+ * and we have at least 1G of RAM at boot
+ */
+ if (mmu_psize_defs[MMU_PAGE_16M].shift &&
+ memblock_phys_mem_size() >= 0x40000000)
+ mmu_vmemmap_psize = MMU_PAGE_16M;
+ else
+ mmu_vmemmap_psize = mmu_virtual_psize;
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+ pr_info("Page orders: linear mapping = %d, "
+ "virtual = %d, io = %d"
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ ", vmemmap = %d"
+#endif
+ "\n",
+ mmu_psize_defs[mmu_linear_psize].shift,
+ mmu_psize_defs[mmu_virtual_psize].shift,
+ mmu_psize_defs[mmu_io_psize].shift
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ ,mmu_psize_defs[mmu_vmemmap_psize].shift
+#endif
+ );
+}
+
+static int __init htab_dt_scan_pftsize(unsigned long node,
+ const char *uname, int depth,
+ void *data)
+{
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+
+ /* We are scanning "cpu" nodes only */
+ if (type == NULL || strcmp(type, "cpu") != 0)
+ return 0;
+
+ prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
+ if (prop != NULL) {
+ /* pft_size[0] is the NUMA CEC cookie */
+ ppc64_pft_size = be32_to_cpu(prop[1]);
+ return 1;
+ }
+ return 0;
+}
+
+unsigned htab_shift_for_mem_size(unsigned long mem_size)
+{
+ unsigned memshift = __ilog2(mem_size);
+ unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift;
+ unsigned pteg_shift;
+
+ /* round mem_size up to next power of 2 */
+ if ((1UL << memshift) < mem_size)
+ memshift += 1;
+
+ /* aim for 2 pages / pteg */
+ pteg_shift = memshift - (pshift + 1);
+
+ /*
+ * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab
+ * size permitted by the architecture.
+ */
+ return max(pteg_shift + 7, 18U);
+}
+
+static unsigned long __init htab_get_table_size(void)
+{
+ /*
+ * If hash size isn't already provided by the platform, we try to
+ * retrieve it from the device-tree. If it's not there neither, we
+ * calculate it now based on the total RAM size
+ */
+ if (ppc64_pft_size == 0)
+ of_scan_flat_dt(htab_dt_scan_pftsize, NULL);
+ if (ppc64_pft_size)
+ return 1UL << ppc64_pft_size;
+
+ return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size());
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int resize_hpt_for_hotplug(unsigned long new_mem_size)
+{
+ unsigned target_hpt_shift;
+
+ if (!mmu_hash_ops.resize_hpt)
+ return 0;
+
+ target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
+
+ /*
+ * To avoid lots of HPT resizes if memory size is fluctuating
+ * across a boundary, we deliberately have some hysterisis
+ * here: we immediately increase the HPT size if the target
+ * shift exceeds the current shift, but we won't attempt to
+ * reduce unless the target shift is at least 2 below the
+ * current shift
+ */
+ if (target_hpt_shift > ppc64_pft_size ||
+ target_hpt_shift < ppc64_pft_size - 1)
+ return mmu_hash_ops.resize_hpt(target_hpt_shift);
+
+ return 0;
+}
+
+int hash__create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot)
+{
+ int rc;
+ unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift;
+
+ if (end >= H_VMALLOC_START) {
+ pr_warn("Outside the supported range\n");
+ return -1;
+ }
+
+ resize_hpt_for_hotplug(memblock_phys_mem_size());
+
+ rc = htab_bolt_mapping(start, end, __pa(start),
+ pgprot_val(prot), mmu_linear_psize,
+ mmu_kernel_ssize);
+
+ if (rc < 0) {
+ int rc2 = htab_remove_mapping(start, end, mmu_linear_psize,
+ mmu_kernel_ssize);
+ BUG_ON(rc2 && (rc2 != -ENOENT));
+ }
+ update_page_count(mmu_linear_psize, (end - start) >> pshift);
+ return rc;
+}
+
+int hash__remove_section_mapping(unsigned long start, unsigned long end)
+{
+ unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift;
+
+ int rc = htab_remove_mapping(start, end, mmu_linear_psize,
+ mmu_kernel_ssize);
+
+ if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
+ pr_warn("Hash collision while resizing HPT\n");
+
+ if (!rc)
+ update_page_count(mmu_linear_psize, -((end - start) >> pshift));
+ return rc;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+static void __init hash_init_partition_table(phys_addr_t hash_table,
+ unsigned long htab_size)
+{
+ mmu_partition_table_init();
+
+ /*
+ * PS field (VRMA page size) is not used for LPID 0, hence set to 0.
+ * For now, UPRT is 0 and we have no segment table.
+ */
+ htab_size = __ilog2(htab_size) - 18;
+ mmu_partition_table_set_entry(0, hash_table | htab_size, 0, false);
+ pr_info("Partition table %p\n", partition_tb);
+}
+
+void hpt_clear_stress(void);
+static struct timer_list stress_hpt_timer;
+static void stress_hpt_timer_fn(struct timer_list *timer)
+{
+ int next_cpu;
+
+ hpt_clear_stress();
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ tlbiel_all();
+
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+ stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
+ add_timer_on(&stress_hpt_timer, next_cpu);
+}
+
+static void __init htab_initialize(void)
+{
+ unsigned long table;
+ unsigned long pteg_count;
+ unsigned long prot;
+ phys_addr_t base = 0, size = 0, end, limit = MEMBLOCK_ALLOC_ANYWHERE;
+ u64 i;
+ unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift;
+
+ DBG(" -> htab_initialize()\n");
+
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ limit = ppc64_rma_size;
+
+ if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
+ mmu_kernel_ssize = MMU_SEGSIZE_1T;
+ mmu_highuser_ssize = MMU_SEGSIZE_1T;
+ pr_info("Using 1TB segments\n");
+ }
+
+ if (stress_slb_enabled)
+ static_branch_enable(&stress_slb_key);
+
+ if (no_slb_preload)
+ static_branch_enable(&no_slb_preload_key);
+
+ if (stress_hpt_enabled) {
+ unsigned long tmp;
+ static_branch_enable(&stress_hpt_key);
+ // Too early to use nr_cpu_ids, so use NR_CPUS
+ tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS,
+ __alignof__(struct stress_hpt_struct),
+ MEMBLOCK_LOW_LIMIT, limit);
+ memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS);
+ stress_hpt_struct = __va(tmp);
+
+ timer_setup(&stress_hpt_timer, stress_hpt_timer_fn, 0);
+ stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
+ add_timer(&stress_hpt_timer);
+ }
+
+ /*
+ * Calculate the required size of the htab. We want the number of
+ * PTEGs to equal one half the number of real pages.
+ */
+ htab_size_bytes = htab_get_table_size();
+ pteg_count = htab_size_bytes >> 7;
+
+ htab_hash_mask = pteg_count - 1;
+
+ if (firmware_has_feature(FW_FEATURE_LPAR) ||
+ firmware_has_feature(FW_FEATURE_PS3_LV1)) {
+ /* Using a hypervisor which owns the htab */
+ htab_address = NULL;
+ _SDR1 = 0;
+#ifdef CONFIG_FA_DUMP
+ /*
+ * If firmware assisted dump is active firmware preserves
+ * the contents of htab along with entire partition memory.
+ * Clear the htab if firmware assisted dump is active so
+ * that we dont end up using old mappings.
+ */
+ if (is_fadump_active() && mmu_hash_ops.hpte_clear_all)
+ mmu_hash_ops.hpte_clear_all();
+#endif
+ } else {
+
+ table = memblock_phys_alloc_range(htab_size_bytes,
+ htab_size_bytes,
+ MEMBLOCK_LOW_LIMIT, limit);
+ if (!table)
+ panic("ERROR: Failed to allocate %pa bytes below %pa\n",
+ &htab_size_bytes, &limit);
+
+ DBG("Hash table allocated at %lx, size: %lx\n", table,
+ htab_size_bytes);
+
+ htab_address = __va(table);
+
+ /* htab absolute addr + encoded htabsize */
+ _SDR1 = table + __ilog2(htab_size_bytes) - 18;
+
+ /* Initialize the HPT with no entries */
+ memset((void *)table, 0, htab_size_bytes);
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ /* Set SDR1 */
+ mtspr(SPRN_SDR1, _SDR1);
+ else
+ hash_init_partition_table(table, htab_size_bytes);
+ }
+
+ prot = pgprot_val(PAGE_KERNEL);
+
+ hash_debug_pagealloc_alloc_slots();
+ hash_kfence_alloc_pool();
+ /* create bolted the linear mapping in the hash table */
+ for_each_mem_range(i, &base, &end) {
+ size = end - base;
+ base = (unsigned long)__va(base);
+
+ pr_debug("creating mapping for region: 0x%pa..0x%pa (prot: %lx)\n",
+ &base, &size, prot);
+
+ if ((base + size) >= H_VMALLOC_START) {
+ pr_warn("Outside the supported range\n");
+ continue;
+ }
+
+ BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
+ prot, mmu_linear_psize, mmu_kernel_ssize));
+
+ update_page_count(mmu_linear_psize, size >> pshift);
+ }
+ hash_kfence_map_pool();
+ memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
+
+ /*
+ * If we have a memory_limit and we've allocated TCEs then we need to
+ * explicitly map the TCE area at the top of RAM. We also cope with the
+ * case that the TCEs start below memory_limit.
+ * tce_alloc_start/end are 16MB aligned so the mapping should work
+ * for either 4K or 16MB pages.
+ */
+ if (tce_alloc_start) {
+ tce_alloc_start = (unsigned long)__va(tce_alloc_start);
+ tce_alloc_end = (unsigned long)__va(tce_alloc_end);
+
+ if (base + size >= tce_alloc_start)
+ tce_alloc_start = base + size + 1;
+
+ BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
+ __pa(tce_alloc_start), prot,
+ mmu_linear_psize, mmu_kernel_ssize));
+ update_page_count(mmu_linear_psize,
+ (tce_alloc_end - tce_alloc_start) >> pshift);
+ }
+
+
+ DBG(" <- htab_initialize()\n");
+}
+#undef KB
+#undef MB
+
+void __init hash__early_init_devtree(void)
+{
+ /* Initialize segment sizes */
+ of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
+
+ /* Initialize page sizes */
+ htab_scan_page_sizes();
+}
+
+static struct hash_mm_context init_hash_mm_context;
+void __init hash__early_init_mmu(void)
+{
+#ifndef CONFIG_PPC_64K_PAGES
+ /*
+ * We have code in __hash_page_4K() and elsewhere, which assumes it can
+ * do the following:
+ * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX);
+ *
+ * Where the slot number is between 0-15, and values of 8-15 indicate
+ * the secondary bucket. For that code to work H_PAGE_F_SECOND and
+ * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and
+ * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here
+ * with a BUILD_BUG_ON().
+ */
+ BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3)));
+#endif /* CONFIG_PPC_64K_PAGES */
+
+ htab_init_page_sizes();
+
+ /*
+ * initialize page table size
+ */
+ __pte_frag_nr = H_PTE_FRAG_NR;
+ __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
+ __pmd_frag_nr = H_PMD_FRAG_NR;
+ __pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT;
+
+ __pte_index_size = H_PTE_INDEX_SIZE;
+ __pmd_index_size = H_PMD_INDEX_SIZE;
+ __pud_index_size = H_PUD_INDEX_SIZE;
+ __pgd_index_size = H_PGD_INDEX_SIZE;
+ __pud_cache_index = H_PUD_CACHE_INDEX;
+ __pte_table_size = H_PTE_TABLE_SIZE;
+ __pmd_table_size = H_PMD_TABLE_SIZE;
+ __pud_table_size = H_PUD_TABLE_SIZE;
+ __pgd_table_size = H_PGD_TABLE_SIZE;
+ __pmd_val_bits = HASH_PMD_VAL_BITS;
+ __pud_val_bits = HASH_PUD_VAL_BITS;
+ __pgd_val_bits = HASH_PGD_VAL_BITS;
+
+ __kernel_virt_start = H_KERN_VIRT_START;
+ __vmalloc_start = H_VMALLOC_START;
+ __vmalloc_end = H_VMALLOC_END;
+ __kernel_io_start = H_KERN_IO_START;
+ __kernel_io_end = H_KERN_IO_END;
+ vmemmap = (struct page *)H_VMEMMAP_START;
+ ioremap_bot = IOREMAP_BASE;
+
+#ifdef CONFIG_PCI
+ pci_io_base = ISA_IO_BASE;
+#endif
+
+ /* Select appropriate backend */
+ if (firmware_has_feature(FW_FEATURE_PS3_LV1))
+ ps3_early_mm_init();
+ else if (firmware_has_feature(FW_FEATURE_LPAR))
+ hpte_init_pseries();
+ else if (IS_ENABLED(CONFIG_PPC_HASH_MMU_NATIVE))
+ hpte_init_native();
+
+ if (!mmu_hash_ops.hpte_insert)
+ panic("hash__early_init_mmu: No MMU hash ops defined!\n");
+
+ /*
+ * Initialize the MMU Hash table and create the linear mapping
+ * of memory. Has to be done before SLB initialization as this is
+ * currently where the page size encoding is obtained.
+ */
+ htab_initialize();
+
+ init_mm.context.hash_context = &init_hash_mm_context;
+ mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT);
+
+ pr_info("Initializing hash mmu with SLB\n");
+ /* Initialize SLB management */
+ slb_initialize();
+
+ if (cpu_has_feature(CPU_FTR_ARCH_206)
+ && cpu_has_feature(CPU_FTR_HVMODE))
+ tlbiel_all();
+}
+
+#ifdef CONFIG_SMP
+void hash__early_init_mmu_secondary(void)
+{
+ /* Initialize hash table for that CPU */
+ if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ mtspr(SPRN_SDR1, _SDR1);
+ else
+ set_ptcr_when_no_uv(__pa(partition_tb) |
+ (PATB_SIZE_SHIFT - 12));
+ }
+ /* Initialize SLB */
+ slb_initialize();
+
+ if (cpu_has_feature(CPU_FTR_ARCH_206)
+ && cpu_has_feature(CPU_FTR_HVMODE))
+ tlbiel_all();
+
+#ifdef CONFIG_PPC_MEM_KEYS
+ if (mmu_has_feature(MMU_FTR_PKEY))
+ mtspr(SPRN_UAMOR, default_uamor);
+#endif
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
+{
+ struct folio *folio;
+
+ if (!pfn_valid(pte_pfn(pte)))
+ return pp;
+
+ folio = page_folio(pte_page(pte));
+
+ /* page is dirty */
+ if (!test_bit(PG_dcache_clean, &folio->flags.f) &&
+ !folio_test_reserved(folio)) {
+ if (trap == INTERRUPT_INST_STORAGE) {
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags.f);
+ } else
+ pp |= HPTE_R_N;
+ }
+ return pp;
+}
+
+static unsigned int get_paca_psize(unsigned long addr)
+{
+ unsigned char *psizes;
+ unsigned long index, mask_index;
+
+ if (addr < SLICE_LOW_TOP) {
+ psizes = get_paca()->mm_ctx_low_slices_psize;
+ index = GET_LOW_SLICE_INDEX(addr);
+ } else {
+ psizes = get_paca()->mm_ctx_high_slices_psize;
+ index = GET_HIGH_SLICE_INDEX(addr);
+ }
+ mask_index = index & 0x1;
+ return (psizes[index >> 1] >> (mask_index * 4)) & 0xF;
+}
+
+
+/*
+ * Demote a segment to using 4k pages.
+ * For now this makes the whole process use 4k pages.
+ */
+#ifdef CONFIG_PPC_64K_PAGES
+void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
+{
+ if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
+ return;
+ slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
+#ifdef CONFIG_SPU_BASE
+ spu_flush_all_slbs(mm);
+#endif
+ if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
+
+ copy_mm_to_paca(mm);
+ slb_flush_and_restore_bolted();
+ }
+}
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+/*
+ * This looks up a 2-bit protection code for a 4k subpage of a 64k page.
+ * Userspace sets the subpage permissions using the subpage_prot system call.
+ *
+ * Result is 0: full permissions, _PAGE_RW: read-only,
+ * _PAGE_RWX: no access.
+ */
+static int subpage_protection(struct mm_struct *mm, unsigned long ea)
+{
+ struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
+ u32 spp = 0;
+ u32 **sbpm, *sbpp;
+
+ if (!spt)
+ return 0;
+
+ if (ea >= spt->maxaddr)
+ return 0;
+ if (ea < 0x100000000UL) {
+ /* addresses below 4GB use spt->low_prot */
+ sbpm = spt->low_prot;
+ } else {
+ sbpm = spt->protptrs[ea >> SBP_L3_SHIFT];
+ if (!sbpm)
+ return 0;
+ }
+ sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
+ if (!sbpp)
+ return 0;
+ spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)];
+
+ /* extract 2-bit bitfield for this 4k subpage */
+ spp >>= 30 - 2 * ((ea >> 12) & 0xf);
+
+ /*
+ * 0 -> full permission
+ * 1 -> Read only
+ * 2 -> no access.
+ * We return the flag that need to be cleared.
+ */
+ spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0);
+ return spp;
+}
+
+#else /* CONFIG_PPC_SUBPAGE_PROT */
+static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
+{
+ return 0;
+}
+#endif
+
+void hash_failure_debug(unsigned long ea, unsigned long access,
+ unsigned long vsid, unsigned long trap,
+ int ssize, int psize, int lpsize, unsigned long pte)
+{
+ if (!printk_ratelimit())
+ return;
+ pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n",
+ ea, access, current->comm);
+ pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n",
+ trap, vsid, ssize, psize, lpsize, pte);
+}
+
+static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
+ int psize, bool user_region)
+{
+ if (user_region) {
+ if (psize != get_paca_psize(ea)) {
+ copy_mm_to_paca(mm);
+ slb_flush_and_restore_bolted();
+ }
+ } else if (get_paca()->vmalloc_sllp !=
+ mmu_psize_defs[mmu_vmalloc_psize].sllp) {
+ get_paca()->vmalloc_sllp =
+ mmu_psize_defs[mmu_vmalloc_psize].sllp;
+ slb_vmalloc_update();
+ }
+}
+
+/*
+ * Result code is:
+ * 0 - handled
+ * 1 - normal page fault
+ * -1 - critical hash insertion error
+ * -2 - access not permitted by subpage protection mechanism
+ */
+int hash_page_mm(struct mm_struct *mm, unsigned long ea,
+ unsigned long access, unsigned long trap,
+ unsigned long flags)
+{
+ bool is_thp;
+ pgd_t *pgdir;
+ unsigned long vsid;
+ pte_t *ptep;
+ unsigned hugeshift;
+ int rc, user_region = 0;
+ int psize, ssize;
+
+ DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
+ ea, access, trap);
+ trace_hash_fault(ea, access, trap);
+
+ /* Get region & vsid */
+ switch (get_region_id(ea)) {
+ case USER_REGION_ID:
+ user_region = 1;
+ if (! mm) {
+ DBG_LOW(" user region with no mm !\n");
+ rc = 1;
+ goto bail;
+ }
+ psize = get_slice_psize(mm, ea);
+ ssize = user_segment_size(ea);
+ vsid = get_user_vsid(&mm->context, ea, ssize);
+ break;
+ case VMALLOC_REGION_ID:
+ vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+ psize = mmu_vmalloc_psize;
+ ssize = mmu_kernel_ssize;
+ flags |= HPTE_USE_KERNEL_KEY;
+ break;
+
+ case IO_REGION_ID:
+ vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+ psize = mmu_io_psize;
+ ssize = mmu_kernel_ssize;
+ flags |= HPTE_USE_KERNEL_KEY;
+ break;
+ default:
+ /*
+ * Not a valid range
+ * Send the problem up to do_page_fault()
+ */
+ rc = 1;
+ goto bail;
+ }
+ DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
+
+ /* Bad address. */
+ if (!vsid) {
+ DBG_LOW("Bad address!\n");
+ rc = 1;
+ goto bail;
+ }
+ /* Get pgdir */
+ pgdir = mm->pgd;
+ if (pgdir == NULL) {
+ rc = 1;
+ goto bail;
+ }
+
+ /* Check CPU locality */
+ if (user_region && mm_is_thread_local(mm))
+ flags |= HPTE_LOCAL_UPDATE;
+
+#ifndef CONFIG_PPC_64K_PAGES
+ /*
+ * If we use 4K pages and our psize is not 4K, then we might
+ * be hitting a special driver mapping, and need to align the
+ * address before we fetch the PTE.
+ *
+ * It could also be a hugepage mapping, in which case this is
+ * not necessary, but it's not harmful, either.
+ */
+ if (psize != MMU_PAGE_4K)
+ ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+#endif /* CONFIG_PPC_64K_PAGES */
+
+ /* Get PTE and page size from page tables */
+ ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift);
+ if (ptep == NULL || !pte_present(*ptep)) {
+ DBG_LOW(" no PTE !\n");
+ rc = 1;
+ goto bail;
+ }
+
+ if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !radix_enabled()) {
+ if (hugeshift == PMD_SHIFT && psize == MMU_PAGE_16M)
+ hugeshift = mmu_psize_defs[MMU_PAGE_16M].shift;
+ if (hugeshift == PUD_SHIFT && psize == MMU_PAGE_16G)
+ hugeshift = mmu_psize_defs[MMU_PAGE_16G].shift;
+ }
+
+ /*
+ * Add _PAGE_PRESENT to the required access perm. If there are parallel
+ * updates to the pte that can possibly clear _PAGE_PTE, catch that too.
+ *
+ * We can safely use the return pte address in rest of the function
+ * because we do set H_PAGE_BUSY which prevents further updates to pte
+ * from generic code.
+ */
+ access |= _PAGE_PRESENT | _PAGE_PTE;
+
+ /*
+ * Pre-check access permissions (will be re-checked atomically
+ * in __hash_page_XX but this pre-check is a fast path
+ */
+ if (!check_pte_access(access, pte_val(*ptep))) {
+ DBG_LOW(" no access !\n");
+ rc = 1;
+ goto bail;
+ }
+
+ if (hugeshift) {
+ if (is_thp)
+ rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
+ trap, flags, ssize, psize);
+#ifdef CONFIG_HUGETLB_PAGE
+ else
+ rc = __hash_page_huge(ea, access, vsid, ptep, trap,
+ flags, ssize, hugeshift, psize);
+#else
+ else {
+ /*
+ * if we have hugeshift, and is not transhuge with
+ * hugetlb disabled, something is really wrong.
+ */
+ rc = 1;
+ WARN_ON(1);
+ }
+#endif
+ if (current->mm == mm)
+ check_paca_psize(ea, mm, psize, user_region);
+
+ goto bail;
+ }
+
+#ifndef CONFIG_PPC_64K_PAGES
+ DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
+#else
+ DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
+ pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
+ /* Do actual hashing */
+#ifdef CONFIG_PPC_64K_PAGES
+ /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */
+ if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
+ demote_segment_4k(mm, ea);
+ psize = MMU_PAGE_4K;
+ }
+
+ /*
+ * If this PTE is non-cacheable and we have restrictions on
+ * using non cacheable large pages, then we switch to 4k
+ */
+ if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) {
+ if (user_region) {
+ demote_segment_4k(mm, ea);
+ psize = MMU_PAGE_4K;
+ } else if (ea < VMALLOC_END) {
+ /*
+ * some driver did a non-cacheable mapping
+ * in vmalloc space, so switch vmalloc
+ * to 4k pages
+ */
+ pr_alert("Reducing vmalloc segment "
+ "to 4kB pages because of "
+ "non-cacheable mapping\n");
+ psize = mmu_vmalloc_psize = MMU_PAGE_4K;
+#ifdef CONFIG_SPU_BASE
+ spu_flush_all_slbs(mm);
+#endif
+ }
+ }
+
+#endif /* CONFIG_PPC_64K_PAGES */
+
+ if (current->mm == mm)
+ check_paca_psize(ea, mm, psize, user_region);
+
+#ifdef CONFIG_PPC_64K_PAGES
+ if (psize == MMU_PAGE_64K)
+ rc = __hash_page_64K(ea, access, vsid, ptep, trap,
+ flags, ssize);
+ else
+#endif /* CONFIG_PPC_64K_PAGES */
+ {
+ int spp = subpage_protection(mm, ea);
+ if (access & spp)
+ rc = -2;
+ else
+ rc = __hash_page_4K(ea, access, vsid, ptep, trap,
+ flags, ssize, spp);
+ }
+
+ /*
+ * Dump some info in case of hash insertion failure, they should
+ * never happen so it is really useful to know if/when they do
+ */
+ if (rc == -1)
+ hash_failure_debug(ea, access, vsid, trap, ssize, psize,
+ psize, pte_val(*ptep));
+#ifndef CONFIG_PPC_64K_PAGES
+ DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
+#else
+ DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep),
+ pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
+ DBG_LOW(" -> rc=%d\n", rc);
+
+bail:
+ return rc;
+}
+EXPORT_SYMBOL_GPL(hash_page_mm);
+
+int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
+ unsigned long dsisr)
+{
+ unsigned long flags = 0;
+ struct mm_struct *mm = current->mm;
+
+ if ((get_region_id(ea) == VMALLOC_REGION_ID) ||
+ (get_region_id(ea) == IO_REGION_ID))
+ mm = &init_mm;
+
+ if (dsisr & DSISR_NOHPTE)
+ flags |= HPTE_NOHPTE_UPDATE;
+
+ return hash_page_mm(mm, ea, access, trap, flags);
+}
+EXPORT_SYMBOL_GPL(hash_page);
+
+DEFINE_INTERRUPT_HANDLER(do_hash_fault)
+{
+ unsigned long ea = regs->dar;
+ unsigned long dsisr = regs->dsisr;
+ unsigned long access = _PAGE_PRESENT | _PAGE_READ;
+ unsigned long flags = 0;
+ struct mm_struct *mm;
+ unsigned int region_id;
+ long err;
+
+ if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT))) {
+ hash__do_page_fault(regs);
+ return;
+ }
+
+ region_id = get_region_id(ea);
+ if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
+ mm = &init_mm;
+ else
+ mm = current->mm;
+
+ if (dsisr & DSISR_NOHPTE)
+ flags |= HPTE_NOHPTE_UPDATE;
+
+ if (dsisr & DSISR_ISSTORE)
+ access |= _PAGE_WRITE;
+ /*
+ * We set _PAGE_PRIVILEGED only when
+ * kernel mode access kernel space.
+ *
+ * _PAGE_PRIVILEGED is NOT set
+ * 1) when kernel mode access user space
+ * 2) user space access kernel space.
+ */
+ access |= _PAGE_PRIVILEGED;
+ if (user_mode(regs) || (region_id == USER_REGION_ID))
+ access &= ~_PAGE_PRIVILEGED;
+
+ if (TRAP(regs) == INTERRUPT_INST_STORAGE)
+ access |= _PAGE_EXEC;
+
+ err = hash_page_mm(mm, ea, access, TRAP(regs), flags);
+ if (unlikely(err < 0)) {
+ // failed to insert a hash PTE due to an hypervisor error
+ if (user_mode(regs)) {
+ if (IS_ENABLED(CONFIG_PPC_SUBPAGE_PROT) && err == -2)
+ _exception(SIGSEGV, regs, SEGV_ACCERR, ea);
+ else
+ _exception(SIGBUS, regs, BUS_ADRERR, ea);
+ } else {
+ bad_page_fault(regs, SIGBUS);
+ }
+ err = 0;
+
+ } else if (err) {
+ hash__do_page_fault(regs);
+ }
+}
+
+static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
+{
+ int psize = get_slice_psize(mm, ea);
+
+ /* We only prefault standard pages for now */
+ if (unlikely(psize != mm_ctx_user_psize(&mm->context)))
+ return false;
+
+ /*
+ * Don't prefault if subpage protection is enabled for the EA.
+ */
+ if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea)))
+ return false;
+
+ return true;
+}
+
+static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
+ bool is_exec, unsigned long trap)
+{
+ unsigned long vsid;
+ pgd_t *pgdir;
+ int rc, ssize, update_flags = 0;
+ unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
+ unsigned long flags;
+
+ BUG_ON(get_region_id(ea) != USER_REGION_ID);
+
+ if (!should_hash_preload(mm, ea))
+ return;
+
+ DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
+ " trap=%lx\n", mm, mm->pgd, ea, access, trap);
+
+ /* Get Linux PTE if available */
+ pgdir = mm->pgd;
+ if (pgdir == NULL)
+ return;
+
+ /* Get VSID */
+ ssize = user_segment_size(ea);
+ vsid = get_user_vsid(&mm->context, ea, ssize);
+ if (!vsid)
+ return;
+
+#ifdef CONFIG_PPC_64K_PAGES
+ /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
+ * a 64K kernel), then we don't preload, hash_page() will take
+ * care of it once we actually try to access the page.
+ * That way we don't have to duplicate all of the logic for segment
+ * page size demotion here
+ * Called with PTL held, hence can be sure the value won't change in
+ * between.
+ */
+ if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
+ return;
+#endif /* CONFIG_PPC_64K_PAGES */
+
+ /*
+ * __hash_page_* must run with interrupts off, including PMI interrupts
+ * off, as it sets the H_PAGE_BUSY bit.
+ *
+ * It's otherwise possible for perf interrupts to hit at any time and
+ * may take a hash fault reading the user stack, which could take a
+ * hash miss and deadlock on the same H_PAGE_BUSY bit.
+ *
+ * Interrupts must also be off for the duration of the
+ * mm_is_thread_local test and update, to prevent preempt running the
+ * mm on another CPU (XXX: this may be racy vs kthread_use_mm).
+ */
+ powerpc_local_irq_pmu_save(flags);
+
+ /* Is that local to this CPU ? */
+ if (mm_is_thread_local(mm))
+ update_flags |= HPTE_LOCAL_UPDATE;
+
+ /* Hash it in */
+#ifdef CONFIG_PPC_64K_PAGES
+ if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K)
+ rc = __hash_page_64K(ea, access, vsid, ptep, trap,
+ update_flags, ssize);
+ else
+#endif /* CONFIG_PPC_64K_PAGES */
+ rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags,
+ ssize, subpage_protection(mm, ea));
+
+ /* Dump some info in case of hash insertion failure, they should
+ * never happen so it is really useful to know if/when they do
+ */
+ if (rc == -1)
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ mm_ctx_user_psize(&mm->context),
+ mm_ctx_user_psize(&mm->context),
+ pte_val(*ptep));
+
+ powerpc_local_irq_pmu_restore(flags);
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux PTE.
+ *
+ * This must always be called with the pte lock held.
+ */
+void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep)
+{
+ /*
+ * We don't need to worry about _PAGE_PRESENT here because we are
+ * called with either mm->page_table_lock held or ptl lock held
+ */
+ unsigned long trap;
+ bool is_exec;
+
+ /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
+ if (!pte_young(*ptep) || address >= TASK_SIZE)
+ return;
+
+ /*
+ * We try to figure out if we are coming from an instruction
+ * access fault and pass that down to __hash_page so we avoid
+ * double-faulting on execution of fresh text. We have to test
+ * for regs NULL since init will get here first thing at boot.
+ *
+ * We also avoid filling the hash if not coming from a fault.
+ */
+
+ trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
+ switch (trap) {
+ case 0x300:
+ is_exec = false;
+ break;
+ case 0x400:
+ is_exec = true;
+ break;
+ default:
+ return;
+ }
+
+ hash_preload(vma->vm_mm, ptep, address, is_exec, trap);
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static inline void tm_flush_hash_page(int local)
+{
+ /*
+ * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a
+ * page back to a block device w/PIO could pick up transactional data
+ * (bad!) so we force an abort here. Before the sync the page will be
+ * made read-only, which will flush_hash_page. BIG ISSUE here: if the
+ * kernel uses a page from userspace without unmapping it first, it may
+ * see the speculated version.
+ */
+ if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
+ MSR_TM_ACTIVE(current->thread.regs->msr)) {
+ tm_enable();
+ tm_abort(TM_CAUSE_TLBI);
+ }
+}
+#else
+static inline void tm_flush_hash_page(int local)
+{
+}
+#endif
+
+/*
+ * Return the global hash slot, corresponding to the given PTE, which contains
+ * the HPTE.
+ */
+unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift,
+ int ssize, real_pte_t rpte, unsigned int subpg_index)
+{
+ unsigned long hash, gslot, hidx;
+
+ hash = hpt_hash(vpn, shift, ssize);
+ hidx = __rpte_to_hidx(rpte, subpg_index);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+ gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ gslot += hidx & _PTEIDX_GROUP_IX;
+ return gslot;
+}
+
+void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
+ unsigned long flags)
+{
+ unsigned long index, shift, gslot;
+ int local = flags & HPTE_LOCAL_UPDATE;
+
+ DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
+ pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+ gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index);
+ DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot);
+ /*
+ * We use same base page size and actual psize, because we don't
+ * use these functions for hugepage
+ */
+ mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize,
+ ssize, local);
+ } pte_iterate_hashed_end();
+
+ tm_flush_hash_page(local);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
+ pmd_t *pmdp, unsigned int psize, int ssize,
+ unsigned long flags)
+{
+ int i, max_hpte_count, valid;
+ unsigned long s_addr;
+ unsigned char *hpte_slot_array;
+ unsigned long hidx, shift, vpn, hash, slot;
+ int local = flags & HPTE_LOCAL_UPDATE;
+
+ s_addr = addr & HPAGE_PMD_MASK;
+ hpte_slot_array = get_hpte_slot_array(pmdp);
+ /*
+ * IF we try to do a HUGE PTE update after a withdraw is done.
+ * we will find the below NULL. This happens when we do
+ * split_huge_pmd
+ */
+ if (!hpte_slot_array)
+ return;
+
+ if (mmu_hash_ops.hugepage_invalidate) {
+ mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
+ psize, ssize, local);
+ goto tm_abort;
+ }
+ /*
+ * No bluk hpte removal support, invalidate each entry
+ */
+ shift = mmu_psize_defs[psize].shift;
+ max_hpte_count = HPAGE_PMD_SIZE >> shift;
+ for (i = 0; i < max_hpte_count; i++) {
+ /*
+ * 8 bits per each hpte entries
+ * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+ */
+ valid = hpte_valid(hpte_slot_array, i);
+ if (!valid)
+ continue;
+ hidx = hpte_hash_index(hpte_slot_array, i);
+
+ /* get the vpn */
+ addr = s_addr + (i * (1ul << shift));
+ vpn = hpt_vpn(addr, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+ mmu_hash_ops.hpte_invalidate(slot, vpn, psize,
+ MMU_PAGE_16M, ssize, local);
+ }
+tm_abort:
+ tm_flush_hash_page(local);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+void flush_hash_range(unsigned long number, int local)
+{
+ if (mmu_hash_ops.flush_hash_range)
+ mmu_hash_ops.flush_hash_range(number, local);
+ else {
+ int i;
+ struct ppc64_tlb_batch *batch =
+ this_cpu_ptr(&ppc64_tlb_batch);
+
+ for (i = 0; i < number; i++)
+ flush_hash_page(batch->vpn[i], batch->pte[i],
+ batch->psize, batch->ssize, local);
+ }
+}
+
+long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+ unsigned long pa, unsigned long rflags,
+ unsigned long vflags, int psize, int ssize)
+{
+ unsigned long hpte_group;
+ long slot;
+
+repeat:
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+ /* Insert into the hash table, primary slot */
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
+ psize, psize, ssize);
+
+ /* Primary is full, try the secondary */
+ if (unlikely(slot == -1)) {
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags,
+ vflags | HPTE_V_SECONDARY,
+ psize, psize, ssize);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = (hash & htab_hash_mask) *
+ HPTES_PER_GROUP;
+
+ mmu_hash_ops.hpte_remove(hpte_group);
+ goto repeat;
+ }
+ }
+
+ return slot;
+}
+
+void hpt_clear_stress(void)
+{
+ int cpu = raw_smp_processor_id();
+ int g;
+
+ for (g = 0; g < stress_nr_groups(); g++) {
+ unsigned long last_group;
+ last_group = stress_hpt_struct[cpu].last_group[g];
+
+ if (last_group != -1UL) {
+ int i;
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ if (mmu_hash_ops.hpte_remove(last_group) == -1)
+ break;
+ }
+ stress_hpt_struct[cpu].last_group[g] = -1;
+ }
+ }
+}
+
+void hpt_do_stress(unsigned long ea, unsigned long hpte_group)
+{
+ unsigned long last_group;
+ int cpu = raw_smp_processor_id();
+
+ last_group = stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1];
+ if (hpte_group == last_group)
+ return;
+
+ if (last_group != -1UL) {
+ int i;
+ /*
+ * Concurrent CPUs might be inserting into this group, so
+ * give up after a number of iterations, to prevent a live
+ * lock.
+ */
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ if (mmu_hash_ops.hpte_remove(last_group) == -1)
+ break;
+ }
+ stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1] = -1;
+ }
+
+ if (ea >= PAGE_OFFSET) {
+ /*
+ * We would really like to prefetch to get the TLB loaded, then
+ * remove the PTE before returning from fault interrupt, to
+ * increase the hash fault rate.
+ *
+ * Unfortunately QEMU TCG does not model the TLB in a way that
+ * makes this possible, and systemsim (mambo) emulator does not
+ * bring in TLBs with prefetches (although loads/stores do
+ * work for non-CI PTEs).
+ *
+ * So remember this PTE and clear it on the next hash fault.
+ */
+ memmove(&stress_hpt_struct[cpu].last_group[1],
+ &stress_hpt_struct[cpu].last_group[0],
+ (stress_nr_groups() - 1) * sizeof(unsigned long));
+ stress_hpt_struct[cpu].last_group[0] = hpte_group;
+ }
+}
+
+void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /*
+ * We don't currently support the first MEMBLOCK not mapping 0
+ * physical on those processors
+ */
+ BUG_ON(first_memblock_base != 0);
+
+ /*
+ * On virtualized systems the first entry is our RMA region aka VRMA,
+ * non-virtualized 64-bit hash MMU systems don't have a limitation
+ * on real mode access.
+ *
+ * For guests on platforms before POWER9, we clamp the it limit to 1G
+ * to avoid some funky things such as RTAS bugs etc...
+ *
+ * On POWER9 we limit to 1TB in case the host erroneously told us that
+ * the RMA was >1TB. Effective address bits 0:23 are treated as zero
+ * (meaning the access is aliased to zero i.e. addr = addr % 1TB)
+ * for virtual real mode addressing and so it doesn't make sense to
+ * have an area larger than 1TB as it can't be addressed.
+ */
+ if (!early_cpu_has_feature(CPU_FTR_HVMODE)) {
+ ppc64_rma_size = first_memblock_size;
+ if (!early_cpu_has_feature(CPU_FTR_ARCH_300))
+ ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000);
+ else
+ ppc64_rma_size = min_t(u64, ppc64_rma_size,
+ 1UL << SID_SHIFT_1T);
+
+ /* Finally limit subsequent allocations */
+ memblock_set_current_limit(ppc64_rma_size);
+ } else {
+ ppc64_rma_size = ULONG_MAX;
+ }
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+static int hpt_order_get(void *data, u64 *val)
+{
+ *val = ppc64_pft_size;
+ return 0;
+}
+
+static int hpt_order_set(void *data, u64 val)
+{
+ int ret;
+
+ if (!mmu_hash_ops.resize_hpt)
+ return -ENODEV;
+
+ cpus_read_lock();
+ ret = mmu_hash_ops.resize_hpt(val);
+ cpus_read_unlock();
+
+ return ret;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
+
+static int __init hash64_debugfs(void)
+{
+ if (radix_enabled())
+ return 0;
+ debugfs_create_file("hpt_order", 0600, arch_debugfs_dir, NULL,
+ &fops_hpt_order);
+ return 0;
+}
+machine_device_initcall(pseries, hash64_debugfs);
+#endif /* CONFIG_DEBUG_FS */
+
+void __init print_system_hash_info(void)
+{
+ pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size);
+
+ if (htab_hash_mask)
+ pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask);
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ /*
+ * If we are using 1TB segments and we are allowed to randomise
+ * the heap, we can put it above 1TB so it is backed by a 1TB
+ * segment. Otherwise the heap will be in the bottom 1TB
+ * which always uses 256MB segments and this may result in a
+ * performance penalty.
+ */
+ if (is_32bit_task())
+ return randomize_page(mm->brk, SZ_32M);
+ else if (!radix_enabled() && mmu_highuser_ssize == MMU_SEGSIZE_1T)
+ return randomize_page(max_t(unsigned long, mm->brk, SZ_1T), SZ_1G);
+ else
+ return randomize_page(mm->brk, SZ_1G);
+}
diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c
new file mode 100644
index 000000000000..2bcbbf9d85ac
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/hugetlbpage.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+
+unsigned int hpage_shift;
+EXPORT_SYMBOL(hpage_shift);
+
+#ifdef CONFIG_PPC_64S_HASH_MMU
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+ pte_t *ptep, unsigned long trap, unsigned long flags,
+ int ssize, unsigned int shift, unsigned int mmu_psize)
+{
+ real_pte_t rpte;
+ unsigned long vpn;
+ unsigned long old_pte, new_pte;
+ unsigned long rflags, pa;
+ long slot, offset;
+
+ BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
+
+ /* Search the Linux page table for a match with va */
+ vpn = hpt_vpn(ea, vsid, ssize);
+
+ /*
+ * At this point, we have a pte (old_pte) which can be used to build
+ * or update an HPTE. There are 2 cases:
+ *
+ * 1. There is a valid (present) pte with no associated HPTE (this is
+ * the most common case)
+ * 2. There is a valid (present) pte with an associated HPTE. The
+ * current values of the pp bits in the HPTE prevent access
+ * because we are doing software DIRTY bit management and the
+ * page is currently not DIRTY.
+ */
+
+
+ do {
+ old_pte = pte_val(*ptep);
+ /* If PTE busy, retry the access */
+ if (unlikely(old_pte & H_PAGE_BUSY))
+ return 0;
+ /* If PTE permissions don't match, take page fault */
+ if (unlikely(!check_pte_access(access, old_pte)))
+ return 1;
+ /*
+ * If hash-4k, hugepages use seeral contiguous PxD entries
+ * so bail out and let mm make the page young or dirty
+ */
+ if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) {
+ if (!(old_pte & _PAGE_ACCESSED))
+ return 1;
+ if ((access & _PAGE_WRITE) && !(old_pte & _PAGE_DIRTY))
+ return 1;
+ }
+
+ /*
+ * Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access
+ */
+ new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_WRITE)
+ new_pte |= _PAGE_DIRTY;
+ } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+ /* Make sure this is a hugetlb entry */
+ if (old_pte & H_PAGE_THP_HUGE)
+ return 0;
+
+ rflags = htab_convert_pte_flags(new_pte, flags);
+ if (unlikely(mmu_psize == MMU_PAGE_16G))
+ offset = PTRS_PER_PUD;
+ else
+ offset = PTRS_PER_PMD;
+ rpte = __real_pte(__pte(old_pte), ptep, offset);
+
+ if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ /*
+ * No CPU has hugepages but lacks no execute, so we
+ * don't need to worry about that case
+ */
+ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+ /* Check if pte already has an hpte (case 2) */
+ if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+ /* There MIGHT be an HPTE for this pte */
+ unsigned long gslot;
+
+ gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
+ if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize,
+ mmu_psize, ssize, flags) == -1)
+ old_pte &= ~_PAGE_HPTEFLAGS;
+ }
+
+ if (likely(!(old_pte & H_PAGE_HASHPTE))) {
+ unsigned long hash = hpt_hash(vpn, shift, ssize);
+
+ pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+
+ /* clear HPTE slot informations in new PTE */
+ new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+
+ slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
+ mmu_psize, ssize);
+
+ /*
+ * Hypervisor failure. Restore old pte and return -1
+ * similar to __hash_page_*
+ */
+ if (unlikely(slot == -2)) {
+ *ptep = __pte(old_pte);
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ mmu_psize, mmu_psize, old_pte);
+ return -1;
+ }
+
+ new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset);
+ }
+
+ /*
+ * No need to use ldarx/stdcx here
+ */
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+ return 0;
+}
+#endif
+
+pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ unsigned long pte_val;
+ /*
+ * Clear the _PAGE_PRESENT so that no hardware parallel update is
+ * possible. Also keep the pte_present true so that we don't take
+ * wrong fault.
+ */
+ pte_val = pte_update(vma->vm_mm, addr, ptep,
+ _PAGE_PRESENT, _PAGE_INVALID, 1);
+
+ return __pte(pte_val);
+}
+
+void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+ unsigned long psize;
+
+ if (radix_enabled())
+ return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
+ old_pte, pte);
+
+ psize = huge_page_size(hstate_vma(vma));
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
+}
+
+void __init hugetlbpage_init_defaultsize(void)
+{
+ /* Set default large page size. Currently, we pick 16M or 1M
+ * depending on what is available
+ */
+ if (mmu_psize_defs[MMU_PAGE_16M].shift)
+ hpage_shift = mmu_psize_defs[MMU_PAGE_16M].shift;
+ else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+ hpage_shift = mmu_psize_defs[MMU_PAGE_1M].shift;
+ else if (mmu_psize_defs[MMU_PAGE_2M].shift)
+ hpage_shift = mmu_psize_defs[MMU_PAGE_2M].shift;
+}
diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h
new file mode 100644
index 000000000000..cad08d83369c
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/internal.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H
+#define ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H
+
+#include <linux/jump_label.h>
+
+extern bool stress_slb_enabled;
+
+DECLARE_STATIC_KEY_FALSE(stress_slb_key);
+
+static inline bool stress_slb(void)
+{
+ return static_branch_unlikely(&stress_slb_key);
+}
+
+extern bool stress_hpt_enabled;
+
+DECLARE_STATIC_KEY_FALSE(stress_hpt_key);
+
+static inline bool stress_hpt(void)
+{
+ return static_branch_unlikely(&stress_hpt_key);
+}
+
+extern bool no_slb_preload;
+DECLARE_STATIC_KEY_FALSE(no_slb_preload_key);
+static inline bool slb_preload_disabled(void)
+{
+ return static_branch_unlikely(&no_slb_preload_key);
+}
+
+void hpt_do_stress(unsigned long ea, unsigned long hpte_group);
+
+void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush);
+
+#endif /* ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H */
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
new file mode 100644
index 000000000000..c0e8d597e4cb
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -0,0 +1,402 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * IOMMU helpers in MMU context.
+ *
+ * Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
+ */
+
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+#include <linux/migrate.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/sizes.h>
+#include <linux/mm.h>
+#include <asm/mmu_context.h>
+#include <asm/pte-walk.h>
+#include <linux/mm_inline.h>
+
+static DEFINE_MUTEX(mem_list_mutex);
+
+#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1
+#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1)
+
+struct mm_iommu_table_group_mem_t {
+ struct list_head next;
+ struct rcu_head rcu;
+ unsigned long used;
+ atomic64_t mapped;
+ unsigned int pageshift;
+ u64 ua; /* userspace address */
+ u64 entries; /* number of entries in hpas/hpages[] */
+ /*
+ * in mm_iommu_get we temporarily use this to store
+ * struct page address.
+ *
+ * We need to convert ua to hpa in real mode. Make it
+ * simpler by storing physical address.
+ */
+ union {
+ struct page **hpages; /* vmalloc'ed */
+ phys_addr_t *hpas;
+ };
+#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1)
+ u64 dev_hpa; /* Device memory base address */
+};
+
+bool mm_iommu_preregistered(struct mm_struct *mm)
+{
+ return !list_empty(&mm->context.iommu_group_mem_list);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
+
+static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
+ unsigned long entries, unsigned long dev_hpa,
+ struct mm_iommu_table_group_mem_t **pmem)
+{
+ struct mm_iommu_table_group_mem_t *mem, *mem2;
+ long i, ret, locked_entries = 0, pinned = 0;
+ unsigned int pageshift;
+ unsigned long entry, chunk;
+
+ if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
+ ret = account_locked_vm(mm, entries, true);
+ if (ret)
+ return ret;
+
+ locked_entries = entries;
+ }
+
+ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ if (!mem) {
+ ret = -ENOMEM;
+ goto unlock_exit;
+ }
+
+ if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) {
+ mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT));
+ mem->dev_hpa = dev_hpa;
+ goto good_exit;
+ }
+ mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA;
+
+ /*
+ * For a starting point for a maximum page size calculation
+ * we use @ua and @entries natural alignment to allow IOMMU pages
+ * smaller than huge pages but still bigger than PAGE_SIZE.
+ */
+ mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT));
+ mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0])));
+ if (!mem->hpas) {
+ kfree(mem);
+ ret = -ENOMEM;
+ goto unlock_exit;
+ }
+
+ mmap_read_lock(mm);
+ chunk = (1UL << (PAGE_SHIFT + MAX_PAGE_ORDER)) /
+ sizeof(struct vm_area_struct *);
+ chunk = min(chunk, entries);
+ for (entry = 0; entry < entries; entry += chunk) {
+ unsigned long n = min(entries - entry, chunk);
+
+ ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n,
+ FOLL_WRITE | FOLL_LONGTERM,
+ mem->hpages + entry);
+ if (ret == n) {
+ pinned += n;
+ continue;
+ }
+ if (ret > 0)
+ pinned += ret;
+ break;
+ }
+ mmap_read_unlock(mm);
+ if (pinned != entries) {
+ if (!ret)
+ ret = -EFAULT;
+ goto free_exit;
+ }
+
+good_exit:
+ atomic64_set(&mem->mapped, 1);
+ mem->used = 1;
+ mem->ua = ua;
+ mem->entries = entries;
+
+ mutex_lock(&mem_list_mutex);
+
+ list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next,
+ lockdep_is_held(&mem_list_mutex)) {
+ /* Overlap? */
+ if ((mem2->ua < (ua + (entries << PAGE_SHIFT))) &&
+ (ua < (mem2->ua +
+ (mem2->entries << PAGE_SHIFT)))) {
+ ret = -EINVAL;
+ mutex_unlock(&mem_list_mutex);
+ goto free_exit;
+ }
+ }
+
+ if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
+ /*
+ * Allow to use larger than 64k IOMMU pages. Only do that
+ * if we are backed by hugetlb. Skip device memory as it is not
+ * backed with page structs.
+ */
+ pageshift = PAGE_SHIFT;
+ for (i = 0; i < entries; ++i) {
+ struct page *page = mem->hpages[i];
+
+ if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page))
+ pageshift = page_shift(compound_head(page));
+ mem->pageshift = min(mem->pageshift, pageshift);
+ /*
+ * We don't need struct page reference any more, switch
+ * to physical address.
+ */
+ mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
+ }
+ }
+
+ list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list);
+
+ mutex_unlock(&mem_list_mutex);
+
+ *pmem = mem;
+
+ return 0;
+
+free_exit:
+ /* free the references taken */
+ unpin_user_pages(mem->hpages, pinned);
+
+ vfree(mem->hpas);
+ kfree(mem);
+
+unlock_exit:
+ account_locked_vm(mm, locked_entries, false);
+
+ return ret;
+}
+
+long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
+ struct mm_iommu_table_group_mem_t **pmem)
+{
+ return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA,
+ pmem);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_new);
+
+long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
+ unsigned long entries, unsigned long dev_hpa,
+ struct mm_iommu_table_group_mem_t **pmem)
+{
+ return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_newdev);
+
+static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
+{
+ long i;
+ struct page *page = NULL;
+
+ if (!mem->hpas)
+ return;
+
+ for (i = 0; i < mem->entries; ++i) {
+ if (!mem->hpas[i])
+ continue;
+
+ page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
+ if (!page)
+ continue;
+
+ if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
+ SetPageDirty(page);
+
+ unpin_user_page(page);
+
+ mem->hpas[i] = 0;
+ }
+}
+
+static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem)
+{
+
+ mm_iommu_unpin(mem);
+ vfree(mem->hpas);
+ kfree(mem);
+}
+
+static void mm_iommu_free(struct rcu_head *head)
+{
+ struct mm_iommu_table_group_mem_t *mem = container_of(head,
+ struct mm_iommu_table_group_mem_t, rcu);
+
+ mm_iommu_do_free(mem);
+}
+
+static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
+{
+ list_del_rcu(&mem->next);
+ call_rcu(&mem->rcu, mm_iommu_free);
+}
+
+long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
+{
+ long ret = 0;
+ unsigned long unlock_entries = 0;
+
+ mutex_lock(&mem_list_mutex);
+
+ if (mem->used == 0) {
+ ret = -ENOENT;
+ goto unlock_exit;
+ }
+
+ --mem->used;
+ /* There are still users, exit */
+ if (mem->used)
+ goto unlock_exit;
+
+ /* Are there still mappings? */
+ if (atomic64_cmpxchg(&mem->mapped, 1, 0) != 1) {
+ ++mem->used;
+ ret = -EBUSY;
+ goto unlock_exit;
+ }
+
+ if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
+ unlock_entries = mem->entries;
+
+ /* @mapped became 0 so now mappings are disabled, release the region */
+ mm_iommu_release(mem);
+
+unlock_exit:
+ mutex_unlock(&mem_list_mutex);
+
+ account_locked_vm(mm, unlock_entries, false);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_put);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
+ unsigned long ua, unsigned long size)
+{
+ struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+ if ((mem->ua <= ua) &&
+ (ua + size <= mem->ua +
+ (mem->entries << PAGE_SHIFT))) {
+ ret = mem;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
+ unsigned long ua, unsigned long entries)
+{
+ struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+ mutex_lock(&mem_list_mutex);
+
+ list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next,
+ lockdep_is_held(&mem_list_mutex)) {
+ if ((mem->ua == ua) && (mem->entries == entries)) {
+ ret = mem;
+ ++mem->used;
+ break;
+ }
+ }
+
+ mutex_unlock(&mem_list_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_get);
+
+long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+ unsigned long ua, unsigned int pageshift, unsigned long *hpa)
+{
+ const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+ u64 *va;
+
+ if (entry >= mem->entries)
+ return -EFAULT;
+
+ if (pageshift > mem->pageshift)
+ return -EFAULT;
+
+ if (!mem->hpas) {
+ *hpa = mem->dev_hpa + (ua - mem->ua);
+ return 0;
+ }
+
+ va = &mem->hpas[entry];
+ *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
+
+bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
+ unsigned int pageshift, unsigned long *size)
+{
+ struct mm_iommu_table_group_mem_t *mem;
+ unsigned long end;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+ if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
+ continue;
+
+ end = mem->dev_hpa + (mem->entries << PAGE_SHIFT);
+ if ((mem->dev_hpa <= hpa) && (hpa < end)) {
+ /*
+ * Since the IOMMU page size might be bigger than
+ * PAGE_SIZE, the amount of preregistered memory
+ * starting from @hpa might be smaller than 1<<pageshift
+ * and the caller needs to distinguish this situation.
+ */
+ *size = min(1UL << pageshift, end - hpa);
+ return true;
+ }
+ }
+ rcu_read_unlock();
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_is_devmem);
+
+long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
+{
+ if (atomic64_inc_not_zero(&mem->mapped))
+ return 0;
+
+ /* Last mm_iommu_put() has been called, no more mappings allowed() */
+ return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
+
+void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
+{
+ atomic64_add_unless(&mem->mapped, -1, 1);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
+
+void mm_iommu_init(struct mm_struct *mm)
+{
+ INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
+}
diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
new file mode 100644
index 000000000000..fb9dcf9ca599
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -0,0 +1,347 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * MMU context allocation for 64-bit kernels.
+ *
+ * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/pkeys.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+
+#include "internal.h"
+
+static DEFINE_IDA(mmu_context_ida);
+
+static int alloc_context_id(int min_id, int max_id)
+{
+ return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL);
+}
+
+#ifdef CONFIG_PPC_64S_HASH_MMU
+void __init hash__reserve_context_id(int id)
+{
+ int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL);
+
+ WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result);
+}
+
+int hash__alloc_context_id(void)
+{
+ unsigned long max;
+
+ if (mmu_has_feature(MMU_FTR_68_BIT_VA))
+ max = MAX_USER_CONTEXT;
+ else
+ max = MAX_USER_CONTEXT_65BIT_VA;
+
+ return alloc_context_id(MIN_USER_CONTEXT, max);
+}
+EXPORT_SYMBOL_GPL(hash__alloc_context_id);
+#endif
+
+#ifdef CONFIG_PPC_64S_HASH_MMU
+static int realloc_context_ids(mm_context_t *ctx)
+{
+ int i, id;
+
+ /*
+ * id 0 (aka. ctx->id) is special, we always allocate a new one, even if
+ * there wasn't one allocated previously (which happens in the exec
+ * case where ctx is newly allocated).
+ *
+ * We have to be a bit careful here. We must keep the existing ids in
+ * the array, so that we can test if they're non-zero to decide if we
+ * need to allocate a new one. However in case of error we must free the
+ * ids we've allocated but *not* any of the existing ones (or risk a
+ * UAF). That's why we decrement i at the start of the error handling
+ * loop, to skip the id that we just tested but couldn't reallocate.
+ */
+ for (i = 0; i < ARRAY_SIZE(ctx->extended_id); i++) {
+ if (i == 0 || ctx->extended_id[i]) {
+ id = hash__alloc_context_id();
+ if (id < 0)
+ goto error;
+
+ ctx->extended_id[i] = id;
+ }
+ }
+
+ /* The caller expects us to return id */
+ return ctx->id;
+
+error:
+ for (i--; i >= 0; i--) {
+ if (ctx->extended_id[i])
+ ida_free(&mmu_context_ida, ctx->extended_id[i]);
+ }
+
+ return id;
+}
+
+static int hash__init_new_context(struct mm_struct *mm)
+{
+ int index;
+
+ mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context),
+ GFP_KERNEL);
+ if (!mm->context.hash_context)
+ return -ENOMEM;
+
+ /*
+ * The old code would re-promote on fork, we don't do that when using
+ * slices as it could cause problem promoting slices that have been
+ * forced down to 4K.
+ *
+ * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
+ * explicitly against context.id == 0. This ensures that we properly
+ * initialize context slice details for newly allocated mm's (which will
+ * have id == 0) and don't alter context slice inherited via fork (which
+ * will have id != 0).
+ *
+ * We should not be calling init_new_context() on init_mm. Hence a
+ * check against 0 is OK.
+ */
+ if (mm->context.id == 0) {
+ memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context));
+ slice_init_new_context_exec(mm);
+ } else {
+ /* This is fork. Copy hash_context details from current->mm */
+ memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context));
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+ /* inherit subpage prot details if we have one. */
+ if (current->mm->context.hash_context->spt) {
+ mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table),
+ GFP_KERNEL);
+ if (!mm->context.hash_context->spt) {
+ kfree(mm->context.hash_context);
+ return -ENOMEM;
+ }
+ }
+#endif
+ }
+
+ index = realloc_context_ids(&mm->context);
+ if (index < 0) {
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+ kfree(mm->context.hash_context->spt);
+#endif
+ kfree(mm->context.hash_context);
+ return index;
+ }
+
+ pkey_mm_init(mm);
+ return index;
+}
+
+void hash__setup_new_exec(void)
+{
+ slice_setup_new_exec();
+}
+#else
+static inline int hash__init_new_context(struct mm_struct *mm)
+{
+ BUILD_BUG();
+ return 0;
+}
+#endif
+
+static int radix__init_new_context(struct mm_struct *mm)
+{
+ unsigned long rts_field;
+ int index, max_id;
+
+ max_id = (1 << mmu_pid_bits) - 1;
+ index = alloc_context_id(mmu_base_pid, max_id);
+ if (index < 0)
+ return index;
+
+ /*
+ * set the process table entry,
+ */
+ rts_field = radix__get_tree_size();
+ process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
+
+ /*
+ * Order the above store with subsequent update of the PID
+ * register (at which point HW can start loading/caching
+ * the entry) and the corresponding load by the MMU from
+ * the L2 cache.
+ */
+ asm volatile("ptesync;isync" : : : "memory");
+
+#ifdef CONFIG_PPC_64S_HASH_MMU
+ mm->context.hash_context = NULL;
+#endif
+
+ return index;
+}
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+ int index;
+
+ if (radix_enabled())
+ index = radix__init_new_context(mm);
+ else
+ index = hash__init_new_context(mm);
+
+ if (index < 0)
+ return index;
+
+ mm->context.id = index;
+
+ mm->context.pte_frag = NULL;
+ mm->context.pmd_frag = NULL;
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+ mm_iommu_init(mm);
+#endif
+ atomic_set(&mm->context.active_cpus, 0);
+ atomic_set(&mm->context.copros, 0);
+
+ return 0;
+}
+
+void __destroy_context(int context_id)
+{
+ ida_free(&mmu_context_ida, context_id);
+}
+EXPORT_SYMBOL_GPL(__destroy_context);
+
+static void destroy_contexts(mm_context_t *ctx)
+{
+ if (radix_enabled()) {
+ ida_free(&mmu_context_ida, ctx->id);
+ } else {
+#ifdef CONFIG_PPC_64S_HASH_MMU
+ int index, context_id;
+
+ for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) {
+ context_id = ctx->extended_id[index];
+ if (context_id)
+ ida_free(&mmu_context_ida, context_id);
+ }
+ kfree(ctx->hash_context);
+#else
+ BUILD_BUG(); // radix_enabled() should be constant true
+#endif
+ }
+}
+
+static void pmd_frag_destroy(void *pmd_frag)
+{
+ int count;
+ struct ptdesc *ptdesc;
+
+ ptdesc = virt_to_ptdesc(pmd_frag);
+ /* drop all the pending references */
+ count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
+ /* We allow PTE_FRAG_NR fragments from a PTE page */
+ if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
+ pagetable_dtor(ptdesc);
+ pagetable_free(ptdesc);
+ }
+}
+
+static void destroy_pagetable_cache(struct mm_struct *mm)
+{
+ void *frag;
+
+ frag = mm->context.pte_frag;
+ if (frag)
+ pte_frag_destroy(frag);
+
+ frag = mm->context.pmd_frag;
+ if (frag)
+ pmd_frag_destroy(frag);
+ return;
+}
+
+void destroy_context(struct mm_struct *mm)
+{
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+ WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
+#endif
+ /*
+ * For tasks which were successfully initialized we end up calling
+ * arch_exit_mmap() which clears the process table entry. And
+ * arch_exit_mmap() is called before the required fullmm TLB flush
+ * which does a RIC=2 flush. Hence for an initialized task, we do clear
+ * any cached process table entries.
+ *
+ * The condition below handles the error case during task init. We have
+ * set the process table entry early and if we fail a task
+ * initialization, we need to ensure the process table entry is zeroed.
+ * We need not worry about process table entry caches because the task
+ * never ran with the PID value.
+ */
+ if (radix_enabled())
+ process_tb[mm->context.id].prtb0 = 0;
+ else
+ subpage_prot_free(mm);
+ destroy_contexts(&mm->context);
+ mm->context.id = MMU_NO_CONTEXT;
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
+ destroy_pagetable_cache(mm);
+
+ if (radix_enabled()) {
+ /*
+ * Radix doesn't have a valid bit in the process table
+ * entries. However we know that at least P9 implementation
+ * will avoid caching an entry with an invalid RTS field,
+ * and 0 is invalid. So this will do.
+ *
+ * This runs before the "fullmm" tlb flush in exit_mmap,
+ * which does a RIC=2 tlbie to clear the process table
+ * entry. See the "fullmm" comments in tlb-radix.c.
+ *
+ * No barrier required here after the store because
+ * this process will do the invalidate, which starts with
+ * ptesync.
+ */
+ process_tb[mm->context.id].prtb0 = 0;
+ }
+}
+
+#ifdef CONFIG_PPC_RADIX_MMU
+void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
+{
+ mtspr(SPRN_PID, next->context.id);
+ isync();
+}
+#endif
+
+/**
+ * cleanup_cpu_mmu_context - Clean up MMU details for this CPU (newly offlined)
+ *
+ * This clears the CPU from mm_cpumask for all processes, and then flushes the
+ * local TLB to ensure TLB coherency in case the CPU is onlined again.
+ *
+ * KVM guest translations are not necessarily flushed here. If KVM started
+ * using mm_cpumask or the Linux APIs which do, this would have to be resolved.
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+void cleanup_cpu_mmu_context(void)
+{
+ int cpu = smp_processor_id();
+
+ clear_tasks_mm_cpumask(cpu);
+ tlbiel_all();
+}
+#endif
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
new file mode 100644
index 000000000000..e3485db7de02
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -0,0 +1,665 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm_types.h>
+#include <linux/memblock.h>
+#include <linux/memremap.h>
+#include <linux/pkeys.h>
+#include <linux/debugfs.h>
+#include <linux/proc_fs.h>
+
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/trace.h>
+#include <asm/powernv.h>
+#include <asm/firmware.h>
+#include <asm/ultravisor.h>
+#include <asm/kexec.h>
+
+#include <mm/mmu_decl.h>
+#include <trace/events/thp.h>
+
+#include "internal.h"
+
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
+EXPORT_SYMBOL_GPL(mmu_psize_defs);
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+int mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif
+
+unsigned long __pmd_frag_nr;
+EXPORT_SYMBOL(__pmd_frag_nr);
+unsigned long __pmd_frag_size_shift;
+EXPORT_SYMBOL(__pmd_frag_size_shift);
+
+#ifdef CONFIG_KFENCE
+extern bool kfence_early_init;
+static int __init parse_kfence_early_init(char *arg)
+{
+ int val;
+
+ if (get_option(&arg, &val))
+ kfence_early_init = !!val;
+ return 0;
+}
+early_param("kfence.sample_interval", parse_kfence_early_init);
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * This is called when relaxing access to a hugepage. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp, pmd_t entry, int dirty)
+{
+ int changed;
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pmd_trans_huge(*pmdp));
+ assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp));
+#endif
+ changed = !pmd_same(*(pmdp), entry);
+ if (changed) {
+ /*
+ * We can use MMU_PAGE_2M here, because only radix
+ * path look at the psize.
+ */
+ __ptep_set_access_flags(vma, pmdp_ptep(pmdp),
+ pmd_pte(entry), address, MMU_PAGE_2M);
+ }
+ return changed;
+}
+
+int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp, pud_t entry, int dirty)
+{
+ int changed;
+#ifdef CONFIG_DEBUG_VM
+ assert_spin_locked(pud_lockptr(vma->vm_mm, pudp));
+#endif
+ changed = !pud_same(*(pudp), entry);
+ if (changed) {
+ /*
+ * We can use MMU_PAGE_1G here, because only radix
+ * path look at the psize.
+ */
+ __ptep_set_access_flags(vma, pudp_ptep(pudp),
+ pud_pte(entry), address, MMU_PAGE_1G);
+ }
+ return changed;
+}
+
+
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+int pudp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp)
+{
+ return __pudp_test_and_clear_young(vma->vm_mm, address, pudp);
+}
+
+/*
+ * set a new huge pmd. We should not be called for updating
+ * an existing pmd entry. That should go via pmd_hugepage_update.
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+
+ WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+ WARN_ON(!(pmd_leaf(pmd)));
+#endif
+ trace_hugepage_set_pmd(addr, pmd_val(pmd));
+ return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+}
+
+void set_pud_at(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud)
+{
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+
+ WARN_ON(pte_hw_valid(pud_pte(*pudp)));
+ assert_spin_locked(pud_lockptr(mm, pudp));
+ WARN_ON(!(pud_leaf(pud)));
+#endif
+ trace_hugepage_set_pud(addr, pud_val(pud));
+ return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud));
+}
+
+static void do_serialize(void *arg)
+{
+ /* We've taken the IPI, so try to trim the mask while here */
+ if (radix_enabled()) {
+ struct mm_struct *mm = arg;
+ exit_lazy_flush_tlb(mm, false);
+ }
+}
+
+/*
+ * Serialize against __find_linux_pte() which does lock-less
+ * lookup in page tables with local interrupts disabled. For huge pages
+ * it casts pmd_t to pte_t. Since format of pte_t is different from
+ * pmd_t we want to prevent transit from pmd pointing to page table
+ * to pmd pointing to huge page (and back) while interrupts are disabled.
+ * We clear pmd to possibly replace it with page table pointer in
+ * different code paths. So make sure we wait for the parallel
+ * __find_linux_pte() to finish.
+ */
+void serialize_against_pte_lookup(struct mm_struct *mm)
+{
+ smp_mb();
+ smp_call_function_many(mm_cpumask(mm), do_serialize, mm, 1);
+}
+
+/*
+ * We use this to invalidate a pmdp entry before switching from a
+ * hugepte to regular pmd entry.
+ */
+pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ unsigned long old_pmd;
+
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+ old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
+ flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return __pmd(old_pmd);
+}
+
+pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp)
+{
+ unsigned long old_pud;
+
+ VM_WARN_ON_ONCE(!pud_present(*pudp));
+ old_pud = pud_hugepage_update(vma->vm_mm, address, pudp, _PAGE_PRESENT, _PAGE_INVALID);
+ flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
+ return __pud(old_pud);
+}
+
+pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp, int full)
+{
+ pmd_t pmd;
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp)) ||
+ !pmd_present(*pmdp));
+ pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
+ /*
+ * if it not a fullmm flush, then we can possibly end up converting
+ * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
+ * Make sure we flush the tlb in this case.
+ */
+ if (!full)
+ flush_pmd_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+ return pmd;
+}
+
+pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pudp, int full)
+{
+ pud_t pud;
+
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(!pud_present(*pudp));
+ pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp);
+ /*
+ * if it not a fullmm flush, then we can possibly end up converting
+ * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
+ * Make sure we flush the tlb in this case.
+ */
+ if (!full)
+ flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE);
+ return pud;
+}
+
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+ return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
+}
+
+static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot)
+{
+ return __pud(pud_val(pud) | pgprot_val(pgprot));
+}
+
+/*
+ * At some point we should be able to get rid of
+ * pmd_mkhuge() and mk_huge_pmd() when we update all the
+ * other archs to mark the pmd huge in pfn_pmd()
+ */
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+ unsigned long pmdv;
+
+ pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
+
+ return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot));
+}
+
+pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot)
+{
+ unsigned long pudv;
+
+ pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
+
+ return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot));
+}
+
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+ unsigned long pmdv;
+
+ pmdv = pmd_val(pmd);
+ pmdv &= _HPAGE_CHG_MASK;
+ return pmd_set_protbits(__pmd(pmdv), newprot);
+}
+
+pud_t pud_modify(pud_t pud, pgprot_t newprot)
+{
+ unsigned long pudv;
+
+ pudv = pud_val(pud);
+ pudv &= _HPAGE_CHG_MASK;
+ return pud_set_protbits(__pud(pudv), newprot);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/* For use by kexec, called with MMU off */
+notrace void mmu_cleanup_all(void)
+{
+ if (radix_enabled())
+ radix__mmu_cleanup_all();
+ else if (mmu_hash_ops.hpte_clear_all)
+ mmu_hash_ops.hpte_clear_all();
+
+ reset_sprs();
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int __meminit create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot)
+{
+ if (radix_enabled())
+ return radix__create_section_mapping(start, end, nid, prot);
+
+ return hash__create_section_mapping(start, end, nid, prot);
+}
+
+int __meminit remove_section_mapping(unsigned long start, unsigned long end)
+{
+ if (radix_enabled())
+ return radix__remove_section_mapping(start, end);
+
+ return hash__remove_section_mapping(start, end);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+void __init mmu_partition_table_init(void)
+{
+ unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
+ unsigned long ptcr;
+
+ /* Initialize the Partition Table with no entries */
+ partition_tb = memblock_alloc_or_panic(patb_size, patb_size);
+ ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
+ set_ptcr_when_no_uv(ptcr);
+ powernv_set_nmmu_ptcr(ptcr);
+}
+
+static void flush_partition(unsigned int lpid, bool radix)
+{
+ if (radix) {
+ radix__flush_all_lpid(lpid);
+ radix__flush_all_lpid_guest(lpid);
+ } else {
+ asm volatile("ptesync" : : : "memory");
+ asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
+ "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+ /* do we need fixup here ?*/
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+ trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
+ }
+}
+
+void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
+ unsigned long dw1, bool flush)
+{
+ unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
+
+ /*
+ * When ultravisor is enabled, the partition table is stored in secure
+ * memory and can only be accessed doing an ultravisor call. However, we
+ * maintain a copy of the partition table in normal memory to allow Nest
+ * MMU translations to occur (for normal VMs).
+ *
+ * Therefore, here we always update partition_tb, regardless of whether
+ * we are running under an ultravisor or not.
+ */
+ partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+ partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+
+ /*
+ * If ultravisor is enabled, we do an ultravisor call to register the
+ * partition table entry (PATE), which also do a global flush of TLBs
+ * and partition table caches for the lpid. Otherwise, just do the
+ * flush. The type of flush (hash or radix) depends on what the previous
+ * use of the partition ID was, not the new use.
+ */
+ if (firmware_has_feature(FW_FEATURE_ULTRAVISOR)) {
+ uv_register_pate(lpid, dw0, dw1);
+ pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n",
+ dw0, dw1);
+ } else if (flush) {
+ /*
+ * Boot does not need to flush, because MMU is off and each
+ * CPU does a tlbiel_all() before switching them on, which
+ * flushes everything.
+ */
+ flush_partition(lpid, (old & PATB_HR));
+ }
+}
+EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
+
+static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
+{
+ void *pmd_frag, *ret;
+
+ if (PMD_FRAG_NR == 1)
+ return NULL;
+
+ spin_lock(&mm->page_table_lock);
+ ret = mm->context.pmd_frag;
+ if (ret) {
+ pmd_frag = ret + PMD_FRAG_SIZE;
+ /*
+ * If we have taken up all the fragments mark PTE page NULL
+ */
+ if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0)
+ pmd_frag = NULL;
+ mm->context.pmd_frag = pmd_frag;
+ }
+ spin_unlock(&mm->page_table_lock);
+ return (pmd_t *)ret;
+}
+
+static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
+{
+ void *ret = NULL;
+ struct ptdesc *ptdesc;
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
+
+ if (mm == &init_mm)
+ gfp &= ~__GFP_ACCOUNT;
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
+ return NULL;
+ if (!pagetable_pmd_ctor(mm, ptdesc)) {
+ pagetable_free(ptdesc);
+ return NULL;
+ }
+
+ atomic_set(&ptdesc->pt_frag_refcount, 1);
+
+ ret = ptdesc_address(ptdesc);
+ /*
+ * if we support only one fragment just return the
+ * allocated page.
+ */
+ if (PMD_FRAG_NR == 1)
+ return ret;
+
+ spin_lock(&mm->page_table_lock);
+ /*
+ * If we find ptdesc_page set, we return
+ * the allocated page with single fragment
+ * count.
+ */
+ if (likely(!mm->context.pmd_frag)) {
+ atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR);
+ mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return (pmd_t *)ret;
+}
+
+pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
+{
+ pmd_t *pmd;
+
+ pmd = get_pmd_from_cache(mm);
+ if (pmd)
+ return pmd;
+
+ return __alloc_for_pmdcache(mm);
+}
+
+void pmd_fragment_free(unsigned long *pmd)
+{
+ struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
+
+ if (pagetable_is_reserved(ptdesc))
+ return free_reserved_ptdesc(ptdesc);
+
+ BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
+ if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
+ pagetable_dtor(ptdesc);
+ pagetable_free(ptdesc);
+ }
+}
+
+static inline void pgtable_free(void *table, int index)
+{
+ switch (index) {
+ case PTE_INDEX:
+ pte_fragment_free(table, 0);
+ break;
+ case PMD_INDEX:
+ pmd_fragment_free(table);
+ break;
+ case PUD_INDEX:
+ __pud_free(table);
+ break;
+ /* We don't free pgd table via RCU callback */
+ default:
+ BUG();
+ }
+}
+
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
+{
+ unsigned long pgf = (unsigned long)table;
+
+ BUG_ON(index > MAX_PGTABLE_INDEX_SIZE);
+ pgf |= index;
+ tlb_remove_table(tlb, (void *)pgf);
+}
+
+void __tlb_remove_table(void *_table)
+{
+ void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+ unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+ return pgtable_free(table, index);
+}
+
+#ifdef CONFIG_PROC_FS
+atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
+
+void arch_report_meminfo(struct seq_file *m)
+{
+ seq_printf(m, "DirectMap4k: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2);
+ seq_printf(m, "DirectMap64k: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6);
+ if (radix_enabled()) {
+ seq_printf(m, "DirectMap2M: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11);
+ seq_printf(m, "DirectMap1G: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20);
+ } else {
+ seq_printf(m, "DirectMap16M: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_16M]) << 14);
+ seq_printf(m, "DirectMap16G: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_16G]) << 24);
+ }
+}
+#endif /* CONFIG_PROC_FS */
+
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep)
+{
+ unsigned long pte_val;
+
+ /*
+ * Clear the _PAGE_PRESENT so that no hardware parallel update is
+ * possible. Also keep the pte_present true so that we don't take
+ * wrong fault.
+ */
+ pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0);
+
+ return __pte(pte_val);
+
+}
+
+void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+ if (radix_enabled())
+ return radix__ptep_modify_prot_commit(vma, addr,
+ ptep, old_pte, pte);
+ set_pte_at(vma->vm_mm, addr, ptep, pte);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * For hash translation mode, we use the deposited table to store hash slot
+ * information and they are stored at PTRS_PER_PMD offset from related pmd
+ * location. Hence a pmd move requires deposit and withdraw.
+ *
+ * For radix translation with split pmd ptl, we store the deposited table in the
+ * pmd page. Hence if we have different pmd page we need to withdraw during pmd
+ * move.
+ *
+ * With hash we use deposited table always irrespective of anon or not.
+ * With radix we use deposited table only for anonymous mapping.
+ */
+int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
+ struct spinlock *old_pmd_ptl,
+ struct vm_area_struct *vma)
+{
+ if (radix_enabled())
+ return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
+
+ return true;
+}
+#endif
+
+/*
+ * Does the CPU support tlbie?
+ */
+bool tlbie_capable __read_mostly = IS_ENABLED(CONFIG_PPC_RADIX_BROADCAST_TLBIE);
+EXPORT_SYMBOL(tlbie_capable);
+
+/*
+ * Should tlbie be used for management of CPU TLBs, for kernel and process
+ * address spaces? tlbie may still be used for nMMU accelerators, and for KVM
+ * guest address spaces.
+ */
+bool tlbie_enabled __read_mostly = IS_ENABLED(CONFIG_PPC_RADIX_BROADCAST_TLBIE);
+
+static int __init setup_disable_tlbie(char *str)
+{
+ if (!radix_enabled()) {
+ pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n");
+ return 1;
+ }
+
+ tlbie_capable = false;
+ tlbie_enabled = false;
+
+ return 1;
+}
+__setup("disable_tlbie", setup_disable_tlbie);
+
+static int __init pgtable_debugfs_setup(void)
+{
+ if (!tlbie_capable)
+ return 0;
+
+ /*
+ * There is no locking vs tlb flushing when changing this value.
+ * The tlb flushers will see one value or another, and use either
+ * tlbie or tlbiel with IPIs. In both cases the TLBs will be
+ * invalidated as expected.
+ */
+ debugfs_create_bool("tlbie_enabled", 0600,
+ arch_debugfs_dir,
+ &tlbie_enabled);
+
+ return 0;
+}
+arch_initcall(pgtable_debugfs_setup);
+
+#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN)
+/*
+ * Override the generic version in mm/memremap.c.
+ *
+ * With hash translation, the direct-map range is mapped with just one
+ * page size selected by htab_init_page_sizes(). Consult
+ * mmu_psize_defs[] to determine the minimum page size alignment.
+*/
+unsigned long memremap_compat_align(void)
+{
+ if (!radix_enabled()) {
+ unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift;
+ return max(SUBSECTION_SIZE, 1UL << shift);
+ }
+
+ return SUBSECTION_SIZE;
+}
+EXPORT_SYMBOL_GPL(memremap_compat_align);
+#endif
+
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
+{
+ unsigned long prot;
+
+ /* Radix supports execute-only, but protection_map maps X -> RX */
+ if (!radix_enabled() && ((vm_flags & VM_ACCESS_FLAGS) == VM_EXEC))
+ vm_flags |= VM_READ;
+
+ prot = pgprot_val(protection_map[vm_flags & (VM_ACCESS_FLAGS | VM_SHARED)]);
+
+ if (vm_flags & VM_SAO)
+ prot |= _PAGE_SAO;
+
+#ifdef CONFIG_PPC_MEM_KEYS
+ prot |= vmflag_to_pte_pkey_bits(vm_flags);
+#endif
+
+ return __pgprot(prot);
+}
+EXPORT_SYMBOL(vm_get_page_prot);
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
new file mode 100644
index 000000000000..a974baf8f327
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -0,0 +1,471 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * PowerPC Memory Protection Keys management
+ *
+ * Copyright 2017, Ram Pai, IBM Corporation.
+ */
+
+#include <asm/mman.h>
+#include <asm/mmu_context.h>
+#include <asm/mmu.h>
+#include <asm/setup.h>
+#include <asm/smp.h>
+#include <asm/firmware.h>
+
+#include <linux/pkeys.h>
+#include <linux/of_fdt.h>
+
+
+int num_pkey; /* Max number of pkeys supported */
+/*
+ * Keys marked in the reservation list cannot be allocated by userspace
+ */
+u32 reserved_allocation_mask __ro_after_init;
+
+/* Bits set for the initially allocated keys */
+static u32 initial_allocation_mask __ro_after_init;
+
+/*
+ * Even if we allocate keys with sys_pkey_alloc(), we need to make sure
+ * other thread still find the access denied using the same keys.
+ */
+u64 default_amr __ro_after_init = ~0x0UL;
+u64 default_iamr __ro_after_init = 0x5555555555555555UL;
+u64 default_uamor __ro_after_init;
+EXPORT_SYMBOL(default_amr);
+/*
+ * Key used to implement PROT_EXEC mmap. Denies READ/WRITE
+ * We pick key 2 because 0 is special key and 1 is reserved as per ISA.
+ */
+static int execute_only_key = 2;
+static bool pkey_execute_disable_supported;
+
+
+#define AMR_BITS_PER_PKEY 2
+#define AMR_RD_BIT 0x1UL
+#define AMR_WR_BIT 0x2UL
+#define IAMR_EX_BIT 0x1UL
+#define PKEY_REG_BITS (sizeof(u64) * 8)
+#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
+
+static int __init dt_scan_storage_keys(unsigned long node,
+ const char *uname, int depth,
+ void *data)
+{
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+ int *pkeys_total = (int *) data;
+
+ /* We are scanning "cpu" nodes only */
+ if (type == NULL || strcmp(type, "cpu") != 0)
+ return 0;
+
+ prop = of_get_flat_dt_prop(node, "ibm,processor-storage-keys", NULL);
+ if (!prop)
+ return 0;
+ *pkeys_total = be32_to_cpu(prop[0]);
+ return 1;
+}
+
+static int __init scan_pkey_feature(void)
+{
+ int ret;
+ int pkeys_total = 0;
+
+ /*
+ * Pkey is not supported with Radix translation.
+ */
+ if (early_radix_enabled())
+ return 0;
+
+ ret = of_scan_flat_dt(dt_scan_storage_keys, &pkeys_total);
+ if (ret == 0) {
+ /*
+ * Let's assume 32 pkeys on P8/P9 bare metal, if its not defined by device
+ * tree. We make this exception since some version of skiboot forgot to
+ * expose this property on power8/9.
+ */
+ if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+ unsigned long pvr = mfspr(SPRN_PVR);
+
+ if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E ||
+ PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9 ||
+ PVR_VER(pvr) == PVR_HX_C2000)
+ pkeys_total = 32;
+ }
+ }
+
+#ifdef CONFIG_PPC_MEM_KEYS
+ /*
+ * Adjust the upper limit, based on the number of bits supported by
+ * arch-neutral code.
+ */
+ pkeys_total = min_t(int, pkeys_total,
+ ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + 1));
+#endif
+ return pkeys_total;
+}
+
+void __init pkey_early_init_devtree(void)
+{
+ int pkeys_total, i;
+
+#ifdef CONFIG_PPC_MEM_KEYS
+ /*
+ * We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral
+ * generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE.
+ * Ensure that the bits a distinct.
+ */
+ BUILD_BUG_ON(PKEY_DISABLE_EXECUTE &
+ (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+ /*
+ * pkey_to_vmflag_bits() assumes that the pkey bits are contiguous
+ * in the vmaflag. Make sure that is really the case.
+ */
+ BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) +
+ __builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)
+ != (sizeof(u64) * BITS_PER_BYTE));
+#endif
+ /*
+ * Only P7 and above supports SPRN_AMR update with MSR[PR] = 1
+ */
+ if (!early_cpu_has_feature(CPU_FTR_ARCH_206))
+ return;
+
+ /* scan the device tree for pkey feature */
+ pkeys_total = scan_pkey_feature();
+ if (!pkeys_total)
+ goto out;
+
+ /* Allow all keys to be modified by default */
+ default_uamor = ~0x0UL;
+
+ cur_cpu_spec->mmu_features |= MMU_FTR_PKEY;
+
+ /*
+ * The device tree cannot be relied to indicate support for
+ * execute_disable support. Instead we use a PVR check.
+ */
+ if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p))
+ pkey_execute_disable_supported = false;
+ else
+ pkey_execute_disable_supported = true;
+
+#ifdef CONFIG_PPC_4K_PAGES
+ /*
+ * The OS can manage only 8 pkeys due to its inability to represent them
+ * in the Linux 4K PTE. Mark all other keys reserved.
+ */
+ num_pkey = min(8, pkeys_total);
+#else
+ num_pkey = pkeys_total;
+#endif
+
+ if (unlikely(num_pkey <= execute_only_key) || !pkey_execute_disable_supported) {
+ /*
+ * Insufficient number of keys to support
+ * execute only key. Mark it unavailable.
+ */
+ execute_only_key = -1;
+ } else {
+ /*
+ * Mark the execute_only_pkey as not available for
+ * user allocation via pkey_alloc.
+ */
+ reserved_allocation_mask |= (0x1 << execute_only_key);
+
+ /*
+ * Deny READ/WRITE for execute_only_key.
+ * Allow execute in IAMR.
+ */
+ default_amr |= (0x3ul << pkeyshift(execute_only_key));
+ default_iamr &= ~(0x1ul << pkeyshift(execute_only_key));
+
+ /*
+ * Clear the uamor bits for this key.
+ */
+ default_uamor &= ~(0x3ul << pkeyshift(execute_only_key));
+ }
+
+ if (unlikely(num_pkey <= 3)) {
+ /*
+ * Insufficient number of keys to support
+ * KUAP/KUEP feature.
+ */
+ disable_kuep = true;
+ disable_kuap = true;
+ WARN(1, "Disabling kernel user protection due to low (%d) max supported keys\n", num_pkey);
+ } else {
+ /* handle key which is used by kernel for KAUP */
+ reserved_allocation_mask |= (0x1 << 3);
+ /*
+ * Mark access for kup_key in default amr so that
+ * we continue to operate with that AMR in
+ * copy_to/from_user().
+ */
+ default_amr &= ~(0x3ul << pkeyshift(3));
+ default_iamr &= ~(0x1ul << pkeyshift(3));
+ default_uamor &= ~(0x3ul << pkeyshift(3));
+ }
+
+ /*
+ * Allow access for only key 0. And prevent any other modification.
+ */
+ default_amr &= ~(0x3ul << pkeyshift(0));
+ default_iamr &= ~(0x1ul << pkeyshift(0));
+ default_uamor &= ~(0x3ul << pkeyshift(0));
+ /*
+ * key 0 is special in that we want to consider it an allocated
+ * key which is preallocated. We don't allow changing AMR bits
+ * w.r.t key 0. But one can pkey_free(key0)
+ */
+ initial_allocation_mask |= (0x1 << 0);
+
+ /*
+ * key 1 is recommended not to be used. PowerISA(3.0) page 1015,
+ * programming note.
+ */
+ reserved_allocation_mask |= (0x1 << 1);
+ default_uamor &= ~(0x3ul << pkeyshift(1));
+
+ /*
+ * Prevent the usage of OS reserved keys. Update UAMOR
+ * for those keys. Also mark the rest of the bits in the
+ * 32 bit mask as reserved.
+ */
+ for (i = num_pkey; i < 32 ; i++) {
+ reserved_allocation_mask |= (0x1 << i);
+ default_uamor &= ~(0x3ul << pkeyshift(i));
+ }
+ /*
+ * Prevent the allocation of reserved keys too.
+ */
+ initial_allocation_mask |= reserved_allocation_mask;
+
+ pr_info("Enabling pkeys with max key count %d\n", num_pkey);
+out:
+ /*
+ * Setup uamor on boot cpu
+ */
+ mtspr(SPRN_UAMOR, default_uamor);
+
+ return;
+}
+
+#ifdef CONFIG_PPC_KUEP
+void setup_kuep(bool disabled)
+{
+ if (disabled)
+ return;
+ /*
+ * On hash if PKEY feature is not enabled, disable KUAP too.
+ */
+ if (!early_radix_enabled() && !early_mmu_has_feature(MMU_FTR_PKEY))
+ return;
+
+ if (smp_processor_id() == boot_cpuid) {
+ pr_info("Activating Kernel Userspace Execution Prevention\n");
+ cur_cpu_spec->mmu_features |= MMU_FTR_BOOK3S_KUEP;
+ }
+
+ /*
+ * Radix always uses key0 of the IAMR to determine if an access is
+ * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
+ * fetch.
+ */
+ mtspr(SPRN_IAMR, AMR_KUEP_BLOCKED);
+ isync();
+}
+#endif
+
+#ifdef CONFIG_PPC_KUAP
+void setup_kuap(bool disabled)
+{
+ if (disabled)
+ return;
+ /*
+ * On hash if PKEY feature is not enabled, disable KUAP too.
+ */
+ if (!early_radix_enabled() && !early_mmu_has_feature(MMU_FTR_PKEY))
+ return;
+
+ if (smp_processor_id() == boot_cpuid) {
+ pr_info("Activating Kernel Userspace Access Prevention\n");
+ cur_cpu_spec->mmu_features |= MMU_FTR_KUAP;
+ }
+
+ /*
+ * Set the default kernel AMR values on all cpus.
+ */
+ mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
+ isync();
+}
+#endif
+
+#ifdef CONFIG_PPC_MEM_KEYS
+void pkey_mm_init(struct mm_struct *mm)
+{
+ if (!mmu_has_feature(MMU_FTR_PKEY))
+ return;
+ mm_pkey_allocation_map(mm) = initial_allocation_mask;
+ mm->context.execute_only_pkey = execute_only_key;
+}
+
+static inline void init_amr(int pkey, u8 init_bits)
+{
+ u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
+ u64 old_amr = current_thread_amr() & ~((u64)(0x3ul) << pkeyshift(pkey));
+
+ current->thread.regs->amr = old_amr | new_amr_bits;
+}
+
+static inline void init_iamr(int pkey, u8 init_bits)
+{
+ u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey));
+ u64 old_iamr = current_thread_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey));
+
+ if (!likely(pkey_execute_disable_supported))
+ return;
+
+ current->thread.regs->iamr = old_iamr | new_iamr_bits;
+}
+
+/*
+ * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that
+ * specified in @init_val.
+ */
+int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val)
+{
+ u64 new_amr_bits = 0x0ul;
+ u64 new_iamr_bits = 0x0ul;
+ u64 pkey_bits, uamor_pkey_bits;
+
+ /*
+ * Check whether the key is disabled by UAMOR.
+ */
+ pkey_bits = 0x3ul << pkeyshift(pkey);
+ uamor_pkey_bits = (default_uamor & pkey_bits);
+
+ /*
+ * Both the bits in UAMOR corresponding to the key should be set
+ */
+ if (uamor_pkey_bits != pkey_bits)
+ return -EINVAL;
+
+ if (init_val & PKEY_DISABLE_EXECUTE) {
+ if (!pkey_execute_disable_supported)
+ return -EINVAL;
+ new_iamr_bits |= IAMR_EX_BIT;
+ }
+ init_iamr(pkey, new_iamr_bits);
+
+ /* Set the bits we need in AMR: */
+ if (init_val & PKEY_DISABLE_ACCESS)
+ new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT;
+ else if (init_val & PKEY_DISABLE_WRITE)
+ new_amr_bits |= AMR_WR_BIT;
+
+ init_amr(pkey, new_amr_bits);
+ return 0;
+}
+
+int execute_only_pkey(struct mm_struct *mm)
+{
+ return mm->context.execute_only_pkey;
+}
+
+static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
+{
+ /* Do this check first since the vm_flags should be hot */
+ if ((vma->vm_flags & VM_ACCESS_FLAGS) != VM_EXEC)
+ return false;
+
+ return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey);
+}
+
+/*
+ * This should only be called for *plain* mprotect calls.
+ */
+int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot,
+ int pkey)
+{
+ /*
+ * If the currently associated pkey is execute-only, but the requested
+ * protection is not execute-only, move it back to the default pkey.
+ */
+ if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC))
+ return 0;
+
+ /*
+ * The requested protection is execute-only. Hence let's use an
+ * execute-only pkey.
+ */
+ if (prot == PROT_EXEC) {
+ pkey = execute_only_pkey(vma->vm_mm);
+ if (pkey > 0)
+ return pkey;
+ }
+
+ /* Nothing to override. */
+ return vma_pkey(vma);
+}
+
+static bool pkey_access_permitted(int pkey, bool write, bool execute)
+{
+ int pkey_shift;
+ u64 amr;
+
+ pkey_shift = pkeyshift(pkey);
+ if (execute)
+ return !(current_thread_iamr() & (IAMR_EX_BIT << pkey_shift));
+
+ amr = current_thread_amr();
+ if (write)
+ return !(amr & (AMR_WR_BIT << pkey_shift));
+
+ return !(amr & (AMR_RD_BIT << pkey_shift));
+}
+
+bool arch_pte_access_permitted(u64 pte, bool write, bool execute)
+{
+ if (!mmu_has_feature(MMU_FTR_PKEY))
+ return true;
+
+ return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute);
+}
+
+/*
+ * We only want to enforce protection keys on the current thread because we
+ * effectively have no access to AMR/IAMR for other threads or any way to tell
+ * which AMR/IAMR in a threaded process we could use.
+ *
+ * So do not enforce things if the VMA is not from the current mm, or if we are
+ * in a kernel thread.
+ */
+bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
+ bool execute, bool foreign)
+{
+ if (!mmu_has_feature(MMU_FTR_PKEY))
+ return true;
+ /*
+ * Do not enforce our key-permissions on a foreign vma.
+ */
+ if (foreign || vma_is_foreign(vma))
+ return true;
+
+ return pkey_access_permitted(vma_pkey(vma), write, execute);
+}
+
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ if (!mmu_has_feature(MMU_FTR_PKEY))
+ return;
+
+ /* Duplicate the oldmm pkey state in mm: */
+ mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
+ mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
+}
+
+#endif /* CONFIG_PPC_MEM_KEYS */
diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
new file mode 100644
index 000000000000..35fd2a95be24
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/security.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+#include <asm/mman.h>
+#include <asm/tlb.h>
+
+void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+ int psize;
+ struct hstate *hstate = hstate_file(vma->vm_file);
+
+ psize = hstate_get_psize(hstate);
+ radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
+}
+
+void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+ int psize;
+ struct hstate *hstate = hstate_file(vma->vm_file);
+
+ psize = hstate_get_psize(hstate);
+ radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
+}
+
+void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+{
+ int psize;
+ struct hstate *hstate = hstate_file(vma->vm_file);
+
+ psize = hstate_get_psize(hstate);
+ /*
+ * Flush PWC even if we get PUD_SIZE hugetlb invalidate to keep this simpler.
+ */
+ if (end - start >= PUD_SIZE)
+ radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize);
+ else
+ radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
+ mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
+}
+
+void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long psize = huge_page_size(hstate_vma(vma));
+
+ /*
+ * POWER9 NMMU must flush the TLB after clearing the PTE before
+ * installing a PTE with more relaxed access permissions, see
+ * radix__ptep_set_access_flags.
+ */
+ if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
+ is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ atomic_read(&mm->context.copros) > 0)
+ radix__flush_hugetlb_page(vma, addr);
+
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
+}
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
new file mode 100644
index 000000000000..73977dbabcf2
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -0,0 +1,1694 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Page table handling routines for radix page table.
+ *
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "radix-mmu: " fmt
+
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/sched/mm.h>
+#include <linux/memblock.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/string_helpers.h>
+#include <linux/memory.h>
+#include <linux/kfence.h>
+
+#include <asm/pgalloc.h>
+#include <asm/mmu_context.h>
+#include <asm/dma.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/firmware.h>
+#include <asm/powernv.h>
+#include <asm/sections.h>
+#include <asm/smp.h>
+#include <asm/trace.h>
+#include <asm/uaccess.h>
+#include <asm/ultravisor.h>
+#include <asm/set_memory.h>
+#include <asm/kfence.h>
+
+#include <trace/events/thp.h>
+
+#include <mm/mmu_decl.h>
+
+unsigned int mmu_base_pid;
+
+static __ref void *early_alloc_pgtable(unsigned long size, int nid,
+ unsigned long region_start, unsigned long region_end)
+{
+ phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
+ phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
+ void *ptr;
+
+ if (region_start)
+ min_addr = region_start;
+ if (region_end)
+ max_addr = region_end;
+
+ ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
+
+ if (!ptr)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
+ __func__, size, size, nid, &min_addr, &max_addr);
+
+ return ptr;
+}
+
+/*
+ * When allocating pud or pmd pointers, we allocate a complete page
+ * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
+ * is to ensure that the page obtained from the memblock allocator
+ * can be completely used as page table page and can be freed
+ * correctly when the page table entries are removed.
+ */
+static int early_map_kernel_page(unsigned long ea, unsigned long pa,
+ pgprot_t flags,
+ unsigned int map_page_size,
+ int nid,
+ unsigned long region_start, unsigned long region_end)
+{
+ unsigned long pfn = pa >> PAGE_SHIFT;
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ pgdp = pgd_offset_k(ea);
+ p4dp = p4d_offset(pgdp, ea);
+ if (p4d_none(*p4dp)) {
+ pudp = early_alloc_pgtable(PAGE_SIZE, nid,
+ region_start, region_end);
+ p4d_populate(&init_mm, p4dp, pudp);
+ }
+ pudp = pud_offset(p4dp, ea);
+ if (map_page_size == PUD_SIZE) {
+ ptep = (pte_t *)pudp;
+ goto set_the_pte;
+ }
+ if (pud_none(*pudp)) {
+ pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
+ region_end);
+ pud_populate(&init_mm, pudp, pmdp);
+ }
+ pmdp = pmd_offset(pudp, ea);
+ if (map_page_size == PMD_SIZE) {
+ ptep = pmdp_ptep(pmdp);
+ goto set_the_pte;
+ }
+ if (!pmd_present(*pmdp)) {
+ ptep = early_alloc_pgtable(PAGE_SIZE, nid,
+ region_start, region_end);
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+ }
+ ptep = pte_offset_kernel(pmdp, ea);
+
+set_the_pte:
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
+ asm volatile("ptesync": : :"memory");
+ return 0;
+}
+
+/*
+ * nid, region_start, and region_end are hints to try to place the page
+ * table memory in the same node or region.
+ */
+static int __map_kernel_page(unsigned long ea, unsigned long pa,
+ pgprot_t flags,
+ unsigned int map_page_size,
+ int nid,
+ unsigned long region_start, unsigned long region_end)
+{
+ unsigned long pfn = pa >> PAGE_SHIFT;
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+ /*
+ * Make sure task size is correct as per the max adddr
+ */
+ BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
+
+#ifdef CONFIG_PPC_64K_PAGES
+ BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
+#endif
+
+ if (unlikely(!slab_is_available()))
+ return early_map_kernel_page(ea, pa, flags, map_page_size,
+ nid, region_start, region_end);
+
+ /*
+ * Should make page table allocation functions be able to take a
+ * node, so we can place kernel page tables on the right nodes after
+ * boot.
+ */
+ pgdp = pgd_offset_k(ea);
+ p4dp = p4d_offset(pgdp, ea);
+ pudp = pud_alloc(&init_mm, p4dp, ea);
+ if (!pudp)
+ return -ENOMEM;
+ if (map_page_size == PUD_SIZE) {
+ ptep = (pte_t *)pudp;
+ goto set_the_pte;
+ }
+ pmdp = pmd_alloc(&init_mm, pudp, ea);
+ if (!pmdp)
+ return -ENOMEM;
+ if (map_page_size == PMD_SIZE) {
+ ptep = pmdp_ptep(pmdp);
+ goto set_the_pte;
+ }
+ ptep = pte_alloc_kernel(pmdp, ea);
+ if (!ptep)
+ return -ENOMEM;
+
+set_the_pte:
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
+ asm volatile("ptesync": : :"memory");
+ return 0;
+}
+
+int radix__map_kernel_page(unsigned long ea, unsigned long pa,
+ pgprot_t flags,
+ unsigned int map_page_size)
+{
+ return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
+}
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+static void radix__change_memory_range(unsigned long start, unsigned long end,
+ unsigned long clear)
+{
+ unsigned long idx;
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ start = ALIGN_DOWN(start, PAGE_SIZE);
+ end = PAGE_ALIGN(end); // aligns up
+
+ pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
+ start, end, clear);
+
+ for (idx = start; idx < end; idx += PAGE_SIZE) {
+ pgdp = pgd_offset_k(idx);
+ p4dp = p4d_offset(pgdp, idx);
+ pudp = pud_alloc(&init_mm, p4dp, idx);
+ if (!pudp)
+ continue;
+ if (pud_leaf(*pudp)) {
+ ptep = (pte_t *)pudp;
+ goto update_the_pte;
+ }
+ pmdp = pmd_alloc(&init_mm, pudp, idx);
+ if (!pmdp)
+ continue;
+ if (pmd_leaf(*pmdp)) {
+ ptep = pmdp_ptep(pmdp);
+ goto update_the_pte;
+ }
+ ptep = pte_alloc_kernel(pmdp, idx);
+ if (!ptep)
+ continue;
+update_the_pte:
+ radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
+ }
+
+ radix__flush_tlb_kernel_range(start, end);
+}
+
+void radix__mark_rodata_ro(void)
+{
+ unsigned long start, end;
+
+ start = (unsigned long)_stext;
+ end = (unsigned long)__end_rodata;
+
+ radix__change_memory_range(start, end, _PAGE_WRITE);
+
+ for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) {
+ end = start + PAGE_SIZE;
+ if (overlaps_interrupt_vector_text(start, end))
+ radix__change_memory_range(start, end, _PAGE_WRITE);
+ else
+ break;
+ }
+}
+
+void radix__mark_initmem_nx(void)
+{
+ unsigned long start = (unsigned long)__init_begin;
+ unsigned long end = (unsigned long)__init_end;
+
+ radix__change_memory_range(start, end, _PAGE_EXEC);
+}
+#endif /* CONFIG_STRICT_KERNEL_RWX */
+
+static inline void __meminit
+print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
+{
+ char buf[10];
+
+ if (end <= start)
+ return;
+
+ string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
+
+ pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
+ exec ? " (exec)" : "");
+}
+
+static unsigned long next_boundary(unsigned long addr, unsigned long end)
+{
+#ifdef CONFIG_STRICT_KERNEL_RWX
+ unsigned long stext_phys;
+
+ stext_phys = __pa_symbol(_stext);
+
+ // Relocatable kernel running at non-zero real address
+ if (stext_phys != 0) {
+ // The end of interrupts code at zero is a rodata boundary
+ unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys;
+ if (addr < end_intr)
+ return end_intr;
+
+ // Start of relocated kernel text is a rodata boundary
+ if (addr < stext_phys)
+ return stext_phys;
+ }
+
+ if (addr < __pa_symbol(__srwx_boundary))
+ return __pa_symbol(__srwx_boundary);
+#endif
+ return end;
+}
+
+static int __meminit create_physical_mapping(unsigned long start,
+ unsigned long end,
+ int nid, pgprot_t _prot,
+ unsigned long mapping_sz_limit)
+{
+ unsigned long vaddr, addr, mapping_size = 0;
+ bool prev_exec, exec = false;
+ pgprot_t prot;
+ int psize;
+ unsigned long max_mapping_size = memory_block_size;
+
+ if (mapping_sz_limit < max_mapping_size)
+ max_mapping_size = mapping_sz_limit;
+
+ if (debug_pagealloc_enabled())
+ max_mapping_size = PAGE_SIZE;
+
+ start = ALIGN(start, PAGE_SIZE);
+ end = ALIGN_DOWN(end, PAGE_SIZE);
+ for (addr = start; addr < end; addr += mapping_size) {
+ unsigned long gap, previous_size;
+ int rc;
+
+ gap = next_boundary(addr, end) - addr;
+ if (gap > max_mapping_size)
+ gap = max_mapping_size;
+ previous_size = mapping_size;
+ prev_exec = exec;
+
+ if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
+ mmu_psize_defs[MMU_PAGE_1G].shift) {
+ mapping_size = PUD_SIZE;
+ psize = MMU_PAGE_1G;
+ } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
+ mmu_psize_defs[MMU_PAGE_2M].shift) {
+ mapping_size = PMD_SIZE;
+ psize = MMU_PAGE_2M;
+ } else {
+ mapping_size = PAGE_SIZE;
+ psize = mmu_virtual_psize;
+ }
+
+ vaddr = (unsigned long)__va(addr);
+
+ if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
+ overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
+ prot = PAGE_KERNEL_X;
+ exec = true;
+ } else {
+ prot = _prot;
+ exec = false;
+ }
+
+ if (mapping_size != previous_size || exec != prev_exec) {
+ print_mapping(start, addr, previous_size, prev_exec);
+ start = addr;
+ }
+
+ rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
+ if (rc)
+ return rc;
+
+ update_page_count(psize, 1);
+ }
+
+ print_mapping(start, addr, mapping_size, exec);
+ return 0;
+}
+
+#ifdef CONFIG_KFENCE
+static __init phys_addr_t alloc_kfence_pool(void)
+{
+ phys_addr_t kfence_pool;
+
+ /*
+ * TODO: Support to enable KFENCE after bootup depends on the ability to
+ * split page table mappings. As such support is not currently
+ * implemented for radix pagetables, support enabling KFENCE
+ * only at system startup for now.
+ *
+ * After support for splitting mappings is available on radix,
+ * alloc_kfence_pool() & map_kfence_pool() can be dropped and
+ * mapping for __kfence_pool memory can be
+ * split during arch_kfence_init_pool().
+ */
+ if (!kfence_early_init)
+ goto no_kfence;
+
+ kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
+ if (!kfence_pool)
+ goto no_kfence;
+
+ memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
+ return kfence_pool;
+
+no_kfence:
+ disable_kfence();
+ return 0;
+}
+
+static __init void map_kfence_pool(phys_addr_t kfence_pool)
+{
+ if (!kfence_pool)
+ return;
+
+ if (create_physical_mapping(kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
+ -1, PAGE_KERNEL, PAGE_SIZE))
+ goto err;
+
+ memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
+ __kfence_pool = __va(kfence_pool);
+ return;
+
+err:
+ memblock_phys_free(kfence_pool, KFENCE_POOL_SIZE);
+ disable_kfence();
+}
+#else
+static inline phys_addr_t alloc_kfence_pool(void) { return 0; }
+static inline void map_kfence_pool(phys_addr_t kfence_pool) { }
+#endif
+
+static void __init radix_init_pgtable(void)
+{
+ phys_addr_t kfence_pool;
+ unsigned long rts_field;
+ phys_addr_t start, end;
+ u64 i;
+
+ /* We don't support slb for radix */
+ slb_set_size(0);
+
+ kfence_pool = alloc_kfence_pool();
+
+ /*
+ * Create the linear mapping
+ */
+ for_each_mem_range(i, &start, &end) {
+ /*
+ * The memblock allocator is up at this point, so the
+ * page tables will be allocated within the range. No
+ * need or a node (which we don't have yet).
+ */
+
+ if (end >= RADIX_VMALLOC_START) {
+ pr_warn("Outside the supported range\n");
+ continue;
+ }
+
+ WARN_ON(create_physical_mapping(start, end,
+ -1, PAGE_KERNEL, ~0UL));
+ }
+
+ map_kfence_pool(kfence_pool);
+
+ if (!cpu_has_feature(CPU_FTR_HVMODE) &&
+ cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
+ /*
+ * Older versions of KVM on these machines prefer if the
+ * guest only uses the low 19 PID bits.
+ */
+ mmu_pid_bits = 19;
+ }
+ mmu_base_pid = 1;
+
+ /*
+ * Allocate Partition table and process table for the
+ * host.
+ */
+ BUG_ON(PRTB_SIZE_SHIFT > 36);
+ process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
+ /*
+ * Fill in the process table.
+ */
+ rts_field = radix__get_tree_size();
+ process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
+
+ /*
+ * The init_mm context is given the first available (non-zero) PID,
+ * which is the "guard PID" and contains no page table. PIDR should
+ * never be set to zero because that duplicates the kernel address
+ * space at the 0x0... offset (quadrant 0)!
+ *
+ * An arbitrary PID that may later be allocated by the PID allocator
+ * for userspace processes must not be used either, because that
+ * would cause stale user mappings for that PID on CPUs outside of
+ * the TLB invalidation scheme (because it won't be in mm_cpumask).
+ *
+ * So permanently carve out one PID for the purpose of a guard PID.
+ */
+ init_mm.context.id = mmu_base_pid;
+ mmu_base_pid++;
+}
+
+static void __init radix_init_partition_table(void)
+{
+ unsigned long rts_field, dw0, dw1;
+
+ mmu_partition_table_init();
+ rts_field = radix__get_tree_size();
+ dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
+ dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
+ mmu_partition_table_set_entry(0, dw0, dw1, false);
+
+ pr_info("Initializing Radix MMU\n");
+}
+
+static int __init get_idx_from_shift(unsigned int shift)
+{
+ int idx = -1;
+
+ switch (shift) {
+ case 0xc:
+ idx = MMU_PAGE_4K;
+ break;
+ case 0x10:
+ idx = MMU_PAGE_64K;
+ break;
+ case 0x15:
+ idx = MMU_PAGE_2M;
+ break;
+ case 0x1e:
+ idx = MMU_PAGE_1G;
+ break;
+ }
+ return idx;
+}
+
+static int __init radix_dt_scan_page_sizes(unsigned long node,
+ const char *uname, int depth,
+ void *data)
+{
+ int size = 0;
+ int shift, idx;
+ unsigned int ap;
+ const __be32 *prop;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+
+ /* We are scanning "cpu" nodes only */
+ if (type == NULL || strcmp(type, "cpu") != 0)
+ return 0;
+
+ /* Grab page size encodings */
+ prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
+ if (!prop)
+ return 0;
+
+ pr_info("Page sizes from device-tree:\n");
+ for (; size >= 4; size -= 4, ++prop) {
+
+ struct mmu_psize_def *def;
+
+ /* top 3 bit is AP encoding */
+ shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
+ ap = be32_to_cpu(prop[0]) >> 29;
+ pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
+
+ idx = get_idx_from_shift(shift);
+ if (idx < 0)
+ continue;
+
+ def = &mmu_psize_defs[idx];
+ def->shift = shift;
+ def->ap = ap;
+ def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
+ }
+
+ /* needed ? */
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
+ return 1;
+}
+
+void __init radix__early_init_devtree(void)
+{
+ int rc;
+
+ /*
+ * Try to find the available page sizes in the device-tree
+ */
+ rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
+ if (!rc) {
+ /*
+ * No page size details found in device tree.
+ * Let's assume we have page 4k and 64k support
+ */
+ mmu_psize_defs[MMU_PAGE_4K].shift = 12;
+ mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
+ mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
+ psize_to_rpti_pgsize(MMU_PAGE_4K);
+
+ mmu_psize_defs[MMU_PAGE_64K].shift = 16;
+ mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
+ mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
+ psize_to_rpti_pgsize(MMU_PAGE_64K);
+ }
+ return;
+}
+
+void __init radix__early_init_mmu(void)
+{
+ unsigned long lpcr;
+
+#ifdef CONFIG_PPC_64S_HASH_MMU
+#ifdef CONFIG_PPC_64K_PAGES
+ /* PAGE_SIZE mappings */
+ mmu_virtual_psize = MMU_PAGE_64K;
+#else
+ mmu_virtual_psize = MMU_PAGE_4K;
+#endif
+#endif
+ /*
+ * initialize page table size
+ */
+ __pte_index_size = RADIX_PTE_INDEX_SIZE;
+ __pmd_index_size = RADIX_PMD_INDEX_SIZE;
+ __pud_index_size = RADIX_PUD_INDEX_SIZE;
+ __pgd_index_size = RADIX_PGD_INDEX_SIZE;
+ __pud_cache_index = RADIX_PUD_INDEX_SIZE;
+ __pte_table_size = RADIX_PTE_TABLE_SIZE;
+ __pmd_table_size = RADIX_PMD_TABLE_SIZE;
+ __pud_table_size = RADIX_PUD_TABLE_SIZE;
+ __pgd_table_size = RADIX_PGD_TABLE_SIZE;
+
+ __pmd_val_bits = RADIX_PMD_VAL_BITS;
+ __pud_val_bits = RADIX_PUD_VAL_BITS;
+ __pgd_val_bits = RADIX_PGD_VAL_BITS;
+
+ __kernel_virt_start = RADIX_KERN_VIRT_START;
+ __vmalloc_start = RADIX_VMALLOC_START;
+ __vmalloc_end = RADIX_VMALLOC_END;
+ __kernel_io_start = RADIX_KERN_IO_START;
+ __kernel_io_end = RADIX_KERN_IO_END;
+ vmemmap = (struct page *)RADIX_VMEMMAP_START;
+ ioremap_bot = IOREMAP_BASE;
+
+#ifdef CONFIG_PCI
+ pci_io_base = ISA_IO_BASE;
+#endif
+ __pte_frag_nr = RADIX_PTE_FRAG_NR;
+ __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
+ __pmd_frag_nr = RADIX_PMD_FRAG_NR;
+ __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
+
+ radix_init_pgtable();
+
+ if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+ lpcr = mfspr(SPRN_LPCR);
+ mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
+ radix_init_partition_table();
+ } else {
+ radix_init_pseries();
+ }
+
+ memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
+
+ /* Switch to the guard PID before turning on MMU */
+ radix__switch_mmu_context(NULL, &init_mm);
+ tlbiel_all();
+}
+
+void radix__early_init_mmu_secondary(void)
+{
+ unsigned long lpcr;
+ /*
+ * update partition table control register and UPRT
+ */
+ if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+ lpcr = mfspr(SPRN_LPCR);
+ mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
+
+ set_ptcr_when_no_uv(__pa(partition_tb) |
+ (PATB_SIZE_SHIFT - 12));
+ }
+
+ radix__switch_mmu_context(NULL, &init_mm);
+ tlbiel_all();
+
+ /* Make sure userspace can't change the AMR */
+ mtspr(SPRN_UAMOR, 0);
+}
+
+/* Called during kexec sequence with MMU off */
+notrace void radix__mmu_cleanup_all(void)
+{
+ unsigned long lpcr;
+
+ if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+ lpcr = mfspr(SPRN_LPCR);
+ mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
+ set_ptcr_when_no_uv(0);
+ powernv_set_nmmu_ptcr(0);
+ radix__flush_tlb_all();
+ }
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (!pte_none(*pte))
+ return;
+ }
+
+ pte_free_kernel(&init_mm, pte_start);
+ pmd_clear(pmd);
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (!pmd_none(*pmd))
+ return;
+ }
+
+ pmd_free(&init_mm, pmd_start);
+ pud_clear(pud);
+}
+
+static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
+{
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (!pud_none(*pud))
+ return;
+ }
+
+ pud_free(&init_mm, pud_start);
+ p4d_clear(p4d);
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
+{
+ unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
+
+ return !vmemmap_populated(start, PMD_SIZE);
+}
+
+static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
+{
+ unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
+
+ return !vmemmap_populated(start, PAGE_SIZE);
+
+}
+#endif
+
+static void __meminit free_vmemmap_pages(struct page *page,
+ struct vmem_altmap *altmap,
+ int order)
+{
+ unsigned int nr_pages = 1 << order;
+
+ if (altmap) {
+ unsigned long alt_start, alt_end;
+ unsigned long base_pfn = page_to_pfn(page);
+
+ /*
+ * with 2M vmemmap mmaping we can have things setup
+ * such that even though atlmap is specified we never
+ * used altmap.
+ */
+ alt_start = altmap->base_pfn;
+ alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
+
+ if (base_pfn >= alt_start && base_pfn < alt_end) {
+ vmem_altmap_free(altmap, nr_pages);
+ return;
+ }
+ }
+
+ if (PageReserved(page)) {
+ /* allocated from memblock */
+ while (nr_pages--)
+ free_reserved_page(page++);
+ } else
+ __free_pages(page, order);
+}
+
+static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte;
+
+ pte = pte_start + pte_index(addr);
+ for (; addr < end; addr = next, pte++) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ if (!pte_present(*pte))
+ continue;
+
+ if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
+ if (!direct)
+ free_vmemmap_pages(pte_page(*pte), altmap, 0);
+ pte_clear(&init_mm, addr, pte);
+ pages++;
+ }
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (!direct && vmemmap_page_is_unused(addr, next)) {
+ free_vmemmap_pages(pte_page(*pte), altmap, 0);
+ pte_clear(&init_mm, addr, pte);
+ }
+#endif
+ }
+ if (direct)
+ update_page_count(mmu_virtual_psize, -pages);
+}
+
+static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte_base;
+ pmd_t *pmd;
+
+ pmd = pmd_start + pmd_index(addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(*pmd))
+ continue;
+
+ if (pmd_leaf(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
+ pte_clear(&init_mm, addr, (pte_t *)pmd);
+ pages++;
+ }
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
+ free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
+ pte_clear(&init_mm, addr, (pte_t *)pmd);
+ }
+#endif
+ continue;
+ }
+
+ pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+ remove_pte_table(pte_base, addr, next, direct, altmap);
+ free_pte_table(pte_base, pmd);
+ }
+ if (direct)
+ update_page_count(MMU_PAGE_2M, -pages);
+}
+
+static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
+{
+ unsigned long next, pages = 0;
+ pmd_t *pmd_base;
+ pud_t *pud;
+
+ pud = pud_start + pud_index(addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(*pud))
+ continue;
+
+ if (pud_leaf(*pud)) {
+ if (!IS_ALIGNED(addr, PUD_SIZE) ||
+ !IS_ALIGNED(next, PUD_SIZE)) {
+ WARN_ONCE(1, "%s: unaligned range\n", __func__);
+ continue;
+ }
+ pte_clear(&init_mm, addr, (pte_t *)pud);
+ pages++;
+ continue;
+ }
+
+ pmd_base = pud_pgtable(*pud);
+ remove_pmd_table(pmd_base, addr, next, direct, altmap);
+ free_pmd_table(pmd_base, pud);
+ }
+ if (direct)
+ update_page_count(MMU_PAGE_1G, -pages);
+}
+
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
+{
+ unsigned long addr, next;
+ pud_t *pud_base;
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ spin_lock(&init_mm.page_table_lock);
+
+ for (addr = start; addr < end; addr = next) {
+ next = pgd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
+ continue;
+
+ if (p4d_leaf(*p4d)) {
+ if (!IS_ALIGNED(addr, P4D_SIZE) ||
+ !IS_ALIGNED(next, P4D_SIZE)) {
+ WARN_ONCE(1, "%s: unaligned range\n", __func__);
+ continue;
+ }
+
+ pte_clear(&init_mm, addr, (pte_t *)pgd);
+ continue;
+ }
+
+ pud_base = p4d_pgtable(*p4d);
+ remove_pud_table(pud_base, addr, next, direct, altmap);
+ free_pud_table(pud_base, p4d);
+ }
+
+ spin_unlock(&init_mm.page_table_lock);
+ radix__flush_tlb_kernel_range(start, end);
+}
+
+int __meminit radix__create_section_mapping(unsigned long start,
+ unsigned long end, int nid,
+ pgprot_t prot)
+{
+ if (end >= RADIX_VMALLOC_START) {
+ pr_warn("Outside the supported range\n");
+ return -1;
+ }
+
+ return create_physical_mapping(__pa(start), __pa(end),
+ nid, prot, ~0UL);
+}
+
+int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
+{
+ remove_pagetable(start, end, true, NULL);
+ return 0;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
+ pgprot_t flags, unsigned int map_page_size,
+ int nid)
+{
+ return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
+}
+
+int __meminit radix__vmemmap_create_mapping(unsigned long start,
+ unsigned long page_size,
+ unsigned long phys)
+{
+ /* Create a PTE encoding */
+ int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
+ int ret;
+
+ if ((start + page_size) >= RADIX_VMEMMAP_END) {
+ pr_warn("Outside the supported range\n");
+ return -1;
+ }
+
+ ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);
+ BUG_ON(ret);
+
+ return 0;
+}
+
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+{
+ if (radix_enabled())
+ return __vmemmap_can_optimize(altmap, pgmap);
+
+ return false;
+}
+#endif
+
+int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
+ unsigned long addr, unsigned long next)
+{
+ int large = pmd_leaf(*pmdp);
+
+ if (large)
+ vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
+
+ return large;
+}
+
+void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
+ unsigned long addr, unsigned long next)
+{
+ pte_t entry;
+ pte_t *ptep = pmdp_ptep(pmdp);
+
+ VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, ptep, entry);
+ asm volatile("ptesync": : :"memory");
+
+ vmemmap_verify(ptep, node, addr, next);
+}
+
+static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
+ int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
+{
+ pte_t *pte = pte_offset_kernel(pmdp, addr);
+
+ if (pte_none(*pte)) {
+ pte_t entry;
+ void *p;
+
+ if (!reuse) {
+ /*
+ * make sure we don't create altmap mappings
+ * covering things outside the device.
+ */
+ if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
+ altmap = NULL;
+
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+ if (!p && altmap)
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
+ if (!p)
+ return NULL;
+ pr_debug("PAGE_SIZE vmemmap mapping\n");
+ } else {
+ /*
+ * When a PTE/PMD entry is freed from the init_mm
+ * there's a free_pages() call to this page allocated
+ * above. Thus this get_page() is paired with the
+ * put_page_testzero() on the freeing path.
+ * This can only called by certain ZONE_DEVICE path,
+ * and through vmemmap_populate_compound_pages() when
+ * slab is available.
+ */
+ get_page(reuse);
+ p = page_to_virt(reuse);
+ pr_debug("Tail page reuse vmemmap mapping\n");
+ }
+
+ VM_BUG_ON(!PAGE_ALIGNED(addr));
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, pte, entry);
+ asm volatile("ptesync": : :"memory");
+ }
+ return pte;
+}
+
+static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
+ unsigned long address)
+{
+ pud_t *pud;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(p4d_none(*p4dp))) {
+ if (unlikely(!slab_is_available())) {
+ pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ p4d_populate(&init_mm, p4dp, pud);
+ /* go to the pud_offset */
+ } else
+ return pud_alloc(&init_mm, p4dp, address);
+ }
+ return pud_offset(p4dp, address);
+}
+
+static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
+ unsigned long address)
+{
+ pmd_t *pmd;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(pud_none(*pudp))) {
+ if (unlikely(!slab_is_available())) {
+ pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ pud_populate(&init_mm, pudp, pmd);
+ } else
+ return pmd_alloc(&init_mm, pudp, address);
+ }
+ return pmd_offset(pudp, address);
+}
+
+static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
+ unsigned long address)
+{
+ pte_t *pte;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(pmd_none(*pmdp))) {
+ if (unlikely(!slab_is_available())) {
+ pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ pmd_populate(&init_mm, pmdp, pte);
+ } else
+ return pte_alloc_kernel(pmdp, address);
+ }
+ return pte_offset_kernel(pmdp, address);
+}
+
+
+
+int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
+{
+ unsigned long addr;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ /*
+ * If altmap is present, Make sure we align the start vmemmap addr
+ * to PAGE_SIZE so that we calculate the correct start_pfn in
+ * altmap boundary check to decide whether we should use altmap or
+ * RAM based backing memory allocation. Also the address need to be
+ * aligned for set_pte operation. If the start addr is already
+ * PMD_SIZE aligned and with in the altmap boundary then we will
+ * try to use a pmd size altmap mapping else we go for page size
+ * mapping.
+ *
+ * If altmap is not present, align the vmemmap addr to PMD_SIZE and
+ * always allocate a PMD size page for vmemmap backing.
+ *
+ */
+
+ if (altmap)
+ start = ALIGN_DOWN(start, PAGE_SIZE);
+ else
+ start = ALIGN_DOWN(start, PMD_SIZE);
+
+ for (addr = start; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return -ENOMEM;
+
+ if (pmd_none(READ_ONCE(*pmd))) {
+ void *p;
+
+ /*
+ * keep it simple by checking addr PMD_SIZE alignment
+ * and verifying the device boundary condition.
+ * For us to use a pmd mapping, both addr and pfn should
+ * be aligned. We skip if addr is not aligned and for
+ * pfn we hope we have extra area in the altmap that
+ * can help to find an aligned block. This can result
+ * in altmap block allocation failures, in which case
+ * we fallback to RAM for vmemmap allocation.
+ */
+ if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
+ altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
+ /*
+ * make sure we don't create altmap mappings
+ * covering things outside the device.
+ */
+ goto base_mapping;
+ }
+
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
+ if (p) {
+ vmemmap_set_pmd(pmd, p, node, addr, next);
+ pr_debug("PMD_SIZE vmemmap mapping\n");
+ continue;
+ } else {
+ /*
+ * A vmemmap block allocation can fail due to
+ * alignment requirements and we trying to align
+ * things aggressively there by running out of
+ * space. Try base mapping on failure.
+ */
+ goto base_mapping;
+ }
+ } else if (vmemmap_check_pmd(pmd, node, addr, next)) {
+ /*
+ * If a huge mapping exist due to early call to
+ * vmemmap_populate, let's try to use that.
+ */
+ continue;
+ }
+base_mapping:
+ /*
+ * Not able allocate higher order memory to back memmap
+ * or we found a pointer to pte page. Allocate base page
+ * size vmemmap
+ */
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return -ENOMEM;
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
+ if (!pte)
+ return -ENOMEM;
+
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+ next = addr + PAGE_SIZE;
+ }
+ return 0;
+}
+
+static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return NULL;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return NULL;
+ if (pmd_leaf(*pmd))
+ /*
+ * The second page is mapped as a hugepage due to a nearby request.
+ * Force our mapping to page size without deduplication
+ */
+ return NULL;
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return NULL;
+ radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ return pte;
+}
+
+static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
+ unsigned long pfn_offset, int node)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long map_addr;
+
+ /* the second vmemmap page which we use for duplication */
+ map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
+ pgd = pgd_offset_k(map_addr);
+ p4d = p4d_offset(pgd, map_addr);
+ pud = vmemmap_pud_alloc(p4d, node, map_addr);
+ if (!pud)
+ return NULL;
+ pmd = vmemmap_pmd_alloc(pud, node, map_addr);
+ if (!pmd)
+ return NULL;
+ if (pmd_leaf(*pmd))
+ /*
+ * The second page is mapped as a hugepage due to a nearby request.
+ * Force our mapping to page size without deduplication
+ */
+ return NULL;
+ pte = vmemmap_pte_alloc(pmd, node, map_addr);
+ if (!pte)
+ return NULL;
+ /*
+ * Check if there exist a mapping to the left
+ */
+ if (pte_none(*pte)) {
+ /*
+ * Populate the head page vmemmap page.
+ * It can fall in different pmd, hence
+ * vmemmap_populate_address()
+ */
+ pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
+ if (!pte)
+ return NULL;
+ /*
+ * Populate the tail pages vmemmap page
+ */
+ pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
+ if (!pte)
+ return NULL;
+ vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
+ return pte;
+ }
+ return pte;
+}
+
+int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
+ unsigned long start,
+ unsigned long end, int node,
+ struct dev_pagemap *pgmap)
+{
+ /*
+ * we want to map things as base page size mapping so that
+ * we can save space in vmemmap. We could have huge mapping
+ * covering out both edges.
+ */
+ unsigned long addr;
+ unsigned long addr_pfn = start_pfn;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ for (addr = start; addr < end; addr = next) {
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return -ENOMEM;
+
+ if (pmd_leaf(READ_ONCE(*pmd))) {
+ /* existing huge mapping. Skip the range */
+ addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
+ next = pmd_addr_end(addr, end);
+ continue;
+ }
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return -ENOMEM;
+ if (!pte_none(*pte)) {
+ /*
+ * This could be because we already have a compound
+ * page whose VMEMMAP_RESERVE_NR pages were mapped and
+ * this request fall in those pages.
+ */
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ } else {
+ unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
+ unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
+ pte_t *tail_page_pte;
+
+ /*
+ * if the address is aligned to huge page size it is the
+ * head mapping.
+ */
+ if (pfn_offset == 0) {
+ /* Populate the head page vmemmap page */
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ /*
+ * Populate the tail pages vmemmap page
+ * It can fall in different pmd, hence
+ * vmemmap_populate_address()
+ */
+ pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+
+ addr_pfn += 2;
+ next = addr + 2 * PAGE_SIZE;
+ continue;
+ }
+ /*
+ * get the 2nd mapping details
+ * Also create it if that doesn't exist
+ */
+ tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
+ if (!tail_page_pte) {
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ }
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ }
+ }
+ return 0;
+}
+
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
+{
+ remove_pagetable(start, start + page_size, true, NULL);
+}
+
+void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+ remove_pagetable(start, end, false, altmap);
+}
+#endif
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, unsigned long clr,
+ unsigned long set)
+{
+ unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!radix__pmd_trans_huge(*pmdp));
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+#endif
+
+ old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
+ trace_hugepage_update_pmd(addr, old, clr, set);
+
+ return old;
+}
+
+unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, unsigned long clr,
+ unsigned long set)
+{
+ unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pud_trans_huge(*pudp));
+ assert_spin_locked(pud_lockptr(mm, pudp));
+#endif
+
+ old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
+ trace_hugepage_update_pud(addr, old, clr, set);
+
+ return old;
+}
+
+pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+
+{
+ pmd_t pmd;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
+ /*
+ * khugepaged calls this for normal pmd
+ */
+ pmd = *pmdp;
+ pmd_clear(pmdp);
+
+ radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
+
+ return pmd;
+}
+
+/*
+ * For us pgtable_t is pte_t *. Inorder to save the deposisted
+ * page table, we consider the allocated page table as a list
+ * head. On withdraw we need to make sure we zero out the used
+ * list_head memory area.
+ */
+void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ struct list_head *lh = (struct list_head *) pgtable;
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+ /* FIFO */
+ if (!pmd_huge_pte(mm, pmdp))
+ INIT_LIST_HEAD(lh);
+ else
+ list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+ pmd_huge_pte(mm, pmdp) = pgtable;
+}
+
+pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pte_t *ptep;
+ pgtable_t pgtable;
+ struct list_head *lh;
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+ /* FIFO */
+ pgtable = pmd_huge_pte(mm, pmdp);
+ lh = (struct list_head *) pgtable;
+ if (list_empty(lh))
+ pmd_huge_pte(mm, pmdp) = NULL;
+ else {
+ pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
+ list_del(lh);
+ }
+ ptep = (pte_t *) pgtable;
+ *ptep = __pte(0);
+ ptep++;
+ *ptep = __pte(0);
+ return pgtable;
+}
+
+pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old_pmd;
+ unsigned long old;
+
+ old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+ old_pmd = __pmd(old);
+ return old_pmd;
+}
+
+pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ pud_t old_pud;
+ unsigned long old;
+
+ old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
+ old_pud = __pud(old);
+ return old_pud;
+}
+
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
+ pte_t entry, unsigned long address, int psize)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY |
+ _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
+
+ unsigned long change = pte_val(entry) ^ pte_val(*ptep);
+ /*
+ * On POWER9, the NMMU is not able to relax PTE access permissions
+ * for a translation with a TLB. The PTE must be invalidated, TLB
+ * flushed before the new PTE is installed.
+ *
+ * This only needs to be done for radix, because hash translation does
+ * flush when updating the linux pte (and we don't support NMMU
+ * accelerators on HPT on POWER9 anyway XXX: do we?).
+ *
+ * POWER10 (and P9P) NMMU does behave as per ISA.
+ */
+ if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
+ atomic_read(&mm->context.copros) > 0) {
+ unsigned long old_pte, new_pte;
+
+ old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
+ new_pte = old_pte | set;
+ radix__flush_tlb_page_psize(mm, address, psize);
+ __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
+ } else {
+ __radix_pte_update(ptep, 0, set);
+ /*
+ * Book3S does not require a TLB flush when relaxing access
+ * restrictions when the address space (modulo the POWER9 nest
+ * MMU issue above) because the MMU will reload the PTE after
+ * taking an access fault, as defined by the architecture. See
+ * "Setting a Reference or Change Bit or Upgrading Access
+ * Authority (PTE Subject to Atomic Hardware Updates)" in
+ * Power ISA Version 3.1B.
+ */
+ }
+ /* See ptesync comment in radix__set_pte_at */
+}
+
+void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+
+ /*
+ * POWER9 NMMU must flush the TLB after clearing the PTE before
+ * installing a PTE with more relaxed access permissions, see
+ * radix__ptep_set_access_flags.
+ */
+ if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
+ is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ (atomic_read(&mm->context.copros) > 0))
+ radix__flush_tlb_page(vma, addr);
+
+ set_pte_at(mm, addr, ptep, pte);
+}
+
+int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+ pte_t *ptep = (pte_t *)pud;
+ pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);
+
+ if (!radix_enabled())
+ return 0;
+
+ set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud);
+
+ return 1;
+}
+
+int pud_clear_huge(pud_t *pud)
+{
+ if (pud_leaf(*pud)) {
+ pud_clear(pud);
+ return 1;
+ }
+
+ return 0;
+}
+
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
+{
+ pmd_t *pmd;
+ int i;
+
+ pmd = pud_pgtable(*pud);
+ pud_clear(pud);
+
+ flush_tlb_kernel_range(addr, addr + PUD_SIZE);
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(pmd[i])) {
+ pte_t *pte;
+ pte = (pte_t *)pmd_page_vaddr(pmd[i]);
+
+ pte_free_kernel(&init_mm, pte);
+ }
+ }
+
+ pmd_free(&init_mm, pmd);
+
+ return 1;
+}
+
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+ pte_t *ptep = (pte_t *)pmd;
+ pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);
+
+ if (!radix_enabled())
+ return 0;
+
+ set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd);
+
+ return 1;
+}
+
+int pmd_clear_huge(pmd_t *pmd)
+{
+ if (pmd_leaf(*pmd)) {
+ pmd_clear(pmd);
+ return 1;
+ }
+
+ return 0;
+}
+
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
+{
+ pte_t *pte;
+
+ pte = (pte_t *)pmd_page_vaddr(*pmd);
+ pmd_clear(pmd);
+
+ flush_tlb_kernel_range(addr, addr + PMD_SIZE);
+
+ pte_free_kernel(&init_mm, pte);
+
+ return 1;
+}
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
new file mode 100644
index 000000000000..9e1f6558d026
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -0,0 +1,1587 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * TLB flush routines for radix kernels.
+ *
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/memblock.h>
+#include <linux/mmu_context.h>
+#include <linux/sched/mm.h>
+#include <linux/debugfs.h>
+
+#include <asm/ppc-opcode.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/trace.h>
+#include <asm/cputhreads.h>
+#include <asm/plpar_wrappers.h>
+
+#include "internal.h"
+
+/*
+ * tlbiel instruction for radix, set invalidation
+ * i.e., r=1 and is=01 or is=10 or is=11
+ */
+static __always_inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
+ unsigned int pid,
+ unsigned int ric, unsigned int prs)
+{
+ unsigned long rb;
+ unsigned long rs;
+
+ rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+ rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
+
+ asm volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
+ : : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
+ : "memory");
+}
+
+static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
+{
+ unsigned int set;
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the first set of the TLB, and the entire Page Walk Cache
+ * and partition table entries. Then flush the remaining sets of the
+ * TLB.
+ */
+
+ if (early_cpu_has_feature(CPU_FTR_HVMODE)) {
+ /* MSR[HV] should flush partition scope translations first. */
+ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
+
+ if (!early_cpu_has_feature(CPU_FTR_ARCH_31)) {
+ for (set = 1; set < num_sets; set++)
+ tlbiel_radix_set_isa300(set, is, 0,
+ RIC_FLUSH_TLB, 0);
+ }
+ }
+
+ /* Flush process scoped entries. */
+ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
+
+ if (!early_cpu_has_feature(CPU_FTR_ARCH_31)) {
+ for (set = 1; set < num_sets; set++)
+ tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
+ }
+
+ ppc_after_tlbiel_barrier();
+}
+
+void radix__tlbiel_all(unsigned int action)
+{
+ unsigned int is;
+
+ switch (action) {
+ case TLB_INVAL_SCOPE_GLOBAL:
+ is = 3;
+ break;
+ case TLB_INVAL_SCOPE_LPID:
+ is = 2;
+ break;
+ default:
+ BUG();
+ }
+
+ if (early_cpu_has_feature(CPU_FTR_ARCH_300))
+ tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is);
+ else
+ WARN(1, "%s called on pre-POWER9 CPU\n", __func__);
+
+ asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static __always_inline void __tlbiel_pid(unsigned long pid, int set,
+ unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(53); /* IS = 1 */
+ rb |= set << PPC_BITLSHIFT(51);
+ rs = ((unsigned long)pid) << PPC_BITLSHIFT(31);
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 1, rb, rs, ric, prs, r);
+}
+
+static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(53); /* IS = 1 */
+ rs = pid << PPC_BITLSHIFT(31);
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rs = lpid;
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
+static __always_inline void __tlbie_lpid_guest(unsigned long lpid, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rs = lpid;
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
+static __always_inline void __tlbiel_va(unsigned long va, unsigned long pid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = pid << PPC_BITLSHIFT(31);
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 1, rb, rs, ric, prs, r);
+}
+
+static __always_inline void __tlbie_va(unsigned long va, unsigned long pid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = pid << PPC_BITLSHIFT(31);
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = lpid;
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
+
+static inline void fixup_tlbie_va(unsigned long va, unsigned long pid,
+ unsigned long ap)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_va(va, 0, ap, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_va(va, pid, ap, RIC_FLUSH_TLB);
+ }
+}
+
+static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid,
+ unsigned long ap)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_pid(0, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_va(va, pid, ap, RIC_FLUSH_TLB);
+ }
+}
+
+static inline void fixup_tlbie_pid(unsigned long pid)
+{
+ /*
+ * We can use any address for the invalidation, pick one which is
+ * probably unused as an optimisation.
+ */
+ unsigned long va = ((1UL << 52) - 1);
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_pid(0, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
+ }
+}
+
+static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
+ unsigned long ap)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, 0, ap, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, lpid, ap, RIC_FLUSH_TLB);
+ }
+}
+
+static inline void fixup_tlbie_lpid(unsigned long lpid)
+{
+ /*
+ * We can use any address for the invalidation, pick one which is
+ * probably unused as an optimisation.
+ */
+ unsigned long va = ((1UL << 52) - 1);
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid(0, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
+ }
+}
+
+/*
+ * We use 128 set in radix mode and 256 set in hpt mode.
+ */
+static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
+{
+ int set;
+
+ asm volatile("ptesync": : :"memory");
+
+ switch (ric) {
+ case RIC_FLUSH_PWC:
+
+ /* For PWC, only one flush is needed */
+ __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+ ppc_after_tlbiel_barrier();
+ return;
+ case RIC_FLUSH_TLB:
+ __tlbiel_pid(pid, 0, RIC_FLUSH_TLB);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ /*
+ * Flush the first set of the TLB, and if
+ * we're doing a RIC_FLUSH_ALL, also flush
+ * the entire Page Walk Cache.
+ */
+ __tlbiel_pid(pid, 0, RIC_FLUSH_ALL);
+ }
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_31)) {
+ /* For the remaining sets, just flush the TLB */
+ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+ __tlbiel_pid(pid, set, RIC_FLUSH_TLB);
+ }
+
+ ppc_after_tlbiel_barrier();
+ asm volatile(PPC_RADIX_INVALIDATE_ERAT_USER "; isync" : : :"memory");
+}
+
+static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
+{
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time constraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_pid(pid, RIC_FLUSH_TLB);
+ fixup_tlbie_pid(pid);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_pid(pid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_pid(pid, RIC_FLUSH_ALL);
+ fixup_tlbie_pid(pid);
+ }
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+struct tlbiel_pid {
+ unsigned long pid;
+ unsigned long ric;
+};
+
+static void do_tlbiel_pid(void *info)
+{
+ struct tlbiel_pid *t = info;
+
+ if (t->ric == RIC_FLUSH_TLB)
+ _tlbiel_pid(t->pid, RIC_FLUSH_TLB);
+ else if (t->ric == RIC_FLUSH_PWC)
+ _tlbiel_pid(t->pid, RIC_FLUSH_PWC);
+ else
+ _tlbiel_pid(t->pid, RIC_FLUSH_ALL);
+}
+
+static inline void _tlbiel_pid_multicast(struct mm_struct *mm,
+ unsigned long pid, unsigned long ric)
+{
+ struct cpumask *cpus = mm_cpumask(mm);
+ struct tlbiel_pid t = { .pid = pid, .ric = ric };
+
+ on_each_cpu_mask(cpus, do_tlbiel_pid, &t, 1);
+ /*
+ * Always want the CPU translations to be invalidated with tlbiel in
+ * these paths, so while coprocessors must use tlbie, we can not
+ * optimise away the tlbiel component.
+ */
+ if (atomic_read(&mm->context.copros) > 0)
+ _tlbie_pid(pid, RIC_FLUSH_ALL);
+}
+
+static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
+{
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_lpid(lpid, RIC_FLUSH_TLB);
+ fixup_tlbie_lpid(lpid);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_lpid(lpid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_lpid(lpid, RIC_FLUSH_ALL);
+ fixup_tlbie_lpid(lpid);
+ }
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static __always_inline void _tlbie_lpid_guest(unsigned long lpid, unsigned long ric)
+{
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_lpid_guest(lpid, RIC_FLUSH_TLB);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_lpid_guest(lpid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_lpid_guest(lpid, RIC_FLUSH_ALL);
+ }
+ fixup_tlbie_lpid(lpid);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ for (addr = start; addr < end; addr += page_size)
+ __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
+static __always_inline void _tlbiel_va(unsigned long va, unsigned long pid,
+ unsigned long psize, unsigned long ric)
+{
+ unsigned long ap = mmu_get_ap(psize);
+
+ asm volatile("ptesync": : :"memory");
+ __tlbiel_va(va, pid, ap, ric);
+ ppc_after_tlbiel_barrier();
+}
+
+static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize, bool also_pwc)
+{
+ asm volatile("ptesync": : :"memory");
+ if (also_pwc)
+ __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+ __tlbiel_va_range(start, end, pid, page_size, psize);
+ ppc_after_tlbiel_barrier();
+}
+
+static inline void __tlbie_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ for (addr = start; addr < end; addr += page_size)
+ __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+
+ fixup_tlbie_va_range(addr - page_size, pid, ap);
+}
+
+static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
+ unsigned long psize, unsigned long ric)
+{
+ unsigned long ap = mmu_get_ap(psize);
+
+ asm volatile("ptesync": : :"memory");
+ __tlbie_va(va, pid, ap, ric);
+ fixup_tlbie_va(va, pid, ap);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+struct tlbiel_va {
+ unsigned long pid;
+ unsigned long va;
+ unsigned long psize;
+ unsigned long ric;
+};
+
+static void do_tlbiel_va(void *info)
+{
+ struct tlbiel_va *t = info;
+
+ if (t->ric == RIC_FLUSH_TLB)
+ _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_TLB);
+ else if (t->ric == RIC_FLUSH_PWC)
+ _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_PWC);
+ else
+ _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_ALL);
+}
+
+static inline void _tlbiel_va_multicast(struct mm_struct *mm,
+ unsigned long va, unsigned long pid,
+ unsigned long psize, unsigned long ric)
+{
+ struct cpumask *cpus = mm_cpumask(mm);
+ struct tlbiel_va t = { .va = va, .pid = pid, .psize = psize, .ric = ric };
+ on_each_cpu_mask(cpus, do_tlbiel_va, &t, 1);
+ if (atomic_read(&mm->context.copros) > 0)
+ _tlbie_va(va, pid, psize, RIC_FLUSH_TLB);
+}
+
+struct tlbiel_va_range {
+ unsigned long pid;
+ unsigned long start;
+ unsigned long end;
+ unsigned long page_size;
+ unsigned long psize;
+ bool also_pwc;
+};
+
+static void do_tlbiel_va_range(void *info)
+{
+ struct tlbiel_va_range *t = info;
+
+ _tlbiel_va_range(t->start, t->end, t->pid, t->page_size,
+ t->psize, t->also_pwc);
+}
+
+static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
+ unsigned long psize, unsigned long ric)
+{
+ unsigned long ap = mmu_get_ap(psize);
+
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, lpid, ap, ric);
+ fixup_tlbie_lpid_va(va, lpid, ap);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbie_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize, bool also_pwc)
+{
+ asm volatile("ptesync": : :"memory");
+ if (also_pwc)
+ __tlbie_pid(pid, RIC_FLUSH_PWC);
+ __tlbie_va_range(start, end, pid, page_size, psize);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize, bool also_pwc)
+{
+ struct cpumask *cpus = mm_cpumask(mm);
+ struct tlbiel_va_range t = { .start = start, .end = end,
+ .pid = pid, .page_size = page_size,
+ .psize = psize, .also_pwc = also_pwc };
+
+ on_each_cpu_mask(cpus, do_tlbiel_va_range, &t, 1);
+ if (atomic_read(&mm->context.copros) > 0)
+ _tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
+}
+
+/*
+ * Base TLB flushing operations:
+ *
+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ * - flush_tlb_page(vma, vmaddr) flushes one page
+ * - flush_tlb_range(vma, start, end) flushes a range of pages
+ * - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ * - local_* variants of page and mm only apply to the current
+ * processor
+ */
+void radix__local_flush_tlb_mm(struct mm_struct *mm)
+{
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
+
+ preempt_disable();
+ _tlbiel_pid(pid, RIC_FLUSH_TLB);
+ preempt_enable();
+}
+EXPORT_SYMBOL(radix__local_flush_tlb_mm);
+
+#ifndef CONFIG_SMP
+void radix__local_flush_all_mm(struct mm_struct *mm)
+{
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
+
+ preempt_disable();
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ preempt_enable();
+}
+EXPORT_SYMBOL(radix__local_flush_all_mm);
+
+static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
+{
+ radix__local_flush_all_mm(mm);
+}
+#endif /* CONFIG_SMP */
+
+void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
+ int psize)
+{
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
+
+ preempt_disable();
+ _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ preempt_enable();
+}
+
+void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ /* need the return fix for nohash.c */
+ if (is_vm_hugetlb_page(vma))
+ return radix__local_flush_hugetlb_page(vma, vmaddr);
+#endif
+ radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
+}
+EXPORT_SYMBOL(radix__local_flush_tlb_page);
+
+static bool mm_needs_flush_escalation(struct mm_struct *mm)
+{
+ /*
+ * The P9 nest MMU has issues with the page walk cache caching PTEs
+ * and not flushing them when RIC = 0 for a PID/LPID invalidate.
+ *
+ * This may have been fixed in shipping firmware (by disabling PWC
+ * or preventing it from caching PTEs), but until that is confirmed,
+ * this workaround is required - escalate all RIC=0 IS=1/2/3 flushes
+ * to RIC=2.
+ *
+ * POWER10 (and P9P) does not have this problem.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ return false;
+ if (atomic_read(&mm->context.copros) > 0)
+ return true;
+ return false;
+}
+
+/*
+ * If always_flush is true, then flush even if this CPU can't be removed
+ * from mm_cpumask.
+ */
+void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush)
+{
+ unsigned long pid = mm->context.id;
+ int cpu = smp_processor_id();
+
+ /*
+ * A kthread could have done a mmget_not_zero() after the flushing CPU
+ * checked mm_cpumask, and be in the process of kthread_use_mm when
+ * interrupted here. In that case, current->mm will be set to mm,
+ * because kthread_use_mm() setting ->mm and switching to the mm is
+ * done with interrupts off.
+ */
+ if (current->mm == mm)
+ goto out;
+
+ if (current->active_mm == mm) {
+ unsigned long flags;
+
+ WARN_ON_ONCE(current->mm != NULL);
+ /*
+ * It is a kernel thread and is using mm as the lazy tlb, so
+ * switch it to init_mm. This is not always called from IPI
+ * (e.g., flush_type_needed), so must disable irqs.
+ */
+ local_irq_save(flags);
+ mmgrab_lazy_tlb(&init_mm);
+ current->active_mm = &init_mm;
+ switch_mm_irqs_off(mm, &init_mm, current);
+ mmdrop_lazy_tlb(mm);
+ local_irq_restore(flags);
+ }
+
+ /*
+ * This IPI may be initiated from any source including those not
+ * running the mm, so there may be a racing IPI that comes after
+ * this one which finds the cpumask already clear. Check and avoid
+ * underflowing the active_cpus count in that case. The race should
+ * not otherwise be a problem, but the TLB must be flushed because
+ * that's what the caller expects.
+ */
+ if (cpumask_test_cpu(cpu, mm_cpumask(mm))) {
+ dec_mm_active_cpus(mm);
+ cpumask_clear_cpu(cpu, mm_cpumask(mm));
+ always_flush = true;
+ }
+
+out:
+ if (always_flush)
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
+}
+
+#ifdef CONFIG_SMP
+static void do_exit_flush_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+ exit_lazy_flush_tlb(mm, true);
+}
+
+static void exit_flush_lazy_tlbs(struct mm_struct *mm)
+{
+ /*
+ * Would be nice if this was async so it could be run in
+ * parallel with our local flush, but generic code does not
+ * give a good API for it. Could extend the generic code or
+ * make a special powerpc IPI for flushing TLBs.
+ * For now it's not too performance critical.
+ */
+ smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
+ (void *)mm, 1);
+}
+
+#else /* CONFIG_SMP */
+static inline void exit_flush_lazy_tlbs(struct mm_struct *mm) { }
+#endif /* CONFIG_SMP */
+
+static DEFINE_PER_CPU(unsigned int, mm_cpumask_trim_clock);
+
+/*
+ * Interval between flushes at which we send out IPIs to check whether the
+ * mm_cpumask can be trimmed for the case where it's not a single-threaded
+ * process flushing its own mm. The intent is to reduce the cost of later
+ * flushes. Don't want this to be so low that it adds noticable cost to TLB
+ * flushing, or so high that it doesn't help reduce global TLBIEs.
+ */
+static unsigned long tlb_mm_cpumask_trim_timer = 1073;
+
+static bool tick_and_test_trim_clock(void)
+{
+ if (__this_cpu_inc_return(mm_cpumask_trim_clock) ==
+ tlb_mm_cpumask_trim_timer) {
+ __this_cpu_write(mm_cpumask_trim_clock, 0);
+ return true;
+ }
+ return false;
+}
+
+enum tlb_flush_type {
+ FLUSH_TYPE_NONE,
+ FLUSH_TYPE_LOCAL,
+ FLUSH_TYPE_GLOBAL,
+};
+
+static enum tlb_flush_type flush_type_needed(struct mm_struct *mm, bool fullmm)
+{
+ int active_cpus = atomic_read(&mm->context.active_cpus);
+ int cpu = smp_processor_id();
+
+ if (active_cpus == 0)
+ return FLUSH_TYPE_NONE;
+ if (active_cpus == 1 && cpumask_test_cpu(cpu, mm_cpumask(mm))) {
+ if (current->mm != mm) {
+ /*
+ * Asynchronous flush sources may trim down to nothing
+ * if the process is not running, so occasionally try
+ * to trim.
+ */
+ if (tick_and_test_trim_clock()) {
+ exit_lazy_flush_tlb(mm, true);
+ return FLUSH_TYPE_NONE;
+ }
+ }
+ return FLUSH_TYPE_LOCAL;
+ }
+
+ /* Coprocessors require TLBIE to invalidate nMMU. */
+ if (atomic_read(&mm->context.copros) > 0)
+ return FLUSH_TYPE_GLOBAL;
+
+ /*
+ * In the fullmm case there's no point doing the exit_flush_lazy_tlbs
+ * because the mm is being taken down anyway, and a TLBIE tends to
+ * be faster than an IPI+TLBIEL.
+ */
+ if (fullmm)
+ return FLUSH_TYPE_GLOBAL;
+
+ /*
+ * If we are running the only thread of a single-threaded process,
+ * then we should almost always be able to trim off the rest of the
+ * CPU mask (except in the case of use_mm() races), so always try
+ * trimming the mask.
+ */
+ if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm) {
+ exit_flush_lazy_tlbs(mm);
+ /*
+ * use_mm() race could prevent IPIs from being able to clear
+ * the cpumask here, however those users are established
+ * after our first check (and so after the PTEs are removed),
+ * and the TLB still gets flushed by the IPI, so this CPU
+ * will only require a local flush.
+ */
+ return FLUSH_TYPE_LOCAL;
+ }
+
+ /*
+ * Occasionally try to trim down the cpumask. It's possible this can
+ * bring the mask to zero, which results in no flush.
+ */
+ if (tick_and_test_trim_clock()) {
+ exit_flush_lazy_tlbs(mm);
+ if (current->mm == mm)
+ return FLUSH_TYPE_LOCAL;
+ if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
+ exit_lazy_flush_tlb(mm, true);
+ return FLUSH_TYPE_NONE;
+ }
+
+ return FLUSH_TYPE_GLOBAL;
+}
+
+#ifdef CONFIG_SMP
+void radix__flush_tlb_mm(struct mm_struct *mm)
+{
+ unsigned long pid;
+ enum tlb_flush_type type;
+
+ pid = mm->context.id;
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
+
+ preempt_disable();
+ /*
+ * Order loads of mm_cpumask (in flush_type_needed) vs previous
+ * stores to clear ptes before the invalidate. See barrier in
+ * switch_mm_irqs_off
+ */
+ smp_mb();
+ type = flush_type_needed(mm, false);
+ if (type == FLUSH_TYPE_LOCAL) {
+ _tlbiel_pid(pid, RIC_FLUSH_TLB);
+ } else if (type == FLUSH_TYPE_GLOBAL) {
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB,
+ H_RPTI_PAGE_ALL, 0, -1UL);
+ } else if (cputlb_use_tlbie()) {
+ if (mm_needs_flush_escalation(mm))
+ _tlbie_pid(pid, RIC_FLUSH_ALL);
+ else
+ _tlbie_pid(pid, RIC_FLUSH_TLB);
+ } else {
+ _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
+ }
+ }
+ preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+EXPORT_SYMBOL(radix__flush_tlb_mm);
+
+static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
+{
+ unsigned long pid;
+ enum tlb_flush_type type;
+
+ pid = mm->context.id;
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
+
+ preempt_disable();
+ smp_mb(); /* see radix__flush_tlb_mm */
+ type = flush_type_needed(mm, fullmm);
+ if (type == FLUSH_TYPE_LOCAL) {
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ } else if (type == FLUSH_TYPE_GLOBAL) {
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+ unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+ H_RPTI_TYPE_PRT;
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, type,
+ H_RPTI_PAGE_ALL, 0, -1UL);
+ } else if (cputlb_use_tlbie())
+ _tlbie_pid(pid, RIC_FLUSH_ALL);
+ else
+ _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
+ }
+ preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+
+void radix__flush_all_mm(struct mm_struct *mm)
+{
+ __flush_all_mm(mm, false);
+}
+EXPORT_SYMBOL(radix__flush_all_mm);
+
+void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
+ int psize)
+{
+ unsigned long pid;
+ enum tlb_flush_type type;
+
+ pid = mm->context.id;
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
+
+ preempt_disable();
+ smp_mb(); /* see radix__flush_tlb_mm */
+ type = flush_type_needed(mm, false);
+ if (type == FLUSH_TYPE_LOCAL) {
+ _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ } else if (type == FLUSH_TYPE_GLOBAL) {
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt, pg_sizes, size;
+
+ tgt = H_RPTI_TARGET_CMMU;
+ pg_sizes = psize_to_rpti_pgsize(psize);
+ size = 1UL << mmu_psize_to_shift(psize);
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB,
+ pg_sizes, vmaddr,
+ vmaddr + size);
+ } else if (cputlb_use_tlbie())
+ _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ else
+ _tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB);
+ }
+ preempt_enable();
+}
+
+void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ if (is_vm_hugetlb_page(vma))
+ return radix__flush_hugetlb_page(vma, vmaddr);
+#endif
+ radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
+}
+EXPORT_SYMBOL(radix__flush_tlb_page);
+
+#endif /* CONFIG_SMP */
+
+static void do_tlbiel_kernel(void *info)
+{
+ _tlbiel_pid(0, RIC_FLUSH_ALL);
+}
+
+static inline void _tlbiel_kernel_broadcast(void)
+{
+ on_each_cpu(do_tlbiel_kernel, NULL, 1);
+ if (tlbie_capable) {
+ /*
+ * Coherent accelerators don't refcount kernel memory mappings,
+ * so have to always issue a tlbie for them. This is quite a
+ * slow path anyway.
+ */
+ _tlbie_pid(0, RIC_FLUSH_ALL);
+ }
+}
+
+/*
+ * If kernel TLBIs ever become local rather than global, then
+ * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it
+ * assumes kernel TLBIs are global.
+ */
+void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU | H_RPTI_TARGET_NMMU;
+ unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+ H_RPTI_TYPE_PRT;
+
+ pseries_rpt_invalidate(0, tgt, type, H_RPTI_PAGE_ALL,
+ start, end);
+ } else if (cputlb_use_tlbie())
+ _tlbie_pid(0, RIC_FLUSH_ALL);
+ else
+ _tlbiel_kernel_broadcast();
+}
+EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
+
+/*
+ * Doesn't appear to be used anywhere. Remove.
+ */
+#define TLB_FLUSH_ALL -1UL
+
+/*
+ * Number of pages above which we invalidate the entire PID rather than
+ * flush individual pages, for local and global flushes respectively.
+ *
+ * tlbie goes out to the interconnect and individual ops are more costly.
+ * It also does not iterate over sets like the local tlbiel variant when
+ * invalidating a full PID, so it has a far lower threshold to change from
+ * individual page flushes to full-pid flushes.
+ */
+static u32 tlb_single_page_flush_ceiling __read_mostly = 33;
+static u32 tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
+
+static inline void __radix__flush_tlb_range(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ unsigned long pid;
+ unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
+ unsigned long page_size = 1UL << page_shift;
+ unsigned long nr_pages = (end - start) >> page_shift;
+ bool flush_pid, flush_pwc = false;
+ enum tlb_flush_type type;
+
+ pid = mm->context.id;
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
+
+ WARN_ON_ONCE(end == TLB_FLUSH_ALL);
+
+ preempt_disable();
+ smp_mb(); /* see radix__flush_tlb_mm */
+ type = flush_type_needed(mm, false);
+ if (type == FLUSH_TYPE_NONE)
+ goto out;
+
+ if (type == FLUSH_TYPE_GLOBAL)
+ flush_pid = nr_pages > tlb_single_page_flush_ceiling;
+ else
+ flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
+ /*
+ * full pid flush already does the PWC flush. if it is not full pid
+ * flush check the range is more than PMD and force a pwc flush
+ * mremap() depends on this behaviour.
+ */
+ if (!flush_pid && (end - start) >= PMD_SIZE)
+ flush_pwc = true;
+
+ if (!mmu_has_feature(MMU_FTR_GTSE) && type == FLUSH_TYPE_GLOBAL) {
+ unsigned long type = H_RPTI_TYPE_TLB;
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+ unsigned long pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize);
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ pg_sizes |= psize_to_rpti_pgsize(MMU_PAGE_2M);
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ if (flush_pwc)
+ type |= H_RPTI_TYPE_PWC;
+ pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end);
+ } else if (flush_pid) {
+ /*
+ * We are now flushing a range larger than PMD size force a RIC_FLUSH_ALL
+ */
+ if (type == FLUSH_TYPE_LOCAL) {
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ } else {
+ if (cputlb_use_tlbie()) {
+ _tlbie_pid(pid, RIC_FLUSH_ALL);
+ } else {
+ _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
+ }
+ }
+ } else {
+ bool hflush;
+ unsigned long hstart, hend;
+
+ hstart = (start + PMD_SIZE - 1) & PMD_MASK;
+ hend = end & PMD_MASK;
+ hflush = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hstart < hend;
+
+ if (type == FLUSH_TYPE_LOCAL) {
+ asm volatile("ptesync": : :"memory");
+ if (flush_pwc)
+ /* For PWC, only one flush is needed */
+ __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+ __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
+ if (hflush)
+ __tlbiel_va_range(hstart, hend, pid,
+ PMD_SIZE, MMU_PAGE_2M);
+ ppc_after_tlbiel_barrier();
+ } else if (cputlb_use_tlbie()) {
+ asm volatile("ptesync": : :"memory");
+ if (flush_pwc)
+ __tlbie_pid(pid, RIC_FLUSH_PWC);
+ __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
+ if (hflush)
+ __tlbie_va_range(hstart, hend, pid,
+ PMD_SIZE, MMU_PAGE_2M);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+ } else {
+ _tlbiel_va_range_multicast(mm,
+ start, end, pid, page_size, mmu_virtual_psize, flush_pwc);
+ if (hflush)
+ _tlbiel_va_range_multicast(mm,
+ hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, flush_pwc);
+ }
+ }
+out:
+ preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
+}
+
+void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ if (is_vm_hugetlb_page(vma))
+ return radix__flush_hugetlb_tlb_range(vma, start, end);
+#endif
+
+ __radix__flush_tlb_range(vma->vm_mm, start, end);
+}
+EXPORT_SYMBOL(radix__flush_tlb_range);
+
+static int radix_get_mmu_psize(int page_size)
+{
+ int psize;
+
+ if (page_size == (1UL << mmu_psize_defs[mmu_virtual_psize].shift))
+ psize = mmu_virtual_psize;
+ else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_2M].shift))
+ psize = MMU_PAGE_2M;
+ else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_1G].shift))
+ psize = MMU_PAGE_1G;
+ else
+ return -1;
+ return psize;
+}
+
+/*
+ * Flush partition scoped LPID address translation for all CPUs.
+ */
+void radix__flush_tlb_lpid_page(unsigned int lpid,
+ unsigned long addr,
+ unsigned long page_size)
+{
+ int psize = radix_get_mmu_psize(page_size);
+
+ _tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB);
+}
+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page);
+
+/*
+ * Flush partition scoped PWC from LPID for all CPUs.
+ */
+void radix__flush_pwc_lpid(unsigned int lpid)
+{
+ _tlbie_lpid(lpid, RIC_FLUSH_PWC);
+}
+EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
+
+/*
+ * Flush partition scoped translations from LPID (=LPIDR)
+ */
+void radix__flush_all_lpid(unsigned int lpid)
+{
+ _tlbie_lpid(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__flush_all_lpid);
+
+/*
+ * Flush process scoped translations from LPID (=LPIDR)
+ */
+void radix__flush_all_lpid_guest(unsigned int lpid)
+{
+ _tlbie_lpid_guest(lpid, RIC_FLUSH_ALL);
+}
+
+void radix__tlb_flush(struct mmu_gather *tlb)
+{
+ int psize = 0;
+ struct mm_struct *mm = tlb->mm;
+ int page_size = tlb->page_size;
+ unsigned long start = tlb->start;
+ unsigned long end = tlb->end;
+
+ /*
+ * if page size is not something we understand, do a full mm flush
+ *
+ * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush
+ * that flushes the process table entry cache upon process teardown.
+ * See the comment for radix in arch_exit_mmap().
+ */
+ if (tlb->fullmm) {
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * Shootdown based lazy tlb mm refcounting means we
+ * have to IPI everyone in the mm_cpumask anyway soon
+ * when the mm goes away, so might as well do it as
+ * part of the final flush now.
+ *
+ * If lazy shootdown was improved to reduce IPIs (e.g.,
+ * by batching), then it may end up being better to use
+ * tlbies here instead.
+ */
+ preempt_disable();
+
+ smp_mb(); /* see radix__flush_tlb_mm */
+ exit_flush_lazy_tlbs(mm);
+ __flush_all_mm(mm, true);
+
+ preempt_enable();
+ } else {
+ __flush_all_mm(mm, true);
+ }
+
+ } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
+ if (!tlb->freed_tables)
+ radix__flush_tlb_mm(mm);
+ else
+ radix__flush_all_mm(mm);
+ } else {
+ if (!tlb->freed_tables)
+ radix__flush_tlb_range_psize(mm, start, end, psize);
+ else
+ radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
+ }
+}
+
+static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ int psize, bool also_pwc)
+{
+ unsigned long pid;
+ unsigned int page_shift = mmu_psize_defs[psize].shift;
+ unsigned long page_size = 1UL << page_shift;
+ unsigned long nr_pages = (end - start) >> page_shift;
+ bool flush_pid;
+ enum tlb_flush_type type;
+
+ pid = mm->context.id;
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
+
+ WARN_ON_ONCE(end == TLB_FLUSH_ALL);
+
+ preempt_disable();
+ smp_mb(); /* see radix__flush_tlb_mm */
+ type = flush_type_needed(mm, false);
+ if (type == FLUSH_TYPE_NONE)
+ goto out;
+
+ if (type == FLUSH_TYPE_GLOBAL)
+ flush_pid = nr_pages > tlb_single_page_flush_ceiling;
+ else
+ flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
+
+ if (!mmu_has_feature(MMU_FTR_GTSE) && type == FLUSH_TYPE_GLOBAL) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+ unsigned long type = H_RPTI_TYPE_TLB;
+ unsigned long pg_sizes = psize_to_rpti_pgsize(psize);
+
+ if (also_pwc)
+ type |= H_RPTI_TYPE_PWC;
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end);
+ } else if (flush_pid) {
+ if (type == FLUSH_TYPE_LOCAL) {
+ _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+ } else {
+ if (cputlb_use_tlbie()) {
+ if (mm_needs_flush_escalation(mm))
+ also_pwc = true;
+
+ _tlbie_pid(pid,
+ also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+ } else {
+ _tlbiel_pid_multicast(mm, pid,
+ also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+ }
+
+ }
+ } else {
+ if (type == FLUSH_TYPE_LOCAL)
+ _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
+ else if (cputlb_use_tlbie())
+ _tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
+ else
+ _tlbiel_va_range_multicast(mm,
+ start, end, pid, page_size, psize, also_pwc);
+ }
+out:
+ preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
+}
+
+void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int psize)
+{
+ return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
+}
+
+void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int psize)
+{
+ __radix__flush_tlb_range_psize(mm, start, end, psize, true);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long pid, end;
+ enum tlb_flush_type type;
+
+ pid = mm->context.id;
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
+
+ /* 4k page size, just blow the world */
+ if (PAGE_SIZE == 0x1000) {
+ radix__flush_all_mm(mm);
+ return;
+ }
+
+ end = addr + HPAGE_PMD_SIZE;
+
+ /* Otherwise first do the PWC, then iterate the pages. */
+ preempt_disable();
+ smp_mb(); /* see radix__flush_tlb_mm */
+ type = flush_type_needed(mm, false);
+ if (type == FLUSH_TYPE_LOCAL) {
+ _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+ } else if (type == FLUSH_TYPE_GLOBAL) {
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt, type, pg_sizes;
+
+ tgt = H_RPTI_TARGET_CMMU;
+ type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+ H_RPTI_TYPE_PRT;
+ pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize);
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, type, pg_sizes,
+ addr, end);
+ } else if (cputlb_use_tlbie())
+ _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+ else
+ _tlbiel_va_range_multicast(mm,
+ addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+ }
+
+ preempt_enable();
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M);
+}
+EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
+
+void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G);
+}
+EXPORT_SYMBOL(radix__flush_pud_tlb_range);
+
+void radix__flush_tlb_all(void)
+{
+ unsigned long rb,prs,r,rs;
+ unsigned long ric = RIC_FLUSH_ALL;
+
+ rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+ rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */
+
+ asm volatile("ptesync": : :"memory");
+ /*
+ * now flush guest entries by passing PRS = 1 and LPID != 0
+ */
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory");
+ /*
+ * now flush host entires by passing PRS = 0 and LPID == 0
+ */
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory");
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+static __always_inline void __tlbie_pid_lpid(unsigned long pid,
+ unsigned long lpid,
+ unsigned long ric)
+{
+ unsigned long rb, rs, prs, r;
+
+ rb = PPC_BIT(53); /* IS = 1 */
+ rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid,
+ unsigned long lpid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb, rs, prs, r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid)
+{
+ /*
+ * We can use any address for the invalidation, pick one which is
+ * probably unused as an optimisation.
+ */
+ unsigned long va = ((1UL << 52) - 1);
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
+ RIC_FLUSH_TLB);
+ }
+}
+
+static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
+ unsigned long ric)
+{
+ asm volatile("ptesync" : : : "memory");
+
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
+ fixup_tlbie_pid_lpid(pid, lpid);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
+ fixup_tlbie_pid_lpid(pid, lpid);
+ }
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+
+static inline void fixup_tlbie_va_range_lpid(unsigned long va,
+ unsigned long pid,
+ unsigned long lpid,
+ unsigned long ap)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB);
+ }
+}
+
+static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long lpid,
+ unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ for (addr = start; addr < end; addr += page_size)
+ __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
+
+ fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
+}
+
+static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long lpid,
+ unsigned long page_size,
+ unsigned long psize, bool also_pwc)
+{
+ asm volatile("ptesync" : : : "memory");
+ if (also_pwc)
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+ __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+
+/*
+ * Performs process-scoped invalidations for a given LPID
+ * as part of H_RPT_INVALIDATE hcall.
+ */
+void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid,
+ unsigned long type, unsigned long pg_sizes,
+ unsigned long start, unsigned long end)
+{
+ unsigned long psize, nr_pages;
+ struct mmu_psize_def *def;
+ bool flush_pid;
+
+ /*
+ * A H_RPTI_TYPE_ALL request implies RIC=3, hence
+ * do a single IS=1 based flush.
+ */
+ if ((type & H_RPTI_TYPE_ALL) == H_RPTI_TYPE_ALL) {
+ _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
+ return;
+ }
+
+ if (type & H_RPTI_TYPE_PWC)
+ _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+
+ /* Full PID flush */
+ if (start == 0 && end == -1)
+ return _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
+
+ /* Do range invalidation for all the valid page sizes */
+ for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+ def = &mmu_psize_defs[psize];
+ if (!(pg_sizes & def->h_rpt_pgsize))
+ continue;
+
+ nr_pages = (end - start) >> def->shift;
+ flush_pid = nr_pages > tlb_single_page_flush_ceiling;
+
+ /*
+ * If the number of pages spanning the range is above
+ * the ceiling, convert the request into a full PID flush.
+ * And since PID flush takes out all the page sizes, there
+ * is no need to consider remaining page sizes.
+ */
+ if (flush_pid) {
+ _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
+ return;
+ }
+ _tlbie_va_range_lpid(start, end, pid, lpid,
+ (1UL << def->shift), psize, false);
+ }
+}
+EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt);
+
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+
+static int __init create_tlb_single_page_flush_ceiling(void)
+{
+ debugfs_create_u32("tlb_single_page_flush_ceiling", 0600,
+ arch_debugfs_dir, &tlb_single_page_flush_ceiling);
+ debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600,
+ arch_debugfs_dir, &tlb_local_single_page_flush_ceiling);
+ return 0;
+}
+late_initcall(create_tlb_single_page_flush_ceiling);
+
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
new file mode 100644
index 000000000000..15f73abd1506
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -0,0 +1,795 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerPC64 SLB support.
+ *
+ * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
+ * Based on earlier code written by:
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ * Copyright (c) 2001 Dave Engebretsen
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ */
+
+#include <asm/interrupt.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/paca.h>
+#include <asm/lppaca.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/smp.h>
+#include <linux/compiler.h>
+#include <linux/context_tracking.h>
+#include <linux/mm_types.h>
+#include <linux/pgtable.h>
+
+#include <asm/udbg.h>
+#include <asm/text-patching.h>
+
+#include "internal.h"
+
+
+static long slb_allocate_user(struct mm_struct *mm, unsigned long ea);
+
+bool stress_slb_enabled __initdata;
+
+static int __init parse_stress_slb(char *p)
+{
+ stress_slb_enabled = true;
+ return 0;
+}
+early_param("stress_slb", parse_stress_slb);
+
+__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_slb_key);
+
+bool no_slb_preload __initdata;
+static int __init parse_no_slb_preload(char *p)
+{
+ no_slb_preload = true;
+ return 0;
+}
+early_param("no_slb_preload", parse_no_slb_preload);
+__ro_after_init DEFINE_STATIC_KEY_FALSE(no_slb_preload_key);
+
+static void assert_slb_presence(bool present, unsigned long ea)
+{
+#ifdef CONFIG_DEBUG_VM
+ unsigned long tmp;
+
+ WARN_ON_ONCE(mfmsr() & MSR_EE);
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_206))
+ return;
+
+ /*
+ * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware
+ * ignores all other bits from 0-27, so just clear them all.
+ */
+ ea &= ~((1UL << SID_SHIFT) - 1);
+ asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
+
+ WARN_ON(present == (tmp == 0));
+#endif
+}
+
+static inline void slb_shadow_update(unsigned long ea, int ssize,
+ unsigned long flags,
+ enum slb_index index)
+{
+ struct slb_shadow *p = get_slb_shadow();
+
+ /*
+ * Clear the ESID first so the entry is not valid while we are
+ * updating it. No write barriers are needed here, provided
+ * we only update the current CPU's SLB shadow buffer.
+ */
+ WRITE_ONCE(p->save_area[index].esid, 0);
+ WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags)));
+ WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index)));
+}
+
+static inline void slb_shadow_clear(enum slb_index index)
+{
+ WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index));
+}
+
+static inline void create_shadowed_slbe(unsigned long ea, int ssize,
+ unsigned long flags,
+ enum slb_index index)
+{
+ /*
+ * Updating the shadow buffer before writing the SLB ensures
+ * we don't get a stale entry here if we get preempted by PHYP
+ * between these two statements.
+ */
+ slb_shadow_update(ea, ssize, flags, index);
+
+ assert_slb_presence(false, ea);
+ asm volatile("slbmte %0,%1" :
+ : "r" (mk_vsid_data(ea, ssize, flags)),
+ "r" (mk_esid_data(ea, ssize, index))
+ : "memory" );
+}
+
+/*
+ * Insert bolted entries into SLB (which may not be empty, so don't clear
+ * slb_cache_ptr).
+ */
+void __slb_restore_bolted_realmode(void)
+{
+ struct slb_shadow *p = get_slb_shadow();
+ enum slb_index index;
+
+ /* No isync needed because realmode. */
+ for (index = 0; index < SLB_NUM_BOLTED; index++) {
+ asm volatile("slbmte %0,%1" :
+ : "r" (be64_to_cpu(p->save_area[index].vsid)),
+ "r" (be64_to_cpu(p->save_area[index].esid)));
+ }
+
+ assert_slb_presence(true, local_paca->kstack);
+}
+
+/*
+ * Insert the bolted entries into an empty SLB.
+ */
+void slb_restore_bolted_realmode(void)
+{
+ __slb_restore_bolted_realmode();
+ get_paca()->slb_cache_ptr = 0;
+
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+}
+
+/*
+ * This flushes all SLB entries including 0, so it must be realmode.
+ */
+void slb_flush_all_realmode(void)
+{
+ asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+}
+
+static __always_inline void __slb_flush_and_restore_bolted(bool preserve_kernel_lookaside)
+{
+ struct slb_shadow *p = get_slb_shadow();
+ unsigned long ksp_esid_data, ksp_vsid_data;
+ u32 ih;
+
+ /*
+ * SLBIA IH=1 on ISA v2.05 and newer processors may preserve lookaside
+ * information created with Class=0 entries, which we use for kernel
+ * SLB entries (the SLB entries themselves are still invalidated).
+ *
+ * Older processors will ignore this optimisation. Over-invalidation
+ * is fine because we never rely on lookaside information existing.
+ */
+ if (preserve_kernel_lookaside)
+ ih = 1;
+ else
+ ih = 0;
+
+ ksp_esid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
+ ksp_vsid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
+
+ asm volatile(PPC_SLBIA(%0)" \n"
+ "slbmte %1, %2 \n"
+ :: "i" (ih),
+ "r" (ksp_vsid_data),
+ "r" (ksp_esid_data)
+ : "memory");
+}
+
+/*
+ * This flushes non-bolted entries, it can be run in virtual mode. Must
+ * be called with interrupts disabled.
+ */
+void slb_flush_and_restore_bolted(void)
+{
+ BUILD_BUG_ON(SLB_NUM_BOLTED != 2);
+
+ WARN_ON(!irqs_disabled());
+
+ /*
+ * We can't take a PMU exception in the following code, so hard
+ * disable interrupts.
+ */
+ hard_irq_disable();
+
+ isync();
+ __slb_flush_and_restore_bolted(false);
+ isync();
+
+ assert_slb_presence(true, get_paca()->kstack);
+
+ get_paca()->slb_cache_ptr = 0;
+
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+}
+
+void slb_save_contents(struct slb_entry *slb_ptr)
+{
+ int i;
+ unsigned long e, v;
+
+ /* Save slb_cache_ptr value. */
+ get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;
+
+ if (!slb_ptr)
+ return;
+
+ for (i = 0; i < mmu_slb_size; i++) {
+ asm volatile("slbmfee %0,%1" : "=r" (e) : "r" (i));
+ asm volatile("slbmfev %0,%1" : "=r" (v) : "r" (i));
+ slb_ptr->esid = e;
+ slb_ptr->vsid = v;
+ slb_ptr++;
+ }
+}
+
+void slb_dump_contents(struct slb_entry *slb_ptr)
+{
+ int i, n;
+ unsigned long e, v;
+ unsigned long llp;
+
+ if (!slb_ptr)
+ return;
+
+ pr_err("SLB contents of cpu 0x%x\n", smp_processor_id());
+
+ for (i = 0; i < mmu_slb_size; i++) {
+ e = slb_ptr->esid;
+ v = slb_ptr->vsid;
+ slb_ptr++;
+
+ if (!e && !v)
+ continue;
+
+ pr_err("%02d %016lx %016lx %s\n", i, e, v,
+ (e & SLB_ESID_V) ? "VALID" : "NOT VALID");
+
+ if (!(e & SLB_ESID_V))
+ continue;
+
+ llp = v & SLB_VSID_LLP;
+ if (v & SLB_VSID_B_1T) {
+ pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n",
+ GET_ESID_1T(e),
+ (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp);
+ } else {
+ pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n",
+ GET_ESID(e),
+ (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp);
+ }
+ }
+
+ if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) {
+ /* RR is not so useful as it's often not used for allocation */
+ pr_err("SLB RR allocator index %d\n", get_paca()->stab_rr);
+
+ /* Dump slb cache entires as well. */
+ pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr);
+ pr_err("Valid SLB cache entries:\n");
+ n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES);
+ for (i = 0; i < n; i++)
+ pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+ pr_err("Rest of SLB cache entries:\n");
+ for (i = n; i < SLB_CACHE_ENTRIES; i++)
+ pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+ }
+}
+
+void slb_vmalloc_update(void)
+{
+ /*
+ * vmalloc is not bolted, so just have to flush non-bolted.
+ */
+ slb_flush_and_restore_bolted();
+}
+
+static bool preload_hit(struct thread_info *ti, unsigned long esid)
+{
+ unsigned char i;
+
+ for (i = 0; i < ti->slb_preload_nr; i++) {
+ unsigned char idx;
+
+ idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
+ if (esid == ti->slb_preload_esid[idx])
+ return true;
+ }
+ return false;
+}
+
+static void preload_add(struct thread_info *ti, unsigned long ea)
+{
+ unsigned char idx;
+ unsigned long esid;
+
+ if (slb_preload_disabled())
+ return;
+
+ if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
+ /* EAs are stored >> 28 so 256MB segments don't need clearing */
+ if (ea & ESID_MASK_1T)
+ ea &= ESID_MASK_1T;
+ }
+
+ esid = ea >> SID_SHIFT;
+
+ if (preload_hit(ti, esid))
+ return;
+
+ idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR;
+ ti->slb_preload_esid[idx] = esid;
+ if (ti->slb_preload_nr == SLB_PRELOAD_NR)
+ ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
+ else
+ ti->slb_preload_nr++;
+}
+
+static void preload_age(struct thread_info *ti)
+{
+ if (!ti->slb_preload_nr)
+ return;
+ ti->slb_preload_nr--;
+ ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
+}
+
+static void slb_cache_slbie_kernel(unsigned int index)
+{
+ unsigned long slbie_data = get_paca()->slb_cache[index];
+ unsigned long ksp = get_paca()->kstack;
+
+ slbie_data <<= SID_SHIFT;
+ slbie_data |= 0xc000000000000000ULL;
+ if ((ksp & slb_esid_mask(mmu_kernel_ssize)) == slbie_data)
+ return;
+ slbie_data |= mmu_kernel_ssize << SLBIE_SSIZE_SHIFT;
+
+ asm volatile("slbie %0" : : "r" (slbie_data));
+}
+
+static void slb_cache_slbie_user(unsigned int index)
+{
+ unsigned long slbie_data = get_paca()->slb_cache[index];
+
+ slbie_data <<= SID_SHIFT;
+ slbie_data |= user_segment_size(slbie_data) << SLBIE_SSIZE_SHIFT;
+ slbie_data |= SLBIE_C; /* user slbs have C=1 */
+
+ asm volatile("slbie %0" : : "r" (slbie_data));
+}
+
+/* Flush all user entries from the segment table of the current processor. */
+void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
+{
+ struct thread_info *ti = task_thread_info(tsk);
+ unsigned char i;
+
+ /*
+ * We need interrupts hard-disabled here, not just soft-disabled,
+ * so that a PMU interrupt can't occur, which might try to access
+ * user memory (to get a stack trace) and possible cause an SLB miss
+ * which would update the slb_cache/slb_cache_ptr fields in the PACA.
+ */
+ hard_irq_disable();
+ isync();
+ if (stress_slb()) {
+ __slb_flush_and_restore_bolted(false);
+ isync();
+ get_paca()->slb_cache_ptr = 0;
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+
+ } else if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ /*
+ * SLBIA IH=3 invalidates all Class=1 SLBEs and their
+ * associated lookaside structures, which matches what
+ * switch_slb wants. So ARCH_300 does not use the slb
+ * cache.
+ */
+ asm volatile(PPC_SLBIA(3));
+
+ } else {
+ unsigned long offset = get_paca()->slb_cache_ptr;
+
+ if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
+ offset <= SLB_CACHE_ENTRIES) {
+ /*
+ * Could assert_slb_presence(true) here, but
+ * hypervisor or machine check could have come
+ * in and removed the entry at this point.
+ */
+
+ for (i = 0; i < offset; i++)
+ slb_cache_slbie_user(i);
+
+ /* Workaround POWER5 < DD2.1 issue */
+ if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1)
+ slb_cache_slbie_user(0);
+
+ } else {
+ /* Flush but retain kernel lookaside information */
+ __slb_flush_and_restore_bolted(true);
+ isync();
+
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ }
+
+ get_paca()->slb_cache_ptr = 0;
+ }
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+
+ copy_mm_to_paca(mm);
+
+ if (slb_preload_disabled())
+ return;
+
+ /*
+ * We gradually age out SLBs after a number of context switches to
+ * reduce reload overhead of unused entries (like we do with FP/VEC
+ * reload). Each time we wrap 256 switches, take an entry out of the
+ * SLB preload cache.
+ */
+ tsk->thread.load_slb++;
+ if (!tsk->thread.load_slb) {
+ unsigned long pc = KSTK_EIP(tsk);
+
+ preload_age(ti);
+ preload_add(ti, pc);
+ }
+
+ for (i = 0; i < ti->slb_preload_nr; i++) {
+ unsigned char idx;
+ unsigned long ea;
+
+ idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
+ ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
+
+ slb_allocate_user(mm, ea);
+ }
+
+ /*
+ * Synchronize slbmte preloads with possible subsequent user memory
+ * address accesses by the kernel (user mode won't happen until
+ * rfid, which is safe).
+ */
+ isync();
+}
+
+void slb_set_size(u16 size)
+{
+ mmu_slb_size = size;
+}
+
+void slb_initialize(void)
+{
+ unsigned long linear_llp, vmalloc_llp, io_llp;
+ unsigned long lflags;
+ static int slb_encoding_inited;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ unsigned long vmemmap_llp;
+#endif
+
+ /* Prepare our SLB miss handler based on our page size */
+ linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
+ io_llp = mmu_psize_defs[mmu_io_psize].sllp;
+ vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
+ get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp;
+#endif
+ if (!slb_encoding_inited) {
+ slb_encoding_inited = 1;
+ pr_devel("SLB: linear LLP = %04lx\n", linear_llp);
+ pr_devel("SLB: io LLP = %04lx\n", io_llp);
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
+#endif
+ }
+
+ get_paca()->stab_rr = SLB_NUM_BOLTED - 1;
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+
+ lflags = SLB_VSID_KERNEL | linear_llp;
+
+ /* Invalidate the entire SLB (even entry 0) & all the ERATS */
+ asm volatile("isync":::"memory");
+ asm volatile("slbmte %0,%0"::"r" (0) : "memory");
+ asm volatile("isync; slbia; isync":::"memory");
+ create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX);
+
+ /*
+ * For the boot cpu, we're running on the stack in init_thread_union,
+ * which is in the first segment of the linear mapping, and also
+ * get_paca()->kstack hasn't been initialized yet.
+ * For secondary cpus, we need to bolt the kernel stack entry now.
+ */
+ slb_shadow_clear(KSTACK_INDEX);
+ if (raw_smp_processor_id() != boot_cpuid &&
+ (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
+ create_shadowed_slbe(get_paca()->kstack,
+ mmu_kernel_ssize, lflags, KSTACK_INDEX);
+
+ asm volatile("isync":::"memory");
+}
+
+static void slb_cache_update(unsigned long esid_data)
+{
+ int slb_cache_index;
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return; /* ISAv3.0B and later does not use slb_cache */
+
+ if (stress_slb())
+ return;
+
+ /*
+ * Now update slb cache entries
+ */
+ slb_cache_index = local_paca->slb_cache_ptr;
+ if (slb_cache_index < SLB_CACHE_ENTRIES) {
+ /*
+ * We have space in slb cache for optimized switch_slb().
+ * Top 36 bits from esid_data as per ISA
+ */
+ local_paca->slb_cache[slb_cache_index++] = esid_data >> SID_SHIFT;
+ local_paca->slb_cache_ptr++;
+ } else {
+ /*
+ * Our cache is full and the current cache content strictly
+ * doesn't indicate the active SLB contents. Bump the ptr
+ * so that switch_slb() will ignore the cache.
+ */
+ local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1;
+ }
+}
+
+static enum slb_index alloc_slb_index(bool kernel)
+{
+ enum slb_index index;
+
+ /*
+ * The allocation bitmaps can become out of synch with the SLB
+ * when the _switch code does slbie when bolting a new stack
+ * segment and it must not be anywhere else in the SLB. This leaves
+ * a kernel allocated entry that is unused in the SLB. With very
+ * large systems or small segment sizes, the bitmaps could slowly
+ * fill with these entries. They will eventually be cleared out
+ * by the round robin allocator in that case, so it's probably not
+ * worth accounting for.
+ */
+
+ /*
+ * SLBs beyond 32 entries are allocated with stab_rr only
+ * POWER7/8/9 have 32 SLB entries, this could be expanded if a
+ * future CPU has more.
+ */
+ if (local_paca->slb_used_bitmap != U32_MAX) {
+ index = ffz(local_paca->slb_used_bitmap);
+ local_paca->slb_used_bitmap |= 1U << index;
+ if (kernel)
+ local_paca->slb_kern_bitmap |= 1U << index;
+ } else {
+ /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */
+ index = local_paca->stab_rr;
+ if (index < (mmu_slb_size - 1))
+ index++;
+ else
+ index = SLB_NUM_BOLTED;
+ local_paca->stab_rr = index;
+ if (index < 32) {
+ if (kernel)
+ local_paca->slb_kern_bitmap |= 1U << index;
+ else
+ local_paca->slb_kern_bitmap &= ~(1U << index);
+ }
+ }
+ BUG_ON(index < SLB_NUM_BOLTED);
+
+ return index;
+}
+
+static long slb_insert_entry(unsigned long ea, unsigned long context,
+ unsigned long flags, int ssize, bool kernel)
+{
+ unsigned long vsid;
+ unsigned long vsid_data, esid_data;
+ enum slb_index index;
+
+ vsid = get_vsid(context, ea, ssize);
+ if (!vsid)
+ return -EFAULT;
+
+ /*
+ * There must not be a kernel SLB fault in alloc_slb_index or before
+ * slbmte here or the allocation bitmaps could get out of whack with
+ * the SLB.
+ *
+ * User SLB faults or preloads take this path which might get inlined
+ * into the caller, so add compiler barriers here to ensure unsafe
+ * memory accesses do not come between.
+ */
+ barrier();
+
+ index = alloc_slb_index(kernel);
+
+ vsid_data = __mk_vsid_data(vsid, ssize, flags);
+ esid_data = mk_esid_data(ea, ssize, index);
+
+ /*
+ * No need for an isync before or after this slbmte. The exception
+ * we enter with and the rfid we exit with are context synchronizing.
+ * User preloads should add isync afterwards in case the kernel
+ * accesses user memory before it returns to userspace with rfid.
+ */
+ assert_slb_presence(false, ea);
+ if (stress_slb()) {
+ int slb_cache_index = local_paca->slb_cache_ptr;
+
+ /*
+ * stress_slb() does not use slb cache, repurpose as a
+ * cache of inserted (non-bolted) kernel SLB entries. All
+ * non-bolted kernel entries are flushed on any user fault,
+ * or if there are already 3 non-boled kernel entries.
+ */
+ BUILD_BUG_ON(SLB_CACHE_ENTRIES < 3);
+ if (!kernel || slb_cache_index == 3) {
+ int i;
+
+ for (i = 0; i < slb_cache_index; i++)
+ slb_cache_slbie_kernel(i);
+ slb_cache_index = 0;
+ }
+
+ if (kernel)
+ local_paca->slb_cache[slb_cache_index++] = esid_data >> SID_SHIFT;
+ local_paca->slb_cache_ptr = slb_cache_index;
+ }
+ asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data));
+
+ barrier();
+
+ if (!kernel)
+ slb_cache_update(esid_data);
+
+ return 0;
+}
+
+static long slb_allocate_kernel(unsigned long ea, unsigned long id)
+{
+ unsigned long context;
+ unsigned long flags;
+ int ssize;
+
+ if (id == LINEAR_MAP_REGION_ID) {
+
+ /* We only support upto H_MAX_PHYSMEM_BITS */
+ if ((ea & EA_MASK) > (1UL << H_MAX_PHYSMEM_BITS))
+ return -EFAULT;
+
+ flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp;
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ } else if (id == VMEMMAP_REGION_ID) {
+
+ if (ea >= H_VMEMMAP_END)
+ return -EFAULT;
+
+ flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp;
+#endif
+ } else if (id == VMALLOC_REGION_ID) {
+
+ if (ea >= H_VMALLOC_END)
+ return -EFAULT;
+
+ flags = local_paca->vmalloc_sllp;
+
+ } else if (id == IO_REGION_ID) {
+
+ if (ea >= H_KERN_IO_END)
+ return -EFAULT;
+
+ flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp;
+
+ } else {
+ return -EFAULT;
+ }
+
+ ssize = MMU_SEGSIZE_1T;
+ if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
+ ssize = MMU_SEGSIZE_256M;
+
+ context = get_kernel_context(ea);
+
+ return slb_insert_entry(ea, context, flags, ssize, true);
+}
+
+static long slb_allocate_user(struct mm_struct *mm, unsigned long ea)
+{
+ unsigned long context;
+ unsigned long flags;
+ int bpsize;
+ int ssize;
+
+ /*
+ * consider this as bad access if we take a SLB miss
+ * on an address above addr limit.
+ */
+ if (ea >= mm_ctx_slb_addr_limit(&mm->context))
+ return -EFAULT;
+
+ context = get_user_context(&mm->context, ea);
+ if (!context)
+ return -EFAULT;
+
+ if (unlikely(ea >= H_PGTABLE_RANGE)) {
+ WARN_ON(1);
+ return -EFAULT;
+ }
+
+ ssize = user_segment_size(ea);
+
+ bpsize = get_slice_psize(mm, ea);
+ flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp;
+
+ return slb_insert_entry(ea, context, flags, ssize, false);
+}
+
+DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault)
+{
+ unsigned long ea = regs->dar;
+ unsigned long id = get_region_id(ea);
+
+ /* IRQs are not reconciled here, so can't check irqs_disabled */
+ VM_WARN_ON(mfmsr() & MSR_EE);
+
+ if (regs_is_unrecoverable(regs))
+ return -EINVAL;
+
+ /*
+ * SLB kernel faults must be very careful not to touch anything that is
+ * not bolted. E.g., PACA and global variables are okay, mm->context
+ * stuff is not. SLB user faults may access all of memory (and induce
+ * one recursive SLB kernel fault), so the kernel fault must not
+ * trample on the user fault state at those points.
+ */
+
+ /*
+ * This is a raw interrupt handler, for performance, so that
+ * fast_interrupt_return can be used. The handler must not touch local
+ * irq state, or schedule. We could test for usermode and upgrade to a
+ * normal process context (synchronous) interrupt for those, which
+ * would make them first-class kernel code and able to be traced and
+ * instrumented, although performance would suffer a bit, it would
+ * probably be a good tradeoff.
+ */
+ if (id >= LINEAR_MAP_REGION_ID) {
+ long err;
+#ifdef CONFIG_DEBUG_VM
+ /* Catch recursive kernel SLB faults. */
+ BUG_ON(local_paca->in_kernel_slb_handler);
+ local_paca->in_kernel_slb_handler = 1;
+#endif
+ err = slb_allocate_kernel(ea, id);
+#ifdef CONFIG_DEBUG_VM
+ local_paca->in_kernel_slb_handler = 0;
+#endif
+ return err;
+ } else {
+ struct mm_struct *mm = current->mm;
+ long err;
+
+ if (unlikely(!mm))
+ return -EFAULT;
+
+ err = slb_allocate_user(mm, ea);
+ if (!err)
+ preload_add(current_thread_info(), ea);
+
+ return err;
+ }
+}
diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c
new file mode 100644
index 000000000000..28bec5bc7879
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/slice.c
@@ -0,0 +1,819 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * address space "slices" (meta-segments) support
+ *
+ * Copyright (C) 2007 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * Based on hugetlb implementation
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/hugetlb.h>
+#include <linux/sched/mm.h>
+#include <linux/security.h>
+#include <asm/mman.h>
+#include <asm/mmu.h>
+#include <asm/spu.h>
+#include <asm/hugetlb.h>
+#include <asm/mmu_context.h>
+
+static DEFINE_SPINLOCK(slice_convert_lock);
+
+#ifdef DEBUG
+int _slice_debug = 1;
+
+static void slice_print_mask(const char *label, const struct slice_mask *mask)
+{
+ if (!_slice_debug)
+ return;
+ pr_devel("%s low_slice: %*pbl\n", label,
+ (int)SLICE_NUM_LOW, &mask->low_slices);
+ pr_devel("%s high_slice: %*pbl\n", label,
+ (int)SLICE_NUM_HIGH, mask->high_slices);
+}
+
+#define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0)
+
+#else
+
+static void slice_print_mask(const char *label, const struct slice_mask *mask) {}
+#define slice_dbg(fmt...)
+
+#endif
+
+static inline notrace bool slice_addr_is_low(unsigned long addr)
+{
+ u64 tmp = (u64)addr;
+
+ return tmp < SLICE_LOW_TOP;
+}
+
+static void slice_range_to_mask(unsigned long start, unsigned long len,
+ struct slice_mask *ret)
+{
+ unsigned long end = start + len - 1;
+
+ ret->low_slices = 0;
+ if (SLICE_NUM_HIGH)
+ bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
+
+ if (slice_addr_is_low(start)) {
+ unsigned long mend = min(end,
+ (unsigned long)(SLICE_LOW_TOP - 1));
+
+ ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
+ - (1u << GET_LOW_SLICE_INDEX(start));
+ }
+
+ if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) {
+ unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
+ unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
+ unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
+
+ bitmap_set(ret->high_slices, start_index, count);
+ }
+}
+
+static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ struct vm_area_struct *vma;
+
+ if ((mm_ctx_slb_addr_limit(&mm->context) - len) < addr)
+ return 0;
+ vma = find_vma(mm, addr);
+ return (!vma || (addr + len) <= vm_start_gap(vma));
+}
+
+static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice)
+{
+ return !slice_area_is_free(mm, slice << SLICE_LOW_SHIFT,
+ 1ul << SLICE_LOW_SHIFT);
+}
+
+static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
+{
+ unsigned long start = slice << SLICE_HIGH_SHIFT;
+ unsigned long end = start + (1ul << SLICE_HIGH_SHIFT);
+
+ /* Hack, so that each addresses is controlled by exactly one
+ * of the high or low area bitmaps, the first high area starts
+ * at 4GB, not 0 */
+ if (start == 0)
+ start = (unsigned long)SLICE_LOW_TOP;
+
+ return !slice_area_is_free(mm, start, end - start);
+}
+
+static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret,
+ unsigned long high_limit)
+{
+ unsigned long i;
+
+ ret->low_slices = 0;
+ if (SLICE_NUM_HIGH)
+ bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
+
+ for (i = 0; i < SLICE_NUM_LOW; i++)
+ if (!slice_low_has_vma(mm, i))
+ ret->low_slices |= 1u << i;
+
+ if (slice_addr_is_low(high_limit - 1))
+ return;
+
+ for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++)
+ if (!slice_high_has_vma(mm, i))
+ __set_bit(i, ret->high_slices);
+}
+
+static bool slice_check_range_fits(struct mm_struct *mm,
+ const struct slice_mask *available,
+ unsigned long start, unsigned long len)
+{
+ unsigned long end = start + len - 1;
+ u64 low_slices = 0;
+
+ if (slice_addr_is_low(start)) {
+ unsigned long mend = min(end,
+ (unsigned long)(SLICE_LOW_TOP - 1));
+
+ low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
+ - (1u << GET_LOW_SLICE_INDEX(start));
+ }
+ if ((low_slices & available->low_slices) != low_slices)
+ return false;
+
+ if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) {
+ unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
+ unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
+ unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
+ unsigned long i;
+
+ for (i = start_index; i < start_index + count; i++) {
+ if (!test_bit(i, available->high_slices))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void slice_flush_segments(void *parm)
+{
+#ifdef CONFIG_PPC64
+ struct mm_struct *mm = parm;
+ unsigned long flags;
+
+ if (mm != current->active_mm)
+ return;
+
+ copy_mm_to_paca(current->active_mm);
+
+ local_irq_save(flags);
+ slb_flush_and_restore_bolted();
+ local_irq_restore(flags);
+#endif
+}
+
+static void slice_convert(struct mm_struct *mm,
+ const struct slice_mask *mask, int psize)
+{
+ int index, mask_index;
+ /* Write the new slice psize bits */
+ unsigned char *hpsizes, *lpsizes;
+ struct slice_mask *psize_mask, *old_mask;
+ unsigned long i, flags;
+ int old_psize;
+
+ slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
+ slice_print_mask(" mask", mask);
+
+ psize_mask = slice_mask_for_size(&mm->context, psize);
+
+ /* We need to use a spinlock here to protect against
+ * concurrent 64k -> 4k demotion ...
+ */
+ spin_lock_irqsave(&slice_convert_lock, flags);
+
+ lpsizes = mm_ctx_low_slices(&mm->context);
+ for (i = 0; i < SLICE_NUM_LOW; i++) {
+ if (!(mask->low_slices & (1u << i)))
+ continue;
+
+ mask_index = i & 0x1;
+ index = i >> 1;
+
+ /* Update the slice_mask */
+ old_psize = (lpsizes[index] >> (mask_index * 4)) & 0xf;
+ old_mask = slice_mask_for_size(&mm->context, old_psize);
+ old_mask->low_slices &= ~(1u << i);
+ psize_mask->low_slices |= 1u << i;
+
+ /* Update the sizes array */
+ lpsizes[index] = (lpsizes[index] & ~(0xf << (mask_index * 4))) |
+ (((unsigned long)psize) << (mask_index * 4));
+ }
+
+ hpsizes = mm_ctx_high_slices(&mm->context);
+ for (i = 0; i < GET_HIGH_SLICE_INDEX(mm_ctx_slb_addr_limit(&mm->context)); i++) {
+ if (!test_bit(i, mask->high_slices))
+ continue;
+
+ mask_index = i & 0x1;
+ index = i >> 1;
+
+ /* Update the slice_mask */
+ old_psize = (hpsizes[index] >> (mask_index * 4)) & 0xf;
+ old_mask = slice_mask_for_size(&mm->context, old_psize);
+ __clear_bit(i, old_mask->high_slices);
+ __set_bit(i, psize_mask->high_slices);
+
+ /* Update the sizes array */
+ hpsizes[index] = (hpsizes[index] & ~(0xf << (mask_index * 4))) |
+ (((unsigned long)psize) << (mask_index * 4));
+ }
+
+ slice_dbg(" lsps=%lx, hsps=%lx\n",
+ (unsigned long)mm_ctx_low_slices(&mm->context),
+ (unsigned long)mm_ctx_high_slices(&mm->context));
+
+ spin_unlock_irqrestore(&slice_convert_lock, flags);
+
+#ifdef CONFIG_SPU_BASE
+ spu_flush_all_slbs(mm);
+#endif
+}
+
+/*
+ * Compute which slice addr is part of;
+ * set *boundary_addr to the start or end boundary of that slice
+ * (depending on 'end' parameter);
+ * return boolean indicating if the slice is marked as available in the
+ * 'available' slice_mark.
+ */
+static bool slice_scan_available(unsigned long addr,
+ const struct slice_mask *available,
+ int end, unsigned long *boundary_addr)
+{
+ unsigned long slice;
+ if (slice_addr_is_low(addr)) {
+ slice = GET_LOW_SLICE_INDEX(addr);
+ *boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
+ return !!(available->low_slices & (1u << slice));
+ } else {
+ slice = GET_HIGH_SLICE_INDEX(addr);
+ *boundary_addr = (slice + end) ?
+ ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
+ return !!test_bit(slice, available->high_slices);
+ }
+}
+
+static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ const struct slice_mask *available,
+ int psize, unsigned long high_limit)
+{
+ int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+ unsigned long found, next_end;
+ struct vm_unmapped_area_info info = {
+ .length = len,
+ .align_mask = PAGE_MASK & ((1ul << pshift) - 1),
+ };
+ /*
+ * Check till the allow max value for this mmap request
+ */
+ while (addr < high_limit) {
+ info.low_limit = addr;
+ if (!slice_scan_available(addr, available, 1, &addr))
+ continue;
+
+ next_slice:
+ /*
+ * At this point [info.low_limit; addr) covers
+ * available slices only and ends at a slice boundary.
+ * Check if we need to reduce the range, or if we can
+ * extend it to cover the next available slice.
+ */
+ if (addr >= high_limit)
+ addr = high_limit;
+ else if (slice_scan_available(addr, available, 1, &next_end)) {
+ addr = next_end;
+ goto next_slice;
+ }
+ info.high_limit = addr;
+
+ found = vm_unmapped_area(&info);
+ if (!(found & ~PAGE_MASK))
+ return found;
+ }
+
+ return -ENOMEM;
+}
+
+static unsigned long slice_find_area_topdown(struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ const struct slice_mask *available,
+ int psize, unsigned long high_limit)
+{
+ int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+ unsigned long found, prev;
+ struct vm_unmapped_area_info info = {
+ .flags = VM_UNMAPPED_AREA_TOPDOWN,
+ .length = len,
+ .align_mask = PAGE_MASK & ((1ul << pshift) - 1),
+ };
+ unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr);
+
+ /*
+ * If we are trying to allocate above DEFAULT_MAP_WINDOW
+ * Add the different to the mmap_base.
+ * Only for that request for which high_limit is above
+ * DEFAULT_MAP_WINDOW we should apply this.
+ */
+ if (high_limit > DEFAULT_MAP_WINDOW)
+ addr += mm_ctx_slb_addr_limit(&mm->context) - DEFAULT_MAP_WINDOW;
+
+ while (addr > min_addr) {
+ info.high_limit = addr;
+ if (!slice_scan_available(addr - 1, available, 0, &addr))
+ continue;
+
+ prev_slice:
+ /*
+ * At this point [addr; info.high_limit) covers
+ * available slices only and starts at a slice boundary.
+ * Check if we need to reduce the range, or if we can
+ * extend it to cover the previous available slice.
+ */
+ if (addr < min_addr)
+ addr = min_addr;
+ else if (slice_scan_available(addr - 1, available, 0, &prev)) {
+ addr = prev;
+ goto prev_slice;
+ }
+ info.low_limit = addr;
+
+ found = vm_unmapped_area(&info);
+ if (!(found & ~PAGE_MASK))
+ return found;
+ }
+
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ return slice_find_area_bottomup(mm, TASK_UNMAPPED_BASE, len, available, psize, high_limit);
+}
+
+
+static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
+ const struct slice_mask *mask, int psize,
+ int topdown, unsigned long high_limit)
+{
+ if (topdown)
+ return slice_find_area_topdown(mm, mm->mmap_base, len, mask, psize, high_limit);
+ else
+ return slice_find_area_bottomup(mm, mm->mmap_base, len, mask, psize, high_limit);
+}
+
+static inline void slice_copy_mask(struct slice_mask *dst,
+ const struct slice_mask *src)
+{
+ dst->low_slices = src->low_slices;
+ if (!SLICE_NUM_HIGH)
+ return;
+ bitmap_copy(dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
+}
+
+static inline void slice_or_mask(struct slice_mask *dst,
+ const struct slice_mask *src1,
+ const struct slice_mask *src2)
+{
+ dst->low_slices = src1->low_slices | src2->low_slices;
+ if (!SLICE_NUM_HIGH)
+ return;
+ bitmap_or(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH);
+}
+
+static inline void slice_andnot_mask(struct slice_mask *dst,
+ const struct slice_mask *src1,
+ const struct slice_mask *src2)
+{
+ dst->low_slices = src1->low_slices & ~src2->low_slices;
+ if (!SLICE_NUM_HIGH)
+ return;
+ bitmap_andnot(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH);
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+#define MMU_PAGE_BASE MMU_PAGE_64K
+#else
+#define MMU_PAGE_BASE MMU_PAGE_4K
+#endif
+
+unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
+ unsigned long flags, unsigned int psize,
+ int topdown)
+{
+ struct slice_mask good_mask;
+ struct slice_mask potential_mask;
+ const struct slice_mask *maskp;
+ const struct slice_mask *compat_maskp = NULL;
+ int fixed = (flags & MAP_FIXED);
+ int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+ unsigned long page_size = 1UL << pshift;
+ struct mm_struct *mm = current->mm;
+ unsigned long newaddr;
+ unsigned long high_limit;
+
+ high_limit = DEFAULT_MAP_WINDOW;
+ if (addr >= high_limit || (fixed && (addr + len > high_limit)))
+ high_limit = TASK_SIZE;
+
+ if (len > high_limit)
+ return -ENOMEM;
+ if (len & (page_size - 1))
+ return -EINVAL;
+ if (fixed) {
+ if (addr & (page_size - 1))
+ return -EINVAL;
+ if (addr > high_limit - len)
+ return -ENOMEM;
+ }
+
+ if (high_limit > mm_ctx_slb_addr_limit(&mm->context)) {
+ /*
+ * Increasing the slb_addr_limit does not require
+ * slice mask cache to be recalculated because it should
+ * be already initialised beyond the old address limit.
+ */
+ mm_ctx_set_slb_addr_limit(&mm->context, high_limit);
+
+ on_each_cpu(slice_flush_segments, mm, 1);
+ }
+
+ /* Sanity checks */
+ BUG_ON(mm->task_size == 0);
+ BUG_ON(mm_ctx_slb_addr_limit(&mm->context) == 0);
+ VM_BUG_ON(radix_enabled());
+
+ slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
+ slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
+ addr, len, flags, topdown);
+
+ /* If hint, make sure it matches our alignment restrictions */
+ if (!fixed && addr) {
+ addr = ALIGN(addr, page_size);
+ slice_dbg(" aligned addr=%lx\n", addr);
+ /* Ignore hint if it's too large or overlaps a VMA */
+ if (addr > high_limit - len || addr < mmap_min_addr ||
+ !slice_area_is_free(mm, addr, len))
+ addr = 0;
+ }
+
+ /* First make up a "good" mask of slices that have the right size
+ * already
+ */
+ maskp = slice_mask_for_size(&mm->context, psize);
+
+ /*
+ * Here "good" means slices that are already the right page size,
+ * "compat" means slices that have a compatible page size (i.e.
+ * 4k in a 64k pagesize kernel), and "free" means slices without
+ * any VMAs.
+ *
+ * If MAP_FIXED:
+ * check if fits in good | compat => OK
+ * check if fits in good | compat | free => convert free
+ * else bad
+ * If have hint:
+ * check if hint fits in good => OK
+ * check if hint fits in good | free => convert free
+ * Otherwise:
+ * search in good, found => OK
+ * search in good | free, found => convert free
+ * search in good | compat | free, found => convert free.
+ */
+
+ /*
+ * If we support combo pages, we can allow 64k pages in 4k slices
+ * The mask copies could be avoided in most cases here if we had
+ * a pointer to good mask for the next code to use.
+ */
+ if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) {
+ compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K);
+ if (fixed)
+ slice_or_mask(&good_mask, maskp, compat_maskp);
+ else
+ slice_copy_mask(&good_mask, maskp);
+ } else {
+ slice_copy_mask(&good_mask, maskp);
+ }
+
+ slice_print_mask(" good_mask", &good_mask);
+ if (compat_maskp)
+ slice_print_mask(" compat_mask", compat_maskp);
+
+ /* First check hint if it's valid or if we have MAP_FIXED */
+ if (addr != 0 || fixed) {
+ /* Check if we fit in the good mask. If we do, we just return,
+ * nothing else to do
+ */
+ if (slice_check_range_fits(mm, &good_mask, addr, len)) {
+ slice_dbg(" fits good !\n");
+ newaddr = addr;
+ goto return_addr;
+ }
+ } else {
+ /* Now let's see if we can find something in the existing
+ * slices for that size
+ */
+ newaddr = slice_find_area(mm, len, &good_mask,
+ psize, topdown, high_limit);
+ if (newaddr != -ENOMEM) {
+ /* Found within the good mask, we don't have to setup,
+ * we thus return directly
+ */
+ slice_dbg(" found area at 0x%lx\n", newaddr);
+ goto return_addr;
+ }
+ }
+ /*
+ * We don't fit in the good mask, check what other slices are
+ * empty and thus can be converted
+ */
+ slice_mask_for_free(mm, &potential_mask, high_limit);
+ slice_or_mask(&potential_mask, &potential_mask, &good_mask);
+ slice_print_mask(" potential", &potential_mask);
+
+ if (addr != 0 || fixed) {
+ if (slice_check_range_fits(mm, &potential_mask, addr, len)) {
+ slice_dbg(" fits potential !\n");
+ newaddr = addr;
+ goto convert;
+ }
+ }
+
+ /* If we have MAP_FIXED and failed the above steps, then error out */
+ if (fixed)
+ return -EBUSY;
+
+ slice_dbg(" search...\n");
+
+ /* If we had a hint that didn't work out, see if we can fit
+ * anywhere in the good area.
+ */
+ if (addr) {
+ newaddr = slice_find_area(mm, len, &good_mask,
+ psize, topdown, high_limit);
+ if (newaddr != -ENOMEM) {
+ slice_dbg(" found area at 0x%lx\n", newaddr);
+ goto return_addr;
+ }
+ }
+
+ /* Now let's see if we can find something in the existing slices
+ * for that size plus free slices
+ */
+ newaddr = slice_find_area(mm, len, &potential_mask,
+ psize, topdown, high_limit);
+
+ if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && newaddr == -ENOMEM &&
+ psize == MMU_PAGE_64K) {
+ /* retry the search with 4k-page slices included */
+ slice_or_mask(&potential_mask, &potential_mask, compat_maskp);
+ newaddr = slice_find_area(mm, len, &potential_mask,
+ psize, topdown, high_limit);
+ }
+
+ if (newaddr == -ENOMEM)
+ return -ENOMEM;
+
+ slice_range_to_mask(newaddr, len, &potential_mask);
+ slice_dbg(" found potential area at 0x%lx\n", newaddr);
+ slice_print_mask(" mask", &potential_mask);
+
+ convert:
+ /*
+ * Try to allocate the context before we do slice convert
+ * so that we handle the context allocation failure gracefully.
+ */
+ if (need_extra_context(mm, newaddr)) {
+ if (alloc_extended_context(mm, newaddr) < 0)
+ return -ENOMEM;
+ }
+
+ slice_andnot_mask(&potential_mask, &potential_mask, &good_mask);
+ if (compat_maskp && !fixed)
+ slice_andnot_mask(&potential_mask, &potential_mask, compat_maskp);
+ if (potential_mask.low_slices ||
+ (SLICE_NUM_HIGH &&
+ !bitmap_empty(potential_mask.high_slices, SLICE_NUM_HIGH))) {
+ slice_convert(mm, &potential_mask, psize);
+ if (psize > MMU_PAGE_BASE)
+ on_each_cpu(slice_flush_segments, mm, 1);
+ }
+ return newaddr;
+
+return_addr:
+ if (need_extra_context(mm, newaddr)) {
+ if (alloc_extended_context(mm, newaddr) < 0)
+ return -ENOMEM;
+ }
+ return newaddr;
+}
+EXPORT_SYMBOL_GPL(slice_get_unmapped_area);
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int file_to_psize(struct file *file)
+{
+ struct hstate *hstate = hstate_file(file);
+
+ return shift_to_mmu_psize(huge_page_shift(hstate));
+}
+#else
+static int file_to_psize(struct file *file)
+{
+ return 0;
+}
+#endif
+
+unsigned long arch_get_unmapped_area(struct file *filp,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags,
+ vm_flags_t vm_flags)
+{
+ unsigned int psize;
+
+ if (radix_enabled())
+ return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
+
+ if (filp && is_file_hugepages(filp))
+ psize = file_to_psize(filp);
+ else
+ psize = mm_ctx_user_psize(&current->mm->context);
+
+ return slice_get_unmapped_area(addr, len, flags, psize, 0);
+}
+
+unsigned long arch_get_unmapped_area_topdown(struct file *filp,
+ const unsigned long addr0,
+ const unsigned long len,
+ const unsigned long pgoff,
+ const unsigned long flags,
+ vm_flags_t vm_flags)
+{
+ unsigned int psize;
+
+ if (radix_enabled())
+ return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags, vm_flags);
+
+ if (filp && is_file_hugepages(filp))
+ psize = file_to_psize(filp);
+ else
+ psize = mm_ctx_user_psize(&current->mm->context);
+
+ return slice_get_unmapped_area(addr0, len, flags, psize, 1);
+}
+
+unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned char *psizes;
+ int index, mask_index;
+
+ VM_BUG_ON(radix_enabled());
+
+ if (slice_addr_is_low(addr)) {
+ psizes = mm_ctx_low_slices(&mm->context);
+ index = GET_LOW_SLICE_INDEX(addr);
+ } else {
+ psizes = mm_ctx_high_slices(&mm->context);
+ index = GET_HIGH_SLICE_INDEX(addr);
+ }
+ mask_index = index & 0x1;
+ return (psizes[index >> 1] >> (mask_index * 4)) & 0xf;
+}
+EXPORT_SYMBOL_GPL(get_slice_psize);
+
+void slice_init_new_context_exec(struct mm_struct *mm)
+{
+ unsigned char *hpsizes, *lpsizes;
+ struct slice_mask *mask;
+ unsigned int psize = mmu_virtual_psize;
+
+ slice_dbg("slice_init_new_context_exec(mm=%p)\n", mm);
+
+ /*
+ * In the case of exec, use the default limit. In the
+ * case of fork it is just inherited from the mm being
+ * duplicated.
+ */
+ mm_ctx_set_slb_addr_limit(&mm->context, SLB_ADDR_LIMIT_DEFAULT);
+ mm_ctx_set_user_psize(&mm->context, psize);
+
+ /*
+ * Set all slice psizes to the default.
+ */
+ lpsizes = mm_ctx_low_slices(&mm->context);
+ memset(lpsizes, (psize << 4) | psize, SLICE_NUM_LOW >> 1);
+
+ hpsizes = mm_ctx_high_slices(&mm->context);
+ memset(hpsizes, (psize << 4) | psize, SLICE_NUM_HIGH >> 1);
+
+ /*
+ * Slice mask cache starts zeroed, fill the default size cache.
+ */
+ mask = slice_mask_for_size(&mm->context, psize);
+ mask->low_slices = ~0UL;
+ if (SLICE_NUM_HIGH)
+ bitmap_fill(mask->high_slices, SLICE_NUM_HIGH);
+}
+
+void slice_setup_new_exec(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ slice_dbg("slice_setup_new_exec(mm=%p)\n", mm);
+
+ if (!is_32bit_task())
+ return;
+
+ mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW);
+}
+
+void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long len, unsigned int psize)
+{
+ struct slice_mask mask;
+
+ VM_BUG_ON(radix_enabled());
+
+ slice_range_to_mask(start, len, &mask);
+ slice_convert(mm, &mask, psize);
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * is_hugepage_only_range() is used by generic code to verify whether
+ * a normal mmap mapping (non hugetlbfs) is valid on a given area.
+ *
+ * until the generic code provides a more generic hook and/or starts
+ * calling arch get_unmapped_area for MAP_FIXED (which our implementation
+ * here knows how to deal with), we hijack it to keep standard mappings
+ * away from us.
+ *
+ * because of that generic code limitation, MAP_FIXED mapping cannot
+ * "convert" back a slice with no VMAs to the standard page size, only
+ * get_unmapped_area() can. It would be possible to fix it here but I
+ * prefer working on fixing the generic code instead.
+ *
+ * WARNING: This will not work if hugetlbfs isn't enabled since the
+ * generic code will redefine that function as 0 in that. This is ok
+ * for now as we only use slices with hugetlbfs enabled. This should
+ * be fixed as the generic code gets fixed.
+ */
+int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ const struct slice_mask *maskp;
+ unsigned int psize = mm_ctx_user_psize(&mm->context);
+
+ VM_BUG_ON(radix_enabled());
+
+ maskp = slice_mask_for_size(&mm->context, psize);
+
+ /* We need to account for 4k slices too */
+ if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) {
+ const struct slice_mask *compat_maskp;
+ struct slice_mask available;
+
+ compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K);
+ slice_or_mask(&available, maskp, compat_maskp);
+ return !slice_check_range_fits(mm, &available, addr, len);
+ }
+
+ return !slice_check_range_fits(mm, maskp, addr, len);
+}
+
+unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
+{
+ /* With radix we don't use slice, so derive it from vma*/
+ if (radix_enabled())
+ return vma_kernel_pagesize(vma);
+
+ return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start));
+}
+#endif
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
index aa74acb0fdfc..ec98e526167e 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/book3s64/subpage_prot.c
@@ -1,22 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright 2007-2008 Paul Mackerras, IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/types.h>
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/hugetlb.h>
+#include <linux/syscalls.h>
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
-#include <asm/tlbflush.h>
+#include <linux/pgtable.h>
+#include <linux/uaccess.h>
/*
* Free all pages allocated for subpage protection maps and pointers.
@@ -25,10 +21,13 @@
*/
void subpage_prot_free(struct mm_struct *mm)
{
- struct subpage_prot_table *spt = &mm->context.spt;
+ struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
unsigned long i, j, addr;
u32 **p;
+ if (!spt)
+ return;
+
for (i = 0; i < 4; ++i) {
if (spt->low_prot[i]) {
free_page((unsigned long)spt->low_prot[i]);
@@ -36,7 +35,7 @@ void subpage_prot_free(struct mm_struct *mm)
}
}
addr = 0;
- for (i = 0; i < 2; ++i) {
+ for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) {
p = spt->protptrs[i];
if (!p)
continue;
@@ -48,37 +47,35 @@ void subpage_prot_free(struct mm_struct *mm)
free_page((unsigned long)p);
}
spt->maxaddr = 0;
-}
-
-void subpage_prot_init_new_context(struct mm_struct *mm)
-{
- struct subpage_prot_table *spt = &mm->context.spt;
-
- memset(spt, 0, sizeof(*spt));
+ kfree(spt);
}
static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
int npages)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
spinlock_t *ptl;
pgd = pgd_offset(mm, addr);
- if (pgd_none(*pgd))
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
return;
- pud = pud_offset(pgd, addr);
+ pud = pud_offset(p4d, addr);
if (pud_none(*pud))
return;
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
return;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return;
arch_enter_lazy_mmu_mode();
for (; npages > 0; --npages) {
- pte_update(mm, addr, pte, 0, 0);
+ pte_update(mm, addr, pte, 0, 0, 0);
addr += PAGE_SIZE;
++pte;
}
@@ -93,19 +90,24 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
static void subpage_prot_clear(unsigned long addr, unsigned long len)
{
struct mm_struct *mm = current->mm;
- struct subpage_prot_table *spt = &mm->context.spt;
+ struct subpage_prot_table *spt;
u32 **spm, *spp;
unsigned long i;
size_t nw;
unsigned long next, limit;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
+
+ spt = mm_ctx_subpage_prot(&mm->context);
+ if (!spt)
+ goto err_out;
+
limit = addr + len;
if (limit > spt->maxaddr)
limit = spt->maxaddr;
for (; addr < limit; addr = next) {
next = pmd_addr_end(addr, limit);
- if (addr < 0x100000000) {
+ if (addr < 0x100000000UL) {
spm = spt->low_prot;
} else {
spm = spt->protptrs[addr >> SBP_L3_SHIFT];
@@ -127,46 +129,38 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
/* now flush any existing HPTEs for the range */
hpte_flush_range(mm, addr, nw);
}
- up_write(&mm->mmap_sem);
+
+err_out:
+ mmap_write_unlock(mm);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->private;
- split_huge_page_pmd(vma, addr, pmd);
+ struct vm_area_struct *vma = walk->vma;
+ split_huge_pmd(vma, pmd, addr);
return 0;
}
+static const struct mm_walk_ops subpage_walk_ops = {
+ .pmd_entry = subpage_walk_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK_VERIFY,
+};
+
static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
unsigned long len)
{
struct vm_area_struct *vma;
- struct mm_walk subpage_proto_walk = {
- .mm = mm,
- .pmd_entry = subpage_walk_pmd_entry,
- };
+ VMA_ITERATOR(vmi, mm, addr);
/*
* We don't try too hard, we just mark all the vma in that range
* VM_NOHUGEPAGE and split them.
*/
- vma = find_vma(mm, addr);
- /*
- * If the range is in unmapped range, just return
- */
- if (vma && ((addr + len) <= vma->vm_start))
- return;
-
- while (vma) {
- if (vma->vm_start >= (addr + len))
- break;
- vma->vm_flags |= VM_NOHUGEPAGE;
- subpage_proto_walk.private = vma;
- walk_page_range(vma->vm_start, vma->vm_end,
- &subpage_proto_walk);
- vma = vma->vm_next;
+ for_each_vma_range(vmi, vma, addr + len) {
+ vm_flags_set(vma, VM_NOHUGEPAGE);
+ walk_page_vma(vma, &subpage_walk_ops, NULL);
}
}
#else
@@ -187,19 +181,24 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
* in a 2-bit field won't allow writes to a page that is otherwise
* write-protected.
*/
-long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
+SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
+ unsigned long, len, u32 __user *, map)
{
struct mm_struct *mm = current->mm;
- struct subpage_prot_table *spt = &mm->context.spt;
+ struct subpage_prot_table *spt;
u32 **spm, *spp;
unsigned long i;
size_t nw;
unsigned long next, limit;
int err;
+ if (radix_enabled())
+ return -ENOENT;
+
/* Check parameters */
if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
- addr >= TASK_SIZE || len >= TASK_SIZE || addr + len > TASK_SIZE)
+ addr >= mm->task_size || len >= mm->task_size ||
+ addr + len > mm->task_size)
return -EINVAL;
if (is_hugepage_only_range(mm, addr, len))
@@ -211,15 +210,30 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
return 0;
}
- if (!access_ok(VERIFY_READ, map, (len >> PAGE_SHIFT) * sizeof(u32)))
+ if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32)))
return -EFAULT;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
+
+ spt = mm_ctx_subpage_prot(&mm->context);
+ if (!spt) {
+ /*
+ * Allocate subpage prot table if not already done.
+ * Do this with mmap_lock held
+ */
+ spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL);
+ if (!spt) {
+ err = -ENOMEM;
+ goto out;
+ }
+ mm->context.hash_context->spt = spt;
+ }
+
subpage_mark_vma_nohuge(mm, addr, len);
for (limit = addr + len; addr < limit; addr = next) {
next = pmd_addr_end(addr, limit);
err = -ENOMEM;
- if (addr < 0x100000000) {
+ if (addr < 0x100000000UL) {
spm = spt->low_prot;
} else {
spm = spt->protptrs[addr >> SBP_L3_SHIFT];
@@ -249,12 +263,11 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
if (addr + (nw << PAGE_SHIFT) > next)
nw = (next - addr) >> PAGE_SHIFT;
- up_write(&mm->mmap_sem);
- err = -EFAULT;
+ mmap_write_unlock(mm);
if (__copy_from_user(spp, map, nw * sizeof(u32)))
- goto out2;
+ return -EFAULT;
map += nw;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
/* now flush any existing HPTEs for the range */
hpte_flush_range(mm, addr, nw);
@@ -263,7 +276,6 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
spt->maxaddr = limit;
err = 0;
out:
- up_write(&mm->mmap_sem);
- out2:
+ mmap_write_unlock(mm);
return err;
}
diff --git a/arch/powerpc/mm/book3s64/trace.c b/arch/powerpc/mm/book3s64/trace.c
new file mode 100644
index 000000000000..ccd64b5e6cac
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/trace.c
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file is for defining trace points and trace related helpers.
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#include <trace/events/thp.h>
+#endif
diff --git a/arch/powerpc/mm/cacheflush.c b/arch/powerpc/mm/cacheflush.c
new file mode 100644
index 000000000000..7186516eca52
--- /dev/null
+++ b/arch/powerpc/mm/cacheflush.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/highmem.h>
+#include <linux/kprobes.h>
+
+/**
+ * flush_coherent_icache() - if a CPU has a coherent icache, flush it
+ * Return true if the cache was flushed, false otherwise
+ */
+static inline bool flush_coherent_icache(void)
+{
+ /*
+ * For a snooping icache, we still need a dummy icbi to purge all the
+ * prefetched instructions from the ifetch buffers. We also need a sync
+ * before the icbi to order the actual stores to memory that might
+ * have modified instructions with the icbi.
+ */
+ if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+ mb(); /* sync */
+ icbi((void *)PAGE_OFFSET);
+ mb(); /* sync */
+ isync();
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * invalidate_icache_range() - Flush the icache by issuing icbi across an address range
+ * @start: the start address
+ * @stop: the stop address (exclusive)
+ */
+static void invalidate_icache_range(unsigned long start, unsigned long stop)
+{
+ unsigned long shift = l1_icache_shift();
+ unsigned long bytes = l1_icache_bytes();
+ char *addr = (char *)(start & ~(bytes - 1));
+ unsigned long size = stop - (unsigned long)addr + (bytes - 1);
+ unsigned long i;
+
+ for (i = 0; i < size >> shift; i++, addr += bytes)
+ icbi(addr);
+
+ mb(); /* sync */
+ isync();
+}
+
+/**
+ * flush_icache_range: Write any modified data cache blocks out to memory
+ * and invalidate the corresponding blocks in the instruction cache
+ *
+ * Generic code will call this after writing memory, before executing from it.
+ *
+ * @start: the start address
+ * @stop: the stop address (exclusive)
+ */
+void flush_icache_range(unsigned long start, unsigned long stop)
+{
+ if (flush_coherent_icache())
+ return;
+
+ clean_dcache_range(start, stop);
+
+ if (IS_ENABLED(CONFIG_44x)) {
+ /*
+ * Flash invalidate on 44x because we are passed kmapped
+ * addresses and this doesn't work for userspace pages due to
+ * the virtually tagged icache.
+ */
+ iccci((void *)start);
+ mb(); /* sync */
+ isync();
+ } else
+ invalidate_icache_range(start, stop);
+}
+EXPORT_SYMBOL(flush_icache_range);
+
+#ifdef CONFIG_HIGHMEM
+/**
+ * flush_dcache_icache_phys() - Flush a page by its physical address
+ * @physaddr: the physical address of the page
+ */
+static void flush_dcache_icache_phys(unsigned long physaddr)
+{
+ unsigned long bytes = l1_dcache_bytes();
+ unsigned long nb = PAGE_SIZE / bytes;
+ unsigned long addr = physaddr & PAGE_MASK;
+ unsigned long msr, msr0;
+ unsigned long loop1 = addr, loop2 = addr;
+
+ msr0 = mfmsr();
+ msr = msr0 & ~MSR_DR;
+ /*
+ * This must remain as ASM to prevent potential memory accesses
+ * while the data MMU is disabled
+ */
+ asm volatile(
+ " mtctr %2;\n"
+ " mtmsr %3;\n"
+ " isync;\n"
+ "0: dcbst 0, %0;\n"
+ " addi %0, %0, %4;\n"
+ " bdnz 0b;\n"
+ " sync;\n"
+ " mtctr %2;\n"
+ "1: icbi 0, %1;\n"
+ " addi %1, %1, %4;\n"
+ " bdnz 1b;\n"
+ " sync;\n"
+ " mtmsr %5;\n"
+ " isync;\n"
+ : "+&r" (loop1), "+&r" (loop2)
+ : "r" (nb), "r" (msr), "i" (bytes), "r" (msr0)
+ : "ctr", "memory");
+}
+NOKPROBE_SYMBOL(flush_dcache_icache_phys)
+#else
+static void flush_dcache_icache_phys(unsigned long physaddr)
+{
+}
+#endif
+
+/**
+ * __flush_dcache_icache(): Flush a particular page from the data cache to RAM.
+ * Note: this is necessary because the instruction cache does *not*
+ * snoop from the data cache.
+ *
+ * @p: the address of the page to flush
+ */
+static void __flush_dcache_icache(void *p)
+{
+ unsigned long addr = (unsigned long)p & PAGE_MASK;
+
+ clean_dcache_range(addr, addr + PAGE_SIZE);
+
+ /*
+ * We don't flush the icache on 44x. Those have a virtual icache and we
+ * don't have access to the virtual address here (it's not the page
+ * vaddr but where it's mapped in user space). The flushing of the
+ * icache on these is handled elsewhere, when a change in the address
+ * space occurs, before returning to user space.
+ */
+
+ if (mmu_has_feature(MMU_FTR_TYPE_44x))
+ return;
+
+ invalidate_icache_range(addr, addr + PAGE_SIZE);
+}
+
+void flush_dcache_icache_folio(struct folio *folio)
+{
+ unsigned int i, nr = folio_nr_pages(folio);
+
+ if (flush_coherent_icache())
+ return;
+
+ if (!folio_test_highmem(folio)) {
+ void *addr = folio_address(folio);
+ for (i = 0; i < nr; i++)
+ __flush_dcache_icache(addr + i * PAGE_SIZE);
+ } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) {
+ for (i = 0; i < nr; i++) {
+ void *start = kmap_local_folio(folio, i * PAGE_SIZE);
+
+ __flush_dcache_icache(start);
+ kunmap_local(start);
+ }
+ } else {
+ unsigned long pfn = folio_pfn(folio);
+ for (i = 0; i < nr; i++)
+ flush_dcache_icache_phys((pfn + i) * PAGE_SIZE);
+ }
+}
+EXPORT_SYMBOL(flush_dcache_icache_folio);
+
+void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
+{
+ clear_page(page);
+
+ /*
+ * We shouldn't have to do this, but some versions of glibc
+ * require it (ld.so assumes zero filled pages are icache clean)
+ * - Anton
+ */
+ flush_dcache_page(pg);
+}
+EXPORT_SYMBOL(clear_user_page);
+
+void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
+ struct page *pg)
+{
+ copy_page(vto, vfrom);
+
+ /*
+ * We should be able to use the following optimisation, however
+ * there are two problems.
+ * Firstly a bug in some versions of binutils meant PLT sections
+ * were not marked executable.
+ * Secondly the first word in the GOT section is blrl, used
+ * to establish the GOT address. Until recently the GOT was
+ * not marked executable.
+ * - Anton
+ */
+#if 0
+ if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0))
+ return;
+#endif
+
+ flush_dcache_page(pg);
+}
+
+void flush_icache_user_page(struct vm_area_struct *vma, struct page *page,
+ unsigned long addr, int len)
+{
+ void *maddr;
+
+ maddr = kmap_local_page(page) + (addr & ~PAGE_MASK);
+ flush_icache_range((unsigned long)maddr, (unsigned long)maddr + len);
+ kunmap_local(maddr);
+}
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
new file mode 100644
index 000000000000..f5f8692e2c69
--- /dev/null
+++ b/arch/powerpc/mm/copro_fault.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CoProcessor (SPU/AFU) mm fault handler
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2007
+ *
+ * Author: Arnd Bergmann <arndb@de.ibm.com>
+ * Author: Jeremy Kerr <jk@ozlabs.org>
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <asm/reg.h>
+#include <asm/copro.h>
+
+/*
+ * This ought to be kept in sync with the powerpc specific do_page_fault
+ * function. Currently, there are a few corner cases that we haven't had
+ * to handle fortunately.
+ */
+int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
+ unsigned long dsisr, vm_fault_t *flt)
+{
+ struct vm_area_struct *vma;
+ unsigned long is_write;
+ int ret;
+
+ if (mm == NULL)
+ return -EFAULT;
+
+ if (mm->pgd == NULL)
+ return -EFAULT;
+
+ vma = lock_mm_and_find_vma(mm, ea, NULL);
+ if (!vma)
+ return -EFAULT;
+
+ ret = -EFAULT;
+ is_write = dsisr & DSISR_ISSTORE;
+ if (is_write) {
+ if (!(vma->vm_flags & VM_WRITE))
+ goto out_unlock;
+ } else {
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+ goto out_unlock;
+ /*
+ * PROT_NONE is covered by the VMA check above.
+ * and hash should get a NOHPTE fault instead of
+ * a PROTFAULT in case fixup is needed for things
+ * like autonuma.
+ */
+ if (!radix_enabled())
+ WARN_ON_ONCE(dsisr & DSISR_PROTFAULT);
+ }
+
+ ret = 0;
+ *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL);
+
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (*flt & VM_FAULT_COMPLETED)
+ return 0;
+
+ if (unlikely(*flt & VM_FAULT_ERROR)) {
+ if (*flt & VM_FAULT_OOM) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ } else if (*flt & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+ BUG();
+ }
+
+out_unlock:
+ mmap_read_unlock(mm);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(copro_handle_mm_fault);
+
+#ifdef CONFIG_PPC_64S_HASH_MMU
+int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
+{
+ u64 vsid, vsidkey;
+ int psize, ssize;
+
+ switch (get_region_id(ea)) {
+ case USER_REGION_ID:
+ pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea);
+ if (mm == NULL)
+ return 1;
+ psize = get_slice_psize(mm, ea);
+ ssize = user_segment_size(ea);
+ vsid = get_user_vsid(&mm->context, ea, ssize);
+ vsidkey = SLB_VSID_USER;
+ break;
+ case VMALLOC_REGION_ID:
+ pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea);
+ psize = mmu_vmalloc_psize;
+ ssize = mmu_kernel_ssize;
+ vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+ vsidkey = SLB_VSID_KERNEL;
+ break;
+ case IO_REGION_ID:
+ pr_devel("%s: 0x%llx -- IO_REGION_ID\n", __func__, ea);
+ psize = mmu_io_psize;
+ ssize = mmu_kernel_ssize;
+ vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+ vsidkey = SLB_VSID_KERNEL;
+ break;
+ case LINEAR_MAP_REGION_ID:
+ pr_devel("%s: 0x%llx -- LINEAR_MAP_REGION_ID\n", __func__, ea);
+ psize = mmu_linear_psize;
+ ssize = mmu_kernel_ssize;
+ vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+ vsidkey = SLB_VSID_KERNEL;
+ break;
+ default:
+ pr_debug("%s: invalid region access at %016llx\n", __func__, ea);
+ return 1;
+ }
+ /* Bad address */
+ if (!vsid)
+ return 1;
+
+ vsid = (vsid << slb_vsid_shift(ssize)) | vsidkey;
+
+ vsid |= mmu_psize_defs[psize].sllp |
+ ((ssize == MMU_SEGSIZE_1T) ? SLB_VSID_B_1T : 0);
+
+ slb->esid = (ea & (ssize == MMU_SEGSIZE_1T ? ESID_MASK_1T : ESID_MASK)) | SLB_ESID_V;
+ slb->vsid = vsid;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(copro_calculate_slb);
+#endif
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index 6747eece84af..30260b5d146d 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -1,321 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* PowerPC version derived from arch/arm/mm/consistent.c
* Copyright (C) 2001 Dan Malek (dmalek@jlc.net)
*
* Copyright (C) 2000 Russell King
- *
- * Consistent memory allocators. Used for DMA devices that want to
- * share uncached memory with the processor core. The function return
- * is the virtual address and 'dma_handle' is the physical address.
- * Mostly stolen from the ARM port, with some changes for PowerPC.
- * -- Dan
- *
- * Reorganized to get rid of the arch-specific consistent_* functions
- * and provide non-coherent implementations for the DMA API. -Matt
- *
- * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent()
- * implementation. This is pulled straight from ARM and barely
- * modified. -Matt
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
-#include <linux/sched.h>
-#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/errno.h>
-#include <linux/string.h>
#include <linux/types.h>
#include <linux/highmem.h>
-#include <linux/dma-mapping.h>
-#include <linux/export.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-map-ops.h>
#include <asm/tlbflush.h>
-
-#include "mmu_decl.h"
-
-/*
- * This address range defaults to a value that is safe for all
- * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It
- * can be further configured for specific applications under
- * the "Advanced Setup" menu. -Matt
- */
-#define CONSISTENT_BASE (IOREMAP_TOP)
-#define CONSISTENT_END (CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE)
-#define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT)
-
-/*
- * This is the page table (2MB) covering uncached, DMA consistent allocations
- */
-static DEFINE_SPINLOCK(consistent_lock);
-
-/*
- * VM region handling support.
- *
- * This should become something generic, handling VM region allocations for
- * vmalloc and similar (ioremap, module space, etc).
- *
- * I envisage vmalloc()'s supporting vm_struct becoming:
- *
- * struct vm_struct {
- * struct vm_region region;
- * unsigned long flags;
- * struct page **pages;
- * unsigned int nr_pages;
- * unsigned long phys_addr;
- * };
- *
- * get_vm_area() would then call vm_region_alloc with an appropriate
- * struct vm_region head (eg):
- *
- * struct vm_region vmalloc_head = {
- * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list),
- * .vm_start = VMALLOC_START,
- * .vm_end = VMALLOC_END,
- * };
- *
- * However, vmalloc_head.vm_start is variable (typically, it is dependent on
- * the amount of RAM found at boot time.) I would imagine that get_vm_area()
- * would have to initialise this each time prior to calling vm_region_alloc().
- */
-struct ppc_vm_region {
- struct list_head vm_list;
- unsigned long vm_start;
- unsigned long vm_end;
-};
-
-static struct ppc_vm_region consistent_head = {
- .vm_list = LIST_HEAD_INIT(consistent_head.vm_list),
- .vm_start = CONSISTENT_BASE,
- .vm_end = CONSISTENT_END,
-};
-
-static struct ppc_vm_region *
-ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp)
-{
- unsigned long addr = head->vm_start, end = head->vm_end - size;
- unsigned long flags;
- struct ppc_vm_region *c, *new;
-
- new = kmalloc(sizeof(struct ppc_vm_region), gfp);
- if (!new)
- goto out;
-
- spin_lock_irqsave(&consistent_lock, flags);
-
- list_for_each_entry(c, &head->vm_list, vm_list) {
- if ((addr + size) < addr)
- goto nospc;
- if ((addr + size) <= c->vm_start)
- goto found;
- addr = c->vm_end;
- if (addr > end)
- goto nospc;
- }
-
- found:
- /*
- * Insert this entry _before_ the one we found.
- */
- list_add_tail(&new->vm_list, &c->vm_list);
- new->vm_start = addr;
- new->vm_end = addr + size;
-
- spin_unlock_irqrestore(&consistent_lock, flags);
- return new;
-
- nospc:
- spin_unlock_irqrestore(&consistent_lock, flags);
- kfree(new);
- out:
- return NULL;
-}
-
-static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr)
-{
- struct ppc_vm_region *c;
-
- list_for_each_entry(c, &head->vm_list, vm_list) {
- if (c->vm_start == addr)
- goto out;
- }
- c = NULL;
- out:
- return c;
-}
-
-/*
- * Allocate DMA-coherent memory space and return both the kernel remapped
- * virtual and bus address for that space.
- */
-void *
-__dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp)
-{
- struct page *page;
- struct ppc_vm_region *c;
- unsigned long order;
- u64 mask = ISA_DMA_THRESHOLD, limit;
-
- if (dev) {
- mask = dev->coherent_dma_mask;
-
- /*
- * Sanity check the DMA mask - it must be non-zero, and
- * must be able to be satisfied by a DMA allocation.
- */
- if (mask == 0) {
- dev_warn(dev, "coherent DMA mask is unset\n");
- goto no_page;
- }
-
- if ((~mask) & ISA_DMA_THRESHOLD) {
- dev_warn(dev, "coherent DMA mask %#llx is smaller "
- "than system GFP_DMA mask %#llx\n",
- mask, (unsigned long long)ISA_DMA_THRESHOLD);
- goto no_page;
- }
- }
-
-
- size = PAGE_ALIGN(size);
- limit = (mask + 1) & ~mask;
- if ((limit && size >= limit) ||
- size >= (CONSISTENT_END - CONSISTENT_BASE)) {
- printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n",
- size, mask);
- return NULL;
- }
-
- order = get_order(size);
-
- /* Might be useful if we ever have a real legacy DMA zone... */
- if (mask != 0xffffffff)
- gfp |= GFP_DMA;
-
- page = alloc_pages(gfp, order);
- if (!page)
- goto no_page;
-
- /*
- * Invalidate any data that might be lurking in the
- * kernel direct-mapped region for device DMA.
- */
- {
- unsigned long kaddr = (unsigned long)page_address(page);
- memset(page_address(page), 0, size);
- flush_dcache_range(kaddr, kaddr + size);
- }
-
- /*
- * Allocate a virtual address in the consistent mapping region.
- */
- c = ppc_vm_region_alloc(&consistent_head, size,
- gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
- if (c) {
- unsigned long vaddr = c->vm_start;
- struct page *end = page + (1 << order);
-
- split_page(page, order);
-
- /*
- * Set the "dma handle"
- */
- *handle = page_to_phys(page);
-
- do {
- SetPageReserved(page);
- map_page(vaddr, page_to_phys(page),
- pgprot_noncached(PAGE_KERNEL));
- page++;
- vaddr += PAGE_SIZE;
- } while (size -= PAGE_SIZE);
-
- /*
- * Free the otherwise unused pages.
- */
- while (page < end) {
- __free_page(page);
- page++;
- }
-
- return (void *)c->vm_start;
- }
-
- if (page)
- __free_pages(page, order);
- no_page:
- return NULL;
-}
-EXPORT_SYMBOL(__dma_alloc_coherent);
-
-/*
- * free a page as defined by the above mapping.
- */
-void __dma_free_coherent(size_t size, void *vaddr)
-{
- struct ppc_vm_region *c;
- unsigned long flags, addr;
-
- size = PAGE_ALIGN(size);
-
- spin_lock_irqsave(&consistent_lock, flags);
-
- c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr);
- if (!c)
- goto no_area;
-
- if ((c->vm_end - c->vm_start) != size) {
- printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
- __func__, c->vm_end - c->vm_start, size);
- dump_stack();
- size = c->vm_end - c->vm_start;
- }
-
- addr = c->vm_start;
- do {
- pte_t *ptep;
- unsigned long pfn;
-
- ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr),
- addr),
- addr),
- addr);
- if (!pte_none(*ptep) && pte_present(*ptep)) {
- pfn = pte_pfn(*ptep);
- pte_clear(&init_mm, addr, ptep);
- if (pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
-
- ClearPageReserved(page);
- __free_page(page);
- }
- }
- addr += PAGE_SIZE;
- } while (size -= PAGE_SIZE);
-
- flush_tlb_kernel_range(c->vm_start, c->vm_end);
-
- list_del(&c->vm_list);
-
- spin_unlock_irqrestore(&consistent_lock, flags);
-
- kfree(c);
- return;
-
- no_area:
- spin_unlock_irqrestore(&consistent_lock, flags);
- printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
- __func__, vaddr);
- dump_stack();
-}
-EXPORT_SYMBOL(__dma_free_coherent);
+#include <asm/dma.h>
/*
* make an area consistent.
*/
-void __dma_sync(void *vaddr, size_t size, int direction)
+static void __dma_sync(void *vaddr, size_t size, int direction)
{
unsigned long start = (unsigned long)vaddr;
unsigned long end = start + size;
@@ -328,7 +32,7 @@ void __dma_sync(void *vaddr, size_t size, int direction)
* invalidate only when cache-line aligned otherwise there is
* the potential for discarding uncommitted data from the cache
*/
- if ((start & (L1_CACHE_BYTES - 1)) || (size & (L1_CACHE_BYTES - 1)))
+ if ((start | end) & (L1_CACHE_BYTES - 1))
flush_dcache_range(start, end);
else
invalidate_dcache_range(start, end);
@@ -341,7 +45,6 @@ void __dma_sync(void *vaddr, size_t size, int direction)
break;
}
}
-EXPORT_SYMBOL(__dma_sync);
#ifdef CONFIG_HIGHMEM
/*
@@ -388,34 +91,34 @@ static inline void __dma_sync_page_highmem(struct page *page,
* __dma_sync_page makes memory consistent. identical to __dma_sync, but
* takes a struct page instead of a virtual address
*/
-void __dma_sync_page(struct page *page, unsigned long offset,
- size_t size, int direction)
+static void __dma_sync_page(phys_addr_t paddr, size_t size, int dir)
{
+ struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
+ unsigned offset = paddr & ~PAGE_MASK;
+
#ifdef CONFIG_HIGHMEM
- __dma_sync_page_highmem(page, offset, size, direction);
+ __dma_sync_page_highmem(page, offset, size, dir);
#else
unsigned long start = (unsigned long)page_address(page) + offset;
- __dma_sync((void *)start, size, direction);
+ __dma_sync((void *)start, size, dir);
#endif
}
-EXPORT_SYMBOL(__dma_sync_page);
-/*
- * Return the PFN for a given cpu virtual address returned by
- * __dma_alloc_coherent. This is used by dma_mmap_coherent()
- */
-unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir)
+{
+ __dma_sync_page(paddr, size, dir);
+}
+
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir)
+{
+ __dma_sync_page(paddr, size, dir);
+}
+
+void arch_dma_prep_coherent(struct page *page, size_t size)
{
- /* This should always be populated, so we don't test every
- * level. If that fails, we'll have a nice crash which
- * will be as good as a BUG_ON()
- */
- pgd_t *pgd = pgd_offset_k(cpu_addr);
- pud_t *pud = pud_offset(pgd, cpu_addr);
- pmd_t *pmd = pmd_offset(pud, cpu_addr);
- pte_t *ptep = pte_offset_kernel(pmd, cpu_addr);
+ unsigned long kaddr = (unsigned long)page_address(page);
- if (pte_none(*ptep) || !pte_present(*ptep))
- return 0;
- return pte_pfn(*ptep);
+ flush_dcache_range(kaddr, kaddr + size);
}
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
new file mode 100644
index 000000000000..8dd7b340d51f
--- /dev/null
+++ b/arch/powerpc/mm/drmem.c
@@ -0,0 +1,514 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Dynamic reconfiguration memory support
+ *
+ * Copyright 2017 IBM Corporation
+ */
+
+#define pr_fmt(fmt) "drmem: " fmt
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/memblock.h>
+#include <linux/slab.h>
+#include <asm/drmem.h>
+
+static int n_root_addr_cells, n_root_size_cells;
+
+static struct drmem_lmb_info __drmem_info;
+struct drmem_lmb_info *drmem_info = &__drmem_info;
+static bool in_drmem_update;
+
+u64 drmem_lmb_memory_max(void)
+{
+ struct drmem_lmb *last_lmb;
+
+ last_lmb = &drmem_info->lmbs[drmem_info->n_lmbs - 1];
+ return last_lmb->base_addr + drmem_lmb_size();
+}
+
+static u32 drmem_lmb_flags(struct drmem_lmb *lmb)
+{
+ /*
+ * Return the value of the lmb flags field minus the reserved
+ * bit used internally for hotplug processing.
+ */
+ return lmb->flags & ~DRMEM_LMB_RESERVED;
+}
+
+static struct property *clone_property(struct property *prop, u32 prop_sz)
+{
+ struct property *new_prop;
+
+ new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
+ if (!new_prop)
+ return NULL;
+
+ new_prop->name = kstrdup(prop->name, GFP_KERNEL);
+ new_prop->value = kzalloc(prop_sz, GFP_KERNEL);
+ if (!new_prop->name || !new_prop->value) {
+ kfree(new_prop->name);
+ kfree(new_prop->value);
+ kfree(new_prop);
+ return NULL;
+ }
+
+ new_prop->length = prop_sz;
+#if defined(CONFIG_OF_DYNAMIC)
+ of_property_set_flag(new_prop, OF_DYNAMIC);
+#endif
+ return new_prop;
+}
+
+static int drmem_update_dt_v1(struct device_node *memory,
+ struct property *prop)
+{
+ struct property *new_prop;
+ struct of_drconf_cell_v1 *dr_cell;
+ struct drmem_lmb *lmb;
+ __be32 *p;
+
+ new_prop = clone_property(prop, prop->length);
+ if (!new_prop)
+ return -1;
+
+ p = new_prop->value;
+ *p++ = cpu_to_be32(drmem_info->n_lmbs);
+
+ dr_cell = (struct of_drconf_cell_v1 *)p;
+
+ for_each_drmem_lmb(lmb) {
+ dr_cell->base_addr = cpu_to_be64(lmb->base_addr);
+ dr_cell->drc_index = cpu_to_be32(lmb->drc_index);
+ dr_cell->aa_index = cpu_to_be32(lmb->aa_index);
+ dr_cell->flags = cpu_to_be32(drmem_lmb_flags(lmb));
+
+ dr_cell++;
+ }
+
+ of_update_property(memory, new_prop);
+ return 0;
+}
+
+static void init_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
+ struct drmem_lmb *lmb)
+{
+ dr_cell->base_addr = cpu_to_be64(lmb->base_addr);
+ dr_cell->drc_index = cpu_to_be32(lmb->drc_index);
+ dr_cell->aa_index = cpu_to_be32(lmb->aa_index);
+ dr_cell->flags = cpu_to_be32(drmem_lmb_flags(lmb));
+}
+
+static int drmem_update_dt_v2(struct device_node *memory,
+ struct property *prop)
+{
+ struct property *new_prop;
+ struct of_drconf_cell_v2 *dr_cell;
+ struct drmem_lmb *lmb, *prev_lmb;
+ u32 lmb_sets, prop_sz, seq_lmbs;
+ u32 *p;
+
+ /* First pass, determine how many LMB sets are needed. */
+ lmb_sets = 0;
+ prev_lmb = NULL;
+ for_each_drmem_lmb(lmb) {
+ if (!prev_lmb) {
+ prev_lmb = lmb;
+ lmb_sets++;
+ continue;
+ }
+
+ if (prev_lmb->aa_index != lmb->aa_index ||
+ drmem_lmb_flags(prev_lmb) != drmem_lmb_flags(lmb))
+ lmb_sets++;
+
+ prev_lmb = lmb;
+ }
+
+ prop_sz = lmb_sets * sizeof(*dr_cell) + sizeof(__be32);
+ new_prop = clone_property(prop, prop_sz);
+ if (!new_prop)
+ return -1;
+
+ p = new_prop->value;
+ *p++ = cpu_to_be32(lmb_sets);
+
+ dr_cell = (struct of_drconf_cell_v2 *)p;
+
+ /* Second pass, populate the LMB set data */
+ prev_lmb = NULL;
+ seq_lmbs = 0;
+ for_each_drmem_lmb(lmb) {
+ if (prev_lmb == NULL) {
+ /* Start of first LMB set */
+ prev_lmb = lmb;
+ init_drconf_v2_cell(dr_cell, lmb);
+ seq_lmbs++;
+ continue;
+ }
+
+ if (prev_lmb->aa_index != lmb->aa_index ||
+ drmem_lmb_flags(prev_lmb) != drmem_lmb_flags(lmb)) {
+ /* end of one set, start of another */
+ dr_cell->seq_lmbs = cpu_to_be32(seq_lmbs);
+ dr_cell++;
+
+ init_drconf_v2_cell(dr_cell, lmb);
+ seq_lmbs = 1;
+ } else {
+ seq_lmbs++;
+ }
+
+ prev_lmb = lmb;
+ }
+
+ /* close out last LMB set */
+ dr_cell->seq_lmbs = cpu_to_be32(seq_lmbs);
+ of_update_property(memory, new_prop);
+ return 0;
+}
+
+int drmem_update_dt(void)
+{
+ struct device_node *memory;
+ struct property *prop;
+ int rc = -1;
+
+ memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+ if (!memory)
+ return -1;
+
+ /*
+ * Set in_drmem_update to prevent the notifier callback to process the
+ * DT property back since the change is coming from the LMB tree.
+ */
+ in_drmem_update = true;
+ prop = of_find_property(memory, "ibm,dynamic-memory", NULL);
+ if (prop) {
+ rc = drmem_update_dt_v1(memory, prop);
+ } else {
+ prop = of_find_property(memory, "ibm,dynamic-memory-v2", NULL);
+ if (prop)
+ rc = drmem_update_dt_v2(memory, prop);
+ }
+ in_drmem_update = false;
+
+ of_node_put(memory);
+ return rc;
+}
+
+static void read_drconf_v1_cell(struct drmem_lmb *lmb,
+ const __be32 **prop)
+{
+ const __be32 *p = *prop;
+
+ lmb->base_addr = of_read_number(p, n_root_addr_cells);
+ p += n_root_addr_cells;
+ lmb->drc_index = of_read_number(p++, 1);
+
+ p++; /* skip reserved field */
+
+ lmb->aa_index = of_read_number(p++, 1);
+ lmb->flags = of_read_number(p++, 1);
+
+ *prop = p;
+}
+
+static int
+__walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm, void *data,
+ int (*func)(struct drmem_lmb *, const __be32 **, void *))
+{
+ struct drmem_lmb lmb;
+ u32 i, n_lmbs;
+ int ret = 0;
+
+ n_lmbs = of_read_number(prop++, 1);
+ for (i = 0; i < n_lmbs; i++) {
+ read_drconf_v1_cell(&lmb, &prop);
+ ret = func(&lmb, &usm, data);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static void read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
+ const __be32 **prop)
+{
+ const __be32 *p = *prop;
+
+ dr_cell->seq_lmbs = of_read_number(p++, 1);
+ dr_cell->base_addr = of_read_number(p, n_root_addr_cells);
+ p += n_root_addr_cells;
+ dr_cell->drc_index = of_read_number(p++, 1);
+ dr_cell->aa_index = of_read_number(p++, 1);
+ dr_cell->flags = of_read_number(p++, 1);
+
+ *prop = p;
+}
+
+static int
+__walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm, void *data,
+ int (*func)(struct drmem_lmb *, const __be32 **, void *))
+{
+ struct of_drconf_cell_v2 dr_cell;
+ struct drmem_lmb lmb;
+ u32 i, j, lmb_sets;
+ int ret = 0;
+
+ lmb_sets = of_read_number(prop++, 1);
+ for (i = 0; i < lmb_sets; i++) {
+ read_drconf_v2_cell(&dr_cell, &prop);
+
+ for (j = 0; j < dr_cell.seq_lmbs; j++) {
+ lmb.base_addr = dr_cell.base_addr;
+ dr_cell.base_addr += drmem_lmb_size();
+
+ lmb.drc_index = dr_cell.drc_index;
+ dr_cell.drc_index++;
+
+ lmb.aa_index = dr_cell.aa_index;
+ lmb.flags = dr_cell.flags;
+
+ ret = func(&lmb, &usm, data);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_PPC_PSERIES
+int __init walk_drmem_lmbs_early(unsigned long node, void *data,
+ int (*func)(struct drmem_lmb *, const __be32 **, void *))
+{
+ const __be32 *prop, *usm;
+ int len, ret = -ENODEV;
+
+ prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
+ if (!prop || len < dt_root_size_cells * sizeof(__be32))
+ return ret;
+
+ /* Get the address & size cells */
+ n_root_addr_cells = dt_root_addr_cells;
+ n_root_size_cells = dt_root_size_cells;
+
+ drmem_info->lmb_size = dt_mem_next_cell(dt_root_size_cells, &prop);
+
+ usm = of_get_flat_dt_prop(node, "linux,drconf-usable-memory", &len);
+
+ prop = of_get_flat_dt_prop(node, "ibm,dynamic-memory", &len);
+ if (prop) {
+ ret = __walk_drmem_v1_lmbs(prop, usm, data, func);
+ } else {
+ prop = of_get_flat_dt_prop(node, "ibm,dynamic-memory-v2",
+ &len);
+ if (prop)
+ ret = __walk_drmem_v2_lmbs(prop, usm, data, func);
+ }
+
+ memblock_dump_all();
+ return ret;
+}
+
+/*
+ * Update the LMB associativity index.
+ */
+static int update_lmb(struct drmem_lmb *updated_lmb,
+ __maybe_unused const __be32 **usm,
+ __maybe_unused void *data)
+{
+ struct drmem_lmb *lmb;
+
+ for_each_drmem_lmb(lmb) {
+ if (lmb->drc_index != updated_lmb->drc_index)
+ continue;
+
+ lmb->aa_index = updated_lmb->aa_index;
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Update the LMB associativity index.
+ *
+ * This needs to be called when the hypervisor is updating the
+ * dynamic-reconfiguration-memory node property.
+ */
+void drmem_update_lmbs(struct property *prop)
+{
+ /*
+ * Don't update the LMBs if triggered by the update done in
+ * drmem_update_dt(), the LMB values have been used to the update the DT
+ * property in that case.
+ */
+ if (in_drmem_update)
+ return;
+ if (!strcmp(prop->name, "ibm,dynamic-memory"))
+ __walk_drmem_v1_lmbs(prop->value, NULL, NULL, update_lmb);
+ else if (!strcmp(prop->name, "ibm,dynamic-memory-v2"))
+ __walk_drmem_v2_lmbs(prop->value, NULL, NULL, update_lmb);
+}
+#endif
+
+static int init_drmem_lmb_size(struct device_node *dn)
+{
+ const __be32 *prop;
+ int len;
+
+ if (drmem_info->lmb_size)
+ return 0;
+
+ prop = of_get_property(dn, "ibm,lmb-size", &len);
+ if (!prop || len < n_root_size_cells * sizeof(__be32)) {
+ pr_info("Could not determine LMB size\n");
+ return -1;
+ }
+
+ drmem_info->lmb_size = of_read_number(prop, n_root_size_cells);
+ return 0;
+}
+
+/*
+ * Returns the property linux,drconf-usable-memory if
+ * it exists (the property exists only in kexec/kdump kernels,
+ * added by kexec-tools)
+ */
+static const __be32 *of_get_usable_memory(struct device_node *dn)
+{
+ const __be32 *prop;
+ u32 len;
+
+ prop = of_get_property(dn, "linux,drconf-usable-memory", &len);
+ if (!prop || len < sizeof(unsigned int))
+ return NULL;
+
+ return prop;
+}
+
+int walk_drmem_lmbs(struct device_node *dn, void *data,
+ int (*func)(struct drmem_lmb *, const __be32 **, void *))
+{
+ struct device_node *root = of_find_node_by_path("/");
+ const __be32 *prop, *usm;
+ int ret = -ENODEV;
+
+ if (!root)
+ return ret;
+
+ /* Get the address & size cells */
+ n_root_addr_cells = of_n_addr_cells(root);
+ n_root_size_cells = of_n_size_cells(root);
+ of_node_put(root);
+
+ if (init_drmem_lmb_size(dn))
+ return ret;
+
+ usm = of_get_usable_memory(dn);
+
+ prop = of_get_property(dn, "ibm,dynamic-memory", NULL);
+ if (prop) {
+ ret = __walk_drmem_v1_lmbs(prop, usm, data, func);
+ } else {
+ prop = of_get_property(dn, "ibm,dynamic-memory-v2", NULL);
+ if (prop)
+ ret = __walk_drmem_v2_lmbs(prop, usm, data, func);
+ }
+
+ return ret;
+}
+
+static void __init init_drmem_v1_lmbs(const __be32 *prop)
+{
+ struct drmem_lmb *lmb;
+
+ drmem_info->n_lmbs = of_read_number(prop++, 1);
+ if (drmem_info->n_lmbs == 0)
+ return;
+
+ drmem_info->lmbs = kcalloc(drmem_info->n_lmbs, sizeof(*lmb),
+ GFP_KERNEL);
+ if (!drmem_info->lmbs)
+ return;
+
+ for_each_drmem_lmb(lmb)
+ read_drconf_v1_cell(lmb, &prop);
+}
+
+static void __init init_drmem_v2_lmbs(const __be32 *prop)
+{
+ struct drmem_lmb *lmb;
+ struct of_drconf_cell_v2 dr_cell;
+ const __be32 *p;
+ u32 i, j, lmb_sets;
+ int lmb_index;
+
+ lmb_sets = of_read_number(prop++, 1);
+ if (lmb_sets == 0)
+ return;
+
+ /* first pass, calculate the number of LMBs */
+ p = prop;
+ for (i = 0; i < lmb_sets; i++) {
+ read_drconf_v2_cell(&dr_cell, &p);
+ drmem_info->n_lmbs += dr_cell.seq_lmbs;
+ }
+
+ drmem_info->lmbs = kcalloc(drmem_info->n_lmbs, sizeof(*lmb),
+ GFP_KERNEL);
+ if (!drmem_info->lmbs)
+ return;
+
+ /* second pass, read in the LMB information */
+ lmb_index = 0;
+ p = prop;
+
+ for (i = 0; i < lmb_sets; i++) {
+ read_drconf_v2_cell(&dr_cell, &p);
+
+ for (j = 0; j < dr_cell.seq_lmbs; j++) {
+ lmb = &drmem_info->lmbs[lmb_index++];
+
+ lmb->base_addr = dr_cell.base_addr;
+ dr_cell.base_addr += drmem_info->lmb_size;
+
+ lmb->drc_index = dr_cell.drc_index;
+ dr_cell.drc_index++;
+
+ lmb->aa_index = dr_cell.aa_index;
+ lmb->flags = dr_cell.flags;
+ }
+ }
+}
+
+static int __init drmem_init(void)
+{
+ struct device_node *dn;
+ const __be32 *prop;
+
+ dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+ if (!dn)
+ return 0;
+
+ if (init_drmem_lmb_size(dn)) {
+ of_node_put(dn);
+ return 0;
+ }
+
+ prop = of_get_property(dn, "ibm,dynamic-memory", NULL);
+ if (prop) {
+ init_drmem_v1_lmbs(prop);
+ } else {
+ prop = of_get_property(dn, "ibm,dynamic-memory-v2", NULL);
+ if (prop)
+ init_drmem_v2_lmbs(prop);
+ }
+
+ of_node_put(dn);
+ return 0;
+}
+late_initcall(drmem_init);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 8726779e1409..806c74e0d5ab 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -8,526 +9,689 @@
* Modified by Cort Dougan and Paul Mackerras.
*
* Modified for PPC64 by Dave Engebretsen (engebret@ibm.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/signal.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/types.h>
+#include <linux/pagemap.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/highmem.h>
-#include <linux/module.h>
+#include <linux/extable.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/perf_event.h>
-#include <linux/magic.h>
#include <linux/ratelimit.h>
#include <linux/context_tracking.h>
+#include <linux/hugetlb.h>
+#include <linux/uaccess.h>
+#include <linux/kfence.h>
+#include <linux/pkeys.h>
#include <asm/firmware.h>
+#include <asm/interrupt.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
-#include <asm/uaccess.h>
-#include <asm/tlbflush.h>
#include <asm/siginfo.h>
#include <asm/debug.h>
-#include <mm/mmu_decl.h>
+#include <asm/kup.h>
+#include <asm/inst.h>
+
-#include "icswx.h"
+/*
+ * do_page_fault error handling helpers
+ */
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs)
+static int
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code)
{
- int ret = 0;
+ /*
+ * If we are in kernel mode, bail out with a SEGV, this will
+ * be caught by the assembly which will restore the non-volatile
+ * registers before calling bad_page_fault()
+ */
+ if (!user_mode(regs))
+ return SIGSEGV;
- /* kprobe_running() needs smp_processor_id() */
- if (!user_mode(regs)) {
- preempt_disable();
- if (kprobe_running() && kprobe_fault_handler(regs, 11))
- ret = 1;
- preempt_enable();
- }
+ _exception(SIGSEGV, regs, si_code, address);
- return ret;
+ return 0;
}
-#else
-static inline int notify_page_fault(struct pt_regs *regs)
+
+static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address)
{
- return 0;
+ return __bad_area_nosemaphore(regs, address, SEGV_MAPERR);
}
-#endif
-/*
- * Check whether the instruction at regs->nip is a store using
- * an update addressing form which will update r1.
- */
-static int store_updates_sp(struct pt_regs *regs)
+static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code,
+ struct mm_struct *mm, struct vm_area_struct *vma)
{
- unsigned int inst;
- if (get_user(inst, (unsigned int __user *)regs->nip))
- return 0;
- /* check for 1 in the rA field */
- if (((inst >> 16) & 0x1f) != 1)
- return 0;
- /* check major opcode */
- switch (inst >> 26) {
- case 37: /* stwu */
- case 39: /* stbu */
- case 45: /* sthu */
- case 53: /* stfsu */
- case 55: /* stfdu */
- return 1;
- case 62: /* std or stdu */
- return (inst & 3) == 1;
- case 31:
- /* check minor opcode */
- switch ((inst >> 1) & 0x3ff) {
- case 181: /* stdux */
- case 183: /* stwux */
- case 247: /* stbux */
- case 439: /* sthux */
- case 695: /* stfsux */
- case 759: /* stfdux */
- return 1;
- }
- }
+ /*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+ if (mm)
+ mmap_read_unlock(mm);
+ else
+ vma_end_read(vma);
+
+ return __bad_area_nosemaphore(regs, address, si_code);
+}
+
+static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
+ struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ int pkey;
+
+ /*
+ * We don't try to fetch the pkey from page table because reading
+ * page table without locking doesn't guarantee stable pte value.
+ * Hence the pkey value that we return to userspace can be different
+ * from the pkey that actually caused access error.
+ *
+ * It does *not* guarantee that the VMA we find here
+ * was the one that we faulted on.
+ *
+ * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
+ * 2. T1 : set AMR to deny access to pkey=4, touches, page
+ * 3. T1 : faults...
+ * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
+ * 5. T1 : enters fault handler, takes mmap_lock, etc...
+ * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
+ * faulted on a pte with its pkey=4.
+ */
+ pkey = vma_pkey(vma);
+
+ if (mm)
+ mmap_read_unlock(mm);
+ else
+ vma_end_read(vma);
+
+ /*
+ * If we are in kernel mode, bail out with a SEGV, this will
+ * be caught by the assembly which will restore the non-volatile
+ * registers before calling bad_page_fault()
+ */
+ if (!user_mode(regs))
+ return SIGSEGV;
+
+ _exception_pkey(regs, address, pkey);
+
return 0;
}
-/*
- * do_page_fault error handling helpers
- */
-#define MM_FAULT_RETURN 0
-#define MM_FAULT_CONTINUE -1
-#define MM_FAULT_ERR(sig) (sig)
+static noinline int bad_access(struct pt_regs *regs, unsigned long address,
+ struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ return __bad_area(regs, address, SEGV_ACCERR, mm, vma);
+}
-static int do_sigbus(struct pt_regs *regs, unsigned long address)
+static int do_sigbus(struct pt_regs *regs, unsigned long address,
+ vm_fault_t fault)
{
- siginfo_t info;
-
- up_read(&current->mm->mmap_sem);
-
- if (user_mode(regs)) {
- current->thread.trap_nr = BUS_ADRERR;
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void __user *)address;
- force_sig_info(SIGBUS, &info, current);
- return MM_FAULT_RETURN;
+ if (!user_mode(regs))
+ return SIGBUS;
+
+ current->thread.trap_nr = BUS_ADRERR;
+#ifdef CONFIG_MEMORY_FAILURE
+ if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+ unsigned int lsb = 0; /* shutup gcc */
+
+ pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+ current->comm, current->pid, address);
+
+ if (fault & VM_FAULT_HWPOISON_LARGE)
+ lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+ if (fault & VM_FAULT_HWPOISON)
+ lsb = PAGE_SHIFT;
+
+ force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
+ return 0;
}
- return MM_FAULT_ERR(SIGBUS);
+
+#endif
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
+ return 0;
}
-static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
+static int mm_fault_error(struct pt_regs *regs, unsigned long addr,
+ vm_fault_t fault)
{
/*
- * Pagefault was interrupted by SIGKILL. We have no reason to
- * continue the pagefault.
+ * Kernel page fault interrupted by SIGKILL. We have no reason to
+ * continue processing.
*/
- if (fatal_signal_pending(current)) {
- /*
- * If we have retry set, the mmap semaphore will have
- * alrady been released in __lock_page_or_retry(). Else
- * we release it now.
- */
- if (!(fault & VM_FAULT_RETRY))
- up_read(&current->mm->mmap_sem);
- /* Coming from kernel, we need to deal with uaccess fixups */
- if (user_mode(regs))
- return MM_FAULT_RETURN;
- return MM_FAULT_ERR(SIGKILL);
- }
-
- /* No fault: be happy */
- if (!(fault & VM_FAULT_ERROR))
- return MM_FAULT_CONTINUE;
+ if (fatal_signal_pending(current) && !user_mode(regs))
+ return SIGKILL;
/* Out of memory */
if (fault & VM_FAULT_OOM) {
- up_read(&current->mm->mmap_sem);
-
/*
* We ran out of memory, or some other thing happened to us that
* made us unable to handle the page fault gracefully.
*/
if (!user_mode(regs))
- return MM_FAULT_ERR(SIGKILL);
+ return SIGSEGV;
pagefault_out_of_memory();
- return MM_FAULT_RETURN;
+ } else {
+ if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+ VM_FAULT_HWPOISON_LARGE))
+ return do_sigbus(regs, addr, fault);
+ else if (fault & VM_FAULT_SIGSEGV)
+ return bad_area_nosemaphore(regs, addr);
+ else
+ BUG();
+ }
+ return 0;
+}
+
+/* Is this a bad kernel fault ? */
+static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, bool is_write)
+{
+ int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE;
+
+ if (is_exec) {
+ pr_crit_ratelimited("kernel tried to execute %s page (%lx) - exploit attempt? (uid: %d)\n",
+ address >= TASK_SIZE ? "exec-protected" : "user",
+ address,
+ from_kuid(&init_user_ns, current_uid()));
+
+ // Kernel exec fault is always bad
+ return true;
+ }
+
+ // Kernel fault on kernel address is bad
+ if (address >= TASK_SIZE)
+ return true;
+
+ // Read/write fault blocked by KUAP is bad, it can never succeed.
+ if (bad_kuap_fault(regs, address, is_write)) {
+ pr_crit_ratelimited("Kernel attempted to %s user page (%lx) - exploit attempt? (uid: %d)\n",
+ str_write_read(is_write), address,
+ from_kuid(&init_user_ns, current_uid()));
+
+ // Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad
+ if (!search_exception_tables(regs->nip))
+ return true;
+
+ // Read/write fault in a valid region (the exception table search passed
+ // above), but blocked by KUAP is bad, it can never succeed.
+ return WARN(true, "Bug: %s fault blocked by KUAP!", is_write ? "Write" : "Read");
+ }
+
+ // What's left? Kernel fault on user and allowed by KUAP in the faulting context.
+ return false;
+}
+
+static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey,
+ struct vm_area_struct *vma)
+{
+ /*
+ * Make sure to check the VMA so that we do not perform
+ * faults just to hit a pkey fault as soon as we fill in a
+ * page. Only called for current mm, hence foreign == 0
+ */
+ if (!arch_vma_access_permitted(vma, is_write, is_exec, 0))
+ return true;
+
+ return false;
+}
+
+static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma)
+{
+ /*
+ * Allow execution from readable areas if the MMU does not
+ * provide separate controls over reading and executing.
+ *
+ * Note: That code used to not be enabled for 4xx/BookE.
+ * It is now as I/D cache coherency for these is done at
+ * set_pte_at() time and I see no reason why the test
+ * below wouldn't be valid on those processors. This -may-
+ * break programs compiled with a really old ABI though.
+ */
+ if (is_exec) {
+ return !(vma->vm_flags & VM_EXEC) &&
+ (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
+ !(vma->vm_flags & (VM_READ | VM_WRITE)));
}
- /* Bus error. x86 handles HWPOISON here, we'll add this if/when
- * we support the feature in HW
+ if (is_write) {
+ if (unlikely(!(vma->vm_flags & VM_WRITE)))
+ return true;
+ return false;
+ }
+
+ /*
+ * VM_READ, VM_WRITE and VM_EXEC may imply read permissions, as
+ * defined in protection_map[]. In that case Read faults can only be
+ * caused by a PROT_NONE mapping. However a non exec access on a
+ * VM_EXEC only mapping is invalid anyway, so report it as such.
+ */
+ if (unlikely(!vma_is_accessible(vma)))
+ return true;
+
+ if ((vma->vm_flags & VM_ACCESS_FLAGS) == VM_EXEC)
+ return true;
+
+ /*
+ * We should ideally do the vma pkey access check here. But in the
+ * fault path, handle_mm_fault() also does the same check. To avoid
+ * these multiple checks, we skip it here and handle access error due
+ * to pkeys later.
+ */
+ return false;
+}
+
+#ifdef CONFIG_PPC_SMLPAR
+static inline void cmo_account_page_fault(void)
+{
+ if (firmware_has_feature(FW_FEATURE_CMO)) {
+ u32 page_ins;
+
+ preempt_disable();
+ page_ins = be32_to_cpu(get_lppaca()->page_ins);
+ page_ins += 1 << PAGE_FACTOR;
+ get_lppaca()->page_ins = cpu_to_be32(page_ins);
+ preempt_enable();
+ }
+}
+#else
+static inline void cmo_account_page_fault(void) { }
+#endif /* CONFIG_PPC_SMLPAR */
+
+static void sanity_check_fault(bool is_write, bool is_user,
+ unsigned long error_code, unsigned long address)
+{
+ /*
+ * Userspace trying to access kernel address, we get PROTFAULT for that.
+ */
+ if (is_user && address >= TASK_SIZE) {
+ if ((long)address == -1)
+ return;
+
+ pr_crit_ratelimited("%s[%d]: User access of kernel address (%lx) - exploit attempt? (uid: %d)\n",
+ current->comm, current->pid, address,
+ from_kuid(&init_user_ns, current_uid()));
+ return;
+ }
+
+ if (!IS_ENABLED(CONFIG_PPC_BOOK3S))
+ return;
+
+ /*
+ * For hash translation mode, we should never get a
+ * PROTFAULT. Any update to pte to reduce access will result in us
+ * removing the hash page table entry, thus resulting in a DSISR_NOHPTE
+ * fault instead of DSISR_PROTFAULT.
+ *
+ * A pte update to relax the access will not result in a hash page table
+ * entry invalidate and hence can result in DSISR_PROTFAULT.
+ * ptep_set_access_flags() doesn't do a hpte flush. This is why we have
+ * the special !is_write in the below conditional.
+ *
+ * For platforms that doesn't supports coherent icache and do support
+ * per page noexec bit, we do setup things such that we do the
+ * sync between D/I cache via fault. But that is handled via low level
+ * hash fault code (hash_page_do_lazy_icache()) and we should not reach
+ * here in such case.
+ *
+ * For wrong access that can result in PROTFAULT, the above vma->vm_flags
+ * check should handle those and hence we should fall to the bad_area
+ * handling correctly.
+ *
+ * For embedded with per page exec support that doesn't support coherent
+ * icache we do get PROTFAULT and we handle that D/I cache sync in
+ * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON
+ * is conditional for server MMU.
+ *
+ * For radix, we can get prot fault for autonuma case, because radix
+ * page table will have them marked noaccess for user.
+ */
+ if (radix_enabled() || is_write)
+ return;
+
+ WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
+}
+
+/*
+ * Define the correct "is_write" bit in error_code based
+ * on the processor family
+ */
+#ifdef CONFIG_BOOKE
+#define page_fault_is_write(__err) ((__err) & ESR_DST)
+#else
+#define page_fault_is_write(__err) ((__err) & DSISR_ISSTORE)
+#endif
+
+#ifdef CONFIG_BOOKE
+#define page_fault_is_bad(__err) (0)
+#elif defined(CONFIG_PPC_8xx)
+#define page_fault_is_bad(__err) ((__err) & DSISR_NOEXEC_OR_G)
+#elif defined(CONFIG_PPC64)
+static int page_fault_is_bad(unsigned long err)
+{
+ unsigned long flag = DSISR_BAD_FAULT_64S;
+
+ /*
+ * PAPR+ v2.11 § 14.15.3.4.1 (unreleased)
+ * If byte 0, bit 3 of pi-attribute-specifier-type in
+ * ibm,pi-features property is defined, ignore the DSI error
+ * which is caused by the paste instruction on the
+ * suspended NX window.
*/
- if (fault & VM_FAULT_SIGBUS)
- return do_sigbus(regs, addr);
+ if (mmu_has_feature(MMU_FTR_NX_DSI))
+ flag &= ~DSISR_BAD_COPYPASTE;
- /* We don't understand the fault code, this is fatal */
- BUG();
- return MM_FAULT_CONTINUE;
+ return err & flag;
}
+#else
+#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S)
+#endif
/*
* For 600- and 800-family processors, the error_code parameter is DSISR
- * for a data fault, SRR1 for an instruction fault. For 400-family processors
- * the error_code parameter is ESR for a data fault, 0 for an instruction
- * fault.
- * For 64-bit processors, the error_code parameter is
- * - DSISR for a non-SLB data access fault,
- * - SRR1 & 0x08000000 for a non-SLB instruction access fault
- * - 0 any SLB fault.
+ * for a data fault, SRR1 for an instruction fault.
+ * For 400-family processors the error_code parameter is ESR for a data fault,
+ * 0 for an instruction fault.
+ * For 64-bit processors, the error_code parameter is DSISR for a data access
+ * fault, SRR1 & 0x08000000 for an instruction access fault.
*
* The return value is 0 if the fault was handled, or the signal
* number if this is a kernel fault that can't be handled here.
*/
-int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
- unsigned long error_code)
+static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
+ unsigned long error_code)
{
- enum ctx_state prev_state = exception_enter();
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
- int code = SEGV_MAPERR;
- int is_write = 0;
- int trap = TRAP(regs);
- int is_exec = trap == 0x400;
- int fault;
- int rc = 0;
-
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+ unsigned int flags = FAULT_FLAG_DEFAULT;
+ int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE;
+ int is_user = user_mode(regs);
+ int is_write = page_fault_is_write(error_code);
+ vm_fault_t fault, major = 0;
+ bool kprobe_fault = kprobe_page_fault(regs, 11);
+
+ if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
+ return 0;
+
+ if (unlikely(page_fault_is_bad(error_code))) {
+ if (is_user) {
+ _exception(SIGBUS, regs, BUS_OBJERR, address);
+ return 0;
+ }
+ return SIGBUS;
+ }
+
+ /* Additional sanity check(s) */
+ sanity_check_fault(is_write, is_user, error_code, address);
+
/*
- * Fortunately the bit assignments in SRR1 for an instruction
- * fault and DSISR for a data fault are mostly the same for the
- * bits we are interested in. But there are some bits which
- * indicate errors in DSISR but can validly be set in SRR1.
+ * The kernel should never take an execute fault nor should it
+ * take a page fault to a kernel address or a page fault to a user
+ * address outside of dedicated places.
+ *
+ * Rather than kfence directly reporting false negatives, search whether
+ * the NIP belongs to the fixup table for cases where fault could come
+ * from functions like copy_from_kernel_nofault().
*/
- if (trap == 0x400)
- error_code &= 0x48200000;
- else
- is_write = error_code & DSISR_ISSTORE;
-#else
- is_write = error_code & ESR_DST;
-#endif /* CONFIG_4xx || CONFIG_BOOKE */
+ if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write))) {
+ if (is_kfence_address((void *)address) &&
+ !search_exception_tables(instruction_pointer(regs)) &&
+ kfence_handle_page_fault(address, is_write, regs))
+ return 0;
- if (is_write)
- flags |= FAULT_FLAG_WRITE;
+ return SIGSEGV;
+ }
-#ifdef CONFIG_PPC_ICSWX
/*
- * we need to do this early because this "data storage
- * interrupt" does not update the DAR/DEAR so we don't want to
- * look at it
+ * If we're in an interrupt, have no user context or are running
+ * in a region with pagefaults disabled then we must not take the fault
*/
- if (error_code & ICSWX_DSI_UCT) {
- rc = acop_handle_fault(regs, address, error_code);
- if (rc)
- goto bail;
+ if (unlikely(faulthandler_disabled() || !mm)) {
+ if (is_user)
+ printk_ratelimited(KERN_ERR "Page fault in user mode"
+ " with faulthandler_disabled()=%d"
+ " mm=%p\n",
+ faulthandler_disabled(), mm);
+ return bad_area_nosemaphore(regs, address);
}
-#endif /* CONFIG_PPC_ICSWX */
- if (notify_page_fault(regs))
- goto bail;
+ interrupt_cond_local_irq_enable(regs);
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (unlikely(debugger_fault_handler(regs)))
- goto bail;
+ /*
+ * We want to do this outside mmap_lock, because reading code around nip
+ * can result in fault, which will cause a deadlock when called with
+ * mmap_lock held
+ */
+ if (is_user)
+ flags |= FAULT_FLAG_USER;
+ if (is_write)
+ flags |= FAULT_FLAG_WRITE;
+ if (is_exec)
+ flags |= FAULT_FLAG_INSTRUCTION;
- /* On a kernel SLB miss we can only check for a valid exception entry */
- if (!user_mode(regs) && (address >= TASK_SIZE)) {
- rc = SIGSEGV;
- goto bail;
+ if (!(flags & FAULT_FLAG_USER))
+ goto lock_mmap;
+
+ vma = lock_vma_under_rcu(mm, address);
+ if (!vma)
+ goto lock_mmap;
+
+ if (unlikely(access_pkey_error(is_write, is_exec,
+ (error_code & DSISR_KEYFAULT), vma))) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ return bad_access_pkey(regs, address, NULL, vma);
}
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
- defined(CONFIG_PPC_BOOK3S_64))
- if (error_code & DSISR_DABRMATCH) {
- /* breakpoint match */
- do_break(regs, address, error_code);
- goto bail;
+ if (unlikely(access_error(is_write, is_exec, vma))) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ return bad_access(regs, address, NULL, vma);
}
-#endif
- /* We restore the interrupt state now */
- if (!arch_irq_disabled_regs(regs))
- local_irq_enable();
+ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+ if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+ vma_end_read(vma);
- if (in_atomic() || mm == NULL) {
- if (!user_mode(regs)) {
- rc = SIGSEGV;
- goto bail;
- }
- /* in_atomic() in user mode is really bad,
- as is current->mm == NULL. */
- printk(KERN_EMERG "Page fault in user mode with "
- "in_atomic() = %d mm = %p\n", in_atomic(), mm);
- printk(KERN_EMERG "NIP = %lx MSR = %lx\n",
- regs->nip, regs->msr);
- die("Weird page fault", regs, SIGSEGV);
+ if (!(fault & VM_FAULT_RETRY)) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ goto done;
}
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
+ if (fault & VM_FAULT_MAJOR)
+ flags |= FAULT_FLAG_TRIED;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ if (fault_signal_pending(fault, regs))
+ return user_mode(regs) ? 0 : SIGBUS;
+
+lock_mmap:
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
- * erroneous fault occurring in a code path which already holds mmap_sem
+ * erroneous fault occurring in a code path which already holds mmap_lock
* we will deadlock attempting to validate the fault against the
* address space. Luckily the kernel only validly references user
* space from well defined areas of code, which are listed in the
- * exceptions table.
- *
- * As the vast majority of faults will be valid we will only perform
- * the source reference check when there is a possibility of a deadlock.
- * Attempt to lock the address space, if we cannot we then validate the
- * source. If this is invalid we can skip the address space check,
- * thus avoiding the deadlock.
+ * exceptions table. lock_mm_and_find_vma() handles that logic.
*/
- if (!down_read_trylock(&mm->mmap_sem)) {
- if (!user_mode(regs) && !search_exception_tables(regs->nip))
- goto bad_area_nosemaphore;
-
retry:
- down_read(&mm->mmap_sem);
- } else {
- /*
- * The above down_read_trylock() might have succeeded in
- * which case we'll have missed the might_sleep() from
- * down_read():
- */
- might_sleep();
- }
+ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (unlikely(!vma))
+ return bad_area_nosemaphore(regs, address);
- vma = find_vma(mm, address);
- if (!vma)
- goto bad_area;
- if (vma->vm_start <= address)
- goto good_area;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto bad_area;
+ if (unlikely(access_pkey_error(is_write, is_exec,
+ (error_code & DSISR_KEYFAULT), vma)))
+ return bad_access_pkey(regs, address, mm, vma);
- /*
- * N.B. The POWER/Open ABI allows programs to access up to
- * 288 bytes below the stack pointer.
- * The kernel signal delivery code writes up to about 1.5kB
- * below the stack pointer (r1) before decrementing it.
- * The exec code can write slightly over 640kB to the stack
- * before setting the user r1. Thus we allow the stack to
- * expand to 1MB without further checks.
- */
- if (address + 0x100000 < vma->vm_end) {
- /* get user regs even if this fault is in kernel mode */
- struct pt_regs *uregs = current->thread.regs;
- if (uregs == NULL)
- goto bad_area;
-
- /*
- * A user-mode access to an address a long way below
- * the stack pointer is only valid if the instruction
- * is one which would update the stack pointer to the
- * address accessed if the instruction completed,
- * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
- * (or the byte, halfword, float or double forms).
- *
- * If we don't check this then any write to the area
- * between the last mapped region and the stack will
- * expand the stack rather than segfaulting.
- */
- if (address + 2048 < uregs->gpr[1]
- && (!user_mode(regs) || !store_updates_sp(regs)))
- goto bad_area;
- }
- if (expand_stack(vma, address))
- goto bad_area;
-
-good_area:
- code = SEGV_ACCERR;
-#if defined(CONFIG_6xx)
- if (error_code & 0x95700000)
- /* an error such as lwarx to I/O controller space,
- address matching DABR, eciwx, etc. */
- goto bad_area;
-#endif /* CONFIG_6xx */
-#if defined(CONFIG_8xx)
- /* 8xx sometimes need to load a invalid/non-present TLBs.
- * These must be invalidated separately as linux mm don't.
- */
- if (error_code & 0x40000000) /* no translation? */
- _tlbil_va(address, 0, 0, 0);
-
- /* The MPC8xx seems to always set 0x80000000, which is
- * "undefined". Of those that can be set, this is the only
- * one which seems bad.
- */
- if (error_code & 0x10000000)
- /* Guarded storage error. */
- goto bad_area;
-#endif /* CONFIG_8xx */
-
- if (is_exec) {
-#ifdef CONFIG_PPC_STD_MMU
- /* Protection fault on exec go straight to failure on
- * Hash based MMUs as they either don't support per-page
- * execute permission, or if they do, it's handled already
- * at the hash level. This test would probably have to
- * be removed if we change the way this works to make hash
- * processors use the same I/D cache coherency mechanism
- * as embedded.
- */
- if (error_code & DSISR_PROTFAULT)
- goto bad_area;
-#endif /* CONFIG_PPC_STD_MMU */
-
- /*
- * Allow execution from readable areas if the MMU does not
- * provide separate controls over reading and executing.
- *
- * Note: That code used to not be enabled for 4xx/BookE.
- * It is now as I/D cache coherency for these is done at
- * set_pte_at() time and I see no reason why the test
- * below wouldn't be valid on those processors. This -may-
- * break programs compiled with a really old ABI though.
- */
- if (!(vma->vm_flags & VM_EXEC) &&
- (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
- !(vma->vm_flags & (VM_READ | VM_WRITE))))
- goto bad_area;
- /* a write */
- } else if (is_write) {
- if (!(vma->vm_flags & VM_WRITE))
- goto bad_area;
- /* a read */
- } else {
- /* protection fault */
- if (error_code & 0x08000000)
- goto bad_area;
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
- goto bad_area;
- }
+ if (unlikely(access_error(is_write, is_exec, vma)))
+ return bad_access(regs, address, mm, vma);
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(mm, vma, address, flags);
- if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
- rc = mm_fault_error(regs, address, fault);
- if (rc >= MM_FAULT_RETURN)
- goto bail;
- else
- rc = 0;
- }
+ fault = handle_mm_fault(vma, address, flags, regs);
+
+ major |= fault & VM_FAULT_MAJOR;
+
+ if (fault_signal_pending(fault, regs))
+ return user_mode(regs) ? 0 : SIGBUS;
+
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ goto out;
/*
- * Major/minor page fault accounting is only done on the
- * initial attempt. If we go through a retry, it is extremely
- * likely that the page will be found in page cache at that point.
+ * Handle the retry right now, the mmap_lock has been released in that
+ * case.
*/
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_MAJOR) {
- current->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
- regs, address);
-#ifdef CONFIG_PPC_SMLPAR
- if (firmware_has_feature(FW_FEATURE_CMO)) {
- preempt_disable();
- get_lppaca()->page_ins += (1 << PAGE_FACTOR);
- preempt_enable();
- }
-#endif /* CONFIG_PPC_SMLPAR */
- } else {
- current->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
- regs, address);
- }
- if (fault & VM_FAULT_RETRY) {
- /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
- * of starvation. */
- flags &= ~FAULT_FLAG_ALLOW_RETRY;
- flags |= FAULT_FLAG_TRIED;
- goto retry;
- }
+ if (unlikely(fault & VM_FAULT_RETRY)) {
+ flags |= FAULT_FLAG_TRIED;
+ goto retry;
}
- up_read(&mm->mmap_sem);
- goto bail;
+ mmap_read_unlock(current->mm);
-bad_area:
- up_read(&mm->mmap_sem);
+done:
+ if (unlikely(fault & VM_FAULT_ERROR))
+ return mm_fault_error(regs, address, fault);
-bad_area_nosemaphore:
- /* User mode accesses cause a SIGSEGV */
- if (user_mode(regs)) {
- _exception(SIGSEGV, regs, code, address);
- goto bail;
- }
+out:
+ /*
+ * Major/minor page fault accounting.
+ */
+ if (major)
+ cmo_account_page_fault();
- if (is_exec && (error_code & DSISR_PROTFAULT))
- printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
- " page (%lx) - exploit attempt? (uid: %d)\n",
- address, from_kuid(&init_user_ns, current_uid()));
+ return 0;
+}
+NOKPROBE_SYMBOL(___do_page_fault);
- rc = SIGSEGV;
+static __always_inline void __do_page_fault(struct pt_regs *regs)
+{
+ long err;
-bail:
- exception_exit(prev_state);
- return rc;
+ err = ___do_page_fault(regs, regs->dar, regs->dsisr);
+ if (unlikely(err))
+ bad_page_fault(regs, err);
+}
+DEFINE_INTERRUPT_HANDLER(do_page_fault)
+{
+ __do_page_fault(regs);
}
+#ifdef CONFIG_PPC_BOOK3S_64
+/* Same as do_page_fault but interrupt entry has already run in do_hash_fault */
+void hash__do_page_fault(struct pt_regs *regs)
+{
+ __do_page_fault(regs);
+}
+NOKPROBE_SYMBOL(hash__do_page_fault);
+#endif
+
/*
* bad_page_fault is called when we have a bad access from the kernel.
* It is called from the DSI and ISI handlers in head.S and from some
* of the procedures in traps.c.
*/
-void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
+static void __bad_page_fault(struct pt_regs *regs, int sig)
{
- const struct exception_table_entry *entry;
- unsigned long *stackend;
-
- /* Are we prepared to handle this fault? */
- if ((entry = search_exception_tables(regs->nip)) != NULL) {
- regs->nip = entry->fixup;
- return;
- }
+ int is_write = page_fault_is_write(regs->dsisr);
+ const char *msg;
/* kernel has accessed a bad area */
- switch (regs->trap) {
- case 0x300:
- case 0x380:
- printk(KERN_ALERT "Unable to handle kernel paging request for "
- "data at address 0x%08lx\n", regs->dar);
+ if (regs->dar < PAGE_SIZE)
+ msg = "Kernel NULL pointer dereference";
+ else
+ msg = "Unable to handle kernel data access";
+
+ switch (TRAP(regs)) {
+ case INTERRUPT_DATA_STORAGE:
+ case INTERRUPT_H_DATA_STORAGE:
+ pr_alert("BUG: %s on %s at 0x%08lx\n", msg,
+ str_write_read(is_write), regs->dar);
+ break;
+ case INTERRUPT_DATA_SEGMENT:
+ pr_alert("BUG: %s at 0x%08lx\n", msg, regs->dar);
break;
- case 0x400:
- case 0x480:
- printk(KERN_ALERT "Unable to handle kernel paging request for "
- "instruction fetch\n");
+ case INTERRUPT_INST_STORAGE:
+ case INTERRUPT_INST_SEGMENT:
+ pr_alert("BUG: Unable to handle kernel instruction fetch%s",
+ regs->nip < PAGE_SIZE ? " (NULL pointer?)\n" : "\n");
+ break;
+ case INTERRUPT_ALIGNMENT:
+ pr_alert("BUG: Unable to handle kernel unaligned access at 0x%08lx\n",
+ regs->dar);
break;
default:
- printk(KERN_ALERT "Unable to handle kernel paging request for "
- "unknown fault\n");
+ pr_alert("BUG: Unable to handle unknown paging fault at 0x%08lx\n",
+ regs->dar);
break;
}
printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
regs->nip);
- stackend = end_of_stack(current);
- if (current != &init_task && *stackend != STACK_END_MAGIC)
+ if (task_stack_end_corrupted(current))
printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
die("Kernel access of bad area", regs, sig);
}
+
+void bad_page_fault(struct pt_regs *regs, int sig)
+{
+ const struct exception_table_entry *entry;
+
+ /* Are we prepared to handle this fault? */
+ entry = search_exception_tables(instruction_pointer(regs));
+ if (entry)
+ instruction_pointer_set(regs, extable_fixup(entry));
+ else
+ __bad_page_fault(regs, sig);
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+DEFINE_INTERRUPT_HANDLER(do_bad_page_fault_segv)
+{
+ bad_page_fault(regs, SIGSEGV);
+}
+
+/*
+ * In radix, segment interrupts indicate the EA is not addressable by the
+ * page table geometry, so they are always sent here.
+ *
+ * In hash, this is called if do_slb_fault returns error. Typically it is
+ * because the EA was outside the region allowed by software.
+ */
+DEFINE_INTERRUPT_HANDLER(do_bad_segment_interrupt)
+{
+ int err = regs->result;
+
+ if (err == -EFAULT) {
+ if (user_mode(regs))
+ _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar);
+ else
+ bad_page_fault(regs, SIGSEGV);
+ } else if (err == -EINVAL) {
+ unrecoverable_exception(regs);
+ } else {
+ BUG();
+ }
+}
+#endif
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
deleted file mode 100644
index 49822d90ea96..000000000000
--- a/arch/powerpc/mm/gup.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Lockless get_user_pages_fast for powerpc
- *
- * Copyright (C) 2008 Nick Piggin
- * Copyright (C) 2008 Novell Inc.
- */
-#undef DEBUG
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/vmstat.h>
-#include <linux/pagemap.h>
-#include <linux/rwsem.h>
-#include <asm/pgtable.h>
-
-#ifdef __HAVE_ARCH_PTE_SPECIAL
-
-/*
- * The performance critical leaf functions are made noinline otherwise gcc
- * inlines everything into a single function which results in too much
- * register pressure.
- */
-static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- unsigned long mask, result;
- pte_t *ptep;
-
- result = _PAGE_PRESENT|_PAGE_USER;
- if (write)
- result |= _PAGE_RW;
- mask = result | _PAGE_SPECIAL;
-
- ptep = pte_offset_kernel(&pmd, addr);
- do {
- pte_t pte = ACCESS_ONCE(*ptep);
- struct page *page;
-
- if ((pte_val(pte) & mask) != result)
- return 0;
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- page = pte_page(pte);
- if (!page_cache_get_speculative(page))
- return 0;
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- put_page(page);
- return 0;
- }
- pages[*nr] = page;
- (*nr)++;
-
- } while (ptep++, addr += PAGE_SIZE, addr != end);
-
- return 1;
-}
-
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- pmd_t *pmdp;
-
- pmdp = pmd_offset(&pud, addr);
- do {
- pmd_t pmd = ACCESS_ONCE(*pmdp);
-
- next = pmd_addr_end(addr, end);
- /*
- * If we find a splitting transparent hugepage we
- * return zero. That will result in taking the slow
- * path which will call wait_split_huge_page()
- * if the pmd is still in splitting state
- */
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
- return 0;
- if (pmd_huge(pmd) || pmd_large(pmd)) {
- if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
- write, pages, nr))
- return 0;
- } else if (is_hugepd(pmdp)) {
- if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
- addr, next, write, pages, nr))
- return 0;
- } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
- return 0;
- } while (pmdp++, addr = next, addr != end);
-
- return 1;
-}
-
-static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- pud_t *pudp;
-
- pudp = pud_offset(&pgd, addr);
- do {
- pud_t pud = ACCESS_ONCE(*pudp);
-
- next = pud_addr_end(addr, end);
- if (pud_none(pud))
- return 0;
- if (pud_huge(pud)) {
- if (!gup_hugepte((pte_t *)pudp, PUD_SIZE, addr, next,
- write, pages, nr))
- return 0;
- } else if (is_hugepd(pudp)) {
- if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
- addr, next, write, pages, nr))
- return 0;
- } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
- return 0;
- } while (pudp++, addr = next, addr != end);
-
- return 1;
-}
-
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
- unsigned long next;
- pgd_t *pgdp;
- int nr = 0;
-
- pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
-
- start &= PAGE_MASK;
- addr = start;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
-
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- start, len)))
- goto slow_irqon;
-
- pr_devel(" aligned: %lx .. %lx\n", start, end);
-
- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
- /*
- * This doesn't prevent pagetable teardown, but does prevent
- * the pagetables from being freed on powerpc.
- *
- * So long as we atomically load page table pointers versus teardown,
- * we can follow the address down to the the page and take a ref on it.
- */
- local_irq_disable();
-
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = ACCESS_ONCE(*pgdp);
-
- pr_devel(" %016lx: normal pgd %p\n", addr,
- (void *)pgd_val(pgd));
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- goto slow;
- if (pgd_huge(pgd)) {
- if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next,
- write, pages, &nr))
- goto slow;
- } else if (is_hugepd(pgdp)) {
- if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
- addr, next, write, pages, &nr))
- goto slow;
- } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
- goto slow;
- } while (pgdp++, addr = next, addr != end);
-
- local_irq_enable();
-
- VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
- return nr;
-
- {
- int ret;
-
-slow:
- local_irq_enable();
-slow_irqon:
- pr_devel(" slow path ! nr = %d\n", nr);
-
- /* Try to get the remaining pages with get_user_pages */
- start += nr << PAGE_SHIFT;
- pages += nr;
-
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start,
- (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
-
- /* Have to be a bit careful with return values */
- if (nr > 0) {
- if (ret < 0)
- ret = nr;
- else
- ret += nr;
- }
-
- return ret;
- }
-}
-
-#endif /* __HAVE_ARCH_PTE_SPECIAL */
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
deleted file mode 100644
index d3cbda62857b..000000000000
--- a/arch/powerpc/mm/hash_low_64.S
+++ /dev/null
@@ -1,981 +0,0 @@
-/*
- * ppc64 MMU hashtable management routines
- *
- * (c) Copyright IBM Corp. 2003, 2005
- *
- * Maintained by: Benjamin Herrenschmidt
- * <benh@kernel.crashing.org>
- *
- * This file is covered by the GNU Public Licence v2 as
- * described in the kernel's COPYING file.
- */
-
-#include <asm/reg.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/page.h>
-#include <asm/types.h>
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/cputable.h>
-
- .text
-
-/*
- * Stackframe:
- *
- * +-> Back chain (SP + 256)
- * | General register save area (SP + 112)
- * | Parameter save area (SP + 48)
- * | TOC save area (SP + 40)
- * | link editor doubleword (SP + 32)
- * | compiler doubleword (SP + 24)
- * | LR save area (SP + 16)
- * | CR save area (SP + 8)
- * SP ---> +-- Back chain (SP + 0)
- */
-
-#ifndef CONFIG_PPC_64K_PAGES
-
-/*****************************************************************************
- * *
- * 4K SW & 4K HW pages implementation *
- * *
- *****************************************************************************/
-
-
-/*
- * _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
- * pte_t *ptep, unsigned long trap, int local, int ssize)
- *
- * Adds a 4K page to the hash table in a segment of 4K pages only
- */
-
-_GLOBAL(__hash_page_4K)
- mflr r0
- std r0,16(r1)
- stdu r1,-STACKFRAMESIZE(r1)
- /* Save all params that we need after a function call */
- std r6,STK_PARAM(R6)(r1)
- std r8,STK_PARAM(R8)(r1)
- std r9,STK_PARAM(R9)(r1)
-
- /* Save non-volatile registers.
- * r31 will hold "old PTE"
- * r30 is "new PTE"
- * r29 is vpn
- * r28 is a hash value
- * r27 is hashtab mask (maybe dynamic patched instead ?)
- */
- std r27,STK_REG(R27)(r1)
- std r28,STK_REG(R28)(r1)
- std r29,STK_REG(R29)(r1)
- std r30,STK_REG(R30)(r1)
- std r31,STK_REG(R31)(r1)
-
- /* Step 1:
- *
- * Check permissions, atomically mark the linux PTE busy
- * and hashed.
- */
-1:
- ldarx r31,0,r6
- /* Check access rights (access & ~(pte_val(*ptep))) */
- andc. r0,r4,r31
- bne- htab_wrong_access
- /* Check if PTE is busy */
- andi. r0,r31,_PAGE_BUSY
- /* If so, just bail out and refault if needed. Someone else
- * is changing this PTE anyway and might hash it.
- */
- bne- htab_bail_ok
-
- /* Prepare new PTE value (turn access RW into DIRTY, then
- * add BUSY,HASHPTE and ACCESSED)
- */
- rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */
- or r30,r30,r31
- ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
- /* Write the linux PTE atomically (setting busy) */
- stdcx. r30,0,r6
- bne- 1b
- isync
-
- /* Step 2:
- *
- * Insert/Update the HPTE in the hash table. At this point,
- * r4 (access) is re-useable, we use it for the new HPTE flags
- */
-
-BEGIN_FTR_SECTION
- cmpdi r9,0 /* check segment size */
- bne 3f
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
- /* Calc vpn and put it in r29 */
- sldi r29,r5,SID_SHIFT - VPN_SHIFT
- rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
- or r29,r28,r29
- /*
- * Calculate hash value for primary slot and store it in r28
- * r3 = va, r5 = vsid
- * r0 = (va >> 12) & ((1ul << (28 - 12)) -1)
- */
- rldicl r0,r3,64-12,48
- xor r28,r5,r0 /* hash */
- b 4f
-
-3: /* Calc vpn and put it in r29 */
- sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT
- rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
- or r29,r28,r29
-
- /*
- * calculate hash value for primary slot and
- * store it in r28 for 1T segment
- * r3 = va, r5 = vsid
- */
- sldi r28,r5,25 /* vsid << 25 */
- /* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */
- rldicl r0,r3,64-12,36
- xor r28,r28,r5 /* vsid ^ ( vsid << 25) */
- xor r28,r28,r0 /* hash */
-
- /* Convert linux PTE bits into HW equivalents */
-4: andi. r3,r30,0x1fe /* Get basic set of flags */
- xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */
- rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */
- rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */
- and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
- andc r0,r30,r0 /* r0 = pte & ~r0 */
- rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
- ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */
-
- /* We eventually do the icache sync here (maybe inline that
- * code rather than call a C function...)
- */
-BEGIN_FTR_SECTION
- mr r4,r30
- mr r5,r7
- bl .hash_page_do_lazy_icache
-END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
-
- /* At this point, r3 contains new PP bits, save them in
- * place of "access" in the param area (sic)
- */
- std r3,STK_PARAM(R4)(r1)
-
- /* Get htab_hash_mask */
- ld r4,htab_hash_mask@got(2)
- ld r27,0(r4) /* htab_hash_mask -> r27 */
-
- /* Check if we may already be in the hashtable, in this case, we
- * go to out-of-line code to try to modify the HPTE
- */
- andi. r0,r31,_PAGE_HASHPTE
- bne htab_modify_pte
-
-htab_insert_pte:
- /* Clear hpte bits in new pte (we also clear BUSY btw) and
- * add _PAGE_HASHPTE
- */
- lis r0,_PAGE_HPTEFLAGS@h
- ori r0,r0,_PAGE_HPTEFLAGS@l
- andc r30,r30,r0
- ori r30,r30,_PAGE_HASHPTE
-
- /* physical address r5 */
- rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
- sldi r5,r5,PAGE_SHIFT
-
- /* Calculate primary group hash */
- and r0,r28,r27
- rldicr r3,r0,3,63-3 /* r3 = (hash & mask) << 3 */
-
- /* Call ppc_md.hpte_insert */
- ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
- mr r4,r29 /* Retrieve vpn */
- li r7,0 /* !bolted, !secondary */
- li r8,MMU_PAGE_4K /* page size */
- li r9,MMU_PAGE_4K /* actual page size */
- ld r10,STK_PARAM(R9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert1)
- bl . /* Patched by htab_finish_init() */
- cmpdi 0,r3,0
- bge htab_pte_insert_ok /* Insertion successful */
- cmpdi 0,r3,-2 /* Critical failure */
- beq- htab_pte_insert_failure
-
- /* Now try secondary slot */
-
- /* physical address r5 */
- rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
- sldi r5,r5,PAGE_SHIFT
-
- /* Calculate secondary group hash */
- andc r0,r27,r28
- rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */
-
- /* Call ppc_md.hpte_insert */
- ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
- mr r4,r29 /* Retrieve vpn */
- li r7,HPTE_V_SECONDARY /* !bolted, secondary */
- li r8,MMU_PAGE_4K /* page size */
- li r9,MMU_PAGE_4K /* actual page size */
- ld r10,STK_PARAM(R9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert2)
- bl . /* Patched by htab_finish_init() */
- cmpdi 0,r3,0
- bge+ htab_pte_insert_ok /* Insertion successful */
- cmpdi 0,r3,-2 /* Critical failure */
- beq- htab_pte_insert_failure
-
- /* Both are full, we need to evict something */
- mftb r0
- /* Pick a random group based on TB */
- andi. r0,r0,1
- mr r5,r28
- bne 2f
- not r5,r5
-2: and r0,r5,r27
- rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
- /* Call ppc_md.hpte_remove */
-_GLOBAL(htab_call_hpte_remove)
- bl . /* Patched by htab_finish_init() */
-
- /* Try all again */
- b htab_insert_pte
-
-htab_bail_ok:
- li r3,0
- b htab_bail
-
-htab_pte_insert_ok:
- /* Insert slot number & secondary bit in PTE */
- rldimi r30,r3,12,63-15
-
- /* Write out the PTE with a normal write
- * (maybe add eieio may be good still ?)
- */
-htab_write_out_pte:
- ld r6,STK_PARAM(R6)(r1)
- std r30,0(r6)
- li r3, 0
-htab_bail:
- ld r27,STK_REG(R27)(r1)
- ld r28,STK_REG(R28)(r1)
- ld r29,STK_REG(R29)(r1)
- ld r30,STK_REG(R30)(r1)
- ld r31,STK_REG(R31)(r1)
- addi r1,r1,STACKFRAMESIZE
- ld r0,16(r1)
- mtlr r0
- blr
-
-htab_modify_pte:
- /* Keep PP bits in r4 and slot idx from the PTE around in r3 */
- mr r4,r3
- rlwinm r3,r31,32-12,29,31
-
- /* Secondary group ? if yes, get a inverted hash value */
- mr r5,r28
- andi. r0,r31,_PAGE_SECONDARY
- beq 1f
- not r5,r5
-1:
- /* Calculate proper slot value for ppc_md.hpte_updatepp */
- and r0,r5,r27
- rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */
- add r3,r0,r3 /* add slot idx */
-
- /* Call ppc_md.hpte_updatepp */
- mr r5,r29 /* vpn */
- li r6,MMU_PAGE_4K /* base page size */
- li r7,MMU_PAGE_4K /* actual page size */
- ld r8,STK_PARAM(R9)(r1) /* segment size */
- ld r9,STK_PARAM(R8)(r1) /* get "local" param */
-_GLOBAL(htab_call_hpte_updatepp)
- bl . /* Patched by htab_finish_init() */
-
- /* if we failed because typically the HPTE wasn't really here
- * we try an insertion.
- */
- cmpdi 0,r3,-1
- beq- htab_insert_pte
-
- /* Clear the BUSY bit and Write out the PTE */
- li r0,_PAGE_BUSY
- andc r30,r30,r0
- b htab_write_out_pte
-
-htab_wrong_access:
- /* Bail out clearing reservation */
- stdcx. r31,0,r6
- li r3,1
- b htab_bail
-
-htab_pte_insert_failure:
- /* Bail out restoring old PTE */
- ld r6,STK_PARAM(R6)(r1)
- std r31,0(r6)
- li r3,-1
- b htab_bail
-
-
-#else /* CONFIG_PPC_64K_PAGES */
-
-
-/*****************************************************************************
- * *
- * 64K SW & 4K or 64K HW in a 4K segment pages implementation *
- * *
- *****************************************************************************/
-
-/* _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
- * pte_t *ptep, unsigned long trap, int local, int ssize,
- * int subpg_prot)
- */
-
-/*
- * For now, we do NOT implement Admixed pages
- */
-_GLOBAL(__hash_page_4K)
- mflr r0
- std r0,16(r1)
- stdu r1,-STACKFRAMESIZE(r1)
- /* Save all params that we need after a function call */
- std r6,STK_PARAM(R6)(r1)
- std r8,STK_PARAM(R8)(r1)
- std r9,STK_PARAM(R9)(r1)
-
- /* Save non-volatile registers.
- * r31 will hold "old PTE"
- * r30 is "new PTE"
- * r29 is vpn
- * r28 is a hash value
- * r27 is hashtab mask (maybe dynamic patched instead ?)
- * r26 is the hidx mask
- * r25 is the index in combo page
- */
- std r25,STK_REG(R25)(r1)
- std r26,STK_REG(R26)(r1)
- std r27,STK_REG(R27)(r1)
- std r28,STK_REG(R28)(r1)
- std r29,STK_REG(R29)(r1)
- std r30,STK_REG(R30)(r1)
- std r31,STK_REG(R31)(r1)
-
- /* Step 1:
- *
- * Check permissions, atomically mark the linux PTE busy
- * and hashed.
- */
-1:
- ldarx r31,0,r6
- /* Check access rights (access & ~(pte_val(*ptep))) */
- andc. r0,r4,r31
- bne- htab_wrong_access
- /* Check if PTE is busy */
- andi. r0,r31,_PAGE_BUSY
- /* If so, just bail out and refault if needed. Someone else
- * is changing this PTE anyway and might hash it.
- */
- bne- htab_bail_ok
- /* Prepare new PTE value (turn access RW into DIRTY, then
- * add BUSY and ACCESSED)
- */
- rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */
- or r30,r30,r31
- ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED
- oris r30,r30,_PAGE_COMBO@h
- /* Write the linux PTE atomically (setting busy) */
- stdcx. r30,0,r6
- bne- 1b
- isync
-
- /* Step 2:
- *
- * Insert/Update the HPTE in the hash table. At this point,
- * r4 (access) is re-useable, we use it for the new HPTE flags
- */
-
- /* Load the hidx index */
- rldicl r25,r3,64-12,60
-
-BEGIN_FTR_SECTION
- cmpdi r9,0 /* check segment size */
- bne 3f
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
- /* Calc vpn and put it in r29 */
- sldi r29,r5,SID_SHIFT - VPN_SHIFT
- /*
- * clrldi r3,r3,64 - SID_SHIFT --> ea & 0xfffffff
- * srdi r28,r3,VPN_SHIFT
- */
- rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
- or r29,r28,r29
- /*
- * Calculate hash value for primary slot and store it in r28
- * r3 = va, r5 = vsid
- * r0 = (va >> 12) & ((1ul << (28 - 12)) -1)
- */
- rldicl r0,r3,64-12,48
- xor r28,r5,r0 /* hash */
- b 4f
-
-3: /* Calc vpn and put it in r29 */
- sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT
- /*
- * clrldi r3,r3,64 - SID_SHIFT_1T --> ea & 0xffffffffff
- * srdi r28,r3,VPN_SHIFT
- */
- rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
- or r29,r28,r29
-
- /*
- * Calculate hash value for primary slot and
- * store it in r28 for 1T segment
- * r3 = va, r5 = vsid
- */
- sldi r28,r5,25 /* vsid << 25 */
- /* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */
- rldicl r0,r3,64-12,36
- xor r28,r28,r5 /* vsid ^ ( vsid << 25) */
- xor r28,r28,r0 /* hash */
-
- /* Convert linux PTE bits into HW equivalents */
-4:
-#ifdef CONFIG_PPC_SUBPAGE_PROT
- andc r10,r30,r10
- andi. r3,r10,0x1fe /* Get basic set of flags */
- rlwinm r0,r10,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */
-#else
- andi. r3,r30,0x1fe /* Get basic set of flags */
- rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */
-#endif
- xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */
- rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */
- and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
- andc r0,r3,r0 /* r0 = pte & ~r0 */
- rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
- ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */
-
- /* We eventually do the icache sync here (maybe inline that
- * code rather than call a C function...)
- */
-BEGIN_FTR_SECTION
- mr r4,r30
- mr r5,r7
- bl .hash_page_do_lazy_icache
-END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
-
- /* At this point, r3 contains new PP bits, save them in
- * place of "access" in the param area (sic)
- */
- std r3,STK_PARAM(R4)(r1)
-
- /* Get htab_hash_mask */
- ld r4,htab_hash_mask@got(2)
- ld r27,0(r4) /* htab_hash_mask -> r27 */
-
- /* Check if we may already be in the hashtable, in this case, we
- * go to out-of-line code to try to modify the HPTE. We look for
- * the bit at (1 >> (index + 32))
- */
- rldicl. r0,r31,64-12,48
- li r26,0 /* Default hidx */
- beq htab_insert_pte
-
- /*
- * Check if the pte was already inserted into the hash table
- * as a 64k HW page, and invalidate the 64k HPTE if so.
- */
- andis. r0,r31,_PAGE_COMBO@h
- beq htab_inval_old_hpte
-
- ld r6,STK_PARAM(R6)(r1)
- ori r26,r6,PTE_PAGE_HIDX_OFFSET /* Load the hidx mask. */
- ld r26,0(r26)
- addi r5,r25,36 /* Check actual HPTE_SUB bit, this */
- rldcr. r0,r31,r5,0 /* must match pgtable.h definition */
- bne htab_modify_pte
-
-htab_insert_pte:
- /* real page number in r5, PTE RPN value + index */
- andis. r0,r31,_PAGE_4K_PFN@h
- srdi r5,r31,PTE_RPN_SHIFT
- bne- htab_special_pfn
- sldi r5,r5,PAGE_SHIFT-HW_PAGE_SHIFT
- add r5,r5,r25
-htab_special_pfn:
- sldi r5,r5,HW_PAGE_SHIFT
-
- /* Calculate primary group hash */
- and r0,r28,r27
- rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
-
- /* Call ppc_md.hpte_insert */
- ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
- mr r4,r29 /* Retrieve vpn */
- li r7,0 /* !bolted, !secondary */
- li r8,MMU_PAGE_4K /* page size */
- li r9,MMU_PAGE_4K /* actual page size */
- ld r10,STK_PARAM(R9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert1)
- bl . /* patched by htab_finish_init() */
- cmpdi 0,r3,0
- bge htab_pte_insert_ok /* Insertion successful */
- cmpdi 0,r3,-2 /* Critical failure */
- beq- htab_pte_insert_failure
-
- /* Now try secondary slot */
-
- /* real page number in r5, PTE RPN value + index */
- andis. r0,r31,_PAGE_4K_PFN@h
- srdi r5,r31,PTE_RPN_SHIFT
- bne- 3f
- sldi r5,r5,PAGE_SHIFT-HW_PAGE_SHIFT
- add r5,r5,r25
-3: sldi r5,r5,HW_PAGE_SHIFT
-
- /* Calculate secondary group hash */
- andc r0,r27,r28
- rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */
-
- /* Call ppc_md.hpte_insert */
- ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
- mr r4,r29 /* Retrieve vpn */
- li r7,HPTE_V_SECONDARY /* !bolted, secondary */
- li r8,MMU_PAGE_4K /* page size */
- li r9,MMU_PAGE_4K /* actual page size */
- ld r10,STK_PARAM(R9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert2)
- bl . /* patched by htab_finish_init() */
- cmpdi 0,r3,0
- bge+ htab_pte_insert_ok /* Insertion successful */
- cmpdi 0,r3,-2 /* Critical failure */
- beq- htab_pte_insert_failure
-
- /* Both are full, we need to evict something */
- mftb r0
- /* Pick a random group based on TB */
- andi. r0,r0,1
- mr r5,r28
- bne 2f
- not r5,r5
-2: and r0,r5,r27
- rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
- /* Call ppc_md.hpte_remove */
-_GLOBAL(htab_call_hpte_remove)
- bl . /* patched by htab_finish_init() */
-
- /* Try all again */
- b htab_insert_pte
-
- /*
- * Call out to C code to invalidate an 64k HW HPTE that is
- * useless now that the segment has been switched to 4k pages.
- */
-htab_inval_old_hpte:
- mr r3,r29 /* vpn */
- mr r4,r31 /* PTE.pte */
- li r5,0 /* PTE.hidx */
- li r6,MMU_PAGE_64K /* psize */
- ld r7,STK_PARAM(R9)(r1) /* ssize */
- ld r8,STK_PARAM(R8)(r1) /* local */
- bl .flush_hash_page
- /* Clear out _PAGE_HPTE_SUB bits in the new linux PTE */
- lis r0,_PAGE_HPTE_SUB@h
- ori r0,r0,_PAGE_HPTE_SUB@l
- andc r30,r30,r0
- b htab_insert_pte
-
-htab_bail_ok:
- li r3,0
- b htab_bail
-
-htab_pte_insert_ok:
- /* Insert slot number & secondary bit in PTE second half,
- * clear _PAGE_BUSY and set approriate HPTE slot bit
- */
- ld r6,STK_PARAM(R6)(r1)
- li r0,_PAGE_BUSY
- andc r30,r30,r0
- /* HPTE SUB bit */
- li r0,1
- subfic r5,r25,27 /* Must match bit position in */
- sld r0,r0,r5 /* pgtable.h */
- or r30,r30,r0
- /* hindx */
- sldi r5,r25,2
- sld r3,r3,r5
- li r4,0xf
- sld r4,r4,r5
- andc r26,r26,r4
- or r26,r26,r3
- ori r5,r6,PTE_PAGE_HIDX_OFFSET
- std r26,0(r5)
- lwsync
- std r30,0(r6)
- li r3, 0
-htab_bail:
- ld r25,STK_REG(R25)(r1)
- ld r26,STK_REG(R26)(r1)
- ld r27,STK_REG(R27)(r1)
- ld r28,STK_REG(R28)(r1)
- ld r29,STK_REG(R29)(r1)
- ld r30,STK_REG(R30)(r1)
- ld r31,STK_REG(R31)(r1)
- addi r1,r1,STACKFRAMESIZE
- ld r0,16(r1)
- mtlr r0
- blr
-
-htab_modify_pte:
- /* Keep PP bits in r4 and slot idx from the PTE around in r3 */
- mr r4,r3
- sldi r5,r25,2
- srd r3,r26,r5
-
- /* Secondary group ? if yes, get a inverted hash value */
- mr r5,r28
- andi. r0,r3,0x8 /* page secondary ? */
- beq 1f
- not r5,r5
-1: andi. r3,r3,0x7 /* extract idx alone */
-
- /* Calculate proper slot value for ppc_md.hpte_updatepp */
- and r0,r5,r27
- rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */
- add r3,r0,r3 /* add slot idx */
-
- /* Call ppc_md.hpte_updatepp */
- mr r5,r29 /* vpn */
- li r6,MMU_PAGE_4K /* base page size */
- li r7,MMU_PAGE_4K /* actual page size */
- ld r8,STK_PARAM(R9)(r1) /* segment size */
- ld r9,STK_PARAM(R8)(r1) /* get "local" param */
-_GLOBAL(htab_call_hpte_updatepp)
- bl . /* patched by htab_finish_init() */
-
- /* if we failed because typically the HPTE wasn't really here
- * we try an insertion.
- */
- cmpdi 0,r3,-1
- beq- htab_insert_pte
-
- /* Clear the BUSY bit and Write out the PTE */
- li r0,_PAGE_BUSY
- andc r30,r30,r0
- ld r6,STK_PARAM(R6)(r1)
- std r30,0(r6)
- li r3,0
- b htab_bail
-
-htab_wrong_access:
- /* Bail out clearing reservation */
- stdcx. r31,0,r6
- li r3,1
- b htab_bail
-
-htab_pte_insert_failure:
- /* Bail out restoring old PTE */
- ld r6,STK_PARAM(R6)(r1)
- std r31,0(r6)
- li r3,-1
- b htab_bail
-
-#endif /* CONFIG_PPC_64K_PAGES */
-
-#ifdef CONFIG_PPC_HAS_HASH_64K
-
-/*****************************************************************************
- * *
- * 64K SW & 64K HW in a 64K segment pages implementation *
- * *
- *****************************************************************************/
-
-_GLOBAL(__hash_page_64K)
- mflr r0
- std r0,16(r1)
- stdu r1,-STACKFRAMESIZE(r1)
- /* Save all params that we need after a function call */
- std r6,STK_PARAM(R6)(r1)
- std r8,STK_PARAM(R8)(r1)
- std r9,STK_PARAM(R9)(r1)
-
- /* Save non-volatile registers.
- * r31 will hold "old PTE"
- * r30 is "new PTE"
- * r29 is vpn
- * r28 is a hash value
- * r27 is hashtab mask (maybe dynamic patched instead ?)
- */
- std r27,STK_REG(R27)(r1)
- std r28,STK_REG(R28)(r1)
- std r29,STK_REG(R29)(r1)
- std r30,STK_REG(R30)(r1)
- std r31,STK_REG(R31)(r1)
-
- /* Step 1:
- *
- * Check permissions, atomically mark the linux PTE busy
- * and hashed.
- */
-1:
- ldarx r31,0,r6
- /* Check access rights (access & ~(pte_val(*ptep))) */
- andc. r0,r4,r31
- bne- ht64_wrong_access
- /* Check if PTE is busy */
- andi. r0,r31,_PAGE_BUSY
- /* If so, just bail out and refault if needed. Someone else
- * is changing this PTE anyway and might hash it.
- */
- bne- ht64_bail_ok
-BEGIN_FTR_SECTION
- /* Check if PTE has the cache-inhibit bit set */
- andi. r0,r31,_PAGE_NO_CACHE
- /* If so, bail out and refault as a 4k page */
- bne- ht64_bail_ok
-END_MMU_FTR_SECTION_IFCLR(MMU_FTR_CI_LARGE_PAGE)
- /* Prepare new PTE value (turn access RW into DIRTY, then
- * add BUSY and ACCESSED)
- */
- rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */
- or r30,r30,r31
- ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED
- /* Write the linux PTE atomically (setting busy) */
- stdcx. r30,0,r6
- bne- 1b
- isync
-
- /* Step 2:
- *
- * Insert/Update the HPTE in the hash table. At this point,
- * r4 (access) is re-useable, we use it for the new HPTE flags
- */
-
-BEGIN_FTR_SECTION
- cmpdi r9,0 /* check segment size */
- bne 3f
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
- /* Calc vpn and put it in r29 */
- sldi r29,r5,SID_SHIFT - VPN_SHIFT
- rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
- or r29,r28,r29
-
- /* Calculate hash value for primary slot and store it in r28
- * r3 = va, r5 = vsid
- * r0 = (va >> 16) & ((1ul << (28 - 16)) -1)
- */
- rldicl r0,r3,64-16,52
- xor r28,r5,r0 /* hash */
- b 4f
-
-3: /* Calc vpn and put it in r29 */
- sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT
- rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
- or r29,r28,r29
- /*
- * calculate hash value for primary slot and
- * store it in r28 for 1T segment
- * r3 = va, r5 = vsid
- */
- sldi r28,r5,25 /* vsid << 25 */
- /* r0 = (va >> 16) & ((1ul << (40 - 16)) -1) */
- rldicl r0,r3,64-16,40
- xor r28,r28,r5 /* vsid ^ ( vsid << 25) */
- xor r28,r28,r0 /* hash */
-
- /* Convert linux PTE bits into HW equivalents */
-4: andi. r3,r30,0x1fe /* Get basic set of flags */
- xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */
- rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */
- rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */
- and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
- andc r0,r30,r0 /* r0 = pte & ~r0 */
- rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
- ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */
-
- /* We eventually do the icache sync here (maybe inline that
- * code rather than call a C function...)
- */
-BEGIN_FTR_SECTION
- mr r4,r30
- mr r5,r7
- bl .hash_page_do_lazy_icache
-END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
-
- /* At this point, r3 contains new PP bits, save them in
- * place of "access" in the param area (sic)
- */
- std r3,STK_PARAM(R4)(r1)
-
- /* Get htab_hash_mask */
- ld r4,htab_hash_mask@got(2)
- ld r27,0(r4) /* htab_hash_mask -> r27 */
-
- /* Check if we may already be in the hashtable, in this case, we
- * go to out-of-line code to try to modify the HPTE
- */
- rldicl. r0,r31,64-12,48
- bne ht64_modify_pte
-
-ht64_insert_pte:
- /* Clear hpte bits in new pte (we also clear BUSY btw) and
- * add _PAGE_HPTE_SUB0
- */
- lis r0,_PAGE_HPTEFLAGS@h
- ori r0,r0,_PAGE_HPTEFLAGS@l
- andc r30,r30,r0
-#ifdef CONFIG_PPC_64K_PAGES
- oris r30,r30,_PAGE_HPTE_SUB0@h
-#else
- ori r30,r30,_PAGE_HASHPTE
-#endif
- /* Phyical address in r5 */
- rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
- sldi r5,r5,PAGE_SHIFT
-
- /* Calculate primary group hash */
- and r0,r28,r27
- rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
-
- /* Call ppc_md.hpte_insert */
- ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
- mr r4,r29 /* Retrieve vpn */
- li r7,0 /* !bolted, !secondary */
- li r8,MMU_PAGE_64K
- li r9,MMU_PAGE_64K /* actual page size */
- ld r10,STK_PARAM(R9)(r1) /* segment size */
-_GLOBAL(ht64_call_hpte_insert1)
- bl . /* patched by htab_finish_init() */
- cmpdi 0,r3,0
- bge ht64_pte_insert_ok /* Insertion successful */
- cmpdi 0,r3,-2 /* Critical failure */
- beq- ht64_pte_insert_failure
-
- /* Now try secondary slot */
-
- /* Phyical address in r5 */
- rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
- sldi r5,r5,PAGE_SHIFT
-
- /* Calculate secondary group hash */
- andc r0,r27,r28
- rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */
-
- /* Call ppc_md.hpte_insert */
- ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
- mr r4,r29 /* Retrieve vpn */
- li r7,HPTE_V_SECONDARY /* !bolted, secondary */
- li r8,MMU_PAGE_64K
- li r9,MMU_PAGE_64K /* actual page size */
- ld r10,STK_PARAM(R9)(r1) /* segment size */
-_GLOBAL(ht64_call_hpte_insert2)
- bl . /* patched by htab_finish_init() */
- cmpdi 0,r3,0
- bge+ ht64_pte_insert_ok /* Insertion successful */
- cmpdi 0,r3,-2 /* Critical failure */
- beq- ht64_pte_insert_failure
-
- /* Both are full, we need to evict something */
- mftb r0
- /* Pick a random group based on TB */
- andi. r0,r0,1
- mr r5,r28
- bne 2f
- not r5,r5
-2: and r0,r5,r27
- rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
- /* Call ppc_md.hpte_remove */
-_GLOBAL(ht64_call_hpte_remove)
- bl . /* patched by htab_finish_init() */
-
- /* Try all again */
- b ht64_insert_pte
-
-ht64_bail_ok:
- li r3,0
- b ht64_bail
-
-ht64_pte_insert_ok:
- /* Insert slot number & secondary bit in PTE */
- rldimi r30,r3,12,63-15
-
- /* Write out the PTE with a normal write
- * (maybe add eieio may be good still ?)
- */
-ht64_write_out_pte:
- ld r6,STK_PARAM(R6)(r1)
- std r30,0(r6)
- li r3, 0
-ht64_bail:
- ld r27,STK_REG(R27)(r1)
- ld r28,STK_REG(R28)(r1)
- ld r29,STK_REG(R29)(r1)
- ld r30,STK_REG(R30)(r1)
- ld r31,STK_REG(R31)(r1)
- addi r1,r1,STACKFRAMESIZE
- ld r0,16(r1)
- mtlr r0
- blr
-
-ht64_modify_pte:
- /* Keep PP bits in r4 and slot idx from the PTE around in r3 */
- mr r4,r3
- rlwinm r3,r31,32-12,29,31
-
- /* Secondary group ? if yes, get a inverted hash value */
- mr r5,r28
- andi. r0,r31,_PAGE_F_SECOND
- beq 1f
- not r5,r5
-1:
- /* Calculate proper slot value for ppc_md.hpte_updatepp */
- and r0,r5,r27
- rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */
- add r3,r0,r3 /* add slot idx */
-
- /* Call ppc_md.hpte_updatepp */
- mr r5,r29 /* vpn */
- li r6,MMU_PAGE_64K /* base page size */
- li r7,MMU_PAGE_64K /* actual page size */
- ld r8,STK_PARAM(R9)(r1) /* segment size */
- ld r9,STK_PARAM(R8)(r1) /* get "local" param */
-_GLOBAL(ht64_call_hpte_updatepp)
- bl . /* patched by htab_finish_init() */
-
- /* if we failed because typically the HPTE wasn't really here
- * we try an insertion.
- */
- cmpdi 0,r3,-1
- beq- ht64_insert_pte
-
- /* Clear the BUSY bit and Write out the PTE */
- li r0,_PAGE_BUSY
- andc r30,r30,r0
- b ht64_write_out_pte
-
-ht64_wrong_access:
- /* Bail out clearing reservation */
- stdcx. r31,0,r6
- li r3,1
- b ht64_bail
-
-ht64_pte_insert_failure:
- /* Bail out restoring old PTE */
- ld r6,STK_PARAM(R6)(r1)
- std r31,0(r6)
- li r3,-1
- b ht64_bail
-
-
-#endif /* CONFIG_PPC_HAS_HASH_64K */
-
-
-/*****************************************************************************
- * *
- * Huge pages implementation is in hugetlbpage.c *
- * *
- *****************************************************************************/
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
deleted file mode 100644
index 6ecc38bd5b24..000000000000
--- a/arch/powerpc/mm/hash_utils_64.c
+++ /dev/null
@@ -1,1436 +0,0 @@
-/*
- * PowerPC64 port by Mike Corrigan and Dave Engebretsen
- * {mikejc|engebret}@us.ibm.com
- *
- * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
- *
- * SMP scalability work:
- * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * Module name: htab.c
- *
- * Description:
- * PowerPC Hashed Page Table functions
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#undef DEBUG
-#undef DEBUG_LOW
-
-#include <linux/spinlock.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include <linux/export.h>
-#include <linux/ctype.h>
-#include <linux/cache.h>
-#include <linux/init.h>
-#include <linux/signal.h>
-#include <linux/memblock.h>
-#include <linux/context_tracking.h>
-
-#include <asm/processor.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/page.h>
-#include <asm/types.h>
-#include <asm/uaccess.h>
-#include <asm/machdep.h>
-#include <asm/prom.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-#include <asm/eeh.h>
-#include <asm/tlb.h>
-#include <asm/cacheflush.h>
-#include <asm/cputable.h>
-#include <asm/sections.h>
-#include <asm/spu.h>
-#include <asm/udbg.h>
-#include <asm/code-patching.h>
-#include <asm/fadump.h>
-#include <asm/firmware.h>
-#include <asm/tm.h>
-
-#ifdef DEBUG
-#define DBG(fmt...) udbg_printf(fmt)
-#else
-#define DBG(fmt...)
-#endif
-
-#ifdef DEBUG_LOW
-#define DBG_LOW(fmt...) udbg_printf(fmt)
-#else
-#define DBG_LOW(fmt...)
-#endif
-
-#define KB (1024)
-#define MB (1024*KB)
-#define GB (1024L*MB)
-
-/*
- * Note: pte --> Linux PTE
- * HPTE --> PowerPC Hashed Page Table Entry
- *
- * Execution context:
- * htab_initialize is called with the MMU off (of course), but
- * the kernel has been copied down to zero so it can directly
- * reference global data. At this point it is very difficult
- * to print debug info.
- *
- */
-
-#ifdef CONFIG_U3_DART
-extern unsigned long dart_tablebase;
-#endif /* CONFIG_U3_DART */
-
-static unsigned long _SDR1;
-struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
-
-struct hash_pte *htab_address;
-unsigned long htab_size_bytes;
-unsigned long htab_hash_mask;
-EXPORT_SYMBOL_GPL(htab_hash_mask);
-int mmu_linear_psize = MMU_PAGE_4K;
-int mmu_virtual_psize = MMU_PAGE_4K;
-int mmu_vmalloc_psize = MMU_PAGE_4K;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-int mmu_vmemmap_psize = MMU_PAGE_4K;
-#endif
-int mmu_io_psize = MMU_PAGE_4K;
-int mmu_kernel_ssize = MMU_SEGSIZE_256M;
-int mmu_highuser_ssize = MMU_SEGSIZE_256M;
-u16 mmu_slb_size = 64;
-EXPORT_SYMBOL_GPL(mmu_slb_size);
-#ifdef CONFIG_PPC_64K_PAGES
-int mmu_ci_restrictions;
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-static u8 *linear_map_hash_slots;
-static unsigned long linear_map_hash_count;
-static DEFINE_SPINLOCK(linear_map_hash_lock);
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
-/* There are definitions of page sizes arrays to be used when none
- * is provided by the firmware.
- */
-
-/* Pre-POWER4 CPUs (4k pages only)
- */
-static struct mmu_psize_def mmu_psize_defaults_old[] = {
- [MMU_PAGE_4K] = {
- .shift = 12,
- .sllp = 0,
- .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
- .avpnm = 0,
- .tlbiel = 0,
- },
-};
-
-/* POWER4, GPUL, POWER5
- *
- * Support for 16Mb large pages
- */
-static struct mmu_psize_def mmu_psize_defaults_gp[] = {
- [MMU_PAGE_4K] = {
- .shift = 12,
- .sllp = 0,
- .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
- .avpnm = 0,
- .tlbiel = 1,
- },
- [MMU_PAGE_16M] = {
- .shift = 24,
- .sllp = SLB_VSID_L,
- .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0,
- [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 },
- .avpnm = 0x1UL,
- .tlbiel = 0,
- },
-};
-
-static unsigned long htab_convert_pte_flags(unsigned long pteflags)
-{
- unsigned long rflags = pteflags & 0x1fa;
-
- /* _PAGE_EXEC -> NOEXEC */
- if ((pteflags & _PAGE_EXEC) == 0)
- rflags |= HPTE_R_N;
-
- /* PP bits. PAGE_USER is already PP bit 0x2, so we only
- * need to add in 0x1 if it's a read-only user page
- */
- if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) &&
- (pteflags & _PAGE_DIRTY)))
- rflags |= 1;
-
- /* Always add C */
- return rflags | HPTE_R_C;
-}
-
-int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
- unsigned long pstart, unsigned long prot,
- int psize, int ssize)
-{
- unsigned long vaddr, paddr;
- unsigned int step, shift;
- int ret = 0;
-
- shift = mmu_psize_defs[psize].shift;
- step = 1 << shift;
-
- prot = htab_convert_pte_flags(prot);
-
- DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
- vstart, vend, pstart, prot, psize, ssize);
-
- for (vaddr = vstart, paddr = pstart; vaddr < vend;
- vaddr += step, paddr += step) {
- unsigned long hash, hpteg;
- unsigned long vsid = get_kernel_vsid(vaddr, ssize);
- unsigned long vpn = hpt_vpn(vaddr, vsid, ssize);
- unsigned long tprot = prot;
-
- /*
- * If we hit a bad address return error.
- */
- if (!vsid)
- return -1;
- /* Make kernel text executable */
- if (overlaps_kernel_text(vaddr, vaddr + step))
- tprot &= ~HPTE_R_N;
-
- hash = hpt_hash(vpn, shift, ssize);
- hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
-
- BUG_ON(!ppc_md.hpte_insert);
- ret = ppc_md.hpte_insert(hpteg, vpn, paddr, tprot,
- HPTE_V_BOLTED, psize, psize, ssize);
-
- if (ret < 0)
- break;
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((paddr >> PAGE_SHIFT) < linear_map_hash_count)
- linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
-#endif /* CONFIG_DEBUG_PAGEALLOC */
- }
- return ret < 0 ? ret : 0;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int htab_remove_mapping(unsigned long vstart, unsigned long vend,
- int psize, int ssize)
-{
- unsigned long vaddr;
- unsigned int step, shift;
-
- shift = mmu_psize_defs[psize].shift;
- step = 1 << shift;
-
- if (!ppc_md.hpte_removebolted) {
- printk(KERN_WARNING "Platform doesn't implement "
- "hpte_removebolted\n");
- return -EINVAL;
- }
-
- for (vaddr = vstart; vaddr < vend; vaddr += step)
- ppc_md.hpte_removebolted(vaddr, psize, ssize);
-
- return 0;
-}
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-static int __init htab_dt_scan_seg_sizes(unsigned long node,
- const char *uname, int depth,
- void *data)
-{
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- u32 *prop;
- unsigned long size = 0;
-
- /* We are scanning "cpu" nodes only */
- if (type == NULL || strcmp(type, "cpu") != 0)
- return 0;
-
- prop = (u32 *)of_get_flat_dt_prop(node, "ibm,processor-segment-sizes",
- &size);
- if (prop == NULL)
- return 0;
- for (; size >= 4; size -= 4, ++prop) {
- if (prop[0] == 40) {
- DBG("1T segment support detected\n");
- cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
- return 1;
- }
- }
- cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
- return 0;
-}
-
-static void __init htab_init_seg_sizes(void)
-{
- of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
-}
-
-static int __init get_idx_from_shift(unsigned int shift)
-{
- int idx = -1;
-
- switch (shift) {
- case 0xc:
- idx = MMU_PAGE_4K;
- break;
- case 0x10:
- idx = MMU_PAGE_64K;
- break;
- case 0x14:
- idx = MMU_PAGE_1M;
- break;
- case 0x18:
- idx = MMU_PAGE_16M;
- break;
- case 0x22:
- idx = MMU_PAGE_16G;
- break;
- }
- return idx;
-}
-
-static int __init htab_dt_scan_page_sizes(unsigned long node,
- const char *uname, int depth,
- void *data)
-{
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- u32 *prop;
- unsigned long size = 0;
-
- /* We are scanning "cpu" nodes only */
- if (type == NULL || strcmp(type, "cpu") != 0)
- return 0;
-
- prop = (u32 *)of_get_flat_dt_prop(node,
- "ibm,segment-page-sizes", &size);
- if (prop != NULL) {
- pr_info("Page sizes from device-tree:\n");
- size /= 4;
- cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
- while(size > 0) {
- unsigned int base_shift = prop[0];
- unsigned int slbenc = prop[1];
- unsigned int lpnum = prop[2];
- struct mmu_psize_def *def;
- int idx, base_idx;
-
- size -= 3; prop += 3;
- base_idx = get_idx_from_shift(base_shift);
- if (base_idx < 0) {
- /*
- * skip the pte encoding also
- */
- prop += lpnum * 2; size -= lpnum * 2;
- continue;
- }
- def = &mmu_psize_defs[base_idx];
- if (base_idx == MMU_PAGE_16M)
- cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
-
- def->shift = base_shift;
- if (base_shift <= 23)
- def->avpnm = 0;
- else
- def->avpnm = (1 << (base_shift - 23)) - 1;
- def->sllp = slbenc;
- /*
- * We don't know for sure what's up with tlbiel, so
- * for now we only set it for 4K and 64K pages
- */
- if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
- def->tlbiel = 1;
- else
- def->tlbiel = 0;
-
- while (size > 0 && lpnum) {
- unsigned int shift = prop[0];
- int penc = prop[1];
-
- prop += 2; size -= 2;
- lpnum--;
-
- idx = get_idx_from_shift(shift);
- if (idx < 0)
- continue;
-
- if (penc == -1)
- pr_err("Invalid penc for base_shift=%d "
- "shift=%d\n", base_shift, shift);
-
- def->penc[idx] = penc;
- pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
- " avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
- base_shift, shift, def->sllp,
- def->avpnm, def->tlbiel, def->penc[idx]);
- }
- }
- return 1;
- }
- return 0;
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
-/* Scan for 16G memory blocks that have been set aside for huge pages
- * and reserve those blocks for 16G huge pages.
- */
-static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
- const char *uname, int depth,
- void *data) {
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- unsigned long *addr_prop;
- u32 *page_count_prop;
- unsigned int expected_pages;
- long unsigned int phys_addr;
- long unsigned int block_size;
-
- /* We are scanning "memory" nodes only */
- if (type == NULL || strcmp(type, "memory") != 0)
- return 0;
-
- /* This property is the log base 2 of the number of virtual pages that
- * will represent this memory block. */
- page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
- if (page_count_prop == NULL)
- return 0;
- expected_pages = (1 << page_count_prop[0]);
- addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
- if (addr_prop == NULL)
- return 0;
- phys_addr = addr_prop[0];
- block_size = addr_prop[1];
- if (block_size != (16 * GB))
- return 0;
- printk(KERN_INFO "Huge page(16GB) memory: "
- "addr = 0x%lX size = 0x%lX pages = %d\n",
- phys_addr, block_size, expected_pages);
- if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) {
- memblock_reserve(phys_addr, block_size * expected_pages);
- add_gpage(phys_addr, block_size, expected_pages);
- }
- return 0;
-}
-#endif /* CONFIG_HUGETLB_PAGE */
-
-static void mmu_psize_set_default_penc(void)
-{
- int bpsize, apsize;
- for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
- for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
- mmu_psize_defs[bpsize].penc[apsize] = -1;
-}
-
-static void __init htab_init_page_sizes(void)
-{
- int rc;
-
- /* se the invalid penc to -1 */
- mmu_psize_set_default_penc();
-
- /* Default to 4K pages only */
- memcpy(mmu_psize_defs, mmu_psize_defaults_old,
- sizeof(mmu_psize_defaults_old));
-
- /*
- * Try to find the available page sizes in the device-tree
- */
- rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL);
- if (rc != 0) /* Found */
- goto found;
-
- /*
- * Not in the device-tree, let's fallback on known size
- * list for 16M capable GP & GR
- */
- if (mmu_has_feature(MMU_FTR_16M_PAGE))
- memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
- sizeof(mmu_psize_defaults_gp));
- found:
-#ifndef CONFIG_DEBUG_PAGEALLOC
- /*
- * Pick a size for the linear mapping. Currently, we only support
- * 16M, 1M and 4K which is the default
- */
- if (mmu_psize_defs[MMU_PAGE_16M].shift)
- mmu_linear_psize = MMU_PAGE_16M;
- else if (mmu_psize_defs[MMU_PAGE_1M].shift)
- mmu_linear_psize = MMU_PAGE_1M;
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
-#ifdef CONFIG_PPC_64K_PAGES
- /*
- * Pick a size for the ordinary pages. Default is 4K, we support
- * 64K for user mappings and vmalloc if supported by the processor.
- * We only use 64k for ioremap if the processor
- * (and firmware) support cache-inhibited large pages.
- * If not, we use 4k and set mmu_ci_restrictions so that
- * hash_page knows to switch processes that use cache-inhibited
- * mappings to 4k pages.
- */
- if (mmu_psize_defs[MMU_PAGE_64K].shift) {
- mmu_virtual_psize = MMU_PAGE_64K;
- mmu_vmalloc_psize = MMU_PAGE_64K;
- if (mmu_linear_psize == MMU_PAGE_4K)
- mmu_linear_psize = MMU_PAGE_64K;
- if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) {
- /*
- * Don't use 64k pages for ioremap on pSeries, since
- * that would stop us accessing the HEA ethernet.
- */
- if (!machine_is(pseries))
- mmu_io_psize = MMU_PAGE_64K;
- } else
- mmu_ci_restrictions = 1;
- }
-#endif /* CONFIG_PPC_64K_PAGES */
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- /* We try to use 16M pages for vmemmap if that is supported
- * and we have at least 1G of RAM at boot
- */
- if (mmu_psize_defs[MMU_PAGE_16M].shift &&
- memblock_phys_mem_size() >= 0x40000000)
- mmu_vmemmap_psize = MMU_PAGE_16M;
- else if (mmu_psize_defs[MMU_PAGE_64K].shift)
- mmu_vmemmap_psize = MMU_PAGE_64K;
- else
- mmu_vmemmap_psize = MMU_PAGE_4K;
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-
- printk(KERN_DEBUG "Page orders: linear mapping = %d, "
- "virtual = %d, io = %d"
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- ", vmemmap = %d"
-#endif
- "\n",
- mmu_psize_defs[mmu_linear_psize].shift,
- mmu_psize_defs[mmu_virtual_psize].shift,
- mmu_psize_defs[mmu_io_psize].shift
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- ,mmu_psize_defs[mmu_vmemmap_psize].shift
-#endif
- );
-
-#ifdef CONFIG_HUGETLB_PAGE
- /* Reserve 16G huge page memory sections for huge pages */
- of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
-#endif /* CONFIG_HUGETLB_PAGE */
-}
-
-static int __init htab_dt_scan_pftsize(unsigned long node,
- const char *uname, int depth,
- void *data)
-{
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- u32 *prop;
-
- /* We are scanning "cpu" nodes only */
- if (type == NULL || strcmp(type, "cpu") != 0)
- return 0;
-
- prop = (u32 *)of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
- if (prop != NULL) {
- /* pft_size[0] is the NUMA CEC cookie */
- ppc64_pft_size = prop[1];
- return 1;
- }
- return 0;
-}
-
-static unsigned long __init htab_get_table_size(void)
-{
- unsigned long mem_size, rnd_mem_size, pteg_count, psize;
-
- /* If hash size isn't already provided by the platform, we try to
- * retrieve it from the device-tree. If it's not there neither, we
- * calculate it now based on the total RAM size
- */
- if (ppc64_pft_size == 0)
- of_scan_flat_dt(htab_dt_scan_pftsize, NULL);
- if (ppc64_pft_size)
- return 1UL << ppc64_pft_size;
-
- /* round mem_size up to next power of 2 */
- mem_size = memblock_phys_mem_size();
- rnd_mem_size = 1UL << __ilog2(mem_size);
- if (rnd_mem_size < mem_size)
- rnd_mem_size <<= 1;
-
- /* # pages / 2 */
- psize = mmu_psize_defs[mmu_virtual_psize].shift;
- pteg_count = max(rnd_mem_size >> (psize + 1), 1UL << 11);
-
- return pteg_count << 7;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int create_section_mapping(unsigned long start, unsigned long end)
-{
- return htab_bolt_mapping(start, end, __pa(start),
- pgprot_val(PAGE_KERNEL), mmu_linear_psize,
- mmu_kernel_ssize);
-}
-
-int remove_section_mapping(unsigned long start, unsigned long end)
-{
- return htab_remove_mapping(start, end, mmu_linear_psize,
- mmu_kernel_ssize);
-}
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-#define FUNCTION_TEXT(A) ((*(unsigned long *)(A)))
-
-static void __init htab_finish_init(void)
-{
- extern unsigned int *htab_call_hpte_insert1;
- extern unsigned int *htab_call_hpte_insert2;
- extern unsigned int *htab_call_hpte_remove;
- extern unsigned int *htab_call_hpte_updatepp;
-
-#ifdef CONFIG_PPC_HAS_HASH_64K
- extern unsigned int *ht64_call_hpte_insert1;
- extern unsigned int *ht64_call_hpte_insert2;
- extern unsigned int *ht64_call_hpte_remove;
- extern unsigned int *ht64_call_hpte_updatepp;
-
- patch_branch(ht64_call_hpte_insert1,
- FUNCTION_TEXT(ppc_md.hpte_insert),
- BRANCH_SET_LINK);
- patch_branch(ht64_call_hpte_insert2,
- FUNCTION_TEXT(ppc_md.hpte_insert),
- BRANCH_SET_LINK);
- patch_branch(ht64_call_hpte_remove,
- FUNCTION_TEXT(ppc_md.hpte_remove),
- BRANCH_SET_LINK);
- patch_branch(ht64_call_hpte_updatepp,
- FUNCTION_TEXT(ppc_md.hpte_updatepp),
- BRANCH_SET_LINK);
-
-#endif /* CONFIG_PPC_HAS_HASH_64K */
-
- patch_branch(htab_call_hpte_insert1,
- FUNCTION_TEXT(ppc_md.hpte_insert),
- BRANCH_SET_LINK);
- patch_branch(htab_call_hpte_insert2,
- FUNCTION_TEXT(ppc_md.hpte_insert),
- BRANCH_SET_LINK);
- patch_branch(htab_call_hpte_remove,
- FUNCTION_TEXT(ppc_md.hpte_remove),
- BRANCH_SET_LINK);
- patch_branch(htab_call_hpte_updatepp,
- FUNCTION_TEXT(ppc_md.hpte_updatepp),
- BRANCH_SET_LINK);
-}
-
-static void __init htab_initialize(void)
-{
- unsigned long table;
- unsigned long pteg_count;
- unsigned long prot;
- unsigned long base = 0, size = 0, limit;
- struct memblock_region *reg;
-
- DBG(" -> htab_initialize()\n");
-
- /* Initialize segment sizes */
- htab_init_seg_sizes();
-
- /* Initialize page sizes */
- htab_init_page_sizes();
-
- if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
- mmu_kernel_ssize = MMU_SEGSIZE_1T;
- mmu_highuser_ssize = MMU_SEGSIZE_1T;
- printk(KERN_INFO "Using 1TB segments\n");
- }
-
- /*
- * Calculate the required size of the htab. We want the number of
- * PTEGs to equal one half the number of real pages.
- */
- htab_size_bytes = htab_get_table_size();
- pteg_count = htab_size_bytes >> 7;
-
- htab_hash_mask = pteg_count - 1;
-
- if (firmware_has_feature(FW_FEATURE_LPAR)) {
- /* Using a hypervisor which owns the htab */
- htab_address = NULL;
- _SDR1 = 0;
-#ifdef CONFIG_FA_DUMP
- /*
- * If firmware assisted dump is active firmware preserves
- * the contents of htab along with entire partition memory.
- * Clear the htab if firmware assisted dump is active so
- * that we dont end up using old mappings.
- */
- if (is_fadump_active() && ppc_md.hpte_clear_all)
- ppc_md.hpte_clear_all();
-#endif
- } else {
- /* Find storage for the HPT. Must be contiguous in
- * the absolute address space. On cell we want it to be
- * in the first 2 Gig so we can use it for IOMMU hacks.
- */
- if (machine_is(cell))
- limit = 0x80000000;
- else
- limit = MEMBLOCK_ALLOC_ANYWHERE;
-
- table = memblock_alloc_base(htab_size_bytes, htab_size_bytes, limit);
-
- DBG("Hash table allocated at %lx, size: %lx\n", table,
- htab_size_bytes);
-
- htab_address = __va(table);
-
- /* htab absolute addr + encoded htabsize */
- _SDR1 = table + __ilog2(pteg_count) - 11;
-
- /* Initialize the HPT with no entries */
- memset((void *)table, 0, htab_size_bytes);
-
- /* Set SDR1 */
- mtspr(SPRN_SDR1, _SDR1);
- }
-
- prot = pgprot_val(PAGE_KERNEL);
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
- linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
- linear_map_hash_slots = __va(memblock_alloc_base(linear_map_hash_count,
- 1, ppc64_rma_size));
- memset(linear_map_hash_slots, 0, linear_map_hash_count);
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
- /* On U3 based machines, we need to reserve the DART area and
- * _NOT_ map it to avoid cache paradoxes as it's remapped non
- * cacheable later on
- */
-
- /* create bolted the linear mapping in the hash table */
- for_each_memblock(memory, reg) {
- base = (unsigned long)__va(reg->base);
- size = reg->size;
-
- DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
- base, size, prot);
-
-#ifdef CONFIG_U3_DART
- /* Do not map the DART space. Fortunately, it will be aligned
- * in such a way that it will not cross two memblock regions and
- * will fit within a single 16Mb page.
- * The DART space is assumed to be a full 16Mb region even if
- * we only use 2Mb of that space. We will use more of it later
- * for AGP GART. We have to use a full 16Mb large page.
- */
- DBG("DART base: %lx\n", dart_tablebase);
-
- if (dart_tablebase != 0 && dart_tablebase >= base
- && dart_tablebase < (base + size)) {
- unsigned long dart_table_end = dart_tablebase + 16 * MB;
- if (base != dart_tablebase)
- BUG_ON(htab_bolt_mapping(base, dart_tablebase,
- __pa(base), prot,
- mmu_linear_psize,
- mmu_kernel_ssize));
- if ((base + size) > dart_table_end)
- BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB,
- base + size,
- __pa(dart_table_end),
- prot,
- mmu_linear_psize,
- mmu_kernel_ssize));
- continue;
- }
-#endif /* CONFIG_U3_DART */
- BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
- prot, mmu_linear_psize, mmu_kernel_ssize));
- }
- memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
-
- /*
- * If we have a memory_limit and we've allocated TCEs then we need to
- * explicitly map the TCE area at the top of RAM. We also cope with the
- * case that the TCEs start below memory_limit.
- * tce_alloc_start/end are 16MB aligned so the mapping should work
- * for either 4K or 16MB pages.
- */
- if (tce_alloc_start) {
- tce_alloc_start = (unsigned long)__va(tce_alloc_start);
- tce_alloc_end = (unsigned long)__va(tce_alloc_end);
-
- if (base + size >= tce_alloc_start)
- tce_alloc_start = base + size + 1;
-
- BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
- __pa(tce_alloc_start), prot,
- mmu_linear_psize, mmu_kernel_ssize));
- }
-
- htab_finish_init();
-
- DBG(" <- htab_initialize()\n");
-}
-#undef KB
-#undef MB
-
-void __init early_init_mmu(void)
-{
- /* Setup initial STAB address in the PACA */
- get_paca()->stab_real = __pa((u64)&initial_stab);
- get_paca()->stab_addr = (u64)&initial_stab;
-
- /* Initialize the MMU Hash table and create the linear mapping
- * of memory. Has to be done before stab/slb initialization as
- * this is currently where the page size encoding is obtained
- */
- htab_initialize();
-
- /* Initialize stab / SLB management */
- if (mmu_has_feature(MMU_FTR_SLB))
- slb_initialize();
- else
- stab_initialize(get_paca()->stab_real);
-}
-
-#ifdef CONFIG_SMP
-void early_init_mmu_secondary(void)
-{
- /* Initialize hash table for that CPU */
- if (!firmware_has_feature(FW_FEATURE_LPAR))
- mtspr(SPRN_SDR1, _SDR1);
-
- /* Initialize STAB/SLB. We use a virtual address as it works
- * in real mode on pSeries.
- */
- if (mmu_has_feature(MMU_FTR_SLB))
- slb_initialize();
- else
- stab_initialize(get_paca()->stab_addr);
-}
-#endif /* CONFIG_SMP */
-
-/*
- * Called by asm hashtable.S for doing lazy icache flush
- */
-unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
-{
- struct page *page;
-
- if (!pfn_valid(pte_pfn(pte)))
- return pp;
-
- page = pte_page(pte);
-
- /* page is dirty */
- if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
- if (trap == 0x400) {
- flush_dcache_icache_page(page);
- set_bit(PG_arch_1, &page->flags);
- } else
- pp |= HPTE_R_N;
- }
- return pp;
-}
-
-#ifdef CONFIG_PPC_MM_SLICES
-unsigned int get_paca_psize(unsigned long addr)
-{
- u64 lpsizes;
- unsigned char *hpsizes;
- unsigned long index, mask_index;
-
- if (addr < SLICE_LOW_TOP) {
- lpsizes = get_paca()->context.low_slices_psize;
- index = GET_LOW_SLICE_INDEX(addr);
- return (lpsizes >> (index * 4)) & 0xF;
- }
- hpsizes = get_paca()->context.high_slices_psize;
- index = GET_HIGH_SLICE_INDEX(addr);
- mask_index = index & 0x1;
- return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xF;
-}
-
-#else
-unsigned int get_paca_psize(unsigned long addr)
-{
- return get_paca()->context.user_psize;
-}
-#endif
-
-/*
- * Demote a segment to using 4k pages.
- * For now this makes the whole process use 4k pages.
- */
-#ifdef CONFIG_PPC_64K_PAGES
-void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
-{
- if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
- return;
- slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
-#ifdef CONFIG_SPU_BASE
- spu_flush_all_slbs(mm);
-#endif
- if (get_paca_psize(addr) != MMU_PAGE_4K) {
- get_paca()->context = mm->context;
- slb_flush_and_rebolt();
- }
-}
-#endif /* CONFIG_PPC_64K_PAGES */
-
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-/*
- * This looks up a 2-bit protection code for a 4k subpage of a 64k page.
- * Userspace sets the subpage permissions using the subpage_prot system call.
- *
- * Result is 0: full permissions, _PAGE_RW: read-only,
- * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access.
- */
-static int subpage_protection(struct mm_struct *mm, unsigned long ea)
-{
- struct subpage_prot_table *spt = &mm->context.spt;
- u32 spp = 0;
- u32 **sbpm, *sbpp;
-
- if (ea >= spt->maxaddr)
- return 0;
- if (ea < 0x100000000) {
- /* addresses below 4GB use spt->low_prot */
- sbpm = spt->low_prot;
- } else {
- sbpm = spt->protptrs[ea >> SBP_L3_SHIFT];
- if (!sbpm)
- return 0;
- }
- sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
- if (!sbpp)
- return 0;
- spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)];
-
- /* extract 2-bit bitfield for this 4k subpage */
- spp >>= 30 - 2 * ((ea >> 12) & 0xf);
-
- /* turn 0,1,2,3 into combination of _PAGE_USER and _PAGE_RW */
- spp = ((spp & 2) ? _PAGE_USER : 0) | ((spp & 1) ? _PAGE_RW : 0);
- return spp;
-}
-
-#else /* CONFIG_PPC_SUBPAGE_PROT */
-static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
-{
- return 0;
-}
-#endif
-
-void hash_failure_debug(unsigned long ea, unsigned long access,
- unsigned long vsid, unsigned long trap,
- int ssize, int psize, int lpsize, unsigned long pte)
-{
- if (!printk_ratelimit())
- return;
- pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n",
- ea, access, current->comm);
- pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n",
- trap, vsid, ssize, psize, lpsize, pte);
-}
-
-/* Result code is:
- * 0 - handled
- * 1 - normal page fault
- * -1 - critical hash insertion error
- * -2 - access not permitted by subpage protection mechanism
- */
-int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
-{
- enum ctx_state prev_state = exception_enter();
- pgd_t *pgdir;
- unsigned long vsid;
- struct mm_struct *mm;
- pte_t *ptep;
- unsigned hugeshift;
- const struct cpumask *tmp;
- int rc, user_region = 0, local = 0;
- int psize, ssize;
-
- DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
- ea, access, trap);
-
- /* Get region & vsid */
- switch (REGION_ID(ea)) {
- case USER_REGION_ID:
- user_region = 1;
- mm = current->mm;
- if (! mm) {
- DBG_LOW(" user region with no mm !\n");
- rc = 1;
- goto bail;
- }
- psize = get_slice_psize(mm, ea);
- ssize = user_segment_size(ea);
- vsid = get_vsid(mm->context.id, ea, ssize);
- break;
- case VMALLOC_REGION_ID:
- mm = &init_mm;
- vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
- if (ea < VMALLOC_END)
- psize = mmu_vmalloc_psize;
- else
- psize = mmu_io_psize;
- ssize = mmu_kernel_ssize;
- break;
- default:
- /* Not a valid range
- * Send the problem up to do_page_fault
- */
- rc = 1;
- goto bail;
- }
- DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
-
- /* Bad address. */
- if (!vsid) {
- DBG_LOW("Bad address!\n");
- rc = 1;
- goto bail;
- }
- /* Get pgdir */
- pgdir = mm->pgd;
- if (pgdir == NULL) {
- rc = 1;
- goto bail;
- }
-
- /* Check CPU locality */
- tmp = cpumask_of(smp_processor_id());
- if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
- local = 1;
-
-#ifndef CONFIG_PPC_64K_PAGES
- /* If we use 4K pages and our psize is not 4K, then we might
- * be hitting a special driver mapping, and need to align the
- * address before we fetch the PTE.
- *
- * It could also be a hugepage mapping, in which case this is
- * not necessary, but it's not harmful, either.
- */
- if (psize != MMU_PAGE_4K)
- ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
-#endif /* CONFIG_PPC_64K_PAGES */
-
- /* Get PTE and page size from page tables */
- ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
- if (ptep == NULL || !pte_present(*ptep)) {
- DBG_LOW(" no PTE !\n");
- rc = 1;
- goto bail;
- }
-
- /* Add _PAGE_PRESENT to the required access perm */
- access |= _PAGE_PRESENT;
-
- /* Pre-check access permissions (will be re-checked atomically
- * in __hash_page_XX but this pre-check is a fast path
- */
- if (access & ~pte_val(*ptep)) {
- DBG_LOW(" no access !\n");
- rc = 1;
- goto bail;
- }
-
- if (hugeshift) {
- if (pmd_trans_huge(*(pmd_t *)ptep))
- rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
- trap, local, ssize, psize);
-#ifdef CONFIG_HUGETLB_PAGE
- else
- rc = __hash_page_huge(ea, access, vsid, ptep, trap,
- local, ssize, hugeshift, psize);
-#else
- else {
- /*
- * if we have hugeshift, and is not transhuge with
- * hugetlb disabled, something is really wrong.
- */
- rc = 1;
- WARN_ON(1);
- }
-#endif
- goto bail;
- }
-
-#ifndef CONFIG_PPC_64K_PAGES
- DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
-#else
- DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
- pte_val(*(ptep + PTRS_PER_PTE)));
-#endif
- /* Do actual hashing */
-#ifdef CONFIG_PPC_64K_PAGES
- /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */
- if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
- demote_segment_4k(mm, ea);
- psize = MMU_PAGE_4K;
- }
-
- /* If this PTE is non-cacheable and we have restrictions on
- * using non cacheable large pages, then we switch to 4k
- */
- if (mmu_ci_restrictions && psize == MMU_PAGE_64K &&
- (pte_val(*ptep) & _PAGE_NO_CACHE)) {
- if (user_region) {
- demote_segment_4k(mm, ea);
- psize = MMU_PAGE_4K;
- } else if (ea < VMALLOC_END) {
- /*
- * some driver did a non-cacheable mapping
- * in vmalloc space, so switch vmalloc
- * to 4k pages
- */
- printk(KERN_ALERT "Reducing vmalloc segment "
- "to 4kB pages because of "
- "non-cacheable mapping\n");
- psize = mmu_vmalloc_psize = MMU_PAGE_4K;
-#ifdef CONFIG_SPU_BASE
- spu_flush_all_slbs(mm);
-#endif
- }
- }
- if (user_region) {
- if (psize != get_paca_psize(ea)) {
- get_paca()->context = mm->context;
- slb_flush_and_rebolt();
- }
- } else if (get_paca()->vmalloc_sllp !=
- mmu_psize_defs[mmu_vmalloc_psize].sllp) {
- get_paca()->vmalloc_sllp =
- mmu_psize_defs[mmu_vmalloc_psize].sllp;
- slb_vmalloc_update();
- }
-#endif /* CONFIG_PPC_64K_PAGES */
-
-#ifdef CONFIG_PPC_HAS_HASH_64K
- if (psize == MMU_PAGE_64K)
- rc = __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize);
- else
-#endif /* CONFIG_PPC_HAS_HASH_64K */
- {
- int spp = subpage_protection(mm, ea);
- if (access & spp)
- rc = -2;
- else
- rc = __hash_page_4K(ea, access, vsid, ptep, trap,
- local, ssize, spp);
- }
-
- /* Dump some info in case of hash insertion failure, they should
- * never happen so it is really useful to know if/when they do
- */
- if (rc == -1)
- hash_failure_debug(ea, access, vsid, trap, ssize, psize,
- psize, pte_val(*ptep));
-#ifndef CONFIG_PPC_64K_PAGES
- DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
-#else
- DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep),
- pte_val(*(ptep + PTRS_PER_PTE)));
-#endif
- DBG_LOW(" -> rc=%d\n", rc);
-
-bail:
- exception_exit(prev_state);
- return rc;
-}
-EXPORT_SYMBOL_GPL(hash_page);
-
-void hash_preload(struct mm_struct *mm, unsigned long ea,
- unsigned long access, unsigned long trap)
-{
- int hugepage_shift;
- unsigned long vsid;
- pgd_t *pgdir;
- pte_t *ptep;
- unsigned long flags;
- int rc, ssize, local = 0;
-
- BUG_ON(REGION_ID(ea) != USER_REGION_ID);
-
-#ifdef CONFIG_PPC_MM_SLICES
- /* We only prefault standard pages for now */
- if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize))
- return;
-#endif
-
- DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
- " trap=%lx\n", mm, mm->pgd, ea, access, trap);
-
- /* Get Linux PTE if available */
- pgdir = mm->pgd;
- if (pgdir == NULL)
- return;
-
- /* Get VSID */
- ssize = user_segment_size(ea);
- vsid = get_vsid(mm->context.id, ea, ssize);
- if (!vsid)
- return;
- /*
- * Hash doesn't like irqs. Walking linux page table with irq disabled
- * saves us from holding multiple locks.
- */
- local_irq_save(flags);
-
- /*
- * THP pages use update_mmu_cache_pmd. We don't do
- * hash preload there. Hence can ignore THP here
- */
- ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
- if (!ptep)
- goto out_exit;
-
- WARN_ON(hugepage_shift);
-#ifdef CONFIG_PPC_64K_PAGES
- /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
- * a 64K kernel), then we don't preload, hash_page() will take
- * care of it once we actually try to access the page.
- * That way we don't have to duplicate all of the logic for segment
- * page size demotion here
- */
- if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
- goto out_exit;
-#endif /* CONFIG_PPC_64K_PAGES */
-
- /* Is that local to this CPU ? */
- if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
- local = 1;
-
- /* Hash it in */
-#ifdef CONFIG_PPC_HAS_HASH_64K
- if (mm->context.user_psize == MMU_PAGE_64K)
- rc = __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize);
- else
-#endif /* CONFIG_PPC_HAS_HASH_64K */
- rc = __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize,
- subpage_protection(mm, ea));
-
- /* Dump some info in case of hash insertion failure, they should
- * never happen so it is really useful to know if/when they do
- */
- if (rc == -1)
- hash_failure_debug(ea, access, vsid, trap, ssize,
- mm->context.user_psize,
- mm->context.user_psize,
- pte_val(*ptep));
-out_exit:
- local_irq_restore(flags);
-}
-
-/* WARNING: This is called from hash_low_64.S, if you change this prototype,
- * do not forget to update the assembly call site !
- */
-void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
- int local)
-{
- unsigned long hash, index, shift, hidx, slot;
-
- DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
- pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
- hash = hpt_hash(vpn, shift, ssize);
- hidx = __rpte_to_hidx(pte, index);
- if (hidx & _PTEIDX_SECONDARY)
- hash = ~hash;
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += hidx & _PTEIDX_GROUP_IX;
- DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
- /*
- * We use same base page size and actual psize, because we don't
- * use these functions for hugepage
- */
- ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local);
- } pte_iterate_hashed_end();
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
- /* Transactions are not aborted by tlbiel, only tlbie.
- * Without, syncing a page back to a block device w/ PIO could pick up
- * transactional data (bad!) so we force an abort here. Before the
- * sync the page will be made read-only, which will flush_hash_page.
- * BIG ISSUE here: if the kernel uses a page from userspace without
- * unmapping it first, it may see the speculated version.
- */
- if (local && cpu_has_feature(CPU_FTR_TM) &&
- current->thread.regs &&
- MSR_TM_ACTIVE(current->thread.regs->msr)) {
- tm_enable();
- tm_abort(TM_CAUSE_TLBI);
- }
-#endif
-}
-
-void flush_hash_range(unsigned long number, int local)
-{
- if (ppc_md.flush_hash_range)
- ppc_md.flush_hash_range(number, local);
- else {
- int i;
- struct ppc64_tlb_batch *batch =
- &__get_cpu_var(ppc64_tlb_batch);
-
- for (i = 0; i < number; i++)
- flush_hash_page(batch->vpn[i], batch->pte[i],
- batch->psize, batch->ssize, local);
- }
-}
-
-/*
- * low_hash_fault is called when we the low level hash code failed
- * to instert a PTE due to an hypervisor error
- */
-void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
-{
- enum ctx_state prev_state = exception_enter();
-
- if (user_mode(regs)) {
-#ifdef CONFIG_PPC_SUBPAGE_PROT
- if (rc == -2)
- _exception(SIGSEGV, regs, SEGV_ACCERR, address);
- else
-#endif
- _exception(SIGBUS, regs, BUS_ADRERR, address);
- } else
- bad_page_fault(regs, address, SIGBUS);
-
- exception_exit(prev_state);
-}
-
-long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
- unsigned long pa, unsigned long rflags,
- unsigned long vflags, int psize, int ssize)
-{
- unsigned long hpte_group;
- long slot;
-
-repeat:
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
-
- /* Insert into the hash table, primary slot */
- slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
- psize, psize, ssize);
-
- /* Primary is full, try the secondary */
- if (unlikely(slot == -1)) {
- hpte_group = ((~hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
- slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags,
- vflags | HPTE_V_SECONDARY,
- psize, psize, ssize);
- if (slot == -1) {
- if (mftb() & 0x1)
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP)&~0x7UL;
-
- ppc_md.hpte_remove(hpte_group);
- goto repeat;
- }
- }
-
- return slot;
-}
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
-{
- unsigned long hash;
- unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
- unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
- unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL);
- long ret;
-
- hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
-
- /* Don't create HPTE entries for bad address */
- if (!vsid)
- return;
-
- ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
- HPTE_V_BOLTED,
- mmu_linear_psize, mmu_kernel_ssize);
-
- BUG_ON (ret < 0);
- spin_lock(&linear_map_hash_lock);
- BUG_ON(linear_map_hash_slots[lmi] & 0x80);
- linear_map_hash_slots[lmi] = ret | 0x80;
- spin_unlock(&linear_map_hash_lock);
-}
-
-static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
-{
- unsigned long hash, hidx, slot;
- unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
- unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
-
- hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
- spin_lock(&linear_map_hash_lock);
- BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
- hidx = linear_map_hash_slots[lmi] & 0x7f;
- linear_map_hash_slots[lmi] = 0;
- spin_unlock(&linear_map_hash_lock);
- if (hidx & _PTEIDX_SECONDARY)
- hash = ~hash;
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += hidx & _PTEIDX_GROUP_IX;
- ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize,
- mmu_kernel_ssize, 0);
-}
-
-void kernel_map_pages(struct page *page, int numpages, int enable)
-{
- unsigned long flags, vaddr, lmi;
- int i;
-
- local_irq_save(flags);
- for (i = 0; i < numpages; i++, page++) {
- vaddr = (unsigned long)page_address(page);
- lmi = __pa(vaddr) >> PAGE_SHIFT;
- if (lmi >= linear_map_hash_count)
- continue;
- if (enable)
- kernel_map_linear_page(vaddr, lmi);
- else
- kernel_unmap_linear_page(vaddr, lmi);
- }
- local_irq_restore(flags);
-}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size)
-{
- /* We don't currently support the first MEMBLOCK not mapping 0
- * physical on those processors
- */
- BUG_ON(first_memblock_base != 0);
-
- /* On LPAR systems, the first entry is our RMA region,
- * non-LPAR 64-bit hash MMU systems don't have a limitation
- * on real mode access, but using the first entry works well
- * enough. We also clamp it to 1G to avoid some funky things
- * such as RTAS bugs etc...
- */
- ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
-
- /* Finally limit subsequent allocations */
- memblock_set_current_limit(ppc64_rma_size);
-}
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
deleted file mode 100644
index e7450bdbe83a..000000000000
--- a/arch/powerpc/mm/highmem.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * highmem.c: virtual kernel memory mappings for high memory
- *
- * PowerPC version, stolen from the i386 version.
- *
- * Used in CONFIG_HIGHMEM systems for memory pages which
- * are not addressable by direct kernel virtual addresses.
- *
- * Copyright (C) 1999 Gerhard Wichert, Siemens AG
- * Gerhard.Wichert@pdb.siemens.de
- *
- *
- * Redesigned the x86 32-bit VM architecture to deal with
- * up to 16 Terrabyte physical memory. With current x86 CPUs
- * we now support up to 64 Gigabytes physical RAM.
- *
- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
- *
- * Reworked for PowerPC by various contributors. Moved from
- * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp.
- */
-
-#include <linux/highmem.h>
-#include <linux/module.h>
-
-/*
- * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap
- * gives a more generic (and caching) interface. But kmap_atomic can
- * be used in IRQ contexts, so in some (very limited) cases we need
- * it.
- */
-void *kmap_atomic_prot(struct page *page, pgprot_t prot)
-{
- unsigned long vaddr;
- int idx, type;
-
- /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
- pagefault_disable();
- if (!PageHighMem(page))
- return page_address(page);
-
- type = kmap_atomic_idx_push();
- idx = type + KM_TYPE_NR*smp_processor_id();
- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-#ifdef CONFIG_DEBUG_HIGHMEM
- BUG_ON(!pte_none(*(kmap_pte-idx)));
-#endif
- __set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1);
- local_flush_tlb_page(NULL, vaddr);
-
- return (void*) vaddr;
-}
-EXPORT_SYMBOL(kmap_atomic_prot);
-
-void __kunmap_atomic(void *kvaddr)
-{
- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
- int type;
-
- if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
- pagefault_enable();
- return;
- }
-
- type = kmap_atomic_idx();
-
-#ifdef CONFIG_DEBUG_HIGHMEM
- {
- unsigned int idx;
-
- idx = type + KM_TYPE_NR * smp_processor_id();
- BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
-
- /*
- * force other mappings to Oops if they'll try to access
- * this pte without first remap it
- */
- pte_clear(&init_mm, vaddr, kmap_pte-idx);
- local_flush_tlb_page(NULL, vaddr);
- }
-#endif
-
- kmap_atomic_idx_pop();
- pagefault_enable();
-}
-EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
deleted file mode 100644
index 3bc700655fc8..000000000000
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * PPC Huge TLB Page Support for Book3E MMU
- *
- * Copyright (C) 2009 David Gibson, IBM Corporation.
- * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
- *
- */
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-
-static inline int mmu_get_tsize(int psize)
-{
- return mmu_psize_defs[psize].enc;
-}
-
-static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid)
-{
- int found = 0;
-
- mtspr(SPRN_MAS6, pid << 16);
- if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) {
- asm volatile(
- "li %0,0\n"
- "tlbsx. 0,%1\n"
- "bne 1f\n"
- "li %0,1\n"
- "1:\n"
- : "=&r"(found) : "r"(ea));
- } else {
- asm volatile(
- "tlbsx 0,%1\n"
- "mfspr %0,0x271\n"
- "srwi %0,%0,31\n"
- : "=&r"(found) : "r"(ea));
- }
-
- return found;
-}
-
-void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
- pte_t pte)
-{
- unsigned long mas1, mas2;
- u64 mas7_3;
- unsigned long psize, tsize, shift;
- unsigned long flags;
- struct mm_struct *mm;
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
- int index, ncams;
-#endif
-
- if (unlikely(is_kernel_addr(ea)))
- return;
-
- mm = vma->vm_mm;
-
-#ifdef CONFIG_PPC_MM_SLICES
- psize = get_slice_psize(mm, ea);
- tsize = mmu_get_tsize(psize);
- shift = mmu_psize_defs[psize].shift;
-#else
- psize = vma_mmu_pagesize(vma);
- shift = __ilog2(psize);
- tsize = shift - 10;
-#endif
-
- /*
- * We can't be interrupted while we're setting up the MAS
- * regusters or after we've confirmed that no tlb exists.
- */
- local_irq_save(flags);
-
- if (unlikely(book3e_tlb_exists(ea, mm->context.id))) {
- local_irq_restore(flags);
- return;
- }
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
- ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
-
- /* We have to use the CAM(TLB1) on FSL parts for hugepages */
- index = __get_cpu_var(next_tlbcam_idx);
- mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
-
- /* Just round-robin the entries and wrap when we hit the end */
- if (unlikely(index == ncams - 1))
- __get_cpu_var(next_tlbcam_idx) = tlbcam_index;
- else
- __get_cpu_var(next_tlbcam_idx)++;
-#endif
- mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
- mas2 = ea & ~((1UL << shift) - 1);
- mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
- mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT;
- mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK;
- if (!pte_dirty(pte))
- mas7_3 &= ~(MAS3_SW|MAS3_UW);
-
- mtspr(SPRN_MAS1, mas1);
- mtspr(SPRN_MAS2, mas2);
-
- if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) {
- mtspr(SPRN_MAS7_MAS3, mas7_3);
- } else {
- mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
- mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
- }
-
- asm volatile ("tlbwe");
-
- local_irq_restore(flags);
-}
-
-void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
- struct hstate *hstate = hstate_file(vma->vm_file);
- unsigned long tsize = huge_page_shift(hstate) - 10;
-
- __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, tsize, 0);
-
-}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
deleted file mode 100644
index 0b7fb6761015..000000000000
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
- *
- * Copyright (C) 2003 David Gibson, IBM Corporation.
- *
- * Based on the IA-32 version:
- * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
- */
-
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/machdep.h>
-
-extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
- unsigned long pa, unsigned long rlags,
- unsigned long vflags, int psize, int ssize);
-
-int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
- pte_t *ptep, unsigned long trap, int local, int ssize,
- unsigned int shift, unsigned int mmu_psize)
-{
- unsigned long vpn;
- unsigned long old_pte, new_pte;
- unsigned long rflags, pa, sz;
- long slot;
-
- BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
-
- /* Search the Linux page table for a match with va */
- vpn = hpt_vpn(ea, vsid, ssize);
-
- /* At this point, we have a pte (old_pte) which can be used to build
- * or update an HPTE. There are 2 cases:
- *
- * 1. There is a valid (present) pte with no associated HPTE (this is
- * the most common case)
- * 2. There is a valid (present) pte with an associated HPTE. The
- * current values of the pp bits in the HPTE prevent access
- * because we are doing software DIRTY bit management and the
- * page is currently not DIRTY.
- */
-
-
- do {
- old_pte = pte_val(*ptep);
- /* If PTE busy, retry the access */
- if (unlikely(old_pte & _PAGE_BUSY))
- return 0;
- /* If PTE permissions don't match, take page fault */
- if (unlikely(access & ~old_pte))
- return 1;
- /* Try to lock the PTE, add ACCESSED and DIRTY if it was
- * a write access */
- new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
- if (access & _PAGE_RW)
- new_pte |= _PAGE_DIRTY;
- } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
- old_pte, new_pte));
-
- rflags = 0x2 | (!(new_pte & _PAGE_RW));
- /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
- rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
- sz = ((1UL) << shift);
- if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
- /* No CPU has hugepages but lacks no execute, so we
- * don't need to worry about that case */
- rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
-
- /* Check if pte already has an hpte (case 2) */
- if (unlikely(old_pte & _PAGE_HASHPTE)) {
- /* There MIGHT be an HPTE for this pte */
- unsigned long hash, slot;
-
- hash = hpt_hash(vpn, shift, ssize);
- if (old_pte & _PAGE_F_SECOND)
- hash = ~hash;
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += (old_pte & _PAGE_F_GIX) >> 12;
-
- if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
- mmu_psize, ssize, local) == -1)
- old_pte &= ~_PAGE_HPTEFLAGS;
- }
-
- if (likely(!(old_pte & _PAGE_HASHPTE))) {
- unsigned long hash = hpt_hash(vpn, shift, ssize);
-
- pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-
- /* clear HPTE slot informations in new PTE */
-#ifdef CONFIG_PPC_64K_PAGES
- new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
-#else
- new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
-#endif
- /* Add in WIMG bits */
- rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
- _PAGE_COHERENT | _PAGE_GUARDED));
-
- slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
- mmu_psize, ssize);
-
- /*
- * Hypervisor failure. Restore old pte and return -1
- * similar to __hash_page_*
- */
- if (unlikely(slot == -2)) {
- *ptep = __pte(old_pte);
- hash_failure_debug(ea, access, vsid, trap, ssize,
- mmu_psize, mmu_psize, old_pte);
- return -1;
- }
-
- new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
- }
-
- /*
- * No need to use ldarx/stdcx here
- */
- *ptep = __pte(new_pte & ~_PAGE_BUSY);
- return 0;
-}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 834ca8eb38f2..d3c1b749dcfc 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -15,393 +15,84 @@
#include <linux/export.h>
#include <linux/of_fdt.h>
#include <linux/memblock.h>
-#include <linux/bootmem.h>
#include <linux/moduleparam.h>
-#include <asm/pgtable.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/kmemleak.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/setup.h>
#include <asm/hugetlb.h>
+#include <asm/pte-walk.h>
+#include <asm/firmware.h>
-#ifdef CONFIG_HUGETLB_PAGE
+bool hugetlb_disabled = false;
-#define PAGE_SHIFT_64K 16
-#define PAGE_SHIFT_16M 24
-#define PAGE_SHIFT_16G 34
+#define PTE_T_ORDER (__builtin_ffs(sizeof(pte_basic_t)) - \
+ __builtin_ffs(sizeof(void *)))
-unsigned int HPAGE_SHIFT;
-
-/*
- * Tracks gpages after the device tree is scanned and before the
- * huge_boot_pages list is ready. On non-Freescale implementations, this is
- * just used to track 16G pages and so is a single array. FSL-based
- * implementations may have more than one gpage size, so we need multiple
- * arrays
- */
-#ifdef CONFIG_PPC_FSL_BOOK3E
-#define MAX_NUMBER_GPAGES 128
-struct psize_gpages {
- u64 gpage_list[MAX_NUMBER_GPAGES];
- unsigned int nr_gpages;
-};
-static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
-#else
-#define MAX_NUMBER_GPAGES 1024
-static u64 gpage_freearray[MAX_NUMBER_GPAGES];
-static unsigned nr_gpages;
-#endif
-
-#define hugepd_none(hpd) ((hpd).pd == 0)
-
-#ifdef CONFIG_PPC_BOOK3S_64
-/*
- * At this point we do the placement change only for BOOK3S 64. This would
- * possibly work on other subarchs.
- */
-
-/*
- * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
- * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
- */
-int pmd_huge(pmd_t pmd)
-{
- /*
- * leaf pte for huge page, bottom two bits != 00
- */
- return ((pmd_val(pmd) & 0x3) != 0x0);
-}
-
-int pud_huge(pud_t pud)
-{
- /*
- * leaf pte for huge page, bottom two bits != 00
- */
- return ((pud_val(pud) & 0x3) != 0x0);
-}
-
-int pgd_huge(pgd_t pgd)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
/*
- * leaf pte for huge page, bottom two bits != 00
+ * Only called for hugetlbfs pages, hence can ignore THP and the
+ * irq disabled walk.
*/
- return ((pgd_val(pgd) & 0x3) != 0x0);
-}
-#else
-int pmd_huge(pmd_t pmd)
-{
- return 0;
+ return __find_linux_pte(mm->pgd, addr, NULL, NULL);
}
-int pud_huge(pud_t pud)
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long sz)
{
- return 0;
-}
-
-int pgd_huge(pgd_t pgd)
-{
- return 0;
-}
-#endif
-
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
- /* Only called for hugetlbfs pages, hence can ignore THP */
- return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
-}
-
-static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
- unsigned long address, unsigned pdshift, unsigned pshift)
-{
- struct kmem_cache *cachep;
- pte_t *new;
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
- int i;
- int num_hugepd = 1 << (pshift - pdshift);
- cachep = hugepte_cache;
-#else
- cachep = PGT_CACHE(pdshift - pshift);
-#endif
-
- new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
- BUG_ON(pshift > HUGEPD_SHIFT_MASK);
- BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
+ addr &= ~(sz - 1);
- if (! new)
- return -ENOMEM;
+ p4d = p4d_offset(pgd_offset(mm, addr), addr);
+ if (!mm_pud_folded(mm) && sz >= P4D_SIZE)
+ return (pte_t *)p4d;
- spin_lock(&mm->page_table_lock);
-#ifdef CONFIG_PPC_FSL_BOOK3E
- /*
- * We have multiple higher-level entries that point to the same
- * actual pte location. Fill in each as we go and backtrack on error.
- * We need all of these so the DTLB pgtable walk code can find the
- * right higher-level entry without knowing if it's a hugepage or not.
- */
- for (i = 0; i < num_hugepd; i++, hpdp++) {
- if (unlikely(!hugepd_none(*hpdp)))
- break;
- else
- /* We use the old format for PPC_FSL_BOOK3E */
- hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
- }
- /* If we bailed from the for loop early, an error occurred, clean up */
- if (i < num_hugepd) {
- for (i = i - 1 ; i >= 0; i--, hpdp--)
- hpdp->pd = 0;
- kmem_cache_free(cachep, new);
- }
-#else
- if (!hugepd_none(*hpdp))
- kmem_cache_free(cachep, new);
- else {
-#ifdef CONFIG_PPC_BOOK3S_64
- hpdp->pd = (unsigned long)new |
- (shift_to_mmu_psize(pshift) << 2);
-#else
- hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
-#endif
- }
-#endif
- spin_unlock(&mm->page_table_lock);
- return 0;
-}
-
-/*
- * These macros define how to determine which level of the page table holds
- * the hpdp.
- */
-#ifdef CONFIG_PPC_FSL_BOOK3E
-#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
-#define HUGEPD_PUD_SHIFT PUD_SHIFT
-#else
-#define HUGEPD_PGD_SHIFT PUD_SHIFT
-#define HUGEPD_PUD_SHIFT PMD_SHIFT
-#endif
-
-#ifdef CONFIG_PPC_BOOK3S_64
-/*
- * At this point we do the placement change only for BOOK3S 64. This would
- * possibly work on other subarchs.
- */
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
-{
- pgd_t *pg;
- pud_t *pu;
- pmd_t *pm;
- hugepd_t *hpdp = NULL;
- unsigned pshift = __ffs(sz);
- unsigned pdshift = PGDIR_SHIFT;
-
- addr &= ~(sz-1);
- pg = pgd_offset(mm, addr);
-
- if (pshift == PGDIR_SHIFT)
- /* 16GB huge page */
- return (pte_t *) pg;
- else if (pshift > PUD_SHIFT)
- /*
- * We need to use hugepd table
- */
- hpdp = (hugepd_t *)pg;
- else {
- pdshift = PUD_SHIFT;
- pu = pud_alloc(mm, pg, addr);
- if (pshift == PUD_SHIFT)
- return (pte_t *)pu;
- else if (pshift > PMD_SHIFT)
- hpdp = (hugepd_t *)pu;
- else {
- pdshift = PMD_SHIFT;
- pm = pmd_alloc(mm, pu, addr);
- if (pshift == PMD_SHIFT)
- /* 16MB hugepage */
- return (pte_t *)pm;
- else
- hpdp = (hugepd_t *)pm;
- }
- }
- if (!hpdp)
+ pud = pud_alloc(mm, p4d, addr);
+ if (!pud)
return NULL;
+ if (!mm_pmd_folded(mm) && sz >= PUD_SIZE)
+ return (pte_t *)pud;
- BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
-
- if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
+ pmd = pmd_alloc(mm, pud, addr);
+ if (!pmd)
return NULL;
- return hugepte_offset(hpdp, addr, pdshift);
-}
-
-#else
-
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
-{
- pgd_t *pg;
- pud_t *pu;
- pmd_t *pm;
- hugepd_t *hpdp = NULL;
- unsigned pshift = __ffs(sz);
- unsigned pdshift = PGDIR_SHIFT;
-
- addr &= ~(sz-1);
-
- pg = pgd_offset(mm, addr);
+ if (sz >= PMD_SIZE) {
+ /* On 8xx, all hugepages are handled as contiguous PTEs */
+ if (IS_ENABLED(CONFIG_PPC_8xx)) {
+ int i;
- if (pshift >= HUGEPD_PGD_SHIFT) {
- hpdp = (hugepd_t *)pg;
- } else {
- pdshift = PUD_SHIFT;
- pu = pud_alloc(mm, pg, addr);
- if (pshift >= HUGEPD_PUD_SHIFT) {
- hpdp = (hugepd_t *)pu;
- } else {
- pdshift = PMD_SHIFT;
- pm = pmd_alloc(mm, pu, addr);
- hpdp = (hugepd_t *)pm;
+ for (i = 0; i < sz / PMD_SIZE; i++) {
+ if (!pte_alloc_huge(mm, pmd + i, addr))
+ return NULL;
+ }
}
+ return (pte_t *)pmd;
}
- if (!hpdp)
- return NULL;
-
- BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
-
- if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
- return NULL;
-
- return hugepte_offset(hpdp, addr, pdshift);
-}
-#endif
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
-/* Build list of addresses of gigantic pages. This function is used in early
- * boot before the buddy or bootmem allocator is setup.
- */
-void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
-{
- unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
- int i;
-
- if (addr == 0)
- return;
-
- gpage_freearray[idx].nr_gpages = number_of_pages;
-
- for (i = 0; i < number_of_pages; i++) {
- gpage_freearray[idx].gpage_list[i] = addr;
- addr += page_size;
- }
+ return pte_alloc_huge(mm, pmd, addr);
}
+#ifdef CONFIG_PPC_BOOK3S_64
/*
- * Moves the gigantic page addresses from the temporary list to the
- * huge_boot_pages list.
- */
-int alloc_bootmem_huge_page(struct hstate *hstate)
-{
- struct huge_bootmem_page *m;
- int idx = shift_to_mmu_psize(huge_page_shift(hstate));
- int nr_gpages = gpage_freearray[idx].nr_gpages;
-
- if (nr_gpages == 0)
- return 0;
-
-#ifdef CONFIG_HIGHMEM
- /*
- * If gpages can be in highmem we can't use the trick of storing the
- * data structure in the page; allocate space for this
- */
- m = alloc_bootmem(sizeof(struct huge_bootmem_page));
- m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
-#else
- m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
-#endif
-
- list_add(&m->list, &huge_boot_pages);
- gpage_freearray[idx].nr_gpages = nr_gpages;
- gpage_freearray[idx].gpage_list[nr_gpages] = 0;
- m->hstate = hstate;
-
- return 1;
-}
-/*
- * Scan the command line hugepagesz= options for gigantic pages; store those in
- * a list that we use to allocate the memory once all options are parsed.
+ * Tracks gpages after the device tree is scanned and before the
+ * huge_boot_pages list is ready on pseries.
*/
-
-unsigned long gpage_npages[MMU_PAGE_COUNT];
-
-static int __init do_gpage_early_setup(char *param, char *val,
- const char *unused)
-{
- static phys_addr_t size;
- unsigned long npages;
-
- /*
- * The hugepagesz and hugepages cmdline options are interleaved. We
- * use the size variable to keep track of whether or not this was done
- * properly and skip over instances where it is incorrect. Other
- * command-line parsing code will issue warnings, so we don't need to.
- *
- */
- if ((strcmp(param, "default_hugepagesz") == 0) ||
- (strcmp(param, "hugepagesz") == 0)) {
- size = memparse(val, NULL);
- } else if (strcmp(param, "hugepages") == 0) {
- if (size != 0) {
- if (sscanf(val, "%lu", &npages) <= 0)
- npages = 0;
- gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
- size = 0;
- }
- }
- return 0;
-}
-
+#define MAX_NUMBER_GPAGES 1024
+__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
+__initdata static unsigned nr_gpages;
/*
- * This function allocates physical space for pages that are larger than the
- * buddy allocator can handle. We want to allocate these in highmem because
- * the amount of lowmem is limited. This means that this function MUST be
- * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
- * allocate to grab highmem.
+ * Build list of addresses of gigantic pages. This function is used in early
+ * boot before the buddy allocator is setup.
*/
-void __init reserve_hugetlb_gpages(void)
-{
- static __initdata char cmdline[COMMAND_LINE_SIZE];
- phys_addr_t size, base;
- int i;
-
- strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
- parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
- &do_gpage_early_setup);
-
- /*
- * Walk gpage list in reverse, allocating larger page sizes first.
- * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
- * When we reach the point in the list where pages are no longer
- * considered gpages, we're done.
- */
- for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
- if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
- continue;
- else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
- break;
-
- size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
- base = memblock_alloc_base(size * gpage_npages[i], size,
- MEMBLOCK_ALLOC_ANYWHERE);
- add_gpage(base, size, gpage_npages[i]);
- }
-}
-
-#else /* !PPC_FSL_BOOK3E */
-
-/* Build list of addresses of gigantic pages. This function is used in early
- * boot before the buddy or bootmem allocator is setup.
- */
-void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
+void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
{
if (!addr)
return;
@@ -413,458 +104,82 @@ void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
}
}
-/* Moves the gigantic page addresses from the temporary list to the
- * huge_boot_pages list.
- */
-int alloc_bootmem_huge_page(struct hstate *hstate)
+static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
{
struct huge_bootmem_page *m;
if (nr_gpages == 0)
return 0;
m = phys_to_virt(gpage_freearray[--nr_gpages]);
gpage_freearray[nr_gpages] = 0;
- list_add(&m->list, &huge_boot_pages);
+ list_add(&m->list, &huge_boot_pages[0]);
m->hstate = hstate;
+ m->flags = 0;
return 1;
}
-#endif
-
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- return 0;
-}
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
-#define HUGEPD_FREELIST_SIZE \
- ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
-
-struct hugepd_freelist {
- struct rcu_head rcu;
- unsigned int index;
- void *ptes[0];
-};
-
-static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
-
-static void hugepd_free_rcu_callback(struct rcu_head *head)
-{
- struct hugepd_freelist *batch =
- container_of(head, struct hugepd_freelist, rcu);
- unsigned int i;
-
- for (i = 0; i < batch->index; i++)
- kmem_cache_free(hugepte_cache, batch->ptes[i]);
-
- free_page((unsigned long)batch);
-}
-
-static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
-{
- struct hugepd_freelist **batchp;
-
- batchp = &__get_cpu_var(hugepd_freelist_cur);
-
- if (atomic_read(&tlb->mm->mm_users) < 2 ||
- cpumask_equal(mm_cpumask(tlb->mm),
- cpumask_of(smp_processor_id()))) {
- kmem_cache_free(hugepte_cache, hugepte);
- return;
- }
-
- if (*batchp == NULL) {
- *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
- (*batchp)->index = 0;
- }
-
- (*batchp)->ptes[(*batchp)->index++] = hugepte;
- if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
- call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
- *batchp = NULL;
- }
-}
-#endif
-
-static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
- unsigned long start, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
- pte_t *hugepte = hugepd_page(*hpdp);
- int i;
-
- unsigned long pdmask = ~((1UL << pdshift) - 1);
- unsigned int num_hugepd = 1;
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
- /* Note: On fsl the hpdp may be the first of several */
- num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
-#else
- unsigned int shift = hugepd_shift(*hpdp);
-#endif
-
- start &= pdmask;
- if (start < floor)
- return;
- if (ceiling) {
- ceiling &= pdmask;
- if (! ceiling)
- return;
- }
- if (end - 1 > ceiling - 1)
- return;
-
- for (i = 0; i < num_hugepd; i++, hpdp++)
- hpdp->pd = 0;
-
- tlb->need_flush = 1;
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
- hugepd_free(tlb, hugepte);
-#else
- pgtable_free_tlb(tlb, hugepte, pdshift - shift);
-#endif
-}
-static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
+bool __init hugetlb_node_alloc_supported(void)
{
- pmd_t *pmd;
- unsigned long next;
- unsigned long start;
-
- start = addr;
- do {
- pmd = pmd_offset(pud, addr);
- next = pmd_addr_end(addr, end);
- if (!is_hugepd(pmd)) {
- /*
- * if it is not hugepd pointer, we should already find
- * it cleared.
- */
- WARN_ON(!pmd_none_or_clear_bad(pmd));
- continue;
- }
-#ifdef CONFIG_PPC_FSL_BOOK3E
- /*
- * Increment next by the size of the huge mapping since
- * there may be more than one entry at this level for a
- * single hugepage, but all of them point to
- * the same kmem cache that holds the hugepte.
- */
- next = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
-#endif
- free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
- addr, next, floor, ceiling);
- } while (addr = next, addr != end);
-
- start &= PUD_MASK;
- if (start < floor)
- return;
- if (ceiling) {
- ceiling &= PUD_MASK;
- if (!ceiling)
- return;
- }
- if (end - 1 > ceiling - 1)
- return;
-
- pmd = pmd_offset(pud, start);
- pud_clear(pud);
- pmd_free_tlb(tlb, pmd, start);
+ return false;
}
-
-static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
- pud_t *pud;
- unsigned long next;
- unsigned long start;
-
- start = addr;
- do {
- pud = pud_offset(pgd, addr);
- next = pud_addr_end(addr, end);
- if (!is_hugepd(pud)) {
- if (pud_none_or_clear_bad(pud))
- continue;
- hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
- ceiling);
- } else {
-#ifdef CONFIG_PPC_FSL_BOOK3E
- /*
- * Increment next by the size of the huge mapping since
- * there may be more than one entry at this level for a
- * single hugepage, but all of them point to
- * the same kmem cache that holds the hugepte.
- */
- next = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
#endif
- free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
- addr, next, floor, ceiling);
- }
- } while (addr = next, addr != end);
- start &= PGDIR_MASK;
- if (start < floor)
- return;
- if (ceiling) {
- ceiling &= PGDIR_MASK;
- if (!ceiling)
- return;
- }
- if (end - 1 > ceiling - 1)
- return;
- pud = pud_offset(pgd, start);
- pgd_clear(pgd);
- pud_free_tlb(tlb, pud, start);
-}
-
-/*
- * This function frees user-level page tables of a process.
- *
- * Must be called with pagetable lock held.
- */
-void hugetlb_free_pgd_range(struct mmu_gather *tlb,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
+int __init alloc_bootmem_huge_page(struct hstate *h, int nid)
{
- pgd_t *pgd;
- unsigned long next;
-
- /*
- * Because there are a number of different possible pagetable
- * layouts for hugepage ranges, we limit knowledge of how
- * things should be laid out to the allocation path
- * (huge_pte_alloc(), above). Everything else works out the
- * structure as it goes from information in the hugepd
- * pointers. That means that we can't here use the
- * optimization used in the normal page free_pgd_range(), of
- * checking whether we're actually covering a large enough
- * range to have to do anything at the top level of the walk
- * instead of at the bottom.
- *
- * To make sense of this, you should probably go read the big
- * block comment at the top of the normal free_pgd_range(),
- * too.
- */
- do {
- next = pgd_addr_end(addr, end);
- pgd = pgd_offset(tlb->mm, addr);
- if (!is_hugepd(pgd)) {
- if (pgd_none_or_clear_bad(pgd))
- continue;
- hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
- } else {
-#ifdef CONFIG_PPC_FSL_BOOK3E
- /*
- * Increment next by the size of the huge mapping since
- * there may be more than one entry at the pgd level
- * for a single hugepage, but all of them point to the
- * same kmem cache that holds the hugepte.
- */
- next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
-#endif
- free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
- addr, next, floor, ceiling);
- }
- } while (addr = next, addr != end);
-}
-
-struct page *
-follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
-{
- pte_t *ptep;
- struct page *page;
- unsigned shift;
- unsigned long mask;
- /*
- * Transparent hugepages are handled by generic code. We can skip them
- * here.
- */
- ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
-
- /* Verify it is a huge page else bail. */
- if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
- return ERR_PTR(-EINVAL);
-
- mask = (1UL << shift) - 1;
- page = pte_page(*ptep);
- if (page)
- page += (address & mask) / PAGE_SIZE;
-
- return page;
-}
-
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int write)
-{
- BUG();
- return NULL;
-}
-
-static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
- unsigned long sz)
-{
- unsigned long __boundary = (addr + sz) & ~(sz-1);
- return (__boundary - 1 < end - 1) ? __boundary : end;
-}
-
-int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
- unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- pte_t *ptep;
- unsigned long sz = 1UL << hugepd_shift(*hugepd);
- unsigned long next;
-
- ptep = hugepte_offset(hugepd, addr, pdshift);
- do {
- next = hugepte_addr_end(addr, end, sz);
- if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
- return 0;
- } while (ptep++, addr = next, addr != end);
-
- return 1;
-}
-
-#ifdef CONFIG_PPC_MM_SLICES
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags)
-{
- struct hstate *hstate = hstate_file(file);
- int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
-
- return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
-}
-#endif
-
-unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
-{
-#ifdef CONFIG_PPC_MM_SLICES
- unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
-
- return 1UL << mmu_psize_to_shift(psize);
-#else
- if (!is_vm_hugetlb_page(vma))
- return PAGE_SIZE;
-
- return huge_page_size(hstate_vma(vma));
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
+ return pseries_alloc_bootmem_huge_page(h);
#endif
+ return __alloc_bootmem_huge_page(h, nid);
}
-static inline bool is_power_of_4(unsigned long x)
-{
- if (is_power_of_2(x))
- return (__ilog2(x) % 2) ? false : true;
- return false;
-}
-
-static int __init add_huge_page_size(unsigned long long size)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
int shift = __ffs(size);
int mmu_psize;
/* Check that it is a page size supported by the hardware and
* that it fits within pagetable and slice limits. */
-#ifdef CONFIG_PPC_FSL_BOOK3E
- if ((size < PAGE_SIZE) || !is_power_of_4(size))
- return -EINVAL;
-#else
- if (!is_power_of_2(size)
- || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
- return -EINVAL;
-#endif
+ if (size <= PAGE_SIZE || !is_power_of_2(size))
+ return false;
- if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
- return -EINVAL;
-
-#ifdef CONFIG_SPU_FS_64K_LS
- /* Disable support for 64K huge pages when 64K SPU local store
- * support is enabled as the current implementation conflicts.
- */
- if (shift == PAGE_SHIFT_64K)
- return -EINVAL;
-#endif /* CONFIG_SPU_FS_64K_LS */
+ mmu_psize = check_and_get_huge_psize(shift);
+ if (mmu_psize < 0)
+ return false;
BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
- /* Return if huge page size has already been setup */
- if (size_to_hstate(size))
- return 0;
-
- hugetlb_add_hstate(shift - PAGE_SHIFT);
-
- return 0;
+ return true;
}
-static int __init hugepage_setup_sz(char *str)
+static int __init add_huge_page_size(unsigned long long size)
{
- unsigned long long size;
-
- size = memparse(str, &str);
+ int shift = __ffs(size);
- if (add_huge_page_size(size) != 0)
- printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
+ if (!arch_hugetlb_valid_size((unsigned long)size))
+ return -EINVAL;
- return 1;
+ hugetlb_add_hstate(shift - PAGE_SHIFT);
+ return 0;
}
-__setup("hugepagesz=", hugepage_setup_sz);
-#ifdef CONFIG_PPC_FSL_BOOK3E
-struct kmem_cache *hugepte_cache;
static int __init hugetlbpage_init(void)
{
+ bool configured = false;
int psize;
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
- unsigned shift;
-
- if (!mmu_psize_defs[psize].shift)
- continue;
-
- shift = mmu_psize_to_shift(psize);
-
- /* Don't treat normal page sizes as huge... */
- if (shift != PAGE_SHIFT)
- if (add_huge_page_size(1ULL << shift) < 0)
- continue;
+ if (hugetlb_disabled) {
+ pr_info("HugeTLB support is disabled!\n");
+ return 0;
}
- /*
- * Create a kmem cache for hugeptes. The bottom bits in the pte have
- * size information encoded in them, so align them to allow this
- */
- hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t),
- HUGEPD_SHIFT_MASK + 1, 0, NULL);
- if (hugepte_cache == NULL)
- panic("%s: Unable to create kmem cache for hugeptes\n",
- __func__);
-
- /* Default hpage size = 4M */
- if (mmu_psize_defs[MMU_PAGE_4M].shift)
- HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
- else
- panic("%s: Unable to set default huge page size\n", __func__);
-
-
- return 0;
-}
-#else
-static int __init hugetlbpage_init(void)
-{
- int psize;
-
- if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() &&
+ !mmu_has_feature(MMU_FTR_16M_PAGE))
return -ENODEV;
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
unsigned shift;
- unsigned pdshift;
if (!mmu_psize_defs[psize].shift)
continue;
@@ -874,216 +189,29 @@ static int __init hugetlbpage_init(void)
if (add_huge_page_size(1ULL << shift) < 0)
continue;
- if (shift < PMD_SHIFT)
- pdshift = PMD_SHIFT;
- else if (shift < PUD_SHIFT)
- pdshift = PUD_SHIFT;
- else
- pdshift = PGDIR_SHIFT;
- /*
- * if we have pdshift and shift value same, we don't
- * use pgt cache for hugepd.
- */
- if (pdshift != shift) {
- pgtable_cache_add(pdshift - shift, NULL);
- if (!PGT_CACHE(pdshift - shift))
- panic("hugetlbpage_init(): could not create "
- "pgtable cache for %d bit pagesize\n", shift);
- }
+ configured = true;
}
- /* Set default large page size. Currently, we pick 16M or 1M
- * depending on what is available
- */
- if (mmu_psize_defs[MMU_PAGE_16M].shift)
- HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
- else if (mmu_psize_defs[MMU_PAGE_1M].shift)
- HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
+ if (!configured)
+ pr_info("Failed to initialize. Disabling HugeTLB");
return 0;
}
-#endif
-module_init(hugetlbpage_init);
-
-void flush_dcache_icache_hugepage(struct page *page)
-{
- int i;
- void *start;
-
- BUG_ON(!PageCompound(page));
- for (i = 0; i < (1UL << compound_order(page)); i++) {
- if (!PageHighMem(page)) {
- __flush_dcache_icache(page_address(page+i));
- } else {
- start = kmap_atomic(page+i);
- __flush_dcache_icache(start);
- kunmap_atomic(start);
- }
- }
-}
+arch_initcall(hugetlbpage_init);
-#endif /* CONFIG_HUGETLB_PAGE */
-
-/*
- * We have 4 cases for pgds and pmds:
- * (1) invalid (all zeroes)
- * (2) pointer to next table, as normal; bottom 6 bits == 0
- * (3) leaf pte for huge page, bottom two bits != 00
- * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
- *
- * So long as we atomically load page table pointers we are safe against teardown,
- * we can follow the address down to the the page and take a ref on it.
- */
-
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+void __init gigantic_hugetlb_cma_reserve(void)
{
- pgd_t pgd, *pgdp;
- pud_t pud, *pudp;
- pmd_t pmd, *pmdp;
- pte_t *ret_pte;
- hugepd_t *hpdp = NULL;
- unsigned pdshift = PGDIR_SHIFT;
-
- if (shift)
- *shift = 0;
+ unsigned long order = 0;
- pgdp = pgdir + pgd_index(ea);
- pgd = ACCESS_ONCE(*pgdp);
- /*
- * Always operate on the local stack value. This make sure the
- * value don't get updated by a parallel THP split/collapse,
- * page fault or a page unmap. The return pte_t * is still not
- * stable. So should be checked there for above conditions.
- */
- if (pgd_none(pgd))
- return NULL;
- else if (pgd_huge(pgd)) {
- ret_pte = (pte_t *) pgdp;
- goto out;
- } else if (is_hugepd(&pgd))
- hpdp = (hugepd_t *)&pgd;
- else {
+ if (radix_enabled())
+ order = PUD_SHIFT - PAGE_SHIFT;
+ else if (!firmware_has_feature(FW_FEATURE_LPAR) && mmu_psize_defs[MMU_PAGE_16G].shift)
/*
- * Even if we end up with an unmap, the pgtable will not
- * be freed, because we do an rcu free and here we are
- * irq disabled
+ * For pseries we do use ibm,expected#pages for reserving 16G pages.
*/
- pdshift = PUD_SHIFT;
- pudp = pud_offset(&pgd, ea);
- pud = ACCESS_ONCE(*pudp);
+ order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
- if (pud_none(pud))
- return NULL;
- else if (pud_huge(pud)) {
- ret_pte = (pte_t *) pudp;
- goto out;
- } else if (is_hugepd(&pud))
- hpdp = (hugepd_t *)&pud;
- else {
- pdshift = PMD_SHIFT;
- pmdp = pmd_offset(&pud, ea);
- pmd = ACCESS_ONCE(*pmdp);
- /*
- * A hugepage collapse is captured by pmd_none, because
- * it mark the pmd none and do a hpte invalidate.
- *
- * A hugepage split is captured by pmd_trans_splitting
- * because we mark the pmd trans splitting and do a
- * hpte invalidate
- *
- */
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
- return NULL;
-
- if (pmd_huge(pmd) || pmd_large(pmd)) {
- ret_pte = (pte_t *) pmdp;
- goto out;
- } else if (is_hugepd(&pmd))
- hpdp = (hugepd_t *)&pmd;
- else
- return pte_offset_kernel(&pmd, ea);
- }
- }
- if (!hpdp)
- return NULL;
-
- ret_pte = hugepte_offset(hpdp, ea, pdshift);
- pdshift = hugepd_shift(*hpdp);
-out:
- if (shift)
- *shift = pdshift;
- return ret_pte;
-}
-EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
-
-int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- unsigned long mask;
- unsigned long pte_end;
- struct page *head, *page, *tail;
- pte_t pte;
- int refs;
-
- pte_end = (addr + sz) & ~(sz-1);
- if (pte_end < end)
- end = pte_end;
-
- pte = ACCESS_ONCE(*ptep);
- mask = _PAGE_PRESENT | _PAGE_USER;
- if (write)
- mask |= _PAGE_RW;
-
- if ((pte_val(pte) & mask) != mask)
- return 0;
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- /*
- * check for splitting here
- */
- if (pmd_trans_splitting(pte_pmd(pte)))
- return 0;
-#endif
-
- /* hugepages are never "special" */
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
- refs = 0;
- head = pte_page(pte);
-
- page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
- tail = page;
- do {
- VM_BUG_ON(compound_head(page) != head);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
-
- if (!page_cache_add_speculative(head, refs)) {
- *nr -= refs;
- return 0;
- }
-
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- /* Could be optimized better */
- *nr -= refs;
- while (refs--)
- put_page(head);
- return 0;
- }
-
- /*
- * Any tail page need their mapcount reference taken before we
- * return.
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
- return 1;
+ if (order)
+ hugetlb_cma_reserve(order);
}
diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c
deleted file mode 100644
index 915412e4d5ba..000000000000
--- a/arch/powerpc/mm/icswx.c
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- * ICSWX and ACOP Management
- *
- * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-
-#include "icswx.h"
-
-/*
- * The processor and its L2 cache cause the icswx instruction to
- * generate a COP_REQ transaction on PowerBus. The transaction has no
- * address, and the processor does not perform an MMU access to
- * authenticate the transaction. The command portion of the PowerBus
- * COP_REQ transaction includes the LPAR_ID (LPID) and the coprocessor
- * Process ID (PID), which the coprocessor compares to the authorized
- * LPID and PID held in the coprocessor, to determine if the process
- * is authorized to generate the transaction. The data of the COP_REQ
- * transaction is 128-byte or less in size and is placed in cacheable
- * memory on a 128-byte cache line boundary.
- *
- * The task to use a coprocessor should use use_cop() to mark the use
- * of the Coprocessor Type (CT) and context switching. On a server
- * class processor, the PID register is used only for coprocessor
- * management + * and so a coprocessor PID is allocated before
- * executing icswx + * instruction. Drop_cop() is used to free the
- * coprocessor PID.
- *
- * Example:
- * Host Fabric Interface (HFI) is a PowerPC network coprocessor.
- * Each HFI have multiple windows. Each HFI window serves as a
- * network device sending to and receiving from HFI network.
- * HFI immediate send function uses icswx instruction. The immediate
- * send function allows small (single cache-line) packets be sent
- * without using the regular HFI send FIFO and doorbell, which are
- * much slower than immediate send.
- *
- * For each task intending to use HFI immediate send, the HFI driver
- * calls use_cop() to obtain a coprocessor PID for the task.
- * The HFI driver then allocate a free HFI window and save the
- * coprocessor PID to the HFI window to allow the task to use the
- * HFI window.
- *
- * The HFI driver repeatedly creates immediate send packets and
- * issues icswx instruction to send data through the HFI window.
- * The HFI compares the coprocessor PID in the CPU PID register
- * to the PID held in the HFI window to determine if the transaction
- * is allowed.
- *
- * When the task to release the HFI window, the HFI driver calls
- * drop_cop() to release the coprocessor PID.
- */
-
-void switch_cop(struct mm_struct *next)
-{
-#ifdef CONFIG_PPC_ICSWX_PID
- mtspr(SPRN_PID, next->context.cop_pid);
-#endif
- mtspr(SPRN_ACOP, next->context.acop);
-}
-
-/**
- * Start using a coprocessor.
- * @acop: mask of coprocessor to be used.
- * @mm: The mm the coprocessor to associate with. Most likely current mm.
- *
- * Return a positive PID if successful. Negative errno otherwise.
- * The returned PID will be fed to the coprocessor to determine if an
- * icswx transaction is authenticated.
- */
-int use_cop(unsigned long acop, struct mm_struct *mm)
-{
- int ret;
-
- if (!cpu_has_feature(CPU_FTR_ICSWX))
- return -ENODEV;
-
- if (!mm || !acop)
- return -EINVAL;
-
- /* The page_table_lock ensures mm_users won't change under us */
- spin_lock(&mm->page_table_lock);
- spin_lock(mm->context.cop_lockp);
-
- ret = get_cop_pid(mm);
- if (ret < 0)
- goto out;
-
- /* update acop */
- mm->context.acop |= acop;
-
- sync_cop(mm);
-
- /*
- * If this is a threaded process then there might be other threads
- * running. We need to send an IPI to force them to pick up any
- * change in PID and ACOP.
- */
- if (atomic_read(&mm->mm_users) > 1)
- smp_call_function(sync_cop, mm, 1);
-
-out:
- spin_unlock(mm->context.cop_lockp);
- spin_unlock(&mm->page_table_lock);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(use_cop);
-
-/**
- * Stop using a coprocessor.
- * @acop: mask of coprocessor to be stopped.
- * @mm: The mm the coprocessor associated with.
- */
-void drop_cop(unsigned long acop, struct mm_struct *mm)
-{
- int free_pid;
-
- if (!cpu_has_feature(CPU_FTR_ICSWX))
- return;
-
- if (WARN_ON_ONCE(!mm))
- return;
-
- /* The page_table_lock ensures mm_users won't change under us */
- spin_lock(&mm->page_table_lock);
- spin_lock(mm->context.cop_lockp);
-
- mm->context.acop &= ~acop;
-
- free_pid = disable_cop_pid(mm);
- sync_cop(mm);
-
- /*
- * If this is a threaded process then there might be other threads
- * running. We need to send an IPI to force them to pick up any
- * change in PID and ACOP.
- */
- if (atomic_read(&mm->mm_users) > 1)
- smp_call_function(sync_cop, mm, 1);
-
- if (free_pid != COP_PID_NONE)
- free_cop_pid(free_pid);
-
- spin_unlock(mm->context.cop_lockp);
- spin_unlock(&mm->page_table_lock);
-}
-EXPORT_SYMBOL_GPL(drop_cop);
-
-static int acop_use_cop(int ct)
-{
- /* There is no alternate policy, yet */
- return -1;
-}
-
-/*
- * Get the instruction word at the NIP
- */
-static u32 acop_get_inst(struct pt_regs *regs)
-{
- u32 inst;
- u32 __user *p;
-
- p = (u32 __user *)regs->nip;
- if (!access_ok(VERIFY_READ, p, sizeof(*p)))
- return 0;
-
- if (__get_user(inst, p))
- return 0;
-
- return inst;
-}
-
-/**
- * @regs: regsiters at time of interrupt
- * @address: storage address
- * @error_code: Fault code, usually the DSISR or ESR depending on
- * processor type
- *
- * Return 0 if we are able to resolve the data storage fault that
- * results from a CT miss in the ACOP register.
- */
-int acop_handle_fault(struct pt_regs *regs, unsigned long address,
- unsigned long error_code)
-{
- int ct;
- u32 inst = 0;
-
- if (!cpu_has_feature(CPU_FTR_ICSWX)) {
- pr_info("No coprocessors available");
- _exception(SIGILL, regs, ILL_ILLOPN, address);
- }
-
- if (!user_mode(regs)) {
- /* this could happen if the HV denies the
- * kernel access, for now we just die */
- die("ICSWX from kernel failed", regs, SIGSEGV);
- }
-
- /* Some implementations leave us a hint for the CT */
- ct = ICSWX_GET_CT_HINT(error_code);
- if (ct < 0) {
- /* we have to peek at the instruction word to figure out CT */
- u32 ccw;
- u32 rs;
-
- inst = acop_get_inst(regs);
- if (inst == 0)
- return -1;
-
- rs = (inst >> (31 - 10)) & 0x1f;
- ccw = regs->gpr[rs];
- ct = (ccw >> 16) & 0x3f;
- }
-
- /*
- * We could be here because another thread has enabled acop
- * but the ACOP register has yet to be updated.
- *
- * This should have been taken care of by the IPI to sync all
- * the threads (see smp_call_function(sync_cop, mm, 1)), but
- * that could take forever if there are a significant amount
- * of threads.
- *
- * Given the number of threads on some of these systems,
- * perhaps this is the best way to sync ACOP rather than whack
- * every thread with an IPI.
- */
- if ((acop_copro_type_bit(ct) & current->active_mm->context.acop) != 0) {
- sync_cop(current->active_mm);
- return 0;
- }
-
- /* check for alternate policy */
- if (!acop_use_cop(ct))
- return 0;
-
- /* at this point the CT is unknown to the system */
- pr_warn("%s[%d]: Coprocessor %d is unavailable\n",
- current->comm, current->pid, ct);
-
- /* get inst if we don't already have it */
- if (inst == 0) {
- inst = acop_get_inst(regs);
- if (inst == 0)
- return -1;
- }
-
- /* Check if the instruction is the "record form" */
- if (inst & 1) {
- /*
- * the instruction is "record" form so we can reject
- * using CR0
- */
- regs->ccr &= ~(0xful << 28);
- regs->ccr |= ICSWX_RC_NOT_FOUND << 28;
-
- /* Move on to the next instruction */
- regs->nip += 4;
- } else {
- /*
- * There is no architected mechanism to report a bad
- * CT so we could either SIGILL or report nothing.
- * Since the non-record version should only bu used
- * for "hints" or "don't care" we should probably do
- * nothing. However, I could see how some people
- * might want an SIGILL so it here if you want it.
- */
-#ifdef CONFIG_PPC_ICSWX_USE_SIGILL
- _exception(SIGILL, regs, ILL_ILLOPN, address);
-#else
- regs->nip += 4;
-#endif
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(acop_handle_fault);
diff --git a/arch/powerpc/mm/icswx.h b/arch/powerpc/mm/icswx.h
deleted file mode 100644
index 6dedc08e62c8..000000000000
--- a/arch/powerpc/mm/icswx.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef _ARCH_POWERPC_MM_ICSWX_H_
-#define _ARCH_POWERPC_MM_ICSWX_H_
-
-/*
- * ICSWX and ACOP Management
- *
- * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <asm/mmu_context.h>
-
-/* also used to denote that PIDs are not used */
-#define COP_PID_NONE 0
-
-static inline void sync_cop(void *arg)
-{
- struct mm_struct *mm = arg;
-
- if (mm == current->active_mm)
- switch_cop(current->active_mm);
-}
-
-#ifdef CONFIG_PPC_ICSWX_PID
-extern int get_cop_pid(struct mm_struct *mm);
-extern int disable_cop_pid(struct mm_struct *mm);
-extern void free_cop_pid(int free_pid);
-#else
-#define get_cop_pid(m) (COP_PID_NONE)
-#define disable_cop_pid(m) (COP_PID_NONE)
-#define free_cop_pid(p)
-#endif
-
-/*
- * These are implementation bits for architected registers. If this
- * ever becomes architecture the should be moved to reg.h et. al.
- */
-/* UCT is the same bit for Server and Embedded */
-#define ICSWX_DSI_UCT 0x00004000 /* Unavailable Coprocessor Type */
-
-#ifdef CONFIG_PPC_BOOK3E
-/* Embedded implementation gives us no hints as to what the CT is */
-#define ICSWX_GET_CT_HINT(x) (-1)
-#else
-/* Server implementation contains the CT value in the DSISR */
-#define ICSWX_DSISR_CTMASK 0x00003f00
-#define ICSWX_GET_CT_HINT(x) (((x) & ICSWX_DSISR_CTMASK) >> 8)
-#endif
-
-#define ICSWX_RC_STARTED 0x8 /* The request has been started */
-#define ICSWX_RC_NOT_IDLE 0x4 /* No coprocessor found idle */
-#define ICSWX_RC_NOT_FOUND 0x2 /* No coprocessor found */
-#define ICSWX_RC_UNDEFINED 0x1 /* Reserved */
-
-extern int acop_handle_fault(struct pt_regs *regs, unsigned long address,
- unsigned long error_code);
-
-static inline u64 acop_copro_type_bit(unsigned int type)
-{
- return 1ULL << (63 - type);
-}
-
-#endif /* !_ARCH_POWERPC_MM_ICSWX_H_ */
diff --git a/arch/powerpc/mm/icswx_pid.c b/arch/powerpc/mm/icswx_pid.c
deleted file mode 100644
index 91e30eb7d054..000000000000
--- a/arch/powerpc/mm/icswx_pid.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * ICSWX and ACOP/PID Management
- *
- * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/module.h>
-#include "icswx.h"
-
-#define COP_PID_MIN (COP_PID_NONE + 1)
-#define COP_PID_MAX (0xFFFF)
-
-static DEFINE_SPINLOCK(mmu_context_acop_lock);
-static DEFINE_IDA(cop_ida);
-
-static int new_cop_pid(struct ida *ida, int min_id, int max_id,
- spinlock_t *lock)
-{
- int index;
- int err;
-
-again:
- if (!ida_pre_get(ida, GFP_KERNEL))
- return -ENOMEM;
-
- spin_lock(lock);
- err = ida_get_new_above(ida, min_id, &index);
- spin_unlock(lock);
-
- if (err == -EAGAIN)
- goto again;
- else if (err)
- return err;
-
- if (index > max_id) {
- spin_lock(lock);
- ida_remove(ida, index);
- spin_unlock(lock);
- return -ENOMEM;
- }
-
- return index;
-}
-
-int get_cop_pid(struct mm_struct *mm)
-{
- int pid;
-
- if (mm->context.cop_pid == COP_PID_NONE) {
- pid = new_cop_pid(&cop_ida, COP_PID_MIN, COP_PID_MAX,
- &mmu_context_acop_lock);
- if (pid >= 0)
- mm->context.cop_pid = pid;
- }
- return mm->context.cop_pid;
-}
-
-int disable_cop_pid(struct mm_struct *mm)
-{
- int free_pid = COP_PID_NONE;
-
- if ((!mm->context.acop) && (mm->context.cop_pid != COP_PID_NONE)) {
- free_pid = mm->context.cop_pid;
- mm->context.cop_pid = COP_PID_NONE;
- }
- return free_pid;
-}
-
-void free_cop_pid(int free_pid)
-{
- spin_lock(&mmu_context_acop_lock);
- ida_remove(&cop_ida, free_pid);
- spin_unlock(&mmu_context_acop_lock);
-}
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
new file mode 100644
index 000000000000..745097554bea
--- /dev/null
+++ b/arch/powerpc/mm/init-common.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerPC version
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Dave Engebretsen <engebret@us.ibm.com>
+ * Rework for PPC64 port.
+ */
+
+#undef DEBUG
+
+#include <linux/string.h>
+#include <linux/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/kup.h>
+#include <asm/smp.h>
+
+phys_addr_t memstart_addr __ro_after_init = (phys_addr_t)~0ull;
+EXPORT_SYMBOL_GPL(memstart_addr);
+phys_addr_t kernstart_addr __ro_after_init;
+EXPORT_SYMBOL_GPL(kernstart_addr);
+unsigned long kernstart_virt_addr __ro_after_init = KERNELBASE;
+EXPORT_SYMBOL_GPL(kernstart_virt_addr);
+
+bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP);
+bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP);
+#ifdef CONFIG_KFENCE
+bool __ro_after_init kfence_disabled;
+bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;
+#endif
+
+static int __init parse_nosmep(char *p)
+{
+ if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64))
+ return 0;
+
+ disable_kuep = true;
+ pr_warn("Disabling Kernel Userspace Execution Prevention\n");
+ return 0;
+}
+early_param("nosmep", parse_nosmep);
+
+static int __init parse_nosmap(char *p)
+{
+ disable_kuap = true;
+ pr_warn("Disabling Kernel Userspace Access Protection\n");
+ return 0;
+}
+early_param("nosmap", parse_nosmap);
+
+void __weak setup_kuep(bool disabled)
+{
+ if (!IS_ENABLED(CONFIG_PPC_KUEP) || disabled)
+ return;
+
+ if (smp_processor_id() != boot_cpuid)
+ return;
+
+ pr_info("Activating Kernel Userspace Execution Prevention\n");
+}
+
+void setup_kup(void)
+{
+ setup_kuap(disable_kuap);
+ setup_kuep(disable_kuep);
+}
+
+#define CTOR(shift) static void ctor_##shift(void *addr) \
+{ \
+ memset(addr, 0, sizeof(pgd_t) << (shift)); \
+}
+
+CTOR(0); CTOR(1); CTOR(2); CTOR(3); CTOR(4); CTOR(5); CTOR(6); CTOR(7);
+CTOR(8); CTOR(9); CTOR(10); CTOR(11); CTOR(12); CTOR(13); CTOR(14); CTOR(15);
+
+static inline void (*ctor(int shift))(void *)
+{
+ BUILD_BUG_ON(MAX_PGTABLE_INDEX_SIZE != 15);
+
+ switch (shift) {
+ case 0: return ctor_0;
+ case 1: return ctor_1;
+ case 2: return ctor_2;
+ case 3: return ctor_3;
+ case 4: return ctor_4;
+ case 5: return ctor_5;
+ case 6: return ctor_6;
+ case 7: return ctor_7;
+ case 8: return ctor_8;
+ case 9: return ctor_9;
+ case 10: return ctor_10;
+ case 11: return ctor_11;
+ case 12: return ctor_12;
+ case 13: return ctor_13;
+ case 14: return ctor_14;
+ case 15: return ctor_15;
+ }
+ return NULL;
+}
+
+struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE + 1];
+EXPORT_SYMBOL_GPL(pgtable_cache); /* used by kvm_hv module */
+
+/*
+ * Create a kmem_cache() for pagetables. This is not used for PTE
+ * pages - they're linked to struct page, come from the normal free
+ * pages pool and have a different entry size (see real_pte_t) to
+ * everything else. Caches created by this function are used for all
+ * the higher level pagetables, and for hugepage pagetables.
+ */
+void pgtable_cache_add(unsigned int shift)
+{
+ char *name;
+ unsigned long table_size = sizeof(pgd_t) << shift;
+ unsigned long align = table_size;
+
+ /* When batching pgtable pointers for RCU freeing, we store
+ * the index size in the low bits. Table alignment must be
+ * big enough to fit it.
+ */
+ unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1;
+ struct kmem_cache *new = NULL;
+
+ /* It would be nice if this was a BUILD_BUG_ON(), but at the
+ * moment, gcc doesn't seem to recognize is_power_of_2 as a
+ * constant expression, so so much for that. */
+ BUG_ON(!is_power_of_2(minalign));
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+
+ if (PGT_CACHE(shift))
+ return; /* Already have a cache of this size */
+
+ align = max_t(unsigned long, align, minalign);
+ name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
+ if (name)
+ new = kmem_cache_create(name, table_size, align, 0, ctor(shift));
+ if (!new)
+ panic("Could not allocate pgtable cache for order %d", shift);
+
+ kfree(name);
+ pgtable_cache[shift] = new;
+
+ pr_debug("Allocated pgtable cache for order %d\n", shift);
+}
+EXPORT_SYMBOL_GPL(pgtable_cache_add); /* used by kvm_hv module */
+
+void pgtable_cache_init(void)
+{
+ pgtable_cache_add(PGD_INDEX_SIZE);
+
+ if (PMD_CACHE_INDEX)
+ pgtable_cache_add(PMD_CACHE_INDEX);
+ /*
+ * In all current configs, when the PUD index exists it's the
+ * same size as either the pgd or pmd index except with THP enabled
+ * on book3s 64
+ */
+ if (PUD_CACHE_INDEX)
+ pgtable_cache_add(PUD_CACHE_INDEX);
+}
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 01e2db97a210..4e71dfe7d026 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -9,12 +10,6 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/module.h>
@@ -26,7 +21,6 @@
#include <linux/mm.h>
#include <linux/stddef.h>
#include <linux/init.h>
-#include <linux/bootmem.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
#include <linux/pagemap.h>
@@ -35,10 +29,7 @@
#include <linux/slab.h>
#include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
-#include <asm/prom.h>
#include <asm/io.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/smp.h>
#include <asm/machdep.h>
@@ -46,13 +37,16 @@
#include <asm/tlb.h>
#include <asm/sections.h>
#include <asm/hugetlb.h>
+#include <asm/kup.h>
+#include <asm/kasan.h>
+#include <asm/fixmap.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
#if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
/* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */
#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - PAGE_OFFSET))
-#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL"
+#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_KERNEL_START"
#endif
#endif
#define MAX_LOW_MEM CONFIG_LOWMEM_SIZE
@@ -60,12 +54,7 @@
phys_addr_t total_memory;
phys_addr_t total_lowmem;
-phys_addr_t memstart_addr = (phys_addr_t)~0ull;
-EXPORT_SYMBOL(memstart_addr);
-phys_addr_t kernstart_addr;
-EXPORT_SYMBOL(kernstart_addr);
-
-#ifdef CONFIG_RELOCATABLE_PPC32
+#ifdef CONFIG_RELOCATABLE
/* Used in __va()/__pa() */
long long virt_phys_offset;
EXPORT_SYMBOL(virt_phys_offset);
@@ -81,45 +70,10 @@ EXPORT_SYMBOL(agp_special_page);
void MMU_init(void);
-/* XXX should be in current.h -- paulus */
-extern struct task_struct *current_set[NR_CPUS];
-
-/*
- * this tells the system to map all of ram with the segregs
- * (i.e. page tables) instead of the bats.
- * -- Cort
- */
-int __map_without_bats;
-int __map_without_ltlbs;
-
-/*
- * This tells the system to allow ioremapping memory marked as reserved.
- */
-int __allow_ioremap_reserved;
-
/* max amount of low RAM to map in */
unsigned long __max_low_memory = MAX_LOW_MEM;
/*
- * Check for command-line options that affect what MMU_init will do.
- */
-void MMU_setup(void)
-{
- /* Check for nobats option (used in mapin_ram). */
- if (strstr(cmd_line, "nobats")) {
- __map_without_bats = 1;
- }
-
- if (strstr(cmd_line, "noltlbs")) {
- __map_without_ltlbs = 1;
- }
-#ifdef CONFIG_DEBUG_PAGEALLOC
- __map_without_bats = 1;
- __map_without_ltlbs = 1;
-#endif
-}
-
-/*
* MMU_init sets up the basic memory mappings for the kernel,
* including both RAM and possibly some I/O regions,
* and sets up the page tables and the MMU hardware ready to go.
@@ -129,33 +83,15 @@ void __init MMU_init(void)
if (ppc_md.progress)
ppc_md.progress("MMU:enter", 0x111);
- /* parse args from command line */
- MMU_setup();
-
- /*
- * Reserve gigantic pages for hugetlb. This MUST occur before
- * lowmem_end_addr is initialized below.
- */
- reserve_hugetlb_gpages();
-
- if (memblock.memory.cnt > 1) {
-#ifndef CONFIG_WII
- memblock_enforce_memory_limit(memblock.memory.regions[0].size);
- printk(KERN_WARNING "Only using first contiguous memory region");
-#else
- wii_memory_fixups();
-#endif
- }
-
total_lowmem = total_memory = memblock_end_of_DRAM() - memstart_addr;
lowmem_end_addr = memstart_addr + total_lowmem;
-#ifdef CONFIG_FSL_BOOKE
+#ifdef CONFIG_PPC_85xx
/* Freescale Book-E parts expect lowmem to be mapped by fixed TLB
* entries, so we need to adjust lowmem to match the amount we can map
* in the fixed entries */
adjust_total_lowmem();
-#endif /* CONFIG_FSL_BOOKE */
+#endif /* CONFIG_PPC_85xx */
if (total_lowmem > __max_low_memory) {
total_lowmem = __max_low_memory;
@@ -179,10 +115,6 @@ void __init MMU_init(void)
/* Initialize early top-down ioremap allocator */
ioremap_bot = IOREMAP_TOP;
- /* Map in I/O resources */
- if (ppc_md.progress)
- ppc_md.progress("MMU:setio", 0x302);
-
if (ppc_md.progress)
ppc_md.progress("MMU:exit", 0x211);
@@ -191,29 +123,12 @@ void __init MMU_init(void)
btext_unmap();
#endif
- /* Shortly after that, the entire linear mapping will be available */
- memblock_set_current_limit(lowmem_end_addr);
-}
+ kasan_mmu_init();
-/* This is only called until mem_init is done. */
-void __init *early_get_page(void)
-{
- if (init_bootmem_done)
- return alloc_bootmem_pages(PAGE_SIZE);
- else
- return __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE));
-}
+ setup_kup();
-#ifdef CONFIG_8xx /* No 8xx specific .c file to put that in ... */
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size)
-{
- /* We don't currently support the first MEMBLOCK not mapping 0
- * physical on those processors
- */
- BUG_ON(first_memblock_base != 0);
+ update_mmu_feature_fixups(MMU_FTR_KUAP);
- /* 8xx can only access 8MB at the moment */
- memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
+ /* Shortly after that, the entire linear mapping will be available */
+ memblock_set_current_limit(lowmem_end_addr);
}
-#endif /* CONFIG_8xx */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index d0cd9e4c6837..b6f3ae03ca9e 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -11,12 +12,6 @@
*
* Dave Engebretsen <engebret@us.ibm.com>
* Rework for PPC64 port.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#undef DEBUG
@@ -34,7 +29,6 @@
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/delay.h>
-#include <linux/bootmem.h>
#include <linux/highmem.h>
#include <linux/idr.h>
#include <linux/nodemask.h>
@@ -43,6 +37,11 @@
#include <linux/memblock.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
+#include <linux/memremap.h>
+#include <linux/memory.h>
+#include <linux/bootmem_info.h>
#include <asm/pgalloc.h>
#include <asm/page.h>
@@ -50,9 +49,8 @@
#include <asm/rtas.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/tlb.h>
@@ -63,177 +61,97 @@
#include <asm/sections.h>
#include <asm/iommu.h>
#include <asm/vdso.h>
+#include <asm/hugetlb.h>
-#include "mmu_decl.h"
-
-#ifdef CONFIG_PPC_STD_MMU_64
-#if PGTABLE_RANGE > USER_VSID_RANGE
-#warning Limited user VSID range means pagetable space is wasted
-#endif
-
-#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
-#warning TASK_SIZE is smaller than it needs to be.
-#endif
-#endif /* CONFIG_PPC_STD_MMU_64 */
-
-phys_addr_t memstart_addr = ~0;
-EXPORT_SYMBOL_GPL(memstart_addr);
-phys_addr_t kernstart_addr;
-EXPORT_SYMBOL_GPL(kernstart_addr);
-
-static void pgd_ctor(void *addr)
-{
- memset(addr, 0, PGD_TABLE_SIZE);
-}
-
-static void pmd_ctor(void *addr)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- memset(addr, 0, PMD_TABLE_SIZE * 2);
-#else
- memset(addr, 0, PMD_TABLE_SIZE);
-#endif
-}
-
-struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
-
-/*
- * Create a kmem_cache() for pagetables. This is not used for PTE
- * pages - they're linked to struct page, come from the normal free
- * pages pool and have a different entry size (see real_pte_t) to
- * everything else. Caches created by this function are used for all
- * the higher level pagetables, and for hugepage pagetables.
- */
-void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
-{
- char *name;
- unsigned long table_size = sizeof(void *) << shift;
- unsigned long align = table_size;
-
- /* When batching pgtable pointers for RCU freeing, we store
- * the index size in the low bits. Table alignment must be
- * big enough to fit it.
- *
- * Likewise, hugeapge pagetable pointers contain a (different)
- * shift value in the low bits. All tables must be aligned so
- * as to leave enough 0 bits in the address to contain it. */
- unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
- HUGEPD_SHIFT_MASK + 1);
- struct kmem_cache *new;
-
- /* It would be nice if this was a BUILD_BUG_ON(), but at the
- * moment, gcc doesn't seem to recognize is_power_of_2 as a
- * constant expression, so so much for that. */
- BUG_ON(!is_power_of_2(minalign));
- BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
-
- if (PGT_CACHE(shift))
- return; /* Already have a cache of this size */
-
- align = max_t(unsigned long, align, minalign);
- name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
- new = kmem_cache_create(name, table_size, align, 0, ctor);
- pgtable_cache[shift - 1] = new;
- pr_debug("Allocated pgtable cache for order %d\n", shift);
-}
-
-
-void pgtable_cache_init(void)
-{
- pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
- pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
- if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
- panic("Couldn't allocate pgtable caches");
- /* In all current configs, when the PUD index exists it's the
- * same size as either the pgd or pmd index. Verify that the
- * initialization above has also created a PUD cache. This
- * will need re-examiniation if we add new possibilities for
- * the pagetable layout. */
- BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
-}
+#include <mm/mmu_decl.h>
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
- * Given an address within the vmemmap, determine the pfn of the page that
- * represents the start of the section it is within. Note that we have to
+ * Given an address within the vmemmap, determine the page that
+ * represents the start of the subsection it is within. Note that we have to
* do this by hand as the proffered address may not be correctly aligned.
* Subtraction of non-aligned pointers produces undefined results.
*/
-static unsigned long __meminit vmemmap_section_start(unsigned long page)
+static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_addr)
{
- unsigned long offset = page - ((unsigned long)(vmemmap));
+ unsigned long start_pfn;
+ unsigned long offset = vmemmap_addr - ((unsigned long)(vmemmap));
/* Return the pfn of the start of the section. */
- return (offset / sizeof(struct page)) & PAGE_SECTION_MASK;
+ start_pfn = (offset / sizeof(struct page)) & PAGE_SUBSECTION_MASK;
+ return pfn_to_page(start_pfn);
}
/*
- * Check if this vmemmap page is already initialised. If any section
- * which overlaps this vmemmap page is initialised then this page is
- * initialised already.
+ * Since memory is added in sub-section chunks, before creating a new vmemmap
+ * mapping, the kernel should check whether there is an existing memmap mapping
+ * covering the new subsection added. This is needed because kernel can map
+ * vmemmap area using 16MB pages which will cover a memory range of 16G. Such
+ * a range covers multiple subsections (2M)
+ *
+ * If any subsection in the 16G range mapped by vmemmap is valid we consider the
+ * vmemmap populated (There is a page table entry already present). We can't do
+ * a page table lookup here because with the hash translation we don't keep
+ * vmemmap details in linux page table.
*/
-static int __meminit vmemmap_populated(unsigned long start, int page_size)
+int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size)
{
- unsigned long end = start + page_size;
-
- for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page)))
- if (pfn_valid(vmemmap_section_start(start)))
+ struct page *start;
+ unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size;
+ start = vmemmap_subsection_start(vmemmap_addr);
+
+ for (; (unsigned long)start < vmemmap_end; start += PAGES_PER_SUBSECTION)
+ /*
+ * pfn valid check here is intended to really check
+ * whether we have any subsection already initialized
+ * in this range.
+ */
+ if (pfn_valid(page_to_pfn(start)))
return 1;
return 0;
}
-/* On hash-based CPUs, the vmemmap is bolted in the hash table.
- *
- * On Book3E CPUs, the vmemmap is currently mapped in the top half of
- * the vmalloc space using normal page tables, though the size of
- * pages encoded in the PTEs can be different
+/*
+ * vmemmap virtual address space management does not have a traditional page
+ * table to track which virtual struct pages are backed by physical mapping.
+ * The virtual to physical mappings are tracked in a simple linked list
+ * format. 'vmemmap_list' maintains the entire vmemmap physical mapping at
+ * all times where as the 'next' list maintains the available
+ * vmemmap_backing structures which have been deleted from the
+ * 'vmemmap_global' list during system runtime (memory hotplug remove
+ * operation). The freed 'vmemmap_backing' structures are reused later when
+ * new requests come in without allocating fresh memory. This pointer also
+ * tracks the allocated 'vmemmap_backing' structures as we allocate one
+ * full page memory at a time when we dont have any.
*/
-
-#ifdef CONFIG_PPC_BOOK3E
-static void __meminit vmemmap_create_mapping(unsigned long start,
- unsigned long page_size,
- unsigned long phys)
-{
- /* Create a PTE encoding without page size */
- unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
- _PAGE_KERNEL_RW;
-
- /* PTEs only contain page size encodings up to 32M */
- BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
-
- /* Encode the size in the PTE */
- flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
-
- /* For each PTE for that area, map things. Note that we don't
- * increment phys because all PTEs are of the large size and
- * thus must have the low bits clear
- */
- for (i = 0; i < page_size; i += PAGE_SIZE)
- BUG_ON(map_kernel_page(start + i, phys, flags));
-}
-#else /* CONFIG_PPC_BOOK3E */
-static void __meminit vmemmap_create_mapping(unsigned long start,
- unsigned long page_size,
- unsigned long phys)
-{
- int mapped = htab_bolt_mapping(start, start + page_size, phys,
- pgprot_val(PAGE_KERNEL),
- mmu_vmemmap_psize,
- mmu_kernel_ssize);
- BUG_ON(mapped < 0);
-}
-#endif /* CONFIG_PPC_BOOK3E */
-
struct vmemmap_backing *vmemmap_list;
+static struct vmemmap_backing *next;
+
+/*
+ * The same pointer 'next' tracks individual chunks inside the allocated
+ * full page during the boot time and again tracks the freed nodes during
+ * runtime. It is racy but it does not happen as they are separated by the
+ * boot process. Will create problem if some how we have memory hotplug
+ * operation during boot !!
+ */
+static int num_left;
+static int num_freed;
static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
{
- static struct vmemmap_backing *next;
- static int num_left;
+ struct vmemmap_backing *vmem_back;
+ /* get from freed entries first */
+ if (num_freed) {
+ num_freed--;
+ vmem_back = next;
+ next = next->list;
+
+ return vmem_back;
+ }
/* allocate a page when required and hand out chunks */
- if (!next || !num_left) {
+ if (!num_left) {
next = vmemmap_alloc_block(PAGE_SIZE, node);
if (unlikely(!next)) {
WARN_ON(1);
@@ -247,16 +165,16 @@ static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
return next++;
}
-static __meminit void vmemmap_list_populate(unsigned long phys,
- unsigned long start,
- int node)
+static __meminit int vmemmap_list_populate(unsigned long phys,
+ unsigned long start,
+ int node)
{
struct vmemmap_backing *vmem_back;
vmem_back = vmemmap_list_alloc(node);
if (unlikely(!vmem_back)) {
- WARN_ON(1);
- return;
+ pr_debug("vmemap list allocation failed\n");
+ return -ENOMEM;
}
vmem_back->phys = phys;
@@ -264,41 +182,499 @@ static __meminit void vmemmap_list_populate(unsigned long phys,
vmem_back->list = vmemmap_list;
vmemmap_list = vmem_back;
+ return 0;
}
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
+bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
+ unsigned long page_size)
{
+ unsigned long nr_pfn = page_size / sizeof(struct page);
+ unsigned long start_pfn = page_to_pfn((struct page *)start);
+
+ if ((start_pfn + nr_pfn - 1) > altmap->end_pfn)
+ return true;
+
+ if (start_pfn < altmap->base_pfn)
+ return true;
+
+ return false;
+}
+
+static int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
+{
+ bool altmap_alloc;
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
/* Align to the page size of the linear mapping. */
- start = _ALIGN_DOWN(start, page_size);
+ start = ALIGN_DOWN(start, page_size);
pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);
for (; start < end; start += page_size) {
- void *p;
-
+ void *p = NULL;
+ int rc;
+
+ /*
+ * This vmemmap range is backing different subsections. If any
+ * of that subsection is marked valid, that means we already
+ * have initialized a page table covering this range and hence
+ * the vmemmap range is populated.
+ */
if (vmemmap_populated(start, page_size))
continue;
- p = vmemmap_alloc_block(page_size, node);
+ /*
+ * Allocate from the altmap first if we have one. This may
+ * fail due to alignment issues when using 16MB hugepages, so
+ * fall back to system memory if the altmap allocation fail.
+ */
+ if (altmap && !altmap_cross_boundary(altmap, start, page_size)) {
+ p = vmemmap_alloc_block_buf(page_size, node, altmap);
+ if (!p)
+ pr_debug("altmap block allocation failed, falling back to system memory");
+ else
+ altmap_alloc = true;
+ }
+ if (!p) {
+ p = vmemmap_alloc_block_buf(page_size, node, NULL);
+ altmap_alloc = false;
+ }
if (!p)
return -ENOMEM;
- vmemmap_list_populate(__pa(p), start, node);
+ if (vmemmap_list_populate(__pa(p), start, node)) {
+ /*
+ * If we don't populate vmemap list, we don't have
+ * the ability to free the allocated vmemmap
+ * pages in section_deactivate. Hence free them
+ * here.
+ */
+ int nr_pfns = page_size >> PAGE_SHIFT;
+ unsigned long page_order = get_order(page_size);
+
+ if (altmap_alloc)
+ vmem_altmap_free(altmap, nr_pfns);
+ else
+ free_pages((unsigned long)p, page_order);
+ return -ENOMEM;
+ }
pr_debug(" * %016lx..%016lx allocated at %p\n",
start, start + page_size, p);
- vmemmap_create_mapping(start, page_size, __pa(p));
+ rc = vmemmap_create_mapping(start, page_size, __pa(p));
+ if (rc < 0) {
+ pr_warn("%s: Unable to create vmemmap mapping: %d\n",
+ __func__, rc);
+ return -EFAULT;
+ }
}
return 0;
}
-void vmemmap_free(unsigned long start, unsigned long end)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
+{
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (radix_enabled())
+ return radix__vmemmap_populate(start, end, node, altmap);
+#endif
+
+ return __vmemmap_populate(start, end, node, altmap);
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static unsigned long vmemmap_list_free(unsigned long start)
+{
+ struct vmemmap_backing *vmem_back, *vmem_back_prev;
+
+ vmem_back_prev = vmem_back = vmemmap_list;
+
+ /* look for it with prev pointer recorded */
+ for (; vmem_back; vmem_back = vmem_back->list) {
+ if (vmem_back->virt_addr == start)
+ break;
+ vmem_back_prev = vmem_back;
+ }
+
+ if (unlikely(!vmem_back))
+ return 0;
+
+ /* remove it from vmemmap_list */
+ if (vmem_back == vmemmap_list) /* remove head */
+ vmemmap_list = vmem_back->list;
+ else
+ vmem_back_prev->list = vmem_back->list;
+
+ /* next point to this freed entry */
+ vmem_back->list = next;
+ next = vmem_back;
+ num_freed++;
+
+ return vmem_back->phys;
+}
+
+static void __ref __vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+ unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
+ unsigned long page_order = get_order(page_size);
+ unsigned long alt_start = ~0, alt_end = ~0;
+ unsigned long base_pfn;
+
+ start = ALIGN_DOWN(start, page_size);
+ if (altmap) {
+ alt_start = altmap->base_pfn;
+ alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
+ }
+
+ pr_debug("vmemmap_free %lx...%lx\n", start, end);
+
+ for (; start < end; start += page_size) {
+ unsigned long nr_pages, addr;
+ struct page *page;
+
+ /*
+ * We have already marked the subsection we are trying to remove
+ * invalid. So if we want to remove the vmemmap range, we
+ * need to make sure there is no subsection marked valid
+ * in this range.
+ */
+ if (vmemmap_populated(start, page_size))
+ continue;
+
+ addr = vmemmap_list_free(start);
+ if (!addr)
+ continue;
+
+ page = pfn_to_page(addr >> PAGE_SHIFT);
+ nr_pages = 1 << page_order;
+ base_pfn = PHYS_PFN(addr);
+
+ if (base_pfn >= alt_start && base_pfn < alt_end) {
+ vmem_altmap_free(altmap, nr_pages);
+ } else if (PageReserved(page)) {
+ /* allocated from bootmem */
+ if (page_size < PAGE_SIZE) {
+ /*
+ * this shouldn't happen, but if it is
+ * the case, leave the memory there
+ */
+ WARN_ON_ONCE(1);
+ } else {
+ while (nr_pages--)
+ free_reserved_page(page++);
+ }
+ } else {
+ free_pages((unsigned long)(__va(addr)), page_order);
+ }
+
+ vmemmap_remove_mapping(start, page_size);
+ }
+}
+
+void __ref vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
{
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (radix_enabled())
+ return radix__vmemmap_free(start, end, altmap);
+#endif
+ return __vmemmap_free(start, end, altmap);
}
+#endif
+
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+}
+#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
+
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+#ifdef CONFIG_PPC_BOOK3S_64
+unsigned int mmu_lpid_bits;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+EXPORT_SYMBOL_GPL(mmu_lpid_bits);
+#endif
+unsigned int mmu_pid_bits;
+
+static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);
+
+static int __init parse_disable_radix(char *p)
+{
+ bool val;
+
+ if (!p)
+ val = true;
+ else if (kstrtobool(p, &val))
+ return -EINVAL;
+
+ disable_radix = val;
+
+ return 0;
+}
+early_param("disable_radix", parse_disable_radix);
+
+/*
+ * If we're running under a hypervisor, we need to check the contents of
+ * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
+ * radix. If not, we clear the radix feature bit so we fall back to hash.
+ */
+static void __init early_check_vec5(void)
+{
+ unsigned long root, chosen;
+ int size;
+ const u8 *vec5;
+ u8 mmu_supported;
+
+ root = of_get_flat_dt_root();
+ chosen = of_get_flat_dt_subnode_by_name(root, "chosen");
+ if (chosen == -FDT_ERR_NOTFOUND) {
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+ return;
+ }
+ vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size);
+ if (!vec5) {
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+ return;
+ }
+ if (size <= OV5_INDX(OV5_MMU_SUPPORT)) {
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+ return;
+ }
+
+ /* Check for supported configuration */
+ mmu_supported = vec5[OV5_INDX(OV5_MMU_SUPPORT)] &
+ OV5_FEAT(OV5_MMU_SUPPORT);
+ if (mmu_supported == OV5_FEAT(OV5_MMU_RADIX)) {
+ /* Hypervisor only supports radix - check enabled && GTSE */
+ if (!early_radix_enabled()) {
+ pr_warn("WARNING: Ignoring cmdline option disable_radix\n");
+ }
+ if (!(vec5[OV5_INDX(OV5_RADIX_GTSE)] &
+ OV5_FEAT(OV5_RADIX_GTSE))) {
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE;
+ } else
+ cur_cpu_spec->mmu_features |= MMU_FTR_GTSE;
+ /* Do radix anyway - the hypervisor said we had to */
+ cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX;
+ } else if (mmu_supported == OV5_FEAT(OV5_MMU_HASH)) {
+ /* Hypervisor only supports hash - disable radix */
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE;
+ }
+}
+
+static int __init dt_scan_mmu_pid_width(unsigned long node,
+ const char *uname, int depth,
+ void *data)
+{
+ int size = 0;
+ const __be32 *prop;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+
+ /* We are scanning "cpu" nodes only */
+ if (type == NULL || strcmp(type, "cpu") != 0)
+ return 0;
+
+ /* Find MMU LPID, PID register size */
+ prop = of_get_flat_dt_prop(node, "ibm,mmu-lpid-bits", &size);
+ if (prop && size == 4)
+ mmu_lpid_bits = be32_to_cpup(prop);
+
+ prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
+ if (prop && size == 4)
+ mmu_pid_bits = be32_to_cpup(prop);
+
+ if (!mmu_pid_bits && !mmu_lpid_bits)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Outside hotplug the kernel uses this value to map the kernel direct map
+ * with radix. To be compatible with older kernels, let's keep this value
+ * as 16M which is also SECTION_SIZE with SPARSEMEM. We can ideally map
+ * things with 1GB size in the case where we don't support hotplug.
+ */
+#ifndef CONFIG_MEMORY_HOTPLUG
+#define DEFAULT_MEMORY_BLOCK_SIZE SZ_16M
+#else
+#define DEFAULT_MEMORY_BLOCK_SIZE MIN_MEMORY_BLOCK_SIZE
+#endif
+
+static void update_memory_block_size(unsigned long *block_size, unsigned long mem_size)
+{
+ unsigned long min_memory_block_size = DEFAULT_MEMORY_BLOCK_SIZE;
+
+ for (; *block_size > min_memory_block_size; *block_size >>= 2) {
+ if ((mem_size & *block_size) == 0)
+ break;
+ }
+}
+
+static int __init probe_memory_block_size(unsigned long node, const char *uname, int
+ depth, void *data)
+{
+ const char *type;
+ unsigned long *block_size = (unsigned long *)data;
+ const __be32 *reg, *endp;
+ int l;
+
+ if (depth != 1)
+ return 0;
+ /*
+ * If we have dynamic-reconfiguration-memory node, use the
+ * lmb value.
+ */
+ if (strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) {
+
+ const __be32 *prop;
+
+ prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &l);
+
+ if (!prop || l < dt_root_size_cells * sizeof(__be32))
+ /*
+ * Nothing in the device tree
+ */
+ *block_size = DEFAULT_MEMORY_BLOCK_SIZE;
+ else
+ *block_size = of_read_number(prop, dt_root_size_cells);
+ /*
+ * We have found the final value. Don't probe further.
+ */
+ return 1;
+ }
+ /*
+ * Find all the device tree nodes of memory type and make sure
+ * the area can be mapped using the memory block size value
+ * we end up using. We start with 1G value and keep reducing
+ * it such that we can map the entire area using memory_block_size.
+ * This will be used on powernv and older pseries that don't
+ * have ibm,lmb-size node.
+ * For ex: with P5 we can end up with
+ * memory@0 -> 128MB
+ * memory@128M -> 64M
+ * This will end up using 64MB memory block size value.
+ */
+ type = of_get_flat_dt_prop(node, "device_type", NULL);
+ if (type == NULL || strcmp(type, "memory") != 0)
+ return 0;
+
+ reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l);
+ if (!reg)
+ reg = of_get_flat_dt_prop(node, "reg", &l);
+ if (!reg)
+ return 0;
+
+ endp = reg + (l / sizeof(__be32));
+ while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
+ const char *compatible;
+ u64 size;
+
+ dt_mem_next_cell(dt_root_addr_cells, &reg);
+ size = dt_mem_next_cell(dt_root_size_cells, &reg);
+
+ if (size) {
+ update_memory_block_size(block_size, size);
+ continue;
+ }
+ /*
+ * ibm,coherent-device-memory with linux,usable-memory = 0
+ * Force 256MiB block size. Work around for GPUs on P9 PowerNV
+ * linux,usable-memory == 0 implies driver managed memory and
+ * we can't use large memory block size due to hotplug/unplug
+ * limitations.
+ */
+ compatible = of_get_flat_dt_prop(node, "compatible", NULL);
+ if (compatible && !strcmp(compatible, "ibm,coherent-device-memory")) {
+ if (*block_size > SZ_256M)
+ *block_size = SZ_256M;
+ /*
+ * We keep 256M as the upper limit with GPU present.
+ */
+ return 0;
+ }
+ }
+ /* continue looking for other memory device types */
+ return 0;
+}
+
+/*
+ * start with 1G memory block size. Early init will
+ * fix this with correct value.
+ */
+unsigned long memory_block_size __ro_after_init = 1UL << 30;
+static void __init early_init_memory_block_size(void)
+{
+ /*
+ * We need to do memory_block_size probe early so that
+ * radix__early_init_mmu() can use this as limit for
+ * mapping page size.
+ */
+ of_scan_flat_dt(probe_memory_block_size, &memory_block_size);
+}
+
+void __init mmu_early_init_devtree(void)
+{
+ bool hvmode = !!(mfmsr() & MSR_HV);
+
+ /* Disable radix mode based on kernel command line. */
+ if (disable_radix) {
+ if (IS_ENABLED(CONFIG_PPC_64S_HASH_MMU))
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+ else
+ pr_warn("WARNING: Ignoring cmdline option disable_radix\n");
+ }
+
+ of_scan_flat_dt(dt_scan_mmu_pid_width, NULL);
+ if (hvmode && !mmu_lpid_bits) {
+ if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
+ mmu_lpid_bits = 12; /* POWER8-10 */
+ else
+ mmu_lpid_bits = 10; /* POWER7 */
+ }
+ if (!mmu_pid_bits) {
+ if (early_cpu_has_feature(CPU_FTR_ARCH_300))
+ mmu_pid_bits = 20; /* POWER9-10 */
+ }
+
+ /*
+ * Check /chosen/ibm,architecture-vec-5 if running as a guest.
+ * When running bare-metal, we can use radix if we like
+ * even though the ibm,architecture-vec-5 property created by
+ * skiboot doesn't have the necessary bits set.
+ */
+ if (!hvmode)
+ early_check_vec5();
+
+ early_init_memory_block_size();
+
+ if (early_radix_enabled()) {
+ radix__early_init_devtree();
+
+ /*
+ * We have finalized the translation we are going to use by now.
+ * Radix mode is not limited by RMA / VRMA addressing.
+ * Hence don't limit memblock allocations.
+ */
+ ppc64_rma_size = ULONG_MAX;
+ memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
+ } else
+ hash__early_init_devtree();
+
+ if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE))
+ hugetlbpage_init_defaultsize();
+
+ if (!(cur_cpu_spec->mmu_features & MMU_FTR_HPTE_TABLE) &&
+ !(cur_cpu_spec->mmu_features & MMU_FTR_TYPE_RADIX))
+ panic("kernel does not support any MMU type offered by platform");
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
new file mode 100644
index 000000000000..4b4feba9873b
--- /dev/null
+++ b/arch/powerpc/mm/ioremap.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/mmzone.h>
+#include <linux/vmalloc.h>
+
+unsigned long ioremap_bot;
+EXPORT_SYMBOL(ioremap_bot);
+
+void __iomem *ioremap(phys_addr_t addr, unsigned long size)
+{
+ pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
+ void *caller = __builtin_return_address(0);
+
+ return __ioremap_caller(addr, size, prot, caller);
+}
+EXPORT_SYMBOL(ioremap);
+
+void __iomem *ioremap_wc(phys_addr_t addr, unsigned long size)
+{
+ pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL);
+ void *caller = __builtin_return_address(0);
+
+ return __ioremap_caller(addr, size, prot, caller);
+}
+EXPORT_SYMBOL(ioremap_wc);
+
+void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size)
+{
+ pgprot_t prot = pgprot_cached(PAGE_KERNEL);
+ void *caller = __builtin_return_address(0);
+
+ return __ioremap_caller(addr, size, prot, caller);
+}
+
+void __iomem *ioremap_prot(phys_addr_t addr, size_t size, pgprot_t prot)
+{
+ pte_t pte = __pte(pgprot_val(prot));
+ void *caller = __builtin_return_address(0);
+
+ /* writeable implies dirty for kernel addresses */
+ if (pte_write(pte))
+ pte = pte_mkdirty(pte);
+
+ return __ioremap_caller(addr, size, pte_pgprot(pte), caller);
+}
+EXPORT_SYMBOL(ioremap_prot);
+
+int early_ioremap_range(unsigned long ea, phys_addr_t pa,
+ unsigned long size, pgprot_t prot)
+{
+ unsigned long i;
+
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ int err = map_kernel_page(ea + i, pa + i, pgprot_nx(prot));
+
+ if (WARN_ON_ONCE(err)) /* Should clean up */
+ return err;
+ }
+
+ return 0;
+}
diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c
new file mode 100644
index 000000000000..ca5bc6be3e6f
--- /dev/null
+++ b/arch/powerpc/mm/ioremap_32.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <mm/mmu_decl.h>
+
+void __iomem *ioremap_wt(phys_addr_t addr, unsigned long size)
+{
+ pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL);
+
+ return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_wt);
+
+void __iomem *
+__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller)
+{
+ unsigned long v;
+ phys_addr_t p, offset;
+ int err;
+
+ /*
+ * If the address lies within the first 16 MB, assume it's in ISA
+ * memory space
+ */
+ if (addr < SZ_16M)
+ addr += _ISA_MEM_BASE;
+
+ /*
+ * Choose an address to map it to.
+ * Once the vmalloc system is running, we use it.
+ * Before then, we use space going down from IOREMAP_TOP
+ * (ioremap_bot records where we're up to).
+ */
+ p = addr & PAGE_MASK;
+ offset = addr & ~PAGE_MASK;
+ size = PAGE_ALIGN(addr + size) - p;
+
+#ifndef CONFIG_CRASH_DUMP
+ /*
+ * Don't allow anybody to remap normal RAM that we're using.
+ * mem_init() sets high_memory so only do the check after that.
+ */
+ if (slab_is_available() && p <= virt_to_phys(high_memory - 1) &&
+ page_is_ram(__phys_to_pfn(p))) {
+ pr_warn("%s(): phys addr 0x%llx is RAM lr %ps\n", __func__,
+ (unsigned long long)p, __builtin_return_address(0));
+ return NULL;
+ }
+#endif
+
+ if (size == 0)
+ return NULL;
+
+ /*
+ * Is it already mapped? Perhaps overlapped by a previous
+ * mapping.
+ */
+ v = p_block_mapped(p);
+ if (v)
+ return (void __iomem *)v + offset;
+
+ if (slab_is_available())
+ return generic_ioremap_prot(addr, size, prot);
+
+ /*
+ * Should check if it is a candidate for a BAT mapping
+ */
+ pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller);
+
+ err = early_ioremap_range(ioremap_bot - size - PAGE_SIZE, p, size, prot);
+ if (err)
+ return NULL;
+ ioremap_bot -= size + PAGE_SIZE;
+
+ return (void __iomem *)ioremap_bot + offset;
+}
+
+void iounmap(volatile void __iomem *addr)
+{
+ /*
+ * If mapped by BATs then there is nothing to do.
+ * Calling vfree() generates a benign warning.
+ */
+ if (v_block_mapped((unsigned long)addr))
+ return;
+
+ generic_iounmap(addr);
+}
+EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c
new file mode 100644
index 000000000000..fb8b55bd2cd5
--- /dev/null
+++ b/arch/powerpc/mm/ioremap_64.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
+ pgprot_t prot, void *caller)
+{
+ phys_addr_t paligned, offset;
+ void __iomem *ret;
+ int err;
+
+ /* We don't support the 4K PFN hack with ioremap */
+ if (pgprot_val(prot) & H_PAGE_4K_PFN)
+ return NULL;
+
+ /*
+ * Choose an address to map it to. Once the vmalloc system is running,
+ * we use it. Before that, we map using addresses going up from
+ * ioremap_bot. vmalloc will use the addresses from IOREMAP_BASE
+ * through ioremap_bot.
+ */
+ paligned = addr & PAGE_MASK;
+ offset = addr & ~PAGE_MASK;
+ size = PAGE_ALIGN(addr + size) - paligned;
+
+ if (size == 0 || paligned == 0)
+ return NULL;
+
+ if (slab_is_available())
+ return generic_ioremap_prot(addr, size, prot);
+
+ pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller);
+
+ err = early_ioremap_range(ioremap_bot, paligned, size, prot);
+ if (err)
+ return NULL;
+
+ ret = (void __iomem *)ioremap_bot + offset;
+ ioremap_bot += size + PAGE_SIZE;
+
+ return ret;
+}
+
+/*
+ * Unmap an IO region and remove it from vmalloc'd list.
+ * Access to IO memory should be serialized by driver.
+ */
+void iounmap(volatile void __iomem *token)
+{
+ if (!slab_is_available())
+ return;
+
+ generic_iounmap(token);
+}
+EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/kasan/8xx.c b/arch/powerpc/mm/kasan/8xx.c
new file mode 100644
index 000000000000..989d6cdf4141
--- /dev/null
+++ b/arch/powerpc/mm/kasan/8xx.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/kasan.h>
+#include <linux/memblock.h>
+#include <linux/hugetlb.h>
+
+#include <asm/pgalloc.h>
+
+static int __init
+kasan_init_shadow_8M(unsigned long k_start, unsigned long k_end, void *block)
+{
+ pmd_t *pmd = pmd_off_k(k_start);
+ unsigned long k_cur, k_next;
+
+ for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++, block += SZ_4M) {
+ pte_t *ptep;
+ int i;
+
+ k_next = pgd_addr_end(k_cur, k_end);
+ if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte)
+ continue;
+
+ ptep = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE);
+ if (!ptep)
+ return -ENOMEM;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte_t pte = pte_mkhuge(pfn_pte(PHYS_PFN(__pa(block + i * PAGE_SIZE)), PAGE_KERNEL));
+
+ __set_pte_at(&init_mm, k_cur, ptep + i, pte, 1);
+ }
+ pmd_populate_kernel(&init_mm, pmd, ptep);
+ *pmd = __pmd(pmd_val(*pmd) | _PMD_PAGE_8M);
+ }
+ return 0;
+}
+
+int __init kasan_init_region(void *start, size_t size)
+{
+ unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start);
+ unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size);
+ unsigned long k_cur;
+ int ret;
+ void *block;
+
+ block = memblock_alloc(k_end - k_start, SZ_8M);
+ if (!block)
+ return -ENOMEM;
+
+ if (IS_ALIGNED(k_start, SZ_8M)) {
+ kasan_init_shadow_8M(k_start, ALIGN_DOWN(k_end, SZ_8M), block);
+ k_cur = ALIGN_DOWN(k_end, SZ_8M);
+ if (k_cur == k_end)
+ goto finish;
+ } else {
+ k_cur = k_start;
+ }
+
+ ret = kasan_init_shadow_page_tables(k_start, k_end);
+ if (ret)
+ return ret;
+
+ for (; k_cur < k_end; k_cur += PAGE_SIZE) {
+ pmd_t *pmd = pmd_off_k(k_cur);
+ void *va = block + k_cur - k_start;
+ pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);
+
+ if (k_cur < ALIGN_DOWN(k_end, SZ_512K))
+ pte = pte_mkhuge(pte);
+
+ __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0);
+ }
+finish:
+ flush_tlb_kernel_range(k_start, k_end);
+ return 0;
+}
diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile
new file mode 100644
index 000000000000..f9522fd70b2f
--- /dev/null
+++ b/arch/powerpc/mm/kasan/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+
+KASAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+obj-$(CONFIG_PPC32) += init_32.o
+obj-$(CONFIG_PPC_8xx) += 8xx.o
+obj-$(CONFIG_PPC_BOOK3S_32) += book3s_32.o
+obj-$(CONFIG_PPC_BOOK3S_64) += init_book3s_64.o
+obj-$(CONFIG_PPC_BOOK3E_64) += init_book3e_64.o
diff --git a/arch/powerpc/mm/kasan/book3s_32.c b/arch/powerpc/mm/kasan/book3s_32.c
new file mode 100644
index 000000000000..450a67ef0bbe
--- /dev/null
+++ b/arch/powerpc/mm/kasan/book3s_32.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/kasan.h>
+#include <linux/memblock.h>
+#include <mm/mmu_decl.h>
+
+int __init kasan_init_region(void *start, size_t size)
+{
+ unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start);
+ unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size);
+ unsigned long k_nobat = k_start;
+ unsigned long k_cur;
+ phys_addr_t phys;
+ int ret;
+
+ while (k_nobat < k_end) {
+ unsigned int k_size = bat_block_size(k_nobat, k_end);
+ int idx = find_free_bat();
+
+ if (idx == -1)
+ break;
+ if (k_size < SZ_128K)
+ break;
+ phys = memblock_phys_alloc_range(k_size, k_size, 0,
+ MEMBLOCK_ALLOC_ANYWHERE);
+ if (!phys)
+ break;
+
+ setbat(idx, k_nobat, phys, k_size, PAGE_KERNEL);
+ k_nobat += k_size;
+ }
+ if (k_nobat != k_start)
+ update_bats();
+
+ if (k_nobat < k_end) {
+ phys = memblock_phys_alloc_range(k_end - k_nobat, PAGE_SIZE, 0,
+ MEMBLOCK_ALLOC_ANYWHERE);
+ if (!phys)
+ return -ENOMEM;
+ }
+
+ ret = kasan_init_shadow_page_tables(k_start, k_end);
+ if (ret)
+ return ret;
+
+ kasan_update_early_region(k_start, k_nobat, __pte(0));
+
+ for (k_cur = k_nobat; k_cur < k_end; k_cur += PAGE_SIZE) {
+ pmd_t *pmd = pmd_off_k(k_cur);
+ pte_t pte = pfn_pte(PHYS_PFN(phys + k_cur - k_nobat), PAGE_KERNEL);
+
+ __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0);
+ }
+ flush_tlb_kernel_range(k_start, k_end);
+ memset(kasan_mem_to_shadow(start), 0, k_end - k_start);
+
+ return 0;
+}
diff --git a/arch/powerpc/mm/kasan/init_32.c b/arch/powerpc/mm/kasan/init_32.c
new file mode 100644
index 000000000000..1d083597464f
--- /dev/null
+++ b/arch/powerpc/mm/kasan/init_32.c
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/kasan.h>
+#include <linux/printk.h>
+#include <linux/memblock.h>
+#include <linux/sched/task.h>
+#include <asm/pgalloc.h>
+#include <asm/text-patching.h>
+#include <mm/mmu_decl.h>
+
+static pgprot_t __init kasan_prot_ro(void)
+{
+ if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ return PAGE_READONLY;
+
+ return PAGE_KERNEL_RO;
+}
+
+static void __init kasan_populate_pte(pte_t *ptep, pgprot_t prot)
+{
+ unsigned long va = (unsigned long)kasan_early_shadow_page;
+ phys_addr_t pa = __pa(kasan_early_shadow_page);
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++, ptep++)
+ __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 1);
+}
+
+int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end)
+{
+ pmd_t *pmd;
+ unsigned long k_cur, k_next;
+
+ pmd = pmd_off_k(k_start);
+
+ for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) {
+ pte_t *new;
+
+ k_next = pgd_addr_end(k_cur, k_end);
+ if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte)
+ continue;
+
+ new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE);
+
+ if (!new)
+ return -ENOMEM;
+ kasan_populate_pte(new, PAGE_KERNEL);
+ pmd_populate_kernel(&init_mm, pmd, new);
+ }
+ return 0;
+}
+
+int __init __weak kasan_init_region(void *start, size_t size)
+{
+ unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start);
+ unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size);
+ unsigned long k_cur;
+ int ret;
+ void *block;
+
+ ret = kasan_init_shadow_page_tables(k_start, k_end);
+ if (ret)
+ return ret;
+
+ k_start = k_start & PAGE_MASK;
+ block = memblock_alloc(k_end - k_start, PAGE_SIZE);
+ if (!block)
+ return -ENOMEM;
+
+ for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
+ pmd_t *pmd = pmd_off_k(k_cur);
+ void *va = block + k_cur - k_start;
+ pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);
+
+ __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0);
+ }
+ flush_tlb_kernel_range(k_start, k_end);
+ return 0;
+}
+
+void __init
+kasan_update_early_region(unsigned long k_start, unsigned long k_end, pte_t pte)
+{
+ unsigned long k_cur;
+
+ for (k_cur = k_start; k_cur != k_end; k_cur += PAGE_SIZE) {
+ pmd_t *pmd = pmd_off_k(k_cur);
+ pte_t *ptep = pte_offset_kernel(pmd, k_cur);
+
+ if (pte_page(*ptep) != virt_to_page(lm_alias(kasan_early_shadow_page)))
+ continue;
+
+ __set_pte_at(&init_mm, k_cur, ptep, pte, 0);
+ }
+
+ flush_tlb_kernel_range(k_start, k_end);
+}
+
+static void __init kasan_remap_early_shadow_ro(void)
+{
+ pgprot_t prot = kasan_prot_ro();
+ phys_addr_t pa = __pa(kasan_early_shadow_page);
+
+ kasan_populate_pte(kasan_early_shadow_pte, prot);
+
+ kasan_update_early_region(KASAN_SHADOW_START, KASAN_SHADOW_END,
+ pfn_pte(PHYS_PFN(pa), prot));
+}
+
+static void __init kasan_unmap_early_shadow_vmalloc(void)
+{
+ unsigned long k_start = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_START);
+ unsigned long k_end = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_END);
+
+ kasan_update_early_region(k_start, k_end, __pte(0));
+
+#ifdef MODULES_VADDR
+ k_start = (unsigned long)kasan_mem_to_shadow((void *)MODULES_VADDR);
+ k_end = (unsigned long)kasan_mem_to_shadow((void *)MODULES_END);
+ kasan_update_early_region(k_start, k_end, __pte(0));
+#endif
+}
+
+void __init kasan_mmu_init(void)
+{
+ int ret;
+
+ if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
+ ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+ if (ret)
+ panic("kasan: kasan_init_shadow_page_tables() failed");
+ }
+}
+
+void __init kasan_init(void)
+{
+ phys_addr_t base, end;
+ u64 i;
+ int ret;
+
+ for_each_mem_range(i, &base, &end) {
+ phys_addr_t top = min(end, total_lowmem);
+
+ if (base >= top)
+ continue;
+
+ ret = kasan_init_region(__va(base), top - base);
+ if (ret)
+ panic("kasan: kasan_init_region() failed");
+ }
+
+ if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
+ ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+ if (ret)
+ panic("kasan: kasan_init_shadow_page_tables() failed");
+ }
+
+ kasan_remap_early_shadow_ro();
+
+ clear_page(kasan_early_shadow_page);
+
+ /* At this point kasan is fully initialized. Enable error messages */
+ init_task.kasan_depth = 0;
+ kasan_init_generic();
+}
+
+void __init kasan_late_init(void)
+{
+ if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
+ kasan_unmap_early_shadow_vmalloc();
+}
+
+void __init kasan_early_init(void)
+{
+ unsigned long addr = KASAN_SHADOW_START;
+ unsigned long end = KASAN_SHADOW_END;
+ unsigned long next;
+ pmd_t *pmd = pmd_off_k(addr);
+
+ BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK);
+
+ kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL);
+
+ do {
+ next = pgd_addr_end(addr, end);
+ pmd_populate_kernel(&init_mm, pmd, kasan_early_shadow_pte);
+ } while (pmd++, addr = next, addr != end);
+}
diff --git a/arch/powerpc/mm/kasan/init_book3e_64.c b/arch/powerpc/mm/kasan/init_book3e_64.c
new file mode 100644
index 000000000000..0d3a73d6d4b0
--- /dev/null
+++ b/arch/powerpc/mm/kasan/init_book3e_64.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KASAN for 64-bit Book3e powerpc
+ *
+ * Copyright 2022, Christophe Leroy, CS GROUP France
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/kasan.h>
+#include <linux/printk.h>
+#include <linux/memblock.h>
+#include <linux/set_memory.h>
+
+#include <asm/pgalloc.h>
+
+static inline bool kasan_pud_table(p4d_t p4d)
+{
+ return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud));
+}
+
+static inline bool kasan_pmd_table(pud_t pud)
+{
+ return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd));
+}
+
+static inline bool kasan_pte_table(pmd_t pmd)
+{
+ return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte));
+}
+
+static int __init kasan_map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
+{
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ pgdp = pgd_offset_k(ea);
+ p4dp = p4d_offset(pgdp, ea);
+ if (kasan_pud_table(*p4dp)) {
+ pudp = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
+ memcpy(pudp, kasan_early_shadow_pud, PUD_TABLE_SIZE);
+ p4d_populate(&init_mm, p4dp, pudp);
+ }
+ pudp = pud_offset(p4dp, ea);
+ if (kasan_pmd_table(*pudp)) {
+ pmdp = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
+ memcpy(pmdp, kasan_early_shadow_pmd, PMD_TABLE_SIZE);
+ pud_populate(&init_mm, pudp, pmdp);
+ }
+ pmdp = pmd_offset(pudp, ea);
+ if (kasan_pte_table(*pmdp)) {
+ ptep = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
+ memcpy(ptep, kasan_early_shadow_pte, PTE_TABLE_SIZE);
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+ }
+ ptep = pte_offset_kernel(pmdp, ea);
+
+ __set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot), 0);
+
+ return 0;
+}
+
+static void __init kasan_init_phys_region(void *start, void *end)
+{
+ unsigned long k_start, k_end, k_cur;
+ void *va;
+
+ if (start >= end)
+ return;
+
+ k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE);
+ k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE);
+
+ va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE);
+ for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE)
+ kasan_map_kernel_page(k_cur, __pa(va), PAGE_KERNEL);
+}
+
+void __init kasan_early_init(void)
+{
+ int i;
+ unsigned long addr;
+ pgd_t *pgd = pgd_offset_k(KASAN_SHADOW_START);
+ pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL);
+
+ BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE));
+ BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE));
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page,
+ &kasan_early_shadow_pte[i], zero_pte, 0);
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ pmd_populate_kernel(&init_mm, &kasan_early_shadow_pmd[i],
+ kasan_early_shadow_pte);
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ pud_populate(&init_mm, &kasan_early_shadow_pud[i],
+ kasan_early_shadow_pmd);
+
+ for (addr = KASAN_SHADOW_START; addr != KASAN_SHADOW_END; addr += PGDIR_SIZE)
+ p4d_populate(&init_mm, p4d_offset(pgd++, addr), kasan_early_shadow_pud);
+}
+
+void __init kasan_init(void)
+{
+ phys_addr_t start, end;
+ u64 i;
+ pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO);
+
+ for_each_mem_range(i, &start, &end)
+ kasan_init_phys_region(phys_to_virt(start), phys_to_virt(end));
+
+ if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
+ kasan_remove_zero_shadow((void *)VMALLOC_START, VMALLOC_SIZE);
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page,
+ &kasan_early_shadow_pte[i], zero_pte, 0);
+
+ flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+ memset(kasan_early_shadow_page, 0, PAGE_SIZE);
+
+ /* Enable error messages */
+ init_task.kasan_depth = 0;
+ kasan_init_generic();
+}
+
+void __init kasan_late_init(void) { }
diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c
new file mode 100644
index 000000000000..dcafa641804c
--- /dev/null
+++ b/arch/powerpc/mm/kasan/init_book3s_64.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KASAN for 64-bit Book3S powerpc
+ *
+ * Copyright 2019-2022, Daniel Axtens, IBM Corporation.
+ */
+
+/*
+ * ppc64 turns on virtual memory late in boot, after calling into generic code
+ * like the device-tree parser, so it uses this in conjunction with a hook in
+ * outline mode to avoid invalid access early in boot.
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/kasan.h>
+#include <linux/printk.h>
+#include <linux/sched/task.h>
+#include <linux/memblock.h>
+#include <asm/pgalloc.h>
+
+static void __init kasan_init_phys_region(void *start, void *end)
+{
+ unsigned long k_start, k_end, k_cur;
+ void *va;
+
+ if (start >= end)
+ return;
+
+ k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE);
+ k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE);
+
+ va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE);
+ for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE)
+ map_kernel_page(k_cur, __pa(va), PAGE_KERNEL);
+}
+
+void __init kasan_init(void)
+{
+ /*
+ * We want to do the following things:
+ * 1) Map real memory into the shadow for all physical memblocks
+ * This takes us from c000... to c008...
+ * 2) Leave a hole over the shadow of vmalloc space. KASAN_VMALLOC
+ * will manage this for us.
+ * This takes us from c008... to c00a...
+ * 3) Map the 'early shadow'/zero page over iomap and vmemmap space.
+ * This takes us up to where we start at c00e...
+ */
+
+ void *k_start = kasan_mem_to_shadow((void *)RADIX_VMALLOC_END);
+ void *k_end = kasan_mem_to_shadow((void *)RADIX_VMEMMAP_END);
+ phys_addr_t start, end;
+ u64 i;
+ pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL);
+
+ if (!early_radix_enabled()) {
+ pr_warn("KASAN not enabled as it requires radix!");
+ return;
+ }
+
+ for_each_mem_range(i, &start, &end)
+ kasan_init_phys_region(phys_to_virt(start), phys_to_virt(end));
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page,
+ &kasan_early_shadow_pte[i], zero_pte, 0);
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ pmd_populate_kernel(&init_mm, &kasan_early_shadow_pmd[i],
+ kasan_early_shadow_pte);
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ pud_populate(&init_mm, &kasan_early_shadow_pud[i],
+ kasan_early_shadow_pmd);
+
+ /* map the early shadow over the iomap and vmemmap space */
+ kasan_populate_early_shadow(k_start, k_end);
+
+ /* mark early shadow region as RO and wipe it */
+ zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO);
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page,
+ &kasan_early_shadow_pte[i], zero_pte, 0);
+
+ /*
+ * clear_page relies on some cache info that hasn't been set up yet.
+ * It ends up looping ~forever and blows up other data.
+ * Use memset instead.
+ */
+ memset(kasan_early_shadow_page, 0, PAGE_SIZE);
+
+ /* Enable error messages */
+ init_task.kasan_depth = 0;
+ kasan_init_generic();
+}
+
+void __init kasan_early_init(void) { }
+
+void __init kasan_late_init(void) { }
diff --git a/arch/powerpc/mm/maccess.c b/arch/powerpc/mm/maccess.c
new file mode 100644
index 000000000000..ea821d0ffe16
--- /dev/null
+++ b/arch/powerpc/mm/maccess.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+
+#include <asm/disassemble.h>
+#include <asm/inst.h>
+#include <asm/ppc-opcode.h>
+
+bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
+{
+ return is_kernel_addr((unsigned long)unsafe_src);
+}
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 7f4bea162026..3ddbfdbfa941 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -9,251 +10,218 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/gfp.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/stddef.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/highmem.h>
-#include <linux/initrd.h>
-#include <linux/pagemap.h>
#include <linux/suspend.h>
-#include <linux/memblock.h>
-#include <linux/hugetlb.h>
-#include <linux/slab.h>
-
-#include <asm/pgalloc.h>
-#include <asm/prom.h>
-#include <asm/io.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/smp.h>
-#include <asm/machdep.h>
-#include <asm/btext.h>
-#include <asm/tlb.h>
-#include <asm/sections.h>
-#include <asm/sparsemem.h>
-#include <asm/vdso.h>
-#include <asm/fixmap.h>
+#include <linux/dma-direct.h>
+#include <linux/execmem.h>
+#include <linux/vmalloc.h>
+
#include <asm/swiotlb.h>
+#include <asm/machdep.h>
#include <asm/rtas.h>
+#include <asm/kasan.h>
+#include <asm/svm.h>
+#include <asm/mmzone.h>
+#include <asm/ftrace.h>
+#include <asm/text-patching.h>
+#include <asm/setup.h>
+#include <asm/fixmap.h>
-#include "mmu_decl.h"
-
-#ifndef CPU_FTR_COHERENT_ICACHE
-#define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */
-#define CPU_FTR_NOEXECUTE 0
-#endif
-
-int init_bootmem_done;
-int mem_init_done;
-unsigned long long memory_limit;
-
-#ifdef CONFIG_HIGHMEM
-pte_t *kmap_pte;
-EXPORT_SYMBOL(kmap_pte);
-pgprot_t kmap_prot;
-EXPORT_SYMBOL(kmap_prot);
-
-static inline pte_t *virt_to_kpte(unsigned long vaddr)
-{
- return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
- vaddr), vaddr), vaddr);
-}
-#endif
+#include <mm/mmu_decl.h>
-int page_is_ram(unsigned long pfn)
-{
-#ifndef CONFIG_PPC64 /* XXX for now */
- return pfn < max_pfn;
-#else
- unsigned long paddr = (pfn << PAGE_SHIFT);
- struct memblock_region *reg;
+unsigned long long memory_limit __initdata;
- for_each_memblock(memory, reg)
- if (paddr >= reg->base && paddr < (reg->base + reg->size))
- return 1;
- return 0;
-#endif
-}
+unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
+EXPORT_SYMBOL(empty_zero_page);
-pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
- unsigned long size, pgprot_t vma_prot)
+pgprot_t __phys_mem_access_prot(unsigned long pfn, unsigned long size,
+ pgprot_t vma_prot)
{
if (ppc_md.phys_mem_access_prot)
- return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot);
+ return ppc_md.phys_mem_access_prot(pfn, size, vma_prot);
if (!page_is_ram(pfn))
vma_prot = pgprot_noncached(vma_prot);
return vma_prot;
}
-EXPORT_SYMBOL(phys_mem_access_prot);
+EXPORT_SYMBOL(__phys_mem_access_prot);
#ifdef CONFIG_MEMORY_HOTPLUG
+static DEFINE_MUTEX(linear_mapping_mutex);
#ifdef CONFIG_NUMA
int memory_add_physaddr_to_nid(u64 start)
{
return hot_add_scn_to_nid(start);
}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
-int arch_add_memory(int nid, u64 start, u64 size)
+int __weak create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot)
{
- struct pglist_data *pgdata;
- struct zone *zone;
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long nr_pages = size >> PAGE_SHIFT;
-
- pgdata = NODE_DATA(nid);
+ return -ENODEV;
+}
- start = (unsigned long)__va(start);
- if (create_section_mapping(start, start + size))
- return -EINVAL;
+int __weak remove_section_mapping(unsigned long start, unsigned long end)
+{
+ return -ENODEV;
+}
- /* this should work for most non-highmem platforms */
- zone = pgdata->node_zones;
+int __ref arch_create_linear_mapping(int nid, u64 start, u64 size,
+ struct mhp_params *params)
+{
+ int rc;
- return __add_pages(nid, zone, start_pfn, nr_pages);
+ start = (unsigned long)__va(start);
+ mutex_lock(&linear_mapping_mutex);
+ rc = create_section_mapping(start, start + size, nid,
+ params->pgprot);
+ mutex_unlock(&linear_mapping_mutex);
+ if (rc) {
+ pr_warn("Unable to create linear mapping for 0x%llx..0x%llx: %d\n",
+ start, start + size, rc);
+ return -EFAULT;
+ }
+ return 0;
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(u64 start, u64 size)
+void __ref arch_remove_linear_mapping(u64 start, u64 size)
{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long nr_pages = size >> PAGE_SHIFT;
- struct zone *zone;
+ int ret;
- zone = page_zone(pfn_to_page(start_pfn));
- return __remove_pages(zone, start_pfn, nr_pages);
+ /* Remove htab bolted mappings for this section of memory */
+ start = (unsigned long)__va(start);
+
+ mutex_lock(&linear_mapping_mutex);
+ ret = remove_section_mapping(start, start + size);
+ mutex_unlock(&linear_mapping_mutex);
+ if (ret)
+ pr_warn("Unable to remove linear mapping for 0x%llx..0x%llx: %d\n",
+ start, start + size, ret);
+
+ /* Ensure all vmalloc mappings are flushed in case they also
+ * hit that section of memory
+ */
+ vm_unmap_aliases();
}
-#endif
-#endif /* CONFIG_MEMORY_HOTPLUG */
/*
- * walk_memory_resource() needs to make sure there is no holes in a given
- * memory range. PPC64 does not maintain the memory layout in /proc/iomem.
- * Instead it maintains it in memblock.memory structures. Walk through the
- * memory regions, find holes and callback for contiguous regions.
+ * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
+ * updating.
*/
-int
-walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
- void *arg, int (*func)(unsigned long, unsigned long, void *))
+static void update_end_of_memory_vars(u64 start, u64 size)
{
- struct memblock_region *reg;
- unsigned long end_pfn = start_pfn + nr_pages;
- unsigned long tstart, tend;
- int ret = -1;
-
- for_each_memblock(memory, reg) {
- tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
- tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
- if (tstart >= tend)
- continue;
- ret = (*func)(tstart, tend - tstart, arg);
- if (ret)
- break;
+ unsigned long end_pfn = PFN_UP(start + size);
+
+ if (end_pfn > max_pfn) {
+ max_pfn = end_pfn;
+ max_low_pfn = end_pfn;
+ high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
}
+}
+
+int __ref add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
+ struct mhp_params *params)
+{
+ int ret;
+
+ ret = __add_pages(nid, start_pfn, nr_pages, params);
+ if (ret)
+ return ret;
+
+ /* update max_pfn, max_low_pfn and high_memory */
+ update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
+ nr_pages << PAGE_SHIFT);
+
return ret;
}
-EXPORT_SYMBOL_GPL(walk_system_ram_range);
-/*
- * Initialize the bootmem system and give it all the memory we
- * have available. If we are using highmem, we only put the
- * lowmem into the bootmem system.
- */
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init do_init_bootmem(void)
+int __ref arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_params *params)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ int rc;
+
+ rc = arch_create_linear_mapping(nid, start, size, params);
+ if (rc)
+ return rc;
+ rc = add_pages(nid, start_pfn, nr_pages, params);
+ if (rc)
+ arch_remove_linear_mapping(start, size);
+ return rc;
+}
+
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
- unsigned long start, bootmap_pages;
- unsigned long total_pages;
- struct memblock_region *reg;
- int boot_mapsize;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ __remove_pages(start_pfn, nr_pages, altmap);
+ arch_remove_linear_mapping(start, size);
+}
+#endif
+#ifndef CONFIG_NUMA
+void __init mem_topology_setup(void)
+{
max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
- total_pages = (memblock_end_of_DRAM() - memstart_addr) >> PAGE_SHIFT;
+ min_low_pfn = MEMORY_START >> PAGE_SHIFT;
#ifdef CONFIG_HIGHMEM
- total_pages = total_lowmem >> PAGE_SHIFT;
max_low_pfn = lowmem_end_addr >> PAGE_SHIFT;
#endif
- /*
- * Find an area to use for the bootmem bitmap. Calculate the size of
- * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE.
- * Add 1 additional page in case the address isn't page-aligned.
- */
- bootmap_pages = bootmem_bootmap_pages(total_pages);
-
- start = memblock_alloc(bootmap_pages << PAGE_SHIFT, PAGE_SIZE);
-
- min_low_pfn = MEMORY_START >> PAGE_SHIFT;
- boot_mapsize = init_bootmem_node(NODE_DATA(0), start >> PAGE_SHIFT, min_low_pfn, max_low_pfn);
-
/* Place all memblock_regions in the same node and merge contiguous
* memblock_regions
*/
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
-
- /* Add all physical memory to the bootmem map, mark each area
- * present.
- */
-#ifdef CONFIG_HIGHMEM
- free_bootmem_with_active_regions(0, lowmem_end_addr >> PAGE_SHIFT);
-
- /* reserve the sections we're already using */
- for_each_memblock(reserved, reg) {
- unsigned long top = reg->base + reg->size - 1;
- if (top < lowmem_end_addr)
- reserve_bootmem(reg->base, reg->size, BOOTMEM_DEFAULT);
- else if (reg->base < lowmem_end_addr) {
- unsigned long trunc_size = lowmem_end_addr - reg->base;
- reserve_bootmem(reg->base, trunc_size, BOOTMEM_DEFAULT);
- }
- }
-#else
- free_bootmem_with_active_regions(0, max_pfn);
-
- /* reserve the sections we're already using */
- for_each_memblock(reserved, reg)
- reserve_bootmem(reg->base, reg->size, BOOTMEM_DEFAULT);
-#endif
- /* XXX need to clip this if using highmem? */
- sparse_memory_present_with_active_regions(0);
+ memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
+}
- init_bootmem_done = 1;
+void __init initmem_init(void)
+{
+ sparse_init();
}
/* mark pages that don't exist as nosave */
static int __init mark_nonram_nosave(void)
{
- struct memblock_region *reg, *prev = NULL;
-
- for_each_memblock(memory, reg) {
- if (prev &&
- memblock_region_memory_end_pfn(prev) < memblock_region_memory_base_pfn(reg))
- register_nosave_region(memblock_region_memory_end_pfn(prev),
- memblock_region_memory_base_pfn(reg));
- prev = reg;
+ unsigned long spfn, epfn, prev = 0;
+ int i;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &spfn, &epfn, NULL) {
+ if (prev && prev < spfn)
+ register_nosave_region(prev, spfn);
+
+ prev = epfn;
}
+
return 0;
}
+#else /* CONFIG_NUMA */
+static int __init mark_nonram_nosave(void)
+{
+ return 0;
+}
+#endif
+
+/*
+ * Zones usage:
+ *
+ * We setup ZONE_DMA to be 31-bits on all platforms and ZONE_NORMAL to be
+ * everything else. GFP_DMA32 page allocations automatically fall back to
+ * ZONE_DMA.
+ *
+ * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the
+ * generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU
+ * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by
+ * ZONE_DMA.
+ */
+static unsigned long max_zone_pfns[MAX_NR_ZONES];
/*
* paging_init() sets up the page tables - in fact we've already done this.
@@ -262,66 +230,72 @@ void __init paging_init(void)
{
unsigned long long total_ram = memblock_phys_mem_size();
phys_addr_t top_of_ram = memblock_end_of_DRAM();
- unsigned long max_zone_pfns[MAX_NR_ZONES];
+ int zone_dma_bits;
-#ifdef CONFIG_PPC32
- unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1);
- unsigned long end = __fix_to_virt(FIX_HOLE);
+#ifdef CONFIG_HIGHMEM
+ unsigned long v = __fix_to_virt(FIX_KMAP_END);
+ unsigned long end = __fix_to_virt(FIX_KMAP_BEGIN);
for (; v < end; v += PAGE_SIZE)
- map_page(v, 0, 0); /* XXX gross */
-#endif
+ map_kernel_page(v, 0, __pgprot(0)); /* XXX gross */
-#ifdef CONFIG_HIGHMEM
- map_page(PKMAP_BASE, 0, 0); /* XXX gross */
+ map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */
pkmap_page_table = virt_to_kpte(PKMAP_BASE);
-
- kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN));
- kmap_prot = PAGE_KERNEL;
#endif /* CONFIG_HIGHMEM */
printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n",
(unsigned long long)top_of_ram, total_ram);
printk(KERN_DEBUG "Memory hole size: %ldMB\n",
(long int)((top_of_ram - total_ram) >> 20));
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+
+ /*
+ * Allow 30-bit DMA for very limited Broadcom wifi chips on many
+ * powerbooks.
+ */
+ if (IS_ENABLED(CONFIG_PPC32))
+ zone_dma_bits = 30;
+ else
+ zone_dma_bits = 31;
+
+ zone_dma_limit = DMA_BIT_MASK(zone_dma_bits);
+
+#ifdef CONFIG_ZONE_DMA
+ max_zone_pfns[ZONE_DMA] = min(max_low_pfn,
+ 1UL << (zone_dma_bits - PAGE_SHIFT));
+#endif
+ max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
- max_zone_pfns[ZONE_DMA] = lowmem_end_addr >> PAGE_SHIFT;
- max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT;
-#else
- max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
+ max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
#endif
- free_area_init_nodes(max_zone_pfns);
+
+ free_area_init(max_zone_pfns);
mark_nonram_nosave();
}
-#endif /* ! CONFIG_NEED_MULTIPLE_NODES */
-void __init mem_init(void)
+void __init arch_mm_preinit(void)
{
+ /*
+ * book3s is limited to 16 page sizes due to encoding this in
+ * a 4-bit field for slices.
+ */
+ BUILD_BUG_ON(MMU_PAGE_COUNT > 16);
+
#ifdef CONFIG_SWIOTLB
- swiotlb_init(0);
+ /*
+ * Some platforms (e.g. 85xx) limit DMA-able memory way below
+ * 4G. We force memblock to bottom-up mode to ensure that the
+ * memory allocated in swiotlb_init() is DMA-able.
+ * As it's the last memblock allocation, no need to reset it
+ * back to to-down.
+ */
+ memblock_set_bottom_up(true);
+ swiotlb_init(ppc_swiotlb_enable, ppc_swiotlb_flags);
#endif
- high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
- set_max_mapnr(max_pfn);
- free_all_bootmem();
-
-#ifdef CONFIG_HIGHMEM
- {
- unsigned long pfn, highmem_mapnr;
-
- highmem_mapnr = lowmem_end_addr >> PAGE_SHIFT;
- for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) {
- phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT;
- struct page *page = pfn_to_page(pfn);
- if (!memblock_is_reserved(paddr))
- free_highmem_page(page);
- }
- }
-#endif /* CONFIG_HIGHMEM */
+ kasan_late_init();
-#if defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_SMP)
+#if defined(CONFIG_PPC_E500) && !defined(CONFIG_SMP)
/*
* If smp is enabled, next_tlbcam_idx is initialized in the cpu up
* functions.... do it here for the non-smp case.
@@ -329,193 +303,42 @@ void __init mem_init(void)
per_cpu(next_tlbcam_idx, smp_processor_id()) =
(mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1;
#endif
-
- mem_init_print_info(NULL);
-#ifdef CONFIG_PPC32
- pr_info("Kernel virtual memory layout:\n");
- pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP);
-#ifdef CONFIG_HIGHMEM
- pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n",
- PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP));
-#endif /* CONFIG_HIGHMEM */
-#ifdef CONFIG_NOT_COHERENT_CACHE
- pr_info(" * 0x%08lx..0x%08lx : consistent mem\n",
- IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE);
-#endif /* CONFIG_NOT_COHERENT_CACHE */
- pr_info(" * 0x%08lx..0x%08lx : early ioremap\n",
- ioremap_bot, IOREMAP_TOP);
- pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n",
- VMALLOC_START, VMALLOC_END);
-#endif /* CONFIG_PPC32 */
-
- mem_init_done = 1;
}
void free_initmem(void)
{
ppc_md.progress = ppc_printk_progress;
+ mark_initmem_nx();
free_initmem_default(POISON_FREE_INITMEM);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-/*
- * This is called when a page has been modified by the kernel.
- * It just marks the page as not i-cache clean. We do the i-cache
- * flush later when the page is given to a user process, if necessary.
- */
-void flush_dcache_page(struct page *page)
-{
- if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
- return;
- /* avoid an atomic op if possible */
- if (test_bit(PG_arch_1, &page->flags))
- clear_bit(PG_arch_1, &page->flags);
-}
-EXPORT_SYMBOL(flush_dcache_page);
-
-void flush_dcache_icache_page(struct page *page)
-{
-#ifdef CONFIG_HUGETLB_PAGE
- if (PageCompound(page)) {
- flush_dcache_icache_hugepage(page);
- return;
- }
-#endif
-#ifdef CONFIG_BOOKE
- {
- void *start = kmap_atomic(page);
- __flush_dcache_icache(start);
- kunmap_atomic(start);
- }
-#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64)
- /* On 8xx there is no need to kmap since highmem is not supported */
- __flush_dcache_icache(page_address(page));
-#else
- __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
-#endif
-}
-EXPORT_SYMBOL(flush_dcache_icache_page);
-
-void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
-{
- clear_page(page);
-
- /*
- * We shouldn't have to do this, but some versions of glibc
- * require it (ld.so assumes zero filled pages are icache clean)
- * - Anton
- */
- flush_dcache_page(pg);
-}
-EXPORT_SYMBOL(clear_user_page);
-
-void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
- struct page *pg)
-{
- copy_page(vto, vfrom);
-
- /*
- * We should be able to use the following optimisation, however
- * there are two problems.
- * Firstly a bug in some versions of binutils meant PLT sections
- * were not marked executable.
- * Secondly the first word in the GOT section is blrl, used
- * to establish the GOT address. Until recently the GOT was
- * not marked executable.
- * - Anton
- */
-#if 0
- if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0))
- return;
-#endif
-
- flush_dcache_page(pg);
-}
-
-void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
- unsigned long addr, int len)
-{
- unsigned long maddr;
-
- maddr = (unsigned long) kmap(page) + (addr & ~PAGE_MASK);
- flush_icache_range(maddr, maddr + len);
- kunmap(page);
-}
-EXPORT_SYMBOL(flush_icache_user_range);
-
-/*
- * This is called at the end of handling a user page fault, when the
- * fault has been handled by updating a PTE in the linux page tables.
- * We use it to preload an HPTE into the hash table corresponding to
- * the updated linux PTE.
- *
- * This must always be called with the pte lock held.
- */
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
- pte_t *ptep)
-{
-#ifdef CONFIG_PPC_STD_MMU
- /*
- * We don't need to worry about _PAGE_PRESENT here because we are
- * called with either mm->page_table_lock held or ptl lock held
- */
- unsigned long access = 0, trap;
-
- /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
- if (!pte_young(*ptep) || address >= TASK_SIZE)
- return;
-
- /* We try to figure out if we are coming from an instruction
- * access fault and pass that down to __hash_page so we avoid
- * double-faulting on execution of fresh text. We have to test
- * for regs NULL since init will get here first thing at boot
- *
- * We also avoid filling the hash if not coming from a fault
- */
- if (current->thread.regs == NULL)
- return;
- trap = TRAP(current->thread.regs);
- if (trap == 0x400)
- access |= _PAGE_EXEC;
- else if (trap != 0x300)
- return;
- hash_preload(vma->vm_mm, address, access, trap);
-#endif /* CONFIG_PPC_STD_MMU */
-#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \
- && defined(CONFIG_HUGETLB_PAGE)
- if (is_vm_hugetlb_page(vma))
- book3e_hugetlb_preload(vma, address, *ptep);
-#endif
+ ftrace_free_init_tramp();
}
/*
* System memory should not be in /proc/iomem but various tools expect it
* (eg kdump).
*/
-static int add_system_ram_resources(void)
+static int __init add_system_ram_resources(void)
{
- struct memblock_region *reg;
+ phys_addr_t start, end;
+ u64 i;
- for_each_memblock(memory, reg) {
+ for_each_mem_range(i, &start, &end) {
struct resource *res;
- unsigned long base = reg->base;
- unsigned long size = reg->size;
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
WARN_ON(!res);
if (res) {
res->name = "System RAM";
- res->start = base;
- res->end = base + size - 1;
- res->flags = IORESOURCE_MEM;
- WARN_ON(request_resource(&iomem_resource, res) < 0);
+ res->start = start;
+ /*
+ * In memblock, end points to the first byte after
+ * the range while in resourses, end points to the
+ * last byte in the range.
+ */
+ res->end = end - 1;
+ res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ WARN_ON(insert_resource(&iomem_resource, res) < 0);
}
}
@@ -533,12 +356,95 @@ subsys_initcall(add_system_ram_resources);
*/
int devmem_is_allowed(unsigned long pfn)
{
- if (iomem_is_exclusive(pfn << PAGE_SHIFT))
+ if (page_is_rtas_user_buf(pfn))
+ return 1;
+ if (iomem_is_exclusive(PFN_PHYS(pfn)))
return 0;
if (!page_is_ram(pfn))
return 1;
- if (page_is_rtas_user_buf(pfn))
- return 1;
return 0;
}
#endif /* CONFIG_STRICT_DEVMEM */
+
+/*
+ * This is defined in kernel/resource.c but only powerpc needs to export it, for
+ * the EHEA driver. Drop this when drivers/net/ethernet/ibm/ehea is removed.
+ */
+EXPORT_SYMBOL_GPL(walk_system_ram_range);
+
+#ifdef CONFIG_EXECMEM
+static struct execmem_info execmem_info __ro_after_init;
+
+#if defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_BOOK3S_603)
+static void prealloc_execmem_pgtable(void)
+{
+ unsigned long va;
+
+ for (va = ALIGN_DOWN(MODULES_VADDR, PGDIR_SIZE); va < MODULES_END; va += PGDIR_SIZE)
+ pte_alloc_kernel(pmd_off_k(va), va);
+}
+#else
+static void prealloc_execmem_pgtable(void) { }
+#endif
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+ pgprot_t kprobes_prot = strict_module_rwx_enabled() ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
+ pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC;
+ unsigned long fallback_start = 0, fallback_end = 0;
+ unsigned long start, end;
+
+ /*
+ * BOOK3S_32 and 8xx define MODULES_VADDR for text allocations and
+ * allow allocating data in the entire vmalloc space
+ */
+#ifdef MODULES_VADDR
+ unsigned long limit = (unsigned long)_etext - SZ_32M;
+
+ BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
+
+ /* First try within 32M limit from _etext to avoid branch trampolines */
+ if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) {
+ start = limit;
+ fallback_start = MODULES_VADDR;
+ fallback_end = MODULES_END;
+ } else {
+ start = MODULES_VADDR;
+ }
+
+ end = MODULES_END;
+#else
+ start = VMALLOC_START;
+ end = VMALLOC_END;
+#endif
+
+ prealloc_execmem_pgtable();
+
+ execmem_info = (struct execmem_info){
+ .ranges = {
+ [EXECMEM_DEFAULT] = {
+ .start = start,
+ .end = end,
+ .pgprot = prot,
+ .alignment = 1,
+ .fallback_start = fallback_start,
+ .fallback_end = fallback_end,
+ },
+ [EXECMEM_KPROBES] = {
+ .start = VMALLOC_START,
+ .end = VMALLOC_END,
+ .pgprot = kprobes_prot,
+ .alignment = 1,
+ },
+ [EXECMEM_MODULE_DATA] = {
+ .start = VMALLOC_START,
+ .end = VMALLOC_END,
+ .pgprot = PAGE_KERNEL,
+ .alignment = 1,
+ },
+ },
+ };
+
+ return &execmem_info;
+}
+#endif /* CONFIG_EXECMEM */
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
deleted file mode 100644
index cb8bdbe4972f..000000000000
--- a/arch/powerpc/mm/mmap.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * flexible mmap layout support
- *
- * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- *
- * Started by Ingo Molnar <mingo@elte.hu>
- */
-
-#include <linux/personality.h>
-#include <linux/mm.h>
-#include <linux/random.h>
-#include <linux/sched.h>
-
-/*
- * Top of mmap area (just below the process stack).
- *
- * Leave at least a ~128 MB hole on 32bit applications.
- *
- * On 64bit applications we randomise the stack by 1GB so we need to
- * space our mmap start address by a further 1GB, otherwise there is a
- * chance the mmap area will end up closer to the stack than our ulimit
- * requires.
- */
-#define MIN_GAP32 (128*1024*1024)
-#define MIN_GAP64 ((128 + 1024)*1024*1024UL)
-#define MIN_GAP ((is_32bit_task()) ? MIN_GAP32 : MIN_GAP64)
-#define MAX_GAP (TASK_SIZE/6*5)
-
-static inline int mmap_is_legacy(void)
-{
- if (current->personality & ADDR_COMPAT_LAYOUT)
- return 1;
-
- if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
- return 1;
-
- return sysctl_legacy_va_layout;
-}
-
-static unsigned long mmap_rnd(void)
-{
- unsigned long rnd = 0;
-
- if (current->flags & PF_RANDOMIZE) {
- /* 8MB for 32bit, 1GB for 64bit */
- if (is_32bit_task())
- rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT)));
- else
- rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT)));
- }
- return rnd << PAGE_SHIFT;
-}
-
-static inline unsigned long mmap_base(void)
-{
- unsigned long gap = rlimit(RLIMIT_STACK);
-
- if (gap < MIN_GAP)
- gap = MIN_GAP;
- else if (gap > MAX_GAP)
- gap = MAX_GAP;
-
- return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
-}
-
-/*
- * This function, called very early during the creation of a new
- * process VM image, sets up which VM layout function to use:
- */
-void arch_pick_mmap_layout(struct mm_struct *mm)
-{
- /*
- * Fall back to the standard layout if the personality
- * bit is set, or if the expected stack growth is unlimited:
- */
- if (mmap_is_legacy()) {
- mm->mmap_base = TASK_UNMAPPED_BASE;
- mm->get_unmapped_area = arch_get_unmapped_area;
- } else {
- mm->mmap_base = mmap_base();
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- }
-}
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
new file mode 100644
index 000000000000..3e3af29b4523
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Common implementation of switch_mm_irqs_off
+ *
+ * Copyright IBM Corp. 2017
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/sched/mm.h>
+
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+
+#if defined(CONFIG_PPC32)
+static inline void switch_mm_pgdir(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+ /* 32-bit keeps track of the current PGDIR in the thread struct */
+ tsk->thread.pgdir = mm->pgd;
+#ifdef CONFIG_PPC_BOOK3S_32
+ tsk->thread.sr0 = mm->context.sr0;
+#endif
+#if defined(CONFIG_BOOKE) && defined(CONFIG_PPC_KUAP)
+ tsk->thread.pid = mm->context.id;
+#endif
+}
+#elif defined(CONFIG_PPC_BOOK3E_64)
+static inline void switch_mm_pgdir(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+ /* 64-bit Book3E keeps track of current PGD in the PACA */
+ get_paca()->pgd = mm->pgd;
+#ifdef CONFIG_PPC_KUAP
+ tsk->thread.pid = mm->context.id;
+#endif
+}
+#else
+static inline void switch_mm_pgdir(struct task_struct *tsk,
+ struct mm_struct *mm) { }
+#endif
+
+void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ int cpu = smp_processor_id();
+ bool new_on_cpu = false;
+
+ /* Mark this context has been used on the new CPU */
+ if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
+ VM_WARN_ON_ONCE(next == &init_mm);
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+ inc_mm_active_cpus(next);
+
+ /*
+ * This full barrier orders the store to the cpumask above vs
+ * a subsequent load which allows this CPU/MMU to begin loading
+ * translations for 'next' from page table PTEs into the TLB.
+ *
+ * When using the radix MMU, that operation is the load of the
+ * MMU context id, which is then moved to SPRN_PID.
+ *
+ * For the hash MMU it is either the first load from slb_cache
+ * in switch_slb() to preload the SLBs, or the load of
+ * get_user_context which loads the context for the VSID hash
+ * to insert a new SLB, in the SLB fault handler.
+ *
+ * On the other side, the barrier is in mm/tlb-radix.c for
+ * radix which orders earlier stores to clear the PTEs before
+ * the load of mm_cpumask to check which CPU TLBs should be
+ * flushed. For hash, pte_xchg to clear the PTE includes the
+ * barrier.
+ *
+ * This full barrier is also needed by membarrier when
+ * switching between processes after store to rq->curr, before
+ * user-space memory accesses.
+ */
+ smp_mb();
+
+ new_on_cpu = true;
+ }
+
+ /* Some subarchs need to track the PGD elsewhere */
+ switch_mm_pgdir(tsk, next);
+
+ /* Nothing else to do if we aren't actually switching */
+ if (prev == next)
+ return;
+
+ /*
+ * We must stop all altivec streams before changing the HW
+ * context
+ */
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ asm volatile (PPC_DSSALL);
+
+ if (!new_on_cpu)
+ membarrier_arch_switch_mm(prev, next, tsk);
+
+ /*
+ * The actual HW switching method differs between the various
+ * sub architectures. Out of line for now
+ */
+ switch_mmu_context(prev, next, tsk);
+
+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(prev)));
+}
+
+#ifndef CONFIG_PPC_BOOK3S_64
+void arch_exit_mmap(struct mm_struct *mm)
+{
+ void *frag = pte_frag_get(&mm->context);
+
+ if (frag)
+ pte_frag_destroy(frag);
+}
+#endif
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
deleted file mode 100644
index 178876aef40f..000000000000
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * MMU context allocation for 64-bit kernels.
- *
- * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/export.h>
-#include <linux/gfp.h>
-#include <linux/slab.h>
-
-#include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
-
-#include "icswx.h"
-
-static DEFINE_SPINLOCK(mmu_context_lock);
-static DEFINE_IDA(mmu_context_ida);
-
-int __init_new_context(void)
-{
- int index;
- int err;
-
-again:
- if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
- return -ENOMEM;
-
- spin_lock(&mmu_context_lock);
- err = ida_get_new_above(&mmu_context_ida, 1, &index);
- spin_unlock(&mmu_context_lock);
-
- if (err == -EAGAIN)
- goto again;
- else if (err)
- return err;
-
- if (index > MAX_USER_CONTEXT) {
- spin_lock(&mmu_context_lock);
- ida_remove(&mmu_context_ida, index);
- spin_unlock(&mmu_context_lock);
- return -ENOMEM;
- }
-
- return index;
-}
-EXPORT_SYMBOL_GPL(__init_new_context);
-
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
- int index;
-
- index = __init_new_context();
- if (index < 0)
- return index;
-
- /* The old code would re-promote on fork, we don't do that
- * when using slices as it could cause problem promoting slices
- * that have been forced down to 4K
- */
- if (slice_mm_new_context(mm))
- slice_set_user_psize(mm, mmu_virtual_psize);
- subpage_prot_init_new_context(mm);
- mm->context.id = index;
-#ifdef CONFIG_PPC_ICSWX
- mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
- if (!mm->context.cop_lockp) {
- __destroy_context(index);
- subpage_prot_free(mm);
- mm->context.id = MMU_NO_CONTEXT;
- return -ENOMEM;
- }
- spin_lock_init(mm->context.cop_lockp);
-#endif /* CONFIG_PPC_ICSWX */
-
-#ifdef CONFIG_PPC_64K_PAGES
- mm->context.pte_frag = NULL;
-#endif
- return 0;
-}
-
-void __destroy_context(int context_id)
-{
- spin_lock(&mmu_context_lock);
- ida_remove(&mmu_context_ida, context_id);
- spin_unlock(&mmu_context_lock);
-}
-EXPORT_SYMBOL_GPL(__destroy_context);
-
-#ifdef CONFIG_PPC_64K_PAGES
-static void destroy_pagetable_page(struct mm_struct *mm)
-{
- int count;
- void *pte_frag;
- struct page *page;
-
- pte_frag = mm->context.pte_frag;
- if (!pte_frag)
- return;
-
- page = virt_to_page(pte_frag);
- /* drop all the pending references */
- count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
- /* We allow PTE_FRAG_NR fragments from a PTE page */
- count = atomic_sub_return(PTE_FRAG_NR - count, &page->_count);
- if (!count) {
- pgtable_page_dtor(page);
- free_hot_cold_page(page, 0);
- }
-}
-
-#else
-static inline void destroy_pagetable_page(struct mm_struct *mm)
-{
- return;
-}
-#endif
-
-
-void destroy_context(struct mm_struct *mm)
-{
-
-#ifdef CONFIG_PPC_ICSWX
- drop_cop(mm->context.acop, mm);
- kfree(mm->context.cop_lockp);
- mm->context.cop_lockp = NULL;
-#endif /* CONFIG_PPC_ICSWX */
-
- destroy_pagetable_page(mm);
- __destroy_context(mm->context.id);
- subpage_prot_free(mm);
- mm->context.id = MMU_NO_CONTEXT;
-}
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 83eb5d5f53d5..b2d1eea09761 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Declarations of procedures and variables shared between files
* in arch/ppc/mm/.
@@ -11,53 +12,50 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/mm.h>
-#include <asm/tlbflush.h>
#include <asm/mmu.h>
#ifdef CONFIG_PPC_MMU_NOHASH
+#include <asm/trace.h>
/*
- * On 40x and 8xx, we directly inline tlbia and tlbivax
+ * On 8xx, we directly inline tlbia
*/
-#if defined(CONFIG_40x) || defined(CONFIG_8xx)
+#ifdef CONFIG_PPC_8xx
static inline void _tlbil_all(void)
{
asm volatile ("sync; tlbia; isync" : : : "memory");
+ trace_tlbia(MMU_NO_CONTEXT);
}
static inline void _tlbil_pid(unsigned int pid)
{
asm volatile ("sync; tlbia; isync" : : : "memory");
+ trace_tlbia(pid);
}
#define _tlbil_pid_noind(pid) _tlbil_pid(pid)
-#else /* CONFIG_40x || CONFIG_8xx */
+#else /* CONFIG_PPC_8xx */
extern void _tlbil_all(void);
extern void _tlbil_pid(unsigned int pid);
-#ifdef CONFIG_PPC_BOOK3E
+#ifdef CONFIG_PPC_BOOK3E_64
extern void _tlbil_pid_noind(unsigned int pid);
#else
#define _tlbil_pid_noind(pid) _tlbil_pid(pid)
#endif
-#endif /* !(CONFIG_40x || CONFIG_8xx) */
+#endif /* !CONFIG_PPC_8xx */
/*
* On 8xx, we directly inline tlbie, on others, it's extern
*/
-#ifdef CONFIG_8xx
+#ifdef CONFIG_PPC_8xx
static inline void _tlbil_va(unsigned long address, unsigned int pid,
unsigned int tsize, unsigned int ind)
{
asm volatile ("tlbie %0; sync" : : "r" (address) : "memory");
+ trace_tlbie(0, 0, address, pid, 0, 0, 0);
}
-#elif defined(CONFIG_PPC_BOOK3E)
+#elif defined(CONFIG_PPC_BOOK3E_64)
extern void _tlbil_va(unsigned long address, unsigned int pid,
unsigned int tsize, unsigned int ind);
#else
@@ -67,9 +65,9 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid,
{
__tlbil_va(address, pid);
}
-#endif /* CONIFG_8xx */
+#endif /* CONFIG_PPC_8xx */
-#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x)
+#if defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_47x)
extern void _tlbivax_bcast(unsigned long address, unsigned int pid,
unsigned int tsize, unsigned int ind);
#else
@@ -80,76 +78,62 @@ static inline void _tlbivax_bcast(unsigned long address, unsigned int pid,
}
#endif
-#else /* CONFIG_PPC_MMU_NOHASH */
-
-extern void hash_preload(struct mm_struct *mm, unsigned long ea,
- unsigned long access, unsigned long trap);
+static inline void print_system_hash_info(void) {}
+#else /* CONFIG_PPC_MMU_NOHASH */
-extern void _tlbie(unsigned long address);
-extern void _tlbia(void);
+void print_system_hash_info(void);
#endif /* CONFIG_PPC_MMU_NOHASH */
#ifdef CONFIG_PPC32
extern void mapin_ram(void);
-extern int map_page(unsigned long va, phys_addr_t pa, int flags);
extern void setbat(int index, unsigned long virt, phys_addr_t phys,
- unsigned int size, int flags);
+ unsigned int size, pgprot_t prot);
-extern int __map_without_bats;
-extern int __allow_ioremap_reserved;
-extern unsigned long ioremap_base;
-extern unsigned int rtas_data, rtas_size;
-
-struct hash_pte;
-extern struct hash_pte *Hash, *Hash_end;
-extern unsigned long Hash_size, Hash_mask;
+extern u8 early_hash[];
#endif /* CONFIG_PPC32 */
-#ifdef CONFIG_PPC64
-extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags);
-#endif /* CONFIG_PPC64 */
-
-extern unsigned long ioremap_bot;
extern unsigned long __max_low_memory;
-extern phys_addr_t __initial_memory_limit_addr;
extern phys_addr_t total_memory;
extern phys_addr_t total_lowmem;
extern phys_addr_t memstart_addr;
extern phys_addr_t lowmem_end_addr;
-#ifdef CONFIG_WII
-extern unsigned long wii_hole_start;
-extern unsigned long wii_hole_size;
-
-extern unsigned long wii_mmu_mapin_mem2(unsigned long top);
-extern void wii_memory_fixups(void);
-#endif
-
/* ...and now those things that may be slightly different between processor
* architectures. -- Dan
*/
-#if defined(CONFIG_8xx)
-#define MMU_init_hw() do { } while(0)
-#define mmu_mapin_ram(top) (0UL)
-
-#elif defined(CONFIG_4xx)
+#ifdef CONFIG_PPC32
extern void MMU_init_hw(void);
-extern unsigned long mmu_mapin_ram(unsigned long top);
+void MMU_init_hw_patch(void);
+unsigned long mmu_mapin_ram(unsigned long base, unsigned long top);
+#endif
+void mmu_init_secondary(int cpu);
-#elif defined(CONFIG_PPC_FSL_BOOK3E)
-extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx);
-extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
- phys_addr_t phys);
+#ifdef CONFIG_PPC_E500
+extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx,
+ bool dryrun, bool init);
#ifdef CONFIG_PPC32
-extern void MMU_init_hw(void);
-extern unsigned long mmu_mapin_ram(unsigned long top);
extern void adjust_total_lowmem(void);
+extern int switch_to_as1(void);
+extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu);
+void create_kaslr_tlb_entry(int entry, unsigned long virt, phys_addr_t phys);
+void reloc_kernel_entry(void *fdt, int addr);
+void relocate_init(u64 dt_ptr, phys_addr_t start);
+extern int is_second_reloc;
#endif
extern void loadcam_entry(unsigned int index);
+extern void loadcam_multi(int first_idx, int num, int tmp_idx);
+
+#ifdef CONFIG_RANDOMIZE_BASE
+void kaslr_early_init(void *dt_ptr, phys_addr_t size);
+void kaslr_late_init(void);
+#else
+static inline void kaslr_early_init(void *dt_ptr, phys_addr_t size) {}
+static inline void kaslr_late_init(void) {}
+#endif
struct tlbcam {
u32 MAS0;
@@ -158,8 +142,43 @@ struct tlbcam {
u32 MAS3;
u32 MAS7;
};
-#elif defined(CONFIG_PPC32)
-/* anything 32-bit except 4xx or 8xx */
-extern void MMU_init_hw(void);
-extern unsigned long mmu_mapin_ram(unsigned long top);
+
+#define NUM_TLBCAMS 64
+
+extern struct tlbcam TLBCAM[NUM_TLBCAMS];
+#endif
+
+#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_85xx) || defined(CONFIG_PPC_8xx)
+/* 6xx have BATS */
+/* PPC_85xx have TLBCAM */
+/* 8xx have LTLB */
+phys_addr_t v_block_mapped(unsigned long va);
+unsigned long p_block_mapped(phys_addr_t pa);
+#else
+static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; }
+static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; }
+#endif
+
+#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_E500)
+int mmu_mark_initmem_nx(void);
+int mmu_mark_rodata_ro(void);
+#else
+static inline int mmu_mark_initmem_nx(void) { return 0; }
+static inline int mmu_mark_rodata_ro(void) { return 0; }
#endif
+
+#ifdef CONFIG_PPC_8xx
+void __init mmu_mapin_immr(void);
+#endif
+
+static inline bool debug_pagealloc_enabled_or_kfence(void)
+{
+ return IS_ENABLED(CONFIG_KFENCE) || debug_pagealloc_enabled();
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot);
+#endif
+
+int hash__kernel_map_pages(struct page *page, int numpages, int enable);
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/nohash/44x.c
index 82b1ff759e26..6d10c6d8be71 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/nohash/44x.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Modifications by Matt Porter (mporter@mvista.com) to support
* PPC44x Book E processors.
@@ -15,12 +16,6 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/init.h>
@@ -29,8 +24,10 @@
#include <asm/mmu.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
+#include <asm/text-patching.h>
+#include <asm/smp.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
/* Used by the 44x TLB replacement exception handler.
* Just needed it declared someplace.
@@ -41,24 +38,15 @@ int icache_44x_need_flush;
unsigned long tlb_47x_boltmap[1024/8];
-static void ppc44x_update_tlb_hwater(void)
+static void __init ppc44x_update_tlb_hwater(void)
{
- extern unsigned int tlb_44x_patch_hwater_D[];
- extern unsigned int tlb_44x_patch_hwater_I[];
-
/* The TLB miss handlers hard codes the watermark in a cmpli
* instruction to improve performances rather than loading it
* from the global variable. Thus, we patch the instructions
* in the 2 TLB miss handlers when updating the value
*/
- tlb_44x_patch_hwater_D[0] = (tlb_44x_patch_hwater_D[0] & 0xffff0000) |
- tlb_44x_hwater;
- flush_icache_range((unsigned long)&tlb_44x_patch_hwater_D[0],
- (unsigned long)&tlb_44x_patch_hwater_D[1]);
- tlb_44x_patch_hwater_I[0] = (tlb_44x_patch_hwater_I[0] & 0xffff0000) |
- tlb_44x_hwater;
- flush_icache_range((unsigned long)&tlb_44x_patch_hwater_I[0],
- (unsigned long)&tlb_44x_patch_hwater_I[1]);
+ modify_instruction_site(&patch__tlb_44x_hwater_D, 0xffff, tlb_44x_hwater);
+ modify_instruction_site(&patch__tlb_44x_hwater_I, 0xffff, tlb_44x_hwater);
}
/*
@@ -134,7 +122,7 @@ static void __init ppc47x_update_boltmap(void)
/*
* "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
*/
-static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
+static void __init ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
{
unsigned int rA;
int bolted;
@@ -178,7 +166,7 @@ void __init MMU_init_hw(void)
flush_instruction_cache();
}
-unsigned long __init mmu_mapin_ram(unsigned long top)
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
{
unsigned long addr;
unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
@@ -229,7 +217,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
}
#ifdef CONFIG_SMP
-void mmu_init_secondary(int cpu)
+void __init mmu_init_secondary(int cpu)
{
unsigned long addr;
unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
new file mode 100644
index 000000000000..ab1505cf42bf
--- /dev/null
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file contains the routines for initializing the MMU
+ * on the 8xx series of chips.
+ * -- christophe
+ *
+ * Derived from arch/powerpc/mm/40x_mmu.c:
+ */
+
+#include <linux/memblock.h>
+#include <linux/hugetlb.h>
+
+#include <asm/fixmap.h>
+#include <asm/pgalloc.h>
+
+#include <mm/mmu_decl.h>
+
+#define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT)
+
+static unsigned long block_mapped_ram;
+
+/*
+ * Return PA for this VA if it is in an area mapped with LTLBs or fixmap.
+ * Otherwise, returns 0
+ */
+phys_addr_t v_block_mapped(unsigned long va)
+{
+ unsigned long p = PHYS_IMMR_BASE;
+
+ if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE)
+ return p + va - VIRT_IMMR_BASE;
+ if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram)
+ return __pa(va);
+ return 0;
+}
+
+/*
+ * Return VA for a given PA mapped with LTLBs or fixmap
+ * Return 0 if not mapped
+ */
+unsigned long p_block_mapped(phys_addr_t pa)
+{
+ unsigned long p = PHYS_IMMR_BASE;
+
+ if (pa >= p && pa < p + IMMR_SIZE)
+ return VIRT_IMMR_BASE + pa - p;
+ if (pa < block_mapped_ram)
+ return (unsigned long)__va(pa);
+ return 0;
+}
+
+static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa,
+ pgprot_t prot, int psize, bool new)
+{
+ pmd_t *pmdp = pmd_off_k(va);
+ pte_t *ptep;
+ unsigned int shift = mmu_psize_to_shift(psize);
+
+ if (new) {
+ if (WARN_ON(slab_is_available()))
+ return -EINVAL;
+
+ if (psize == MMU_PAGE_8M) {
+ if (WARN_ON(!pmd_none(*pmdp) || !pmd_none(*(pmdp + 1))))
+ return -EINVAL;
+
+ ptep = early_alloc_pgtable(PTE_FRAG_SIZE);
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+
+ ptep = early_alloc_pgtable(PTE_FRAG_SIZE);
+ pmd_populate_kernel(&init_mm, pmdp + 1, ptep);
+
+ ptep = (pte_t *)pmdp;
+ } else {
+ ptep = early_pte_alloc_kernel(pmdp, va);
+ /* The PTE should never be already present */
+ if (WARN_ON(pte_present(*ptep) && pgprot_val(prot)))
+ return -EINVAL;
+ }
+ } else {
+ if (psize == MMU_PAGE_8M)
+ ptep = (pte_t *)pmdp;
+ else
+ ptep = pte_offset_kernel(pmdp, va);
+ }
+
+ if (WARN_ON(!ptep))
+ return -ENOMEM;
+
+ set_huge_pte_at(&init_mm, va, ptep,
+ arch_make_huge_pte(pfn_pte(pa >> PAGE_SHIFT, prot), shift, 0),
+ 1UL << shift);
+
+ return 0;
+}
+
+/*
+ * MMU_init_hw does the chip-specific initialization of the MMU hardware.
+ */
+void __init MMU_init_hw(void)
+{
+}
+
+static bool immr_is_mapped __initdata;
+
+void __init mmu_mapin_immr(void)
+{
+ if (immr_is_mapped)
+ return;
+
+ immr_is_mapped = true;
+
+ __early_map_kernel_hugepage(VIRT_IMMR_BASE, PHYS_IMMR_BASE,
+ PAGE_KERNEL_NCG, MMU_PAGE_512K, true);
+}
+
+static int mmu_mapin_ram_chunk(unsigned long offset, unsigned long top,
+ pgprot_t prot, bool new)
+{
+ unsigned long v = PAGE_OFFSET + offset;
+ unsigned long p = offset;
+ int err = 0;
+
+ WARN_ON(!IS_ALIGNED(offset, SZ_16K) || !IS_ALIGNED(top, SZ_16K));
+
+ for (; p < ALIGN(p, SZ_512K) && p < top && !err; p += SZ_16K, v += SZ_16K)
+ err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_16K, new);
+ for (; p < ALIGN(p, SZ_8M) && p < top && !err; p += SZ_512K, v += SZ_512K)
+ err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
+ for (; p < ALIGN_DOWN(top, SZ_8M) && p < top && !err; p += SZ_8M, v += SZ_8M)
+ err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_8M, new);
+ for (; p < ALIGN_DOWN(top, SZ_512K) && p < top && !err; p += SZ_512K, v += SZ_512K)
+ err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
+ for (; p < ALIGN_DOWN(top, SZ_16K) && p < top && !err; p += SZ_16K, v += SZ_16K)
+ err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_16K, new);
+
+ if (!new)
+ flush_tlb_kernel_range(PAGE_OFFSET + v, PAGE_OFFSET + top);
+
+ return err;
+}
+
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
+{
+ unsigned long etext8 = ALIGN(__pa(_etext), SZ_8M);
+ unsigned long sinittext = __pa(_sinittext);
+ bool strict_boundary = strict_kernel_rwx_enabled() || debug_pagealloc_enabled_or_kfence();
+ unsigned long boundary = strict_boundary ? sinittext : etext8;
+ unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M);
+
+ WARN_ON(top < einittext8);
+
+ mmu_mapin_immr();
+
+ mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_X, true);
+ if (debug_pagealloc_enabled_or_kfence()) {
+ top = boundary;
+ } else {
+ mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL_X, true);
+ mmu_mapin_ram_chunk(einittext8, top, PAGE_KERNEL, true);
+ }
+
+ if (top > SZ_32M)
+ memblock_set_current_limit(top);
+
+ block_mapped_ram = top;
+
+ return top;
+}
+
+int mmu_mark_initmem_nx(void)
+{
+ unsigned long etext8 = ALIGN(__pa(_etext), SZ_8M);
+ unsigned long sinittext = __pa(_sinittext);
+ unsigned long boundary = strict_kernel_rwx_enabled() ? sinittext : etext8;
+ unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M);
+ int err = 0;
+
+ if (!debug_pagealloc_enabled_or_kfence())
+ err = mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL, false);
+
+ if (IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+ mmu_pin_tlb(block_mapped_ram, false);
+
+ return err;
+}
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+int mmu_mark_rodata_ro(void)
+{
+ unsigned long sinittext = __pa(_sinittext);
+ int err;
+
+ err = mmu_mapin_ram_chunk(0, sinittext, PAGE_KERNEL_ROX, false);
+ if (IS_ENABLED(CONFIG_PIN_TLB_DATA))
+ mmu_pin_tlb(block_mapped_ram, true);
+
+ return err;
+}
+#endif
+
+void __init setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /* We don't currently support the first MEMBLOCK not mapping 0
+ * physical on those processors
+ */
+ BUG_ON(first_memblock_base != 0);
+
+ /* 8xx can only access 32MB at the moment */
+ memblock_set_current_limit(min_t(u64, first_memblock_size, SZ_32M));
+
+ BUILD_BUG_ON(ALIGN_DOWN(MODULES_VADDR, PGDIR_SIZE) < TASK_SIZE);
+}
+
+int pud_clear_huge(pud_t *pud)
+{
+ return 0;
+}
+
+int pmd_clear_huge(pmd_t *pmd)
+{
+ return 0;
+}
diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile
new file mode 100644
index 000000000000..cf60c776c883
--- /dev/null
+++ b/arch/powerpc/mm/nohash/Makefile
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += mmu_context.o tlb.o tlb_low.o kup.o
+obj-$(CONFIG_PPC_BOOK3E_64) += tlb_64e.o tlb_low_64e.o book3e_pgtable.o
+obj-$(CONFIG_44x) += 44x.o
+obj-$(CONFIG_PPC_8xx) += 8xx.o
+obj-$(CONFIG_PPC_E500) += e500.o
+obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_booke.o
+ifdef CONFIG_HUGETLB_PAGE
+obj-$(CONFIG_PPC_E500) += e500_hugetlbpage.o
+endif
+
+# Disable kcov instrumentation on sensitive code
+# This is necessary for booting with kcov enabled on book3e machines
+KCOV_INSTRUMENT_tlb.o := n
+KCOV_INSTRUMENT_e500.o := n
diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c
new file mode 100644
index 000000000000..062e8785c1bb
--- /dev/null
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2005, Paul Mackerras, IBM Corporation.
+ * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ */
+
+#include <linux/sched.h>
+#include <linux/memblock.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/dma.h>
+#include <asm/text-patching.h>
+
+#include <mm/mmu_decl.h>
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * On Book3E CPUs, the vmemmap is currently mapped in the top half of
+ * the vmalloc space using normal page tables, though the size of
+ * pages encoded in the PTEs can be different
+ */
+int __meminit vmemmap_create_mapping(unsigned long start,
+ unsigned long page_size,
+ unsigned long phys)
+{
+ /* Create a PTE encoding without page size */
+ unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
+ _PAGE_KERNEL_RW;
+
+ /* PTEs only contain page size encodings up to 32M */
+ BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].shift - 10 > 0xf);
+
+ /* Encode the size in the PTE */
+ flags |= (mmu_psize_defs[mmu_vmemmap_psize].shift - 10) << 8;
+
+ /* For each PTE for that area, map things. Note that we don't
+ * increment phys because all PTEs are of the large size and
+ * thus must have the low bits clear
+ */
+ for (i = 0; i < page_size; i += PAGE_SIZE)
+ BUG_ON(map_kernel_page(start + i, phys, __pgprot(flags)));
+
+ return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void vmemmap_remove_mapping(unsigned long start,
+ unsigned long page_size)
+{
+}
+#endif
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+static void __init *early_alloc_pgtable(unsigned long size)
+{
+ void *ptr;
+
+ ptr = memblock_alloc_try_nid(size, size, MEMBLOCK_LOW_LIMIT,
+ __pa(MAX_DMA_ADDRESS), NUMA_NO_NODE);
+
+ if (!ptr)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx max_addr=%lx\n",
+ __func__, size, size, __pa(MAX_DMA_ADDRESS));
+
+ return ptr;
+}
+
+/*
+ * map_kernel_page currently only called by __ioremap
+ * map_kernel_page adds an entry to the ioremap page table
+ * and adds an entry to the HPT, possibly bolting it
+ */
+int __ref map_kernel_page(unsigned long ea, phys_addr_t pa, pgprot_t prot)
+{
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE);
+ if (slab_is_available()) {
+ pgdp = pgd_offset_k(ea);
+ p4dp = p4d_offset(pgdp, ea);
+ pudp = pud_alloc(&init_mm, p4dp, ea);
+ if (!pudp)
+ return -ENOMEM;
+ pmdp = pmd_alloc(&init_mm, pudp, ea);
+ if (!pmdp)
+ return -ENOMEM;
+ ptep = pte_alloc_kernel(pmdp, ea);
+ if (!ptep)
+ return -ENOMEM;
+ } else {
+ pgdp = pgd_offset_k(ea);
+ p4dp = p4d_offset(pgdp, ea);
+ if (p4d_none(*p4dp)) {
+ pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
+ p4d_populate(&init_mm, p4dp, pudp);
+ }
+ pudp = pud_offset(p4dp, ea);
+ if (pud_none(*pudp)) {
+ pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
+ pud_populate(&init_mm, pudp, pmdp);
+ }
+ pmdp = pmd_offset(pudp, ea);
+ if (!pmd_present(*pmdp)) {
+ ptep = early_alloc_pgtable(PTE_TABLE_SIZE);
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+ }
+ ptep = pte_offset_kernel(pmdp, ea);
+ }
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
+
+ smp_wmb();
+ return 0;
+}
+
+void __patch_exception(int exc, unsigned long addr)
+{
+ unsigned int *ibase = &interrupt_base_book3e;
+
+ /*
+ * Our exceptions vectors start with a NOP and -then- a branch
+ * to deal with single stepping from userspace which stops on
+ * the second instruction. Thus we need to patch the second
+ * instruction of the exception, not the first one.
+ */
+
+ patch_branch(ibase + (exc / 4) + 1, addr, 0);
+}
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/nohash/e500.c
index 07ba45b0f07c..266fb22131fc 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/nohash/e500.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Modifications by Kumar Gala (galak@kernel.crashing.org) to support
* E500 Book E processors.
@@ -17,12 +18,6 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/signal.h>
@@ -41,42 +36,34 @@
#include <linux/delay.h>
#include <linux/highmem.h>
#include <linux/memblock.h>
+#include <linux/of_fdt.h>
-#include <asm/pgalloc.h>
-#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/setup.h>
+#include <asm/paca.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
unsigned int tlbcam_index;
-#define NUM_TLBCAMS (64)
struct tlbcam TLBCAM[NUM_TLBCAMS];
-struct tlbcamrange {
+static struct {
unsigned long start;
unsigned long limit;
phys_addr_t phys;
} tlbcam_addrs[NUM_TLBCAMS];
-extern unsigned int tlbcam_index;
-
-unsigned long tlbcam_sz(int idx)
-{
- return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1;
-}
-
+#ifdef CONFIG_PPC_85xx
/*
* Return PA for this VA if it is mapped by a CAM, or 0
*/
-phys_addr_t v_mapped_by_tlbcam(unsigned long va)
+phys_addr_t v_block_mapped(unsigned long va)
{
int b;
for (b = 0; b < tlbcam_index; ++b)
@@ -88,7 +75,7 @@ phys_addr_t v_mapped_by_tlbcam(unsigned long va)
/*
* Return VA for a given PA or 0 if not mapped
*/
-unsigned long p_mapped_by_tlbcam(phys_addr_t pa)
+unsigned long p_block_mapped(phys_addr_t pa)
{
int b;
for (b = 0; b < tlbcam_index; ++b)
@@ -98,6 +85,7 @@ unsigned long p_mapped_by_tlbcam(phys_addr_t pa)
return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys);
return 0;
}
+#endif
/*
* Set up a variable-size TLB entry (tlbcam). The parameters are not checked;
@@ -113,7 +101,7 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
tsize = __ilog2(size) - 10;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC)
if ((flags & _PAGE_NO_CACHE) == 0)
flags |= _PAGE_COHERENT;
#endif
@@ -128,26 +116,27 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0;
TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0;
- TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR;
- TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0);
+ TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SR;
+ TLBCAM[index].MAS3 |= (flags & _PAGE_WRITE) ? MAS3_SW : 0;
if (mmu_has_feature(MMU_FTR_BIG_PHYS))
TLBCAM[index].MAS7 = (u64)phys >> 32;
/* Below is unlikely -- only for large user pages or similar */
- if (pte_user(flags)) {
- TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
- TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
+ if (!is_kernel_addr(virt)) {
+ TLBCAM[index].MAS3 |= MAS3_UR;
+ TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_UX : 0;
+ TLBCAM[index].MAS3 |= (flags & _PAGE_WRITE) ? MAS3_UW : 0;
+ } else {
+ TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_SX : 0;
}
tlbcam_addrs[index].start = virt;
tlbcam_addrs[index].limit = virt + size - 1;
tlbcam_addrs[index].phys = phys;
-
- loadcam_entry(index);
}
-unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
- phys_addr_t phys)
+static unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
+ phys_addr_t phys)
{
unsigned int camsize = __ilog2(ram);
unsigned int align = __ffs(virt | phys);
@@ -171,41 +160,96 @@ unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
return 1UL << camsize;
}
-unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
+static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
+ unsigned long ram, int max_cam_idx,
+ bool dryrun, bool init)
{
int i;
- unsigned long virt = PAGE_OFFSET;
- phys_addr_t phys = memstart_addr;
unsigned long amount_mapped = 0;
+ unsigned long boundary;
+
+ if (strict_kernel_rwx_enabled())
+ boundary = (unsigned long)(_sinittext - _stext);
+ else
+ boundary = ram;
/* Calculate CAM values */
- for (i = 0; ram && i < max_cam_idx; i++) {
+ for (i = 0; boundary && i < max_cam_idx; i++) {
+ unsigned long cam_sz;
+ pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL_ROX;
+
+ cam_sz = calc_cam_sz(boundary, virt, phys);
+ if (!dryrun)
+ settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0);
+
+ boundary -= cam_sz;
+ amount_mapped += cam_sz;
+ virt += cam_sz;
+ phys += cam_sz;
+ }
+ for (ram -= amount_mapped; ram && i < max_cam_idx; i++) {
unsigned long cam_sz;
+ pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL;
cam_sz = calc_cam_sz(ram, virt, phys);
- settlbcam(i, virt, phys, cam_sz, PAGE_KERNEL_X, 0);
+ if (!dryrun)
+ settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0);
ram -= cam_sz;
amount_mapped += cam_sz;
virt += cam_sz;
phys += cam_sz;
}
- tlbcam_index = i;
+
+ if (dryrun)
+ return amount_mapped;
+
+ if (init) {
+ loadcam_multi(0, i, max_cam_idx);
+ tlbcam_index = i;
+ } else {
+ loadcam_multi(0, i, 0);
+ WARN_ON(i > tlbcam_index);
+ }
+
+#ifdef CONFIG_PPC64
+ get_paca()->tcd.esel_next = i;
+ get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+ get_paca()->tcd.esel_first = i;
+#endif
return amount_mapped;
}
+unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun, bool init)
+{
+ unsigned long virt = PAGE_OFFSET;
+ phys_addr_t phys = memstart_addr;
+
+ return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun, init);
+}
+
#ifdef CONFIG_PPC32
#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS)
#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS"
#endif
-unsigned long __init mmu_mapin_ram(unsigned long top)
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
{
return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1;
}
+void flush_instruction_cache(void)
+{
+ unsigned long tmp;
+
+ tmp = mfspr(SPRN_L1CSR1);
+ tmp |= L1CSR1_ICFI | L1CSR1_ICLFR;
+ mtspr(SPRN_L1CSR1, tmp);
+ isync();
+}
+
/*
* MMU_init_hw does the chip-specific initialization of the MMU hardware.
*/
@@ -214,6 +258,11 @@ void __init MMU_init_hw(void)
flush_instruction_cache();
}
+static unsigned long __init tlbcam_sz(int idx)
+{
+ return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1;
+}
+
void __init adjust_total_lowmem(void)
{
unsigned long ram;
@@ -222,7 +271,9 @@ void __init adjust_total_lowmem(void)
/* adjust lowmem size to __max_low_memory */
ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
- __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM);
+ i = switch_to_as1();
+ __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false, true);
+ restore_to_as0(i, 0, NULL, 1);
pr_info("Memory CAM mapping: ");
for (i = 0; i < tlbcam_index - 1; i++)
@@ -233,6 +284,26 @@ void __init adjust_total_lowmem(void)
memblock_set_current_limit(memstart_addr + __max_low_memory);
}
+#ifdef CONFIG_STRICT_KERNEL_RWX
+int mmu_mark_rodata_ro(void)
+{
+ unsigned long remapped;
+
+ remapped = map_mem_in_cams(__max_low_memory, CONFIG_LOWMEM_CAM_NUM, false, false);
+
+ if (WARN_ON(__max_low_memory != remapped))
+ return -EINVAL;
+
+ return 0;
+}
+#endif
+
+int mmu_mark_initmem_nx(void)
+{
+ /* Everything is done in mmu_mark_rodata_ro() */
+ return 0;
+}
+
void setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
{
@@ -241,4 +312,68 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
/* 64M mapped initially according to head_fsl_booke.S */
memblock_set_current_limit(min_t(u64, limit, 0x04000000));
}
+
+#ifdef CONFIG_RELOCATABLE
+int __initdata is_second_reloc;
+notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start)
+{
+ unsigned long base = kernstart_virt_addr;
+ phys_addr_t size;
+
+ kernstart_addr = start;
+ if (is_second_reloc) {
+ virt_phys_offset = PAGE_OFFSET - memstart_addr;
+ kaslr_late_init();
+ return;
+ }
+
+ /*
+ * Relocatable kernel support based on processing of dynamic
+ * relocation entries. Before we get the real memstart_addr,
+ * We will compute the virt_phys_offset like this:
+ * virt_phys_offset = stext.run - kernstart_addr
+ *
+ * stext.run = (KERNELBASE & ~0x3ffffff) +
+ * (kernstart_addr & 0x3ffffff)
+ * When we relocate, we have :
+ *
+ * (kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff)
+ *
+ * hence:
+ * virt_phys_offset = (KERNELBASE & ~0x3ffffff) -
+ * (kernstart_addr & ~0x3ffffff)
+ *
+ */
+ start &= ~0x3ffffff;
+ base &= ~0x3ffffff;
+ virt_phys_offset = base - start;
+ early_get_first_memblock_info(__va(dt_ptr), &size);
+ /*
+ * We now get the memstart_addr, then we should check if this
+ * address is the same as what the PAGE_OFFSET map to now. If
+ * not we have to change the map of PAGE_OFFSET to memstart_addr
+ * and do a second relocation.
+ */
+ if (start != memstart_addr) {
+ int n;
+ long offset = start - memstart_addr;
+
+ is_second_reloc = 1;
+ n = switch_to_as1();
+ /* map a 64M area for the second relocation */
+ if (memstart_addr > start)
+ map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM,
+ false, true);
+ else
+ map_mem_in_cams_addr(start, PAGE_OFFSET + offset,
+ 0x4000000, CONFIG_LOWMEM_CAM_NUM,
+ false, true);
+ restore_to_as0(n, offset, __va(dt_ptr), 1);
+ /* We should never reach here */
+ panic("Relocation error");
+ }
+
+ kaslr_early_init(__va(dt_ptr), size);
+}
+#endif
#endif
diff --git a/arch/powerpc/mm/nohash/e500_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c
new file mode 100644
index 000000000000..a134d28a0e4d
--- /dev/null
+++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PPC Huge TLB Page Support for Book3E MMU
+ *
+ * Copyright (C) 2009 David Gibson, IBM Corporation.
+ * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
+ *
+ */
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+
+#include <asm/mmu.h>
+
+#ifdef CONFIG_PPC64
+#include <asm/paca.h>
+
+static inline int tlb1_next(void)
+{
+ struct paca_struct *paca = get_paca();
+ struct tlb_core_data *tcd;
+ int this, next;
+
+ tcd = paca->tcd_ptr;
+ this = tcd->esel_next;
+
+ next = this + 1;
+ if (next >= tcd->esel_max)
+ next = tcd->esel_first;
+
+ tcd->esel_next = next;
+ return this;
+}
+
+static inline void book3e_tlb_lock(void)
+{
+ struct paca_struct *paca = get_paca();
+ unsigned long tmp;
+ int token = smp_processor_id() + 1;
+
+ /*
+ * Besides being unnecessary in the absence of SMT, this
+ * check prevents trying to do lbarx/stbcx. on e5500 which
+ * doesn't implement either feature.
+ */
+ if (!cpu_has_feature(CPU_FTR_SMT))
+ return;
+
+ asm volatile(".machine push;"
+ ".machine e6500;"
+ "1: lbarx %0, 0, %1;"
+ "cmpwi %0, 0;"
+ "bne 2f;"
+ "stbcx. %2, 0, %1;"
+ "bne 1b;"
+ "b 3f;"
+ "2: lbzx %0, 0, %1;"
+ "cmpwi %0, 0;"
+ "bne 2b;"
+ "b 1b;"
+ "3:"
+ ".machine pop;"
+ : "=&r" (tmp)
+ : "r" (&paca->tcd_ptr->lock), "r" (token)
+ : "memory");
+}
+
+static inline void book3e_tlb_unlock(void)
+{
+ struct paca_struct *paca = get_paca();
+
+ if (!cpu_has_feature(CPU_FTR_SMT))
+ return;
+
+ isync();
+ paca->tcd_ptr->lock = 0;
+}
+#else
+static inline int tlb1_next(void)
+{
+ int index, ncams;
+
+ ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+ index = this_cpu_read(next_tlbcam_idx);
+
+ /* Just round-robin the entries and wrap when we hit the end */
+ if (unlikely(index == ncams - 1))
+ __this_cpu_write(next_tlbcam_idx, tlbcam_index);
+ else
+ __this_cpu_inc(next_tlbcam_idx);
+
+ return index;
+}
+
+static inline void book3e_tlb_lock(void)
+{
+}
+
+static inline void book3e_tlb_unlock(void)
+{
+}
+#endif
+
+static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid)
+{
+ int found = 0;
+
+ mtspr(SPRN_MAS6, pid << 16);
+ asm volatile(
+ "tlbsx 0,%1\n"
+ "mfspr %0,0x271\n"
+ "srwi %0,%0,31\n"
+ : "=&r"(found) : "r"(ea));
+
+ return found;
+}
+
+static void
+book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte)
+{
+ unsigned long mas1, mas2;
+ u64 mas7_3;
+ unsigned long psize, tsize, shift;
+ unsigned long flags;
+ struct mm_struct *mm;
+ int index;
+
+ if (unlikely(is_kernel_addr(ea)))
+ return;
+
+ mm = vma->vm_mm;
+
+ psize = vma_mmu_pagesize(vma);
+ shift = __ilog2(psize);
+ tsize = shift - 10;
+ /*
+ * We can't be interrupted while we're setting up the MAS
+ * registers or after we've confirmed that no tlb exists.
+ */
+ local_irq_save(flags);
+
+ book3e_tlb_lock();
+
+ if (unlikely(book3e_tlb_exists(ea, mm->context.id))) {
+ book3e_tlb_unlock();
+ local_irq_restore(flags);
+ return;
+ }
+
+ /* We have to use the CAM(TLB1) on FSL parts for hugepages */
+ index = tlb1_next();
+ mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
+
+ mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
+ mas2 = ea & ~((1UL << shift) - 1);
+ mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
+ mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT;
+ mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK;
+ if (!pte_dirty(pte))
+ mas7_3 &= ~(MAS3_SW|MAS3_UW);
+
+ mtspr(SPRN_MAS1, mas1);
+ mtspr(SPRN_MAS2, mas2);
+
+ if (mmu_has_feature(MMU_FTR_BIG_PHYS))
+ mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
+ mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
+
+ asm volatile ("tlbwe");
+
+ book3e_tlb_unlock();
+ local_irq_restore(flags);
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ *
+ * This must always be called with the pte lock held.
+ */
+void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
+{
+ if (is_vm_hugetlb_page(vma))
+ book3e_hugetlb_preload(vma, address, *ptep);
+}
+
+void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+ struct hstate *hstate = hstate_file(vma->vm_file);
+ unsigned long tsize = huge_page_shift(hstate) - 10;
+
+ __flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0);
+}
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c
new file mode 100644
index 000000000000..5e4897daaaea
--- /dev/null
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// Copyright (C) 2019 Jason Yan <yanaijie@huawei.com>
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/memblock.h>
+#include <linux/libfdt.h>
+#include <linux/crash_reserve.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <asm/cacheflush.h>
+#include <asm/kdump.h>
+#include <mm/mmu_decl.h>
+
+struct regions {
+ unsigned long pa_start;
+ unsigned long pa_end;
+ unsigned long kernel_size;
+ unsigned long dtb_start;
+ unsigned long dtb_end;
+ unsigned long initrd_start;
+ unsigned long initrd_end;
+ unsigned long crash_start;
+ unsigned long crash_end;
+ int reserved_mem;
+ int reserved_mem_addr_cells;
+ int reserved_mem_size_cells;
+};
+
+struct regions __initdata regions;
+
+static __init void kaslr_get_cmdline(void *fdt)
+{
+ early_init_dt_scan_chosen(boot_command_line);
+}
+
+static unsigned long __init rotate_xor(unsigned long hash, const void *area,
+ size_t size)
+{
+ size_t i;
+ const unsigned long *ptr = area;
+
+ for (i = 0; i < size / sizeof(hash); i++) {
+ /* Rotate by odd number of bits and XOR. */
+ hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
+ hash ^= ptr[i];
+ }
+
+ return hash;
+}
+
+/* Attempt to create a simple starting entropy. This can make it defferent for
+ * every build but it is still not enough. Stronger entropy should
+ * be added to make it change for every boot.
+ */
+static unsigned long __init get_boot_seed(void *fdt)
+{
+ unsigned long hash = 0;
+
+ /* build-specific string for starting entropy. */
+ hash = rotate_xor(hash, linux_banner, strlen(linux_banner));
+ hash = rotate_xor(hash, fdt, fdt_totalsize(fdt));
+
+ return hash;
+}
+
+static __init u64 get_kaslr_seed(void *fdt)
+{
+ int node, len;
+ fdt64_t *prop;
+ u64 ret;
+
+ node = fdt_path_offset(fdt, "/chosen");
+ if (node < 0)
+ return 0;
+
+ prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len);
+ if (!prop || len != sizeof(u64))
+ return 0;
+
+ ret = fdt64_to_cpu(*prop);
+ *prop = 0;
+ return ret;
+}
+
+static __init bool regions_overlap(u32 s1, u32 e1, u32 s2, u32 e2)
+{
+ return e1 >= s2 && e2 >= s1;
+}
+
+static __init bool overlaps_reserved_region(const void *fdt, u32 start,
+ u32 end)
+{
+ int subnode, len, i;
+ u64 base, size;
+
+ /* check for overlap with /memreserve/ entries */
+ for (i = 0; i < fdt_num_mem_rsv(fdt); i++) {
+ if (fdt_get_mem_rsv(fdt, i, &base, &size) < 0)
+ continue;
+ if (regions_overlap(start, end, base, base + size))
+ return true;
+ }
+
+ if (regions.reserved_mem < 0)
+ return false;
+
+ /* check for overlap with static reservations in /reserved-memory */
+ for (subnode = fdt_first_subnode(fdt, regions.reserved_mem);
+ subnode >= 0;
+ subnode = fdt_next_subnode(fdt, subnode)) {
+ const fdt32_t *reg;
+ u64 rsv_end;
+
+ len = 0;
+ reg = fdt_getprop(fdt, subnode, "reg", &len);
+ while (len >= (regions.reserved_mem_addr_cells +
+ regions.reserved_mem_size_cells)) {
+ base = fdt32_to_cpu(reg[0]);
+ if (regions.reserved_mem_addr_cells == 2)
+ base = (base << 32) | fdt32_to_cpu(reg[1]);
+
+ reg += regions.reserved_mem_addr_cells;
+ len -= 4 * regions.reserved_mem_addr_cells;
+
+ size = fdt32_to_cpu(reg[0]);
+ if (regions.reserved_mem_size_cells == 2)
+ size = (size << 32) | fdt32_to_cpu(reg[1]);
+
+ reg += regions.reserved_mem_size_cells;
+ len -= 4 * regions.reserved_mem_size_cells;
+
+ if (base >= regions.pa_end)
+ continue;
+
+ rsv_end = min(base + size, (u64)U32_MAX);
+
+ if (regions_overlap(start, end, base, rsv_end))
+ return true;
+ }
+ }
+ return false;
+}
+
+static __init bool overlaps_region(const void *fdt, u32 start,
+ u32 end)
+{
+ if (regions_overlap(start, end, __pa(_stext), __pa(_end)))
+ return true;
+
+ if (regions_overlap(start, end, regions.dtb_start,
+ regions.dtb_end))
+ return true;
+
+ if (regions_overlap(start, end, regions.initrd_start,
+ regions.initrd_end))
+ return true;
+
+ if (regions_overlap(start, end, regions.crash_start,
+ regions.crash_end))
+ return true;
+
+ return overlaps_reserved_region(fdt, start, end);
+}
+
+static void __init get_crash_kernel(void *fdt, unsigned long size)
+{
+#ifdef CONFIG_CRASH_RESERVE
+ unsigned long long crash_size, crash_base;
+ int ret;
+
+ ret = parse_crashkernel(boot_command_line, size, &crash_size,
+ &crash_base, NULL, NULL, NULL);
+ if (ret != 0 || crash_size == 0)
+ return;
+ if (crash_base == 0)
+ crash_base = KDUMP_KERNELBASE;
+
+ regions.crash_start = (unsigned long)crash_base;
+ regions.crash_end = (unsigned long)(crash_base + crash_size);
+
+ pr_debug("crash_base=0x%llx crash_size=0x%llx\n", crash_base, crash_size);
+#endif
+}
+
+static void __init get_initrd_range(void *fdt)
+{
+ u64 start, end;
+ int node, len;
+ const __be32 *prop;
+
+ node = fdt_path_offset(fdt, "/chosen");
+ if (node < 0)
+ return;
+
+ prop = fdt_getprop(fdt, node, "linux,initrd-start", &len);
+ if (!prop)
+ return;
+ start = of_read_number(prop, len / 4);
+
+ prop = fdt_getprop(fdt, node, "linux,initrd-end", &len);
+ if (!prop)
+ return;
+ end = of_read_number(prop, len / 4);
+
+ regions.initrd_start = (unsigned long)start;
+ regions.initrd_end = (unsigned long)end;
+
+ pr_debug("initrd_start=0x%llx initrd_end=0x%llx\n", start, end);
+}
+
+static __init unsigned long get_usable_address(const void *fdt,
+ unsigned long start,
+ unsigned long offset)
+{
+ unsigned long pa;
+ unsigned long pa_end;
+
+ for (pa = offset; (long)pa > (long)start; pa -= SZ_16K) {
+ pa_end = pa + regions.kernel_size;
+ if (overlaps_region(fdt, pa, pa_end))
+ continue;
+
+ return pa;
+ }
+ return 0;
+}
+
+static __init void get_cell_sizes(const void *fdt, int node, int *addr_cells,
+ int *size_cells)
+{
+ const int *prop;
+ int len;
+
+ /*
+ * Retrieve the #address-cells and #size-cells properties
+ * from the 'node', or use the default if not provided.
+ */
+ *addr_cells = *size_cells = 1;
+
+ prop = fdt_getprop(fdt, node, "#address-cells", &len);
+ if (len == 4)
+ *addr_cells = fdt32_to_cpu(*prop);
+ prop = fdt_getprop(fdt, node, "#size-cells", &len);
+ if (len == 4)
+ *size_cells = fdt32_to_cpu(*prop);
+}
+
+static unsigned long __init kaslr_legal_offset(void *dt_ptr, unsigned long index,
+ unsigned long offset)
+{
+ unsigned long koffset = 0;
+ unsigned long start;
+
+ while ((long)index >= 0) {
+ offset = memstart_addr + index * SZ_64M + offset;
+ start = memstart_addr + index * SZ_64M;
+ koffset = get_usable_address(dt_ptr, start, offset);
+ if (koffset)
+ break;
+ index--;
+ }
+
+ if (koffset != 0)
+ koffset -= memstart_addr;
+
+ return koffset;
+}
+
+static inline __init bool kaslr_disabled(void)
+{
+ return strstr(boot_command_line, "nokaslr") != NULL;
+}
+
+static unsigned long __init kaslr_choose_location(void *dt_ptr, phys_addr_t size,
+ unsigned long kernel_sz)
+{
+ unsigned long offset, random;
+ unsigned long ram, linear_sz;
+ u64 seed;
+ unsigned long index;
+
+ kaslr_get_cmdline(dt_ptr);
+ if (kaslr_disabled())
+ return 0;
+
+ random = get_boot_seed(dt_ptr);
+
+ seed = get_tb() << 32;
+ seed ^= get_tb();
+ random = rotate_xor(random, &seed, sizeof(seed));
+
+ /*
+ * Retrieve (and wipe) the seed from the FDT
+ */
+ seed = get_kaslr_seed(dt_ptr);
+ if (seed)
+ random = rotate_xor(random, &seed, sizeof(seed));
+ else
+ pr_warn("KASLR: No safe seed for randomizing the kernel base.\n");
+
+ ram = min_t(phys_addr_t, __max_low_memory, size);
+ ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true, true);
+ linear_sz = min_t(unsigned long, ram, SZ_512M);
+
+ /* If the linear size is smaller than 64M, do not randomize */
+ if (linear_sz < SZ_64M)
+ return 0;
+
+ /* check for a reserved-memory node and record its cell sizes */
+ regions.reserved_mem = fdt_path_offset(dt_ptr, "/reserved-memory");
+ if (regions.reserved_mem >= 0)
+ get_cell_sizes(dt_ptr, regions.reserved_mem,
+ &regions.reserved_mem_addr_cells,
+ &regions.reserved_mem_size_cells);
+
+ regions.pa_start = memstart_addr;
+ regions.pa_end = memstart_addr + linear_sz;
+ regions.dtb_start = __pa(dt_ptr);
+ regions.dtb_end = __pa(dt_ptr) + fdt_totalsize(dt_ptr);
+ regions.kernel_size = kernel_sz;
+
+ get_initrd_range(dt_ptr);
+ get_crash_kernel(dt_ptr, ram);
+
+ /*
+ * Decide which 64M we want to start
+ * Only use the low 8 bits of the random seed
+ */
+ index = random & 0xFF;
+ index %= linear_sz / SZ_64M;
+
+ /* Decide offset inside 64M */
+ offset = random % (SZ_64M - kernel_sz);
+ offset = round_down(offset, SZ_16K);
+
+ return kaslr_legal_offset(dt_ptr, index, offset);
+}
+
+/*
+ * To see if we need to relocate the kernel to a random offset
+ * void *dt_ptr - address of the device tree
+ * phys_addr_t size - size of the first memory block
+ */
+notrace void __init kaslr_early_init(void *dt_ptr, phys_addr_t size)
+{
+ unsigned long tlb_virt;
+ phys_addr_t tlb_phys;
+ unsigned long offset;
+ unsigned long kernel_sz;
+
+ kernel_sz = (unsigned long)_end - (unsigned long)_stext;
+
+ offset = kaslr_choose_location(dt_ptr, size, kernel_sz);
+ if (offset == 0)
+ return;
+
+ kernstart_virt_addr += offset;
+ kernstart_addr += offset;
+
+ is_second_reloc = 1;
+
+ if (offset >= SZ_64M) {
+ tlb_virt = round_down(kernstart_virt_addr, SZ_64M);
+ tlb_phys = round_down(kernstart_addr, SZ_64M);
+
+ /* Create kernel map to relocate in */
+ create_kaslr_tlb_entry(1, tlb_virt, tlb_phys);
+ }
+
+ /* Copy the kernel to its new location and run */
+ memcpy((void *)kernstart_virt_addr, (void *)_stext, kernel_sz);
+ flush_icache_range(kernstart_virt_addr, kernstart_virt_addr + kernel_sz);
+
+ reloc_kernel_entry(dt_ptr, kernstart_virt_addr);
+}
+
+void __init kaslr_late_init(void)
+{
+ /* If randomized, clear the original kernel */
+ if (kernstart_virt_addr != KERNELBASE) {
+ unsigned long kernel_sz;
+
+ kernel_sz = (unsigned long)_end - kernstart_virt_addr;
+ memzero_explicit((void *)KERNELBASE, kernel_sz);
+ }
+}
diff --git a/arch/powerpc/mm/nohash/kup.c b/arch/powerpc/mm/nohash/kup.c
new file mode 100644
index 000000000000..c20c4f357fbf
--- /dev/null
+++ b/arch/powerpc/mm/nohash/kup.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file contains the routines for initializing kernel userspace protection
+ */
+
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/printk.h>
+#include <linux/smp.h>
+
+#include <asm/kup.h>
+#include <asm/smp.h>
+
+#ifdef CONFIG_PPC_KUAP
+void setup_kuap(bool disabled)
+{
+ if (disabled) {
+ if (smp_processor_id() == boot_cpuid)
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_KUAP;
+ return;
+ }
+
+ pr_info("Activating Kernel Userspace Access Protection\n");
+
+ prevent_user_access(KUAP_READ_WRITE);
+}
+#endif
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/nohash/mmu_context.c
index af3d78e19302..28a96a10c907 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/nohash/mmu_context.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file contains the routines for handling the MMU on those
* PowerPC implementations where the MMU is not using the hash
@@ -9,11 +10,6 @@
* Derived from previous arch/powerpc/mm/mmu_context.c
* and arch/powerpc/include/asm/mmu_context.h
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* TODO:
*
* - The global context lock will not scale very well
@@ -25,34 +21,54 @@
* also clear mm->cpu_vm_mask bits when processes are migrated
*/
-//#define DEBUG_MAP_CONSISTENCY
-//#define DEBUG_CLAMP_LAST_CONTEXT 31
-//#define DEBUG_HARDER
-
-/* We don't use DEBUG because it tends to be compiled in always nowadays
- * and this would generate way too much output
- */
-#ifdef DEBUG_HARDER
-#define pr_hard(args...) printk(KERN_DEBUG args)
-#define pr_hardcont(args...) printk(KERN_CONT args)
-#else
-#define pr_hard(args...) do { } while(0)
-#define pr_hardcont(args...) do { } while(0)
-#endif
-
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/spinlock.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/slab.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
+#include <asm/smp.h>
+#include <asm/kup.h>
+
+#include <mm/mmu_decl.h>
+
+/*
+ * Room for two PTE table pointers, usually the kernel and current user
+ * pointer to their respective root page table (pgdir).
+ */
+void *abatron_pteptrs[2];
+
+/*
+ * The MPC8xx has only 16 contexts. We rotate through them on each task switch.
+ * A better way would be to keep track of tasks that own contexts, and implement
+ * an LRU usage. That way very active tasks don't always have to pay the TLB
+ * reload overhead. The kernel pages are mapped shared, so the kernel can run on
+ * behalf of any task that makes a kernel entry. Shared does not mean they are
+ * not protected, just that the ASID comparison is not performed. -- Dan
+ *
+ * The IBM4xx has 256 contexts, so we can just rotate through these as a way of
+ * "switching" contexts. If the TID of the TLB is zero, the PID/TID comparison
+ * is disabled, so we can use a TID of zero to represent all kernel pages as
+ * shared among all contexts. -- Dan
+ *
+ * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We should
+ * normally never have to steal though the facility is present if needed.
+ * -- BenH
+ */
+#define FIRST_CONTEXT 1
+#if defined(CONFIG_PPC_8xx)
+#define LAST_CONTEXT 16
+#elif defined(CONFIG_PPC_47x)
+#define LAST_CONTEXT 65535
+#else
+#define LAST_CONTEXT 255
+#endif
-static unsigned int first_context, last_context;
static unsigned int next_context, nr_free_contexts;
static unsigned long *context_map;
static unsigned long *stale_map[NR_CPUS];
@@ -60,7 +76,7 @@ static struct mm_struct **context_mm;
static DEFINE_RAW_SPINLOCK(context_lock);
#define CTX_MAP_SIZE \
- (sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1))
+ (sizeof(unsigned long) * (LAST_CONTEXT / BITS_PER_LONG + 1))
/* Steal a context from a task that has one at the moment.
@@ -78,13 +94,12 @@ static DEFINE_RAW_SPINLOCK(context_lock);
* the stale map as we can just flush the local CPU
* -- benh
*/
-#ifdef CONFIG_SMP
static unsigned int steal_context_smp(unsigned int id)
{
struct mm_struct *mm;
unsigned int cpu, max, i;
- max = last_context - first_context;
+ max = LAST_CONTEXT - FIRST_CONTEXT;
/* Attempt to free next_context first and then loop until we manage */
while (max--) {
@@ -96,11 +111,10 @@ static unsigned int steal_context_smp(unsigned int id)
*/
if (mm->context.active) {
id++;
- if (id > last_context)
- id = first_context;
+ if (id > LAST_CONTEXT)
+ id = FIRST_CONTEXT;
continue;
}
- pr_hardcont(" | steal %d from 0x%p", id, mm);
/* Mark this mm has having no context anymore */
mm->context.id = MMU_NO_CONTEXT;
@@ -131,7 +145,34 @@ static unsigned int steal_context_smp(unsigned int id)
/* This will cause the caller to try again */
return MMU_NO_CONTEXT;
}
-#endif /* CONFIG_SMP */
+
+static unsigned int steal_all_contexts(void)
+{
+ struct mm_struct *mm;
+ int cpu = smp_processor_id();
+ unsigned int id;
+
+ for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) {
+ /* Pick up the victim mm */
+ mm = context_mm[id];
+
+ /* Mark this mm as having no context anymore */
+ mm->context.id = MMU_NO_CONTEXT;
+ if (id != FIRST_CONTEXT) {
+ context_mm[id] = NULL;
+ __clear_bit(id, context_map);
+ }
+ if (IS_ENABLED(CONFIG_SMP))
+ __clear_bit(id, stale_map[cpu]);
+ }
+
+ /* Flush the TLB for all contexts (not to be used on SMP) */
+ _tlbil_all();
+
+ nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT;
+
+ return FIRST_CONTEXT;
+}
/* Note that this will also be called on SMP if all other CPUs are
* offlined, which means that it may be called for cpu != 0. For
@@ -146,8 +187,6 @@ static unsigned int steal_context_up(unsigned int id)
/* Pick up the victim mm */
mm = context_mm[id];
- pr_hardcont(" | steal %d from 0x%p", id, mm);
-
/* Flush the TLB for that context */
local_flush_tlb_mm(mm);
@@ -155,120 +194,93 @@ static unsigned int steal_context_up(unsigned int id)
mm->context.id = MMU_NO_CONTEXT;
/* XXX This clear should ultimately be part of local_flush_tlb_mm */
- __clear_bit(id, stale_map[cpu]);
+ if (IS_ENABLED(CONFIG_SMP))
+ __clear_bit(id, stale_map[cpu]);
return id;
}
-#ifdef DEBUG_MAP_CONSISTENCY
-static void context_check_map(void)
+static void set_context(unsigned long id, pgd_t *pgd)
{
- unsigned int id, nrf, nact;
-
- nrf = nact = 0;
- for (id = first_context; id <= last_context; id++) {
- int used = test_bit(id, context_map);
- if (!used)
- nrf++;
- if (used != (context_mm[id] != NULL))
- pr_err("MMU: Context %d is %s and MM is %p !\n",
- id, used ? "used" : "free", context_mm[id]);
- if (context_mm[id] != NULL)
- nact += context_mm[id]->context.active;
- }
- if (nrf != nr_free_contexts) {
- pr_err("MMU: Free context count out of sync ! (%d vs %d)\n",
- nr_free_contexts, nrf);
- nr_free_contexts = nrf;
+ if (IS_ENABLED(CONFIG_PPC_8xx)) {
+ mtspr(SPRN_M_TWB, __pa(pgd));
+
+ /* Update context */
+ mtspr(SPRN_M_CASID, id - 1);
+
+ /* sync */
+ mb();
+ } else if (kuap_is_disabled()) {
+ mtspr(SPRN_PID, id);
+ isync();
}
- if (nact > num_online_cpus())
- pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
- nact, num_online_cpus());
- if (first_context > 0 && !test_bit(0, context_map))
- pr_err("MMU: Context 0 has been freed !!!\n");
}
-#else
-static void context_check_map(void) { }
-#endif
-void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
+void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
{
- unsigned int i, id, cpu = smp_processor_id();
+ unsigned int id;
+ unsigned int i, cpu = smp_processor_id();
unsigned long *map;
/* No lockless fast path .. yet */
raw_spin_lock(&context_lock);
- pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
- cpu, next, next->context.active, next->context.id);
-
-#ifdef CONFIG_SMP
- /* Mark us active and the previous one not anymore */
- next->context.active++;
- if (prev) {
- pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active);
- WARN_ON(prev->context.active < 1);
- prev->context.active--;
+ if (IS_ENABLED(CONFIG_SMP)) {
+ /* Mark us active and the previous one not anymore */
+ next->context.active++;
+ if (prev) {
+ WARN_ON(prev->context.active < 1);
+ prev->context.active--;
+ }
}
again:
-#endif /* CONFIG_SMP */
/* If we already have a valid assigned context, skip all that */
id = next->context.id;
- if (likely(id != MMU_NO_CONTEXT)) {
-#ifdef DEBUG_MAP_CONSISTENCY
- if (context_mm[id] != next)
- pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n",
- next, id, id, context_mm[id]);
-#endif
+ if (likely(id != MMU_NO_CONTEXT))
goto ctxt_ok;
- }
/* We really don't have a context, let's try to acquire one */
id = next_context;
- if (id > last_context)
- id = first_context;
+ if (id > LAST_CONTEXT)
+ id = FIRST_CONTEXT;
map = context_map;
/* No more free contexts, let's try to steal one */
if (nr_free_contexts == 0) {
-#ifdef CONFIG_SMP
if (num_online_cpus() > 1) {
id = steal_context_smp(id);
if (id == MMU_NO_CONTEXT)
goto again;
goto stolen;
}
-#endif /* CONFIG_SMP */
- id = steal_context_up(id);
+ if (IS_ENABLED(CONFIG_PPC_8xx))
+ id = steal_all_contexts();
+ else
+ id = steal_context_up(id);
goto stolen;
}
nr_free_contexts--;
/* We know there's at least one free context, try to find it */
while (__test_and_set_bit(id, map)) {
- id = find_next_zero_bit(map, last_context+1, id);
- if (id > last_context)
- id = first_context;
+ id = find_next_zero_bit(map, LAST_CONTEXT+1, id);
+ if (id > LAST_CONTEXT)
+ id = FIRST_CONTEXT;
}
stolen:
next_context = id + 1;
context_mm[id] = next;
next->context.id = id;
- pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts);
- context_check_map();
ctxt_ok:
/* If that context got marked stale on this CPU, then flush the
* local TLB for it and unmark it before we use it
*/
- if (test_bit(id, stale_map[cpu])) {
- pr_hardcont(" | stale flush %d [%d..%d]",
- id, cpu_first_thread_sibling(cpu),
- cpu_last_thread_sibling(cpu));
-
+ if (IS_ENABLED(CONFIG_SMP) && test_bit(id, stale_map[cpu])) {
local_flush_tlb_mm(next);
/* XXX This clear should ultimately be part of local_flush_tlb_mm */
@@ -280,8 +292,12 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
}
/* Flick the MMU and release lock */
- pr_hardcont(" -> %d\n", id);
+ if (IS_ENABLED(CONFIG_BDI_SWITCH))
+ abatron_pteptrs[1] = next->pgd;
set_context(id, next->pgd);
+#if defined(CONFIG_BOOKE) && defined(CONFIG_PPC_KUAP)
+ tsk->thread.pid = id;
+#endif
raw_spin_unlock(&context_lock);
}
@@ -290,16 +306,9 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
*/
int init_new_context(struct task_struct *t, struct mm_struct *mm)
{
- pr_hard("initing context for mm @%p\n", mm);
-
mm->context.id = MMU_NO_CONTEXT;
mm->context.active = 0;
-
-#ifdef CONFIG_PPC_MM_SLICES
- if (slice_mm_new_context(mm))
- slice_set_user_psize(mm, mmu_virtual_psize);
-#endif
-
+ pte_frag_set(&mm->context, NULL);
return 0;
}
@@ -321,56 +330,38 @@ void destroy_context(struct mm_struct *mm)
if (id != MMU_NO_CONTEXT) {
__clear_bit(id, context_map);
mm->context.id = MMU_NO_CONTEXT;
-#ifdef DEBUG_MAP_CONSISTENCY
- mm->context.active = 0;
-#endif
context_mm[id] = NULL;
nr_free_contexts++;
}
raw_spin_unlock_irqrestore(&context_lock, flags);
}
-#ifdef CONFIG_SMP
-
-static int mmu_context_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int mmu_ctx_cpu_prepare(unsigned int cpu)
{
- unsigned int cpu = (unsigned int)(long)hcpu;
-
/* We don't touch CPU 0 map, it's allocated at aboot and kept
* around forever
*/
if (cpu == boot_cpuid)
- return NOTIFY_OK;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu);
- stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
- break;
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
- kfree(stale_map[cpu]);
- stale_map[cpu] = NULL;
-
- /* We also clear the cpu_vm_mask bits of CPUs going away */
- clear_tasks_mm_cpumask(cpu);
- break;
-#endif /* CONFIG_HOTPLUG_CPU */
- }
- return NOTIFY_OK;
+ return 0;
+
+ stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
+ return 0;
}
-static struct notifier_block mmu_context_cpu_nb = {
- .notifier_call = mmu_context_cpu_notify,
-};
+static int mmu_ctx_cpu_dead(unsigned int cpu)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ if (cpu == boot_cpuid)
+ return 0;
-#endif /* CONFIG_SMP */
+ kfree(stale_map[cpu]);
+ stale_map[cpu] = NULL;
+
+ /* We also clear the cpu_vm_mask bits of CPUs going away */
+ clear_tasks_mm_cpumask(cpu);
+#endif
+ return 0;
+}
/*
* Initialize the context management stuff.
@@ -384,76 +375,30 @@ void __init mmu_context_init(void)
init_mm.context.active = NR_CPUS;
/*
- * The MPC8xx has only 16 contexts. We rotate through them on each
- * task switch. A better way would be to keep track of tasks that
- * own contexts, and implement an LRU usage. That way very active
- * tasks don't always have to pay the TLB reload overhead. The
- * kernel pages are mapped shared, so the kernel can run on behalf
- * of any task that makes a kernel entry. Shared does not mean they
- * are not protected, just that the ASID comparison is not performed.
- * -- Dan
- *
- * The IBM4xx has 256 contexts, so we can just rotate through these
- * as a way of "switching" contexts. If the TID of the TLB is zero,
- * the PID/TID comparison is disabled, so we can use a TID of zero
- * to represent all kernel pages as shared among all contexts.
- * -- Dan
- *
- * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We
- * should normally never have to steal though the facility is
- * present if needed.
- * -- BenH
- */
- if (mmu_has_feature(MMU_FTR_TYPE_8xx)) {
- first_context = 0;
- last_context = 15;
- } else if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
- first_context = 1;
- last_context = 65535;
- } else
-#ifdef CONFIG_PPC_BOOK3E_MMU
- if (mmu_has_feature(MMU_FTR_TYPE_3E)) {
- u32 mmucfg = mfspr(SPRN_MMUCFG);
- u32 pid_bits = (mmucfg & MMUCFG_PIDSIZE_MASK)
- >> MMUCFG_PIDSIZE_SHIFT;
- first_context = 1;
- last_context = (1UL << (pid_bits + 1)) - 1;
- } else
-#endif
- {
- first_context = 1;
- last_context = 255;
- }
-
-#ifdef DEBUG_CLAMP_LAST_CONTEXT
- last_context = DEBUG_CLAMP_LAST_CONTEXT;
-#endif
- /*
* Allocate the maps used by context management
*/
- context_map = alloc_bootmem(CTX_MAP_SIZE);
- context_mm = alloc_bootmem(sizeof(void *) * (last_context + 1));
-#ifndef CONFIG_SMP
- stale_map[0] = alloc_bootmem(CTX_MAP_SIZE);
-#else
- stale_map[boot_cpuid] = alloc_bootmem(CTX_MAP_SIZE);
-
- register_cpu_notifier(&mmu_context_cpu_nb);
-#endif
+ context_map = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES);
+ context_mm = memblock_alloc_or_panic(sizeof(void *) * (LAST_CONTEXT + 1),
+ SMP_CACHE_BYTES);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ stale_map[boot_cpuid] = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES);
+ cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE,
+ "powerpc/mmu/ctx:prepare",
+ mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead);
+ }
printk(KERN_INFO
"MMU: Allocated %zu bytes of context maps for %d contexts\n",
- 2 * CTX_MAP_SIZE + (sizeof(void *) * (last_context + 1)),
- last_context - first_context + 1);
+ 2 * CTX_MAP_SIZE + (sizeof(void *) * (LAST_CONTEXT + 1)),
+ LAST_CONTEXT - FIRST_CONTEXT + 1);
/*
* Some processors have too few contexts to reserve one for
* init_mm, and require using context 0 for a normal task.
* Other processors reserve the use of context zero for the kernel.
- * This code assumes first_context < 32.
+ * This code assumes FIRST_CONTEXT < 32.
*/
- context_map[0] = (1 << first_context) - 1;
- next_context = first_context;
- nr_free_contexts = last_context - first_context + 1;
+ context_map[0] = (1 << FIRST_CONTEXT) - 1;
+ next_context = FIRST_CONTEXT;
+ nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1;
}
-
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
new file mode 100644
index 000000000000..0a650742f3a0
--- /dev/null
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -0,0 +1,341 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file contains the routines for TLB flushing.
+ * On machines where the MMU does not use a hash table to store virtual to
+ * physical translations (ie, SW loaded TLBs or Book3E compilant processors,
+ * this does -not- include 603 however which shares the implementation with
+ * hash based processors)
+ *
+ * -- BenH
+ *
+ * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org>
+ * IBM Corp.
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/memblock.h>
+#include <linux/of_fdt.h>
+#include <linux/hugetlb.h>
+
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/text-patching.h>
+#include <asm/cputhreads.h>
+#include <asm/hugetlb.h>
+#include <asm/paca.h>
+
+#include <mm/mmu_decl.h>
+
+/*
+ * This struct lists the sw-supported page sizes. The hardawre MMU may support
+ * other sizes not listed here. The .ind field is only used on MMUs that have
+ * indirect page table entries.
+ */
+#ifdef CONFIG_PPC_E500
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
+ [MMU_PAGE_4K] = {
+ .shift = 12,
+ },
+ [MMU_PAGE_2M] = {
+ .shift = 21,
+ },
+ [MMU_PAGE_4M] = {
+ .shift = 22,
+ },
+ [MMU_PAGE_16M] = {
+ .shift = 24,
+ },
+ [MMU_PAGE_64M] = {
+ .shift = 26,
+ },
+ [MMU_PAGE_256M] = {
+ .shift = 28,
+ },
+ [MMU_PAGE_1G] = {
+ .shift = 30,
+ },
+};
+
+static inline int mmu_get_tsize(int psize)
+{
+ return mmu_psize_defs[psize].shift - 10;
+}
+#else
+static inline int mmu_get_tsize(int psize)
+{
+ /* This isn't used on !Book3E for now */
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_PPC_8xx
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
+ [MMU_PAGE_4K] = {
+ .shift = 12,
+ },
+ [MMU_PAGE_16K] = {
+ .shift = 14,
+ },
+ [MMU_PAGE_512K] = {
+ .shift = 19,
+ },
+ [MMU_PAGE_8M] = {
+ .shift = 23,
+ },
+};
+#endif
+
+#ifdef CONFIG_PPC_E500
+/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */
+DEFINE_PER_CPU(int, next_tlbcam_idx);
+EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx);
+#endif
+
+/*
+ * Base TLB flushing operations:
+ *
+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ * - flush_tlb_page(vma, vmaddr) flushes one page
+ * - flush_tlb_range(vma, start, end) flushes a range of pages
+ * - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ * - local_* variants of page and mm only apply to the current
+ * processor
+ */
+
+#ifndef CONFIG_PPC_8xx
+/*
+ * These are the base non-SMP variants of page and mm flushing
+ */
+void local_flush_tlb_mm(struct mm_struct *mm)
+{
+ unsigned int pid;
+
+ preempt_disable();
+ pid = mm->context.id;
+ if (pid != MMU_NO_CONTEXT)
+ _tlbil_pid(pid);
+ preempt_enable();
+}
+EXPORT_SYMBOL(local_flush_tlb_mm);
+
+void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+ int tsize, int ind)
+{
+ unsigned int pid;
+
+ preempt_disable();
+ pid = mm ? mm->context.id : 0;
+ if (pid != MMU_NO_CONTEXT)
+ _tlbil_va(vmaddr, pid, tsize, ind);
+ preempt_enable();
+}
+
+void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+ __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+ mmu_get_tsize(mmu_virtual_psize), 0);
+}
+EXPORT_SYMBOL(local_flush_tlb_page);
+
+void local_flush_tlb_page_psize(struct mm_struct *mm,
+ unsigned long vmaddr, int psize)
+{
+ __local_flush_tlb_page(mm, vmaddr, mmu_get_tsize(psize), 0);
+}
+EXPORT_SYMBOL(local_flush_tlb_page_psize);
+
+#endif
+
+/*
+ * And here are the SMP non-local implementations
+ */
+#ifdef CONFIG_SMP
+
+static DEFINE_RAW_SPINLOCK(tlbivax_lock);
+
+struct tlb_flush_param {
+ unsigned long addr;
+ unsigned int pid;
+ unsigned int tsize;
+ unsigned int ind;
+};
+
+static void do_flush_tlb_mm_ipi(void *param)
+{
+ struct tlb_flush_param *p = param;
+
+ _tlbil_pid(p ? p->pid : 0);
+}
+
+static void do_flush_tlb_page_ipi(void *param)
+{
+ struct tlb_flush_param *p = param;
+
+ _tlbil_va(p->addr, p->pid, p->tsize, p->ind);
+}
+
+
+/* Note on invalidations and PID:
+ *
+ * We snapshot the PID with preempt disabled. At this point, it can still
+ * change either because:
+ * - our context is being stolen (PID -> NO_CONTEXT) on another CPU
+ * - we are invaliating some target that isn't currently running here
+ * and is concurrently acquiring a new PID on another CPU
+ * - some other CPU is re-acquiring a lost PID for this mm
+ * etc...
+ *
+ * However, this shouldn't be a problem as we only guarantee
+ * invalidation of TLB entries present prior to this call, so we
+ * don't care about the PID changing, and invalidating a stale PID
+ * is generally harmless.
+ */
+
+void flush_tlb_mm(struct mm_struct *mm)
+{
+ unsigned int pid;
+
+ preempt_disable();
+ pid = mm->context.id;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ goto no_context;
+ if (!mm_is_core_local(mm)) {
+ struct tlb_flush_param p = { .pid = pid };
+ /* Ignores smp_processor_id() even if set. */
+ smp_call_function_many(mm_cpumask(mm),
+ do_flush_tlb_mm_ipi, &p, 1);
+ }
+ _tlbil_pid(pid);
+ no_context:
+ preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_mm);
+
+void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+ int tsize, int ind)
+{
+ struct cpumask *cpu_mask;
+ unsigned int pid;
+
+ /*
+ * This function as well as __local_flush_tlb_page() must only be called
+ * for user contexts.
+ */
+ if (WARN_ON(!mm))
+ return;
+
+ preempt_disable();
+ pid = mm->context.id;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ goto bail;
+ cpu_mask = mm_cpumask(mm);
+ if (!mm_is_core_local(mm)) {
+ /* If broadcast tlbivax is supported, use it */
+ if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) {
+ int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL);
+ if (lock)
+ raw_spin_lock(&tlbivax_lock);
+ _tlbivax_bcast(vmaddr, pid, tsize, ind);
+ if (lock)
+ raw_spin_unlock(&tlbivax_lock);
+ goto bail;
+ } else {
+ struct tlb_flush_param p = {
+ .pid = pid,
+ .addr = vmaddr,
+ .tsize = tsize,
+ .ind = ind,
+ };
+ /* Ignores smp_processor_id() even if set in cpu_mask */
+ smp_call_function_many(cpu_mask,
+ do_flush_tlb_page_ipi, &p, 1);
+ }
+ }
+ _tlbil_va(vmaddr, pid, tsize, ind);
+ bail:
+ preempt_enable();
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ if (vma && is_vm_hugetlb_page(vma))
+ flush_hugetlb_page(vma, vmaddr);
+#endif
+
+ __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+ mmu_get_tsize(mmu_virtual_psize), 0);
+}
+EXPORT_SYMBOL(flush_tlb_page);
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Flush kernel TLB entries in the given range
+ */
+#ifndef CONFIG_PPC_8xx
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+#ifdef CONFIG_SMP
+ preempt_disable();
+ smp_call_function(do_flush_tlb_mm_ipi, NULL, 1);
+ _tlbil_pid(0);
+ preempt_enable();
+#else
+ _tlbil_pid(0);
+#endif
+}
+EXPORT_SYMBOL(flush_tlb_kernel_range);
+#endif
+
+/*
+ * Currently, for range flushing, we just do a full mm flush. This should
+ * be optimized based on a threshold on the size of the range, since
+ * some implementation can stack multiple tlbivax before a tlbsync but
+ * for now, we keep it that way
+ */
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+
+{
+ if (end - start == PAGE_SIZE && !(start & ~PAGE_MASK))
+ flush_tlb_page(vma, start);
+ else
+ flush_tlb_mm(vma->vm_mm);
+}
+EXPORT_SYMBOL(flush_tlb_range);
+
+void tlb_flush(struct mmu_gather *tlb)
+{
+ flush_tlb_mm(tlb->mm);
+}
+
+#ifndef CONFIG_PPC64
+void __init early_init_mmu(void)
+{
+ unsigned long root = of_get_flat_dt_root();
+
+ if (IS_ENABLED(CONFIG_PPC_47x) && IS_ENABLED(CONFIG_SMP) &&
+ of_get_flat_dt_prop(root, "cooperative-partition", NULL))
+ mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST);
+}
+#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c
new file mode 100644
index 000000000000..4f925adf2695
--- /dev/null
+++ b/arch/powerpc/mm/nohash/tlb_64e.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org>
+ * IBM Corp.
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/memblock.h>
+
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/text-patching.h>
+#include <asm/cputhreads.h>
+
+#include <mm/mmu_decl.h>
+
+/* The variables below are currently only used on 64-bit Book3E
+ * though this will probably be made common with other nohash
+ * implementations at some point
+ */
+static int mmu_pte_psize; /* Page size used for PTE pages */
+int mmu_vmemmap_psize; /* Page size used for the virtual mem map */
+int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */
+unsigned long linear_map_top; /* Top of linear mapping */
+
+
+/*
+ * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug
+ * exceptions. This is used for bolted and e6500 TLB miss handlers which
+ * do not modify this SPRG in the TLB miss code; for other TLB miss handlers,
+ * this is set to zero.
+ */
+int extlb_level_exc;
+
+/*
+ * Handling of virtual linear page tables or indirect TLB entries
+ * flushing when PTE pages are freed
+ */
+void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
+{
+ int tsize = mmu_psize_defs[mmu_pte_psize].shift - 10;
+
+ if (book3e_htw_mode != PPC_HTW_NONE) {
+ unsigned long start = address & PMD_MASK;
+ unsigned long end = address + PMD_SIZE;
+ unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
+
+ /* This isn't the most optimal, ideally we would factor out the
+ * while preempt & CPU mask mucking around, or even the IPI but
+ * it will do for now
+ */
+ while (start < end) {
+ __flush_tlb_page(tlb->mm, start, tsize, 1);
+ start += size;
+ }
+ } else {
+ unsigned long rmask = 0xf000000000000000ul;
+ unsigned long rid = (address & rmask) | 0x1000000000000000ul;
+ unsigned long vpte = address & ~rmask;
+
+ vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
+ vpte |= rid;
+ __flush_tlb_page(tlb->mm, vpte, tsize, 0);
+ }
+}
+
+static void __init setup_page_sizes(void)
+{
+ unsigned int tlb0cfg;
+ unsigned int eptcfg;
+ int psize;
+
+ unsigned int mmucfg = mfspr(SPRN_MMUCFG);
+
+ if ((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
+ unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG);
+ unsigned int min_pg, max_pg;
+
+ min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT;
+ max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT;
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def;
+ unsigned int shift;
+
+ def = &mmu_psize_defs[psize];
+ shift = def->shift;
+
+ if (shift == 0 || shift & 1)
+ continue;
+
+ /* adjust to be in terms of 4^shift Kb */
+ shift = (shift - 10) >> 1;
+
+ if ((shift >= min_pg) && (shift <= max_pg))
+ def->flags |= MMU_PAGE_SIZE_DIRECT;
+ }
+
+ goto out;
+ }
+
+ if ((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
+ u32 tlb1cfg, tlb1ps;
+
+ tlb0cfg = mfspr(SPRN_TLB0CFG);
+ tlb1cfg = mfspr(SPRN_TLB1CFG);
+ tlb1ps = mfspr(SPRN_TLB1PS);
+ eptcfg = mfspr(SPRN_EPTCFG);
+
+ if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT))
+ book3e_htw_mode = PPC_HTW_E6500;
+
+ /*
+ * We expect 4K subpage size and unrestricted indirect size.
+ * The lack of a restriction on indirect size is a Freescale
+ * extension, indicated by PSn = 0 but SPSn != 0.
+ */
+ if (eptcfg != 2)
+ book3e_htw_mode = PPC_HTW_NONE;
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+ if (!def->shift)
+ continue;
+
+ if (tlb1ps & (1U << (def->shift - 10))) {
+ def->flags |= MMU_PAGE_SIZE_DIRECT;
+
+ if (book3e_htw_mode && psize == MMU_PAGE_2M)
+ def->flags |= MMU_PAGE_SIZE_INDIRECT;
+ }
+ }
+
+ goto out;
+ }
+out:
+ /* Cleanup array and print summary */
+ pr_info("MMU: Supported page sizes\n");
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+ const char *__page_type_names[] = {
+ "unsupported",
+ "direct",
+ "indirect",
+ "direct & indirect"
+ };
+ if (def->flags == 0) {
+ def->shift = 0;
+ continue;
+ }
+ pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10),
+ __page_type_names[def->flags & 0x3]);
+ }
+}
+
+/*
+ * Early initialization of the MMU TLB code
+ */
+static void early_init_this_mmu(void)
+{
+ unsigned int mas4;
+
+ /* Set MAS4 based on page table setting */
+
+ mas4 = 0x4 << MAS4_WIMGED_SHIFT;
+ switch (book3e_htw_mode) {
+ case PPC_HTW_E6500:
+ mas4 |= MAS4_INDD;
+ mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT;
+ mas4 |= MAS4_TLBSELD(1);
+ mmu_pte_psize = MMU_PAGE_2M;
+ break;
+
+ case PPC_HTW_NONE:
+ mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
+ mmu_pte_psize = mmu_virtual_psize;
+ break;
+ }
+ mtspr(SPRN_MAS4, mas4);
+
+ unsigned int num_cams;
+ bool map = true;
+
+ /* use a quarter of the TLBCAM for bolted linear map */
+ num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
+
+ /*
+ * Only do the mapping once per core, or else the
+ * transient mapping would cause problems.
+ */
+#ifdef CONFIG_SMP
+ if (hweight32(get_tensr()) > 1)
+ map = false;
+#endif
+
+ if (map)
+ linear_map_top = map_mem_in_cams(linear_map_top,
+ num_cams, false, true);
+
+ /* A sync won't hurt us after mucking around with
+ * the MMU configuration
+ */
+ mb();
+}
+
+static void __init early_init_mmu_global(void)
+{
+ /*
+ * Freescale booke only supports 4K pages in TLB0, so use that.
+ */
+ mmu_vmemmap_psize = MMU_PAGE_4K;
+
+ /* XXX This code only checks for TLB 0 capabilities and doesn't
+ * check what page size combos are supported by the HW. It
+ * also doesn't handle the case where a separate array holds
+ * the IND entries from the array loaded by the PT.
+ */
+ /* Look for supported page sizes */
+ setup_page_sizes();
+
+ /*
+ * If we want to use HW tablewalk, enable it by patching the TLB miss
+ * handlers to branch to the one dedicated to it.
+ */
+ extlb_level_exc = EX_TLB_SIZE;
+ switch (book3e_htw_mode) {
+ case PPC_HTW_E6500:
+ patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
+ patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e);
+ break;
+ }
+
+ pr_info("MMU: Book3E HW tablewalk %s\n",
+ book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported");
+
+ /* Set the global containing the top of the linear mapping
+ * for use by the TLB miss code
+ */
+ linear_map_top = memblock_end_of_DRAM();
+
+ ioremap_bot = IOREMAP_BASE;
+}
+
+static void __init early_mmu_set_memory_limit(void)
+{
+ /*
+ * Limit memory so we dont have linear faults.
+ * Unlike memblock_set_current_limit, which limits
+ * memory available during early boot, this permanently
+ * reduces the memory available to Linux. We need to
+ * do this because highmem is not supported on 64-bit.
+ */
+ memblock_enforce_memory_limit(linear_map_top);
+
+ memblock_set_current_limit(linear_map_top);
+}
+
+/* boot cpu only */
+void __init early_init_mmu(void)
+{
+ early_init_mmu_global();
+ early_init_this_mmu();
+ early_mmu_set_memory_limit();
+}
+
+void early_init_mmu_secondary(void)
+{
+ early_init_this_mmu();
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /*
+ * On FSL Embedded 64-bit, usually all RAM is bolted, but with
+ * unusual memory sizes it's possible for some RAM to not be mapped
+ * (such RAM is not used at all by Linux, since we don't support
+ * highmem on 64-bit). We limit ppc64_rma_size to what would be
+ * mappable if this memblock is the only one. Additional memblocks
+ * can only increase, not decrease, the amount that ends up getting
+ * mapped. We still limit max to 1G even if we'll eventually map
+ * more. This is due to what the early init code is set up to do.
+ *
+ * We crop it to the size of the first MEMBLOCK to
+ * avoid going over total available memory just in case...
+ */
+ unsigned long linear_sz;
+ unsigned int num_cams;
+
+ /* use a quarter of the TLBCAM for bolted linear map */
+ num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
+
+ linear_sz = map_mem_in_cams(first_memblock_size, num_cams, true, true);
+ ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
+
+ /* Finally limit subsequent allocations */
+ memblock_set_current_limit(first_memblock_base + ppc64_rma_size);
+}
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/nohash/tlb_low.S
index 626ad081639f..c4d296e73731 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/nohash/tlb_low.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* This file contains low-level functions for performing various
* types of TLB invalidations on various processors with no hash
@@ -18,12 +19,6 @@
*
* Partially rewritten by Cort Dougan (cort@cs.nmt.edu)
* Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <asm/reg.h>
@@ -34,33 +29,10 @@
#include <asm/asm-offsets.h>
#include <asm/processor.h>
#include <asm/bug.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
-#if defined(CONFIG_40x)
-
-/*
- * 40x implementation needs only tlbil_va
- */
-_GLOBAL(__tlbil_va)
- /* We run the search with interrupts disabled because we have to change
- * the PID and I don't want to preempt when that happens.
- */
- mfmsr r5
- mfspr r6,SPRN_PID
- wrteei 0
- mtspr SPRN_PID,r4
- tlbsx. r3, 0, r3
- mtspr SPRN_PID,r6
- wrtee r5
- bne 1f
- sync
- /* There are only 64 TLB entries, so r3 < 64, which means bit 25 is
- * clear. Since 25 is the V bit in the TLB_TAG, loading this value
- * will invalidate the TLB entry. */
- tlbwe r3, r3, TLB_TAG
- isync
-1: blr
-
-#elif defined(CONFIG_8xx)
+#if defined(CONFIG_PPC_8xx)
/*
* Nothing to do for 8xx, everything is inline
@@ -95,36 +67,25 @@ _GLOBAL(__tlbil_va)
tlbsx. r6,0,r3
bne 10f
sync
-BEGIN_MMU_FTR_SECTION
- b 2f
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
+#ifndef CONFIG_PPC_47x
/* On 440 There are only 64 TLB entries, so r3 < 64, which means bit
* 22, is clear. Since 22 is the V bit in the TLB_PAGEID, loading this
* value will invalidate the TLB entry.
*/
tlbwe r6,r6,PPC44x_TLB_PAGEID
- isync
-10: wrtee r10
- blr
-2:
-#ifdef CONFIG_PPC_47x
- oris r7,r6,0x8000 /* specify way explicitely */
+#else
+ oris r7,r6,0x8000 /* specify way explicitly */
clrrwi r4,r3,12 /* get an EPN for the hashing with V = 0 */
ori r4,r4,PPC47x_TLBE_SIZE
tlbwe r4,r7,0 /* write it */
+#endif /* !CONFIG_PPC_47x */
isync
- wrtee r10
+10: wrtee r10
blr
-#else /* CONFIG_PPC_47x */
-1: trap
- EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;
-#endif /* !CONFIG_PPC_47x */
_GLOBAL(_tlbil_all)
_GLOBAL(_tlbil_pid)
-BEGIN_MMU_FTR_SECTION
- b 2f
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
+#ifndef CONFIG_PPC_47x
li r3,0
sync
@@ -139,8 +100,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
isync
blr
-2:
-#ifdef CONFIG_PPC_47x
+#else
/* 476 variant. There's not simple way to do this, hopefully we'll
* try to limit the amount of such full invalidates
*/
@@ -149,7 +109,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
li r3,-1 /* Current set */
lis r10,tlb_47x_boltmap@h
ori r10,r10,tlb_47x_boltmap@l
- lis r7,0x8000 /* Specify way explicitely */
+ lis r7,0x8000 /* Specify way explicitly */
b 9f /* For each set */
@@ -182,11 +142,8 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
b 1b /* Then loop */
1: isync /* Sync shadows */
wrtee r11
-#else /* CONFIG_PPC_47x */
-1: trap
- EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;
-#endif /* !CONFIG_PPC_47x */
blr
+#endif /* !CONFIG_PPC_47x */
#ifdef CONFIG_PPC_47x
@@ -204,7 +161,7 @@ _GLOBAL(_tlbivax_bcast)
isync
PPC_TLBIVAX(0, R3)
isync
- eieio
+ mbar
tlbsync
BEGIN_FTR_SECTION
b 1f
@@ -217,7 +174,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2)
* Touch enough instruction cache lines to ensure cache hits
*/
1: mflr r9
- bl 2f
+ bcl 20,31,$+4
2: mflr r6
li r7,32
PPC_ICBT(0,R6,R7) /* touch next cache line */
@@ -239,7 +196,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2)
blr
#endif /* CONFIG_PPC_47x */
-#elif defined(CONFIG_FSL_BOOKE)
+#elif defined(CONFIG_PPC_85xx)
/*
* FSL BookE implementations.
*
@@ -312,7 +269,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
isync
1: wrtee r10
blr
-#elif defined(CONFIG_PPC_BOOK3E)
+#elif defined(CONFIG_PPC_BOOK3E_64)
/*
* New Book3E (>= 2.06) implementation
*
@@ -373,36 +330,26 @@ _GLOBAL(_tlbivax_bcast)
rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */
PPC_TLBIVAX(0,R3)
- eieio
+ mbar
tlbsync
sync
wrtee r10
blr
-
-_GLOBAL(set_context)
-#ifdef CONFIG_BDI_SWITCH
- /* Context switch the PTE pointer for the Abatron BDI2000.
- * The PGDIR is the second parameter.
- */
- lis r5, abatron_pteptrs@h
- ori r5, r5, abatron_pteptrs@l
- stw r4, 0x4(r5)
-#endif
- mtspr SPRN_PID,r3
- isync /* Force context change */
- blr
#else
#error Unsupported processor type !
#endif
-#if defined(CONFIG_PPC_FSL_BOOK3E)
+#if defined(CONFIG_PPC_E500)
/*
* extern void loadcam_entry(unsigned int index)
*
* Load TLBCAM[index] entry in to the L2 CAM MMU
+ * Must preserve r7, r8, r9, r10, r11, r12
*/
_GLOBAL(loadcam_entry)
- LOAD_REG_ADDR(r4, TLBCAM)
+ mflr r5
+ LOAD_REG_ADDR_PIC(r4, TLBCAM)
+ mtlr r5
mulli r5,r3,TLBCAM_SIZE
add r3,r5,r4
lwz r4,TLBCAM_MAS0(r3)
@@ -421,4 +368,80 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
tlbwe
isync
blr
+
+/*
+ * Load multiple TLB entries at once, using an alternate-space
+ * trampoline so that we don't have to care about whether the same
+ * TLB entry maps us before and after.
+ *
+ * r3 = first entry to write
+ * r4 = number of entries to write
+ * r5 = temporary tlb entry (0 means no switch to AS1)
+ */
+_GLOBAL(loadcam_multi)
+ mflr r8
+ /* Don't switch to AS=1 if already there */
+ mfmsr r11
+ andi. r11,r11,MSR_IS
+ bne 10f
+ mr. r12, r5
+ beq 10f
+
+ /*
+ * Set up temporary TLB entry that is the same as what we're
+ * running from, but in AS=1.
+ */
+ bcl 20,31,$+4
+1: mflr r6
+ tlbsx 0,r8
+ mfspr r6,SPRN_MAS1
+ ori r6,r6,MAS1_TS
+ mtspr SPRN_MAS1,r6
+ mfspr r6,SPRN_MAS0
+ rlwimi r6,r5,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK
+ mr r7,r5
+ mtspr SPRN_MAS0,r6
+ isync
+ tlbwe
+ isync
+
+ /* Switch to AS=1 */
+ mfmsr r6
+ ori r6,r6,MSR_IS|MSR_DS
+ mtmsr r6
+ isync
+
+10:
+ mr r9,r3
+ add r10,r3,r4
+2: bl loadcam_entry
+ addi r9,r9,1
+ cmpw r9,r10
+ mr r3,r9
+ blt 2b
+
+ /* Don't return to AS=0 if we were in AS=1 at function start */
+ andi. r11,r11,MSR_IS
+ bne 3f
+ cmpwi r12, 0
+ beq 3f
+
+ /* Return to AS=0 and clear the temporary entry */
+ mfmsr r6
+ rlwinm. r6,r6,0,~(MSR_IS|MSR_DS)
+ mtmsr r6
+ isync
+
+ li r6,0
+ mtspr SPRN_MAS1,r6
+ rlwinm r6,r7,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK
+ oris r6,r6,MAS0_TLBSEL(1)@h
+ mtspr SPRN_MAS0,r6
+ isync
+ tlbwe
+ isync
+
+3:
+ mtlr r8
+ blr
#endif
diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S
new file mode 100644
index 000000000000..de568297d5c5
--- /dev/null
+++ b/arch/powerpc/mm/nohash/tlb_low_64e.S
@@ -0,0 +1,743 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Low level TLB miss handlers for Book3E
+ *
+ * Copyright (C) 2008-2009
+ * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp.
+ */
+
+#include <linux/pgtable.h>
+#include <asm/processor.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+#include <asm/exception-64e.h>
+#include <asm/ppc-opcode.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_booke_hv_asm.h>
+#include <asm/feature-fixups.h>
+
+#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE)
+#define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE)
+#define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE)
+#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE)
+
+/**********************************************************************
+ * *
+ * TLB miss handling for Book3E with a bolted linear mapping *
+ * No virtual page table, no nested TLB misses *
+ * *
+ **********************************************************************/
+
+/*
+ * Note that, unlike non-bolted handlers, TLB_EXFRAME is not
+ * modified by the TLB miss handlers themselves, since the TLB miss
+ * handler code will not itself cause a recursive TLB miss.
+ *
+ * TLB_EXFRAME will be modified when crit/mc/debug exceptions are
+ * entered/exited.
+ */
+.macro tlb_prolog_bolted intnum addr
+ mtspr SPRN_SPRG_GEN_SCRATCH,r12
+ mfspr r12,SPRN_SPRG_TLB_EXFRAME
+ std r13,EX_TLB_R13(r12)
+ std r10,EX_TLB_R10(r12)
+ mfspr r13,SPRN_SPRG_PACA
+
+ mfcr r10
+ std r11,EX_TLB_R11(r12)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+ mfspr r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
+ DO_KVM \intnum, SPRN_SRR1
+ std r16,EX_TLB_R16(r12)
+ mfspr r16,\addr /* get faulting address */
+ std r14,EX_TLB_R14(r12)
+ ld r14,PACAPGD(r13)
+ std r15,EX_TLB_R15(r12)
+ std r10,EX_TLB_CR(r12)
+START_BTB_FLUSH_SECTION
+ mfspr r11, SPRN_SRR1
+ andi. r10,r11,MSR_PR
+ beq 1f
+ BTB_FLUSH(r10)
+1:
+END_BTB_FLUSH_SECTION
+ std r7,EX_TLB_R7(r12)
+.endm
+
+.macro tlb_epilog_bolted
+ ld r14,EX_TLB_CR(r12)
+ ld r7,EX_TLB_R7(r12)
+ ld r10,EX_TLB_R10(r12)
+ ld r11,EX_TLB_R11(r12)
+ ld r13,EX_TLB_R13(r12)
+ mtcr r14
+ ld r14,EX_TLB_R14(r12)
+ ld r15,EX_TLB_R15(r12)
+ ld r16,EX_TLB_R16(r12)
+ mfspr r12,SPRN_SPRG_GEN_SCRATCH
+.endm
+
+/* Data TLB miss */
+ START_EXCEPTION(data_tlb_miss_bolted)
+ tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
+
+ /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */
+
+ /* We do the user/kernel test for the PID here along with the RW test
+ */
+ /* We pre-test some combination of permissions to avoid double
+ * faults:
+ *
+ * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
+ * ESR_ST is 0x00800000
+ * _PAGE_BAP_SW is 0x00000010
+ * So the shift is >> 19. This tests for supervisor writeability.
+ * If the page happens to be supervisor writeable and not user
+ * writeable, we will take a new fault later, but that should be
+ * a rare enough case.
+ *
+ * We also move ESR_ST in _PAGE_DIRTY position
+ * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
+ *
+ * MAS1 is preset for all we need except for TID that needs to
+ * be cleared for kernel translations
+ */
+
+ mfspr r11,SPRN_ESR
+
+ srdi r15,r16,60 /* get region */
+ rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+ bne- dtlb_miss_fault_bolted /* Bail if fault addr is invalid */
+
+ rlwinm r10,r11,32-19,27,27
+ rlwimi r10,r11,32-16,19,19
+ cmpwi r15,0 /* user vs kernel check */
+ ori r10,r10,_PAGE_PRESENT
+ oris r11,r10,_PAGE_ACCESSED@h
+
+ bne tlb_miss_kernel_bolted
+
+tlb_miss_user_bolted:
+#ifdef CONFIG_PPC_KUAP
+ mfspr r10,SPRN_MAS1
+ rlwinm. r10,r10,0,0x3fff0000
+ beq- tlb_miss_fault_bolted /* KUAP fault */
+#endif
+
+tlb_miss_common_bolted:
+/*
+ * This is the guts of the TLB miss handler for bolted-linear.
+ * We are entered with:
+ *
+ * r16 = faulting address
+ * r15 = crap (free to use)
+ * r14 = page table base
+ * r13 = PACA
+ * r11 = PTE permission mask
+ * r10 = crap (free to use)
+ */
+ rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
+ cmpldi cr0,r14,0
+ clrrdi r15,r15,3
+ beq tlb_miss_fault_bolted /* No PGDIR, bail */
+
+ ldx r14,r14,r15 /* grab pgd entry */
+
+ rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
+ clrrdi r15,r15,3
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_bolted /* Bad pgd entry or hugepage; bail */
+ ldx r14,r14,r15 /* grab pud entry */
+
+ rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
+ clrrdi r15,r15,3
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_bolted
+ ldx r14,r14,r15 /* Grab pmd entry */
+
+ rldicl r15,r16,64-PAGE_SHIFT+3,64-PTE_INDEX_SIZE-3
+ clrrdi r15,r15,3
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_bolted
+ ldx r14,r14,r15 /* Grab PTE, normal (!huge) page */
+
+ /* Check if required permissions are met */
+ andc. r15,r11,r14
+ rldicr r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
+ bne- tlb_miss_fault_bolted
+
+ /* Now we build the MAS:
+ *
+ * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
+ * MAS 1 : Almost fully setup
+ * - PID already updated by caller if necessary
+ * - TSIZE need change if !base page size, not
+ * yet implemented for now
+ * MAS 2 : Defaults not useful, need to be redone
+ * MAS 3+7 : Needs to be done
+ */
+ clrrdi r11,r16,12 /* Clear low crap in EA */
+ clrldi r15,r15,12 /* Clear crap at the top */
+ rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */
+ rlwimi r15,r14,32-8,22,25 /* Move in U bits */
+ mtspr SPRN_MAS2,r11
+ andi. r11,r14,_PAGE_DIRTY
+ rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */
+
+ /* Mask out SW and UW if !DIRTY (XXX optimize this !) */
+ bne 1f
+ li r11,MAS3_SW|MAS3_UW
+ andc r15,r15,r11
+1:
+ mtspr SPRN_MAS7_MAS3,r15
+ tlbwe
+
+tlb_miss_done_bolted:
+ tlb_epilog_bolted
+ rfi
+
+itlb_miss_kernel_bolted:
+ li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */
+ oris r11,r11,_PAGE_ACCESSED@h
+tlb_miss_kernel_bolted:
+ mfspr r10,SPRN_MAS1
+ ld r14,PACA_KERNELPGD(r13)
+ srdi r15,r16,44 /* get kernel region */
+ andi. r15,r15,1 /* Check for vmalloc region */
+ rlwinm r10,r10,0,16,1 /* Clear TID */
+ mtspr SPRN_MAS1,r10
+ bne+ tlb_miss_common_bolted
+
+tlb_miss_fault_bolted:
+ /* We need to check if it was an instruction miss */
+ andi. r10,r11,_PAGE_BAP_UX|_PAGE_BAP_SX
+ bne itlb_miss_fault_bolted
+dtlb_miss_fault_bolted:
+ tlb_epilog_bolted
+ b exc_data_storage_book3e
+itlb_miss_fault_bolted:
+ tlb_epilog_bolted
+ b exc_instruction_storage_book3e
+
+/* Instruction TLB miss */
+ START_EXCEPTION(instruction_tlb_miss_bolted)
+ tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
+
+ rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+ srdi r15,r16,60 /* get region */
+ bne- itlb_miss_fault_bolted
+
+ li r11,_PAGE_PRESENT|_PAGE_BAP_UX /* Base perm */
+
+ /* We do the user/kernel test for the PID here along with the RW test
+ */
+
+ cmpldi cr0,r15,0 /* Check for user region */
+ oris r11,r11,_PAGE_ACCESSED@h
+ beq tlb_miss_user_bolted
+ b itlb_miss_kernel_bolted
+
+/*
+ * TLB miss handling for e6500 and derivatives, using hardware tablewalk.
+ *
+ * Linear mapping is bolted: no virtual page table or nested TLB misses
+ * Indirect entries in TLB1, hardware loads resulting direct entries
+ * into TLB0
+ * No HES or NV hint on TLB1, so we need to do software round-robin
+ * No tlbsrx. so we need a spinlock, and we have to deal
+ * with MAS-damage caused by tlbsx
+ * 4K pages only
+ */
+
+ START_EXCEPTION(instruction_tlb_miss_e6500)
+ tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
+
+ ld r11,PACA_TCD_PTR(r13)
+ srdi. r15,r16,60 /* get region */
+ ori r16,r16,1
+
+ bne tlb_miss_kernel_e6500 /* user/kernel test */
+
+ b tlb_miss_common_e6500
+
+ START_EXCEPTION(data_tlb_miss_e6500)
+ tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
+
+ ld r11,PACA_TCD_PTR(r13)
+ srdi. r15,r16,60 /* get region */
+ rldicr r16,r16,0,62
+
+ bne tlb_miss_kernel_e6500 /* user vs kernel check */
+
+/*
+ * This is the guts of the TLB miss handler for e6500 and derivatives.
+ * We are entered with:
+ *
+ * r16 = page of faulting address (low bit 0 if data, 1 if instruction)
+ * r15 = crap (free to use)
+ * r14 = page table base
+ * r13 = PACA
+ * r11 = tlb_per_core ptr
+ * r10 = crap (free to use)
+ * r7 = esel_next
+ */
+tlb_miss_common_e6500:
+ crmove cr2*4+2,cr0*4+2 /* cr2.eq != 0 if kernel address */
+
+BEGIN_FTR_SECTION /* CPU_FTR_SMT */
+ /*
+ * Search if we already have an indirect entry for that virtual
+ * address, and if we do, bail out.
+ *
+ * MAS6:IND should be already set based on MAS4
+ */
+ lhz r10,PACAPACAINDEX(r13)
+ addi r10,r10,1
+ crclr cr1*4+eq /* set cr1.eq = 0 for non-recursive */
+1: lbarx r15,0,r11
+ cmpdi r15,0
+ bne 2f
+ stbcx. r10,0,r11
+ bne 1b
+3:
+ .subsection 1
+2: cmpd cr1,r15,r10 /* recursive lock due to mcheck/crit/etc? */
+ beq cr1,3b /* unlock will happen if cr1.eq = 0 */
+10: lbz r15,0(r11)
+ cmpdi r15,0
+ bne 10b
+ b 1b
+ .previous
+END_FTR_SECTION_IFSET(CPU_FTR_SMT)
+
+ lbz r7,TCD_ESEL_NEXT(r11)
+
+BEGIN_FTR_SECTION /* CPU_FTR_SMT */
+ /*
+ * Erratum A-008139 says that we can't use tlbwe to change
+ * an indirect entry in any way (including replacing or
+ * invalidating) if the other thread could be in the process
+ * of a lookup. The workaround is to invalidate the entry
+ * with tlbilx before overwriting.
+ */
+
+ rlwinm r10,r7,16,0xff0000
+ oris r10,r10,MAS0_TLBSEL(1)@h
+ mtspr SPRN_MAS0,r10
+ isync
+ tlbre
+ mfspr r15,SPRN_MAS1
+ andis. r15,r15,MAS1_VALID@h
+ beq 5f
+
+BEGIN_FTR_SECTION_NESTED(532)
+ mfspr r10,SPRN_MAS8
+ rlwinm r10,r10,0,0x80000fff /* tgs,tlpid -> sgs,slpid */
+ mtspr SPRN_MAS5,r10
+END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532)
+
+ mfspr r10,SPRN_MAS1
+ rlwinm r15,r10,0,0x3fff0000 /* tid -> spid */
+ rlwimi r15,r10,20,0x00000003 /* ind,ts -> sind,sas */
+ mfspr r10,SPRN_MAS6
+ mtspr SPRN_MAS6,r15
+
+ mfspr r15,SPRN_MAS2
+ isync
+ PPC_TLBILX_VA(0,R15)
+ isync
+
+ mtspr SPRN_MAS6,r10
+
+5:
+BEGIN_FTR_SECTION_NESTED(532)
+ li r10,0
+ mtspr SPRN_MAS8,r10
+ mtspr SPRN_MAS5,r10
+END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532)
+
+ tlbsx 0,r16
+ mfspr r10,SPRN_MAS1
+ andis. r15,r10,MAS1_VALID@h
+ bne tlb_miss_done_e6500
+FTR_SECTION_ELSE
+ mfspr r10,SPRN_MAS1
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT)
+
+ oris r10,r10,MAS1_VALID@h
+ beq cr2,4f
+ rlwinm r10,r10,0,16,1 /* Clear TID */
+4: mtspr SPRN_MAS1,r10
+
+ /* Now, we need to walk the page tables. First check if we are in
+ * range.
+ */
+ rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+ bne- tlb_miss_fault_e6500
+
+ rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
+ cmpldi cr0,r14,0
+ clrrdi r15,r15,3
+ beq- tlb_miss_fault_e6500 /* No PGDIR, bail */
+ ldx r14,r14,r15 /* grab pgd entry */
+
+ rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
+ clrrdi r15,r15,3
+ cmpdi cr0,r14,0
+ bge tlb_miss_huge_e6500 /* Bad pgd entry or hugepage; bail */
+ ldx r14,r14,r15 /* grab pud entry */
+
+ rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
+ clrrdi r15,r15,3
+ cmpdi cr0,r14,0
+ bge tlb_miss_huge_e6500
+ ldx r14,r14,r15 /* Grab pmd entry */
+
+ mfspr r10,SPRN_MAS0
+ cmpdi cr0,r14,0
+ bge tlb_miss_huge_e6500
+
+ /* Now we build the MAS for a 2M indirect page:
+ *
+ * MAS 0 : ESEL needs to be filled by software round-robin
+ * MAS 1 : Fully set up
+ * - PID already updated by caller if necessary
+ * - TSIZE for now is base ind page size always
+ * - TID already cleared if necessary
+ * MAS 2 : Default not 2M-aligned, need to be redone
+ * MAS 3+7 : Needs to be done
+ */
+
+ ori r14,r14,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
+ mtspr SPRN_MAS7_MAS3,r14
+
+ clrrdi r15,r16,21 /* make EA 2M-aligned */
+ mtspr SPRN_MAS2,r15
+
+tlb_miss_huge_done_e6500:
+ lbz r16,TCD_ESEL_MAX(r11)
+ lbz r14,TCD_ESEL_FIRST(r11)
+ rlwimi r10,r7,16,0x00ff0000 /* insert esel_next into MAS0 */
+ addi r7,r7,1 /* increment esel_next */
+ mtspr SPRN_MAS0,r10
+ cmpw r7,r16
+ iseleq r7,r14,r7 /* if next == last use first */
+ stb r7,TCD_ESEL_NEXT(r11)
+
+ tlbwe
+
+tlb_miss_done_e6500:
+ .macro tlb_unlock_e6500
+BEGIN_FTR_SECTION
+ beq cr1,1f /* no unlock if lock was recursively grabbed */
+ li r15,0
+ isync
+ stb r15,0(r11)
+1:
+END_FTR_SECTION_IFSET(CPU_FTR_SMT)
+ .endm
+
+ tlb_unlock_e6500
+ tlb_epilog_bolted
+ rfi
+
+tlb_miss_huge_e6500:
+ beq tlb_miss_fault_e6500
+ rlwinm r15,r14,32-_PAGE_PSIZE_SHIFT,0x1e
+
+ /*
+ * Now we build the MAS for a huge page.
+ *
+ * MAS 0 : ESEL needs to be filled by software round-robin
+ * - can be handled by indirect code
+ * MAS 1 : Need to clear IND and set TSIZE
+ * MAS 2,3+7: Needs to be redone similar to non-tablewalk handler
+ */
+
+ mfspr r10,SPRN_MAS1
+ rlwinm r10,r10,0,~MAS1_IND
+ rlwimi r10,r15,MAS1_TSIZE_SHIFT,MAS1_TSIZE_MASK
+ mtspr SPRN_MAS1,r10
+
+ li r10,-0x400
+ sld r15,r10,r15 /* Generate mask based on size */
+ and r10,r16,r15
+ rldicr r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
+ rlwimi r10,r14,32-19,27,31 /* Insert WIMGE */
+ clrldi r15,r15,PAGE_SHIFT /* Clear crap at the top */
+ rlwimi r15,r14,32-8,22,25 /* Move in U bits */
+ mtspr SPRN_MAS2,r10
+ andi. r10,r14,_PAGE_DIRTY
+ rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */
+
+ /* Mask out SW and UW if !DIRTY (XXX optimize this !) */
+ bne 1f
+ li r10,MAS3_SW|MAS3_UW
+ andc r15,r15,r10
+1:
+ mtspr SPRN_MAS7_MAS3,r15
+
+ mfspr r10,SPRN_MAS0
+ b tlb_miss_huge_done_e6500
+
+tlb_miss_kernel_e6500:
+ ld r14,PACA_KERNELPGD(r13)
+ srdi r15,r16,44 /* get kernel region */
+ xoris r15,r15,0xc /* Check for vmalloc region */
+ cmplwi cr1,r15,1
+ beq+ cr1,tlb_miss_common_e6500
+
+tlb_miss_fault_e6500:
+ tlb_unlock_e6500
+ /* We need to check if it was an instruction miss */
+ andi. r16,r16,1
+ bne itlb_miss_fault_e6500
+dtlb_miss_fault_e6500:
+ tlb_epilog_bolted
+ b exc_data_storage_book3e
+itlb_miss_fault_e6500:
+ tlb_epilog_bolted
+ b exc_instruction_storage_book3e
+
+/*
+ * This is the guts of the second-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = virtual page table faulting address
+ * r15 = region (top 4 bits of address)
+ * r14 = crap (free to use)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * Note that this should only ever be called as a second level handler
+ * with the current scheme when using SW load.
+ * That means we can always get the original fault DEAR at
+ * EX_TLB_DEAR-EX_TLB_SIZE(r12)
+ *
+ * It can be re-entered by the linear mapping miss handler. However, to
+ * avoid too much complication, it will restart the whole fault at level
+ * 0 so we don't care too much about clobbers
+ *
+ * XXX That code was written back when we couldn't clobber r14. We can now,
+ * so we could probably optimize things a bit
+ */
+virt_page_table_tlb_miss:
+ /* Are we hitting a kernel page table ? */
+ srdi r15,r16,60
+ andi. r10,r15,0x8
+
+ /* The cool thing now is that r10 contains 0 for user and 8 for kernel,
+ * and we happen to have the swapper_pg_dir at offset 8 from the user
+ * pgdir in the PACA :-).
+ */
+ add r11,r10,r13
+
+ /* If kernel, we need to clear MAS1 TID */
+ beq 1f
+ /* XXX replace the RMW cycles with immediate loads + writes */
+ mfspr r10,SPRN_MAS1
+ rlwinm r10,r10,0,16,1 /* Clear TID */
+ mtspr SPRN_MAS1,r10
+#ifdef CONFIG_PPC_KUAP
+ b 2f
+1:
+ mfspr r10,SPRN_MAS1
+ rlwinm. r10,r10,0,0x3fff0000
+ beq- virt_page_table_tlb_miss_fault /* KUAP fault */
+2:
+#else
+1:
+#endif
+
+ /* Now, we need to walk the page tables. First check if we are in
+ * range.
+ */
+ rldicl r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4
+ cmpldi r10,0x80
+ bne- virt_page_table_tlb_miss_fault
+
+ /* Get the PGD pointer */
+ ld r15,PACAPGD(r11)
+ cmpldi cr0,r15,0
+ beq- virt_page_table_tlb_miss_fault
+
+ /* Get to PGD entry */
+ rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3
+ clrrdi r10,r11,3
+ ldx r15,r10,r15
+ cmpdi cr0,r15,0
+ bge virt_page_table_tlb_miss_fault
+
+ /* Get to PUD entry */
+ rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
+ clrrdi r10,r11,3
+ ldx r15,r10,r15
+ cmpdi cr0,r15,0
+ bge virt_page_table_tlb_miss_fault
+
+ /* Get to PMD entry */
+ rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
+ clrrdi r10,r11,3
+ ldx r15,r10,r15
+ cmpdi cr0,r15,0
+ bge virt_page_table_tlb_miss_fault
+
+ /* Ok, we're all right, we can now create a kernel translation for
+ * a 4K or 64K page from r16 -> r15.
+ */
+ /* Now we build the MAS:
+ *
+ * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
+ * MAS 1 : Almost fully setup
+ * - PID already updated by caller if necessary
+ * - TSIZE for now is base page size always
+ * MAS 2 : Use defaults
+ * MAS 3+7 : Needs to be done
+ *
+ * So we only do MAS 2 and 3 for now...
+ */
+ clrldi r11,r15,4 /* remove region ID from RPN */
+ ori r10,r11,1 /* Or-in SR */
+
+ srdi r16,r10,32
+ mtspr SPRN_MAS3,r10
+ mtspr SPRN_MAS7,r16
+
+ tlbwe
+
+ /* Return to caller, normal case */
+ TLB_MISS_EPILOG_SUCCESS
+ rfi
+
+virt_page_table_tlb_miss_fault:
+ /* If we fault here, things are a little bit tricky. We need to call
+ * either data or instruction store fault, and we need to retrieve
+ * the original fault address and ESR (for data).
+ *
+ * The thing is, we know that in normal circumstances, this is
+ * always called as a second level tlb miss for SW load or as a first
+ * level TLB miss for HW load, so we should be able to peek at the
+ * relevant information in the first exception frame in the PACA.
+ *
+ * However, we do need to double check that, because we may just hit
+ * a stray kernel pointer or a userland attack trying to hit those
+ * areas. If that is the case, we do a data fault. (We can't get here
+ * from an instruction tlb miss anyway).
+ *
+ * Note also that when going to a fault, we must unwind the previous
+ * level as well. Since we are doing that, we don't need to clear or
+ * restore the TLB reservation neither.
+ */
+ subf r10,r13,r12
+ cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE
+ bne- virt_page_table_tlb_miss_whacko_fault
+
+ /* We dig the original DEAR and ESR from slot 0 */
+ ld r15,EX_TLB_DEAR+PACA_EXTLB(r13)
+ ld r16,EX_TLB_ESR+PACA_EXTLB(r13)
+
+ /* We check for the "special" ESR value for instruction faults */
+ cmpdi cr0,r16,-1
+ beq 1f
+ mtspr SPRN_DEAR,r15
+ mtspr SPRN_ESR,r16
+ TLB_MISS_EPILOG_ERROR
+ b exc_data_storage_book3e
+1: TLB_MISS_EPILOG_ERROR
+ b exc_instruction_storage_book3e
+
+virt_page_table_tlb_miss_whacko_fault:
+ /* The linear fault will restart everything so ESR and DEAR will
+ * not have been clobbered, let's just fault with what we have
+ */
+ TLB_MISS_EPILOG_ERROR
+ b exc_data_storage_book3e
+
+/*
+ * This is the guts of "any" level TLB miss handler for kernel linear
+ * mapping misses. We are entered with:
+ *
+ *
+ * r16 = faulting address
+ * r15 = crap (free to use)
+ * r14 = ESR (data) or -1 (instruction)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * In addition we know that we will not re-enter, so in theory, we could
+ * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later.
+ *
+ * We also need to be careful about MAS registers here & TLB reservation,
+ * as we know we'll have clobbered them if we interrupt the main TLB miss
+ * handlers in which case we probably want to do a full restart at level
+ * 0 rather than saving / restoring the MAS.
+ *
+ * Note: If we care about performance of that core, we can easily shuffle
+ * a few things around
+ */
+tlb_load_linear:
+ /* For now, we assume the linear mapping is contiguous and stops at
+ * linear_map_top. We also assume the size is a multiple of 1G, thus
+ * we only use 1G pages for now. That might have to be changed in a
+ * final implementation, especially when dealing with hypervisors
+ */
+ __LOAD_PACA_TOC(r11)
+ LOAD_REG_ADDR_ALTTOC(r11, r11, linear_map_top)
+ ld r10,0(r11)
+ tovirt(10,10)
+ cmpld cr0,r16,r10
+ bge tlb_load_linear_fault
+
+ /* MAS1 need whole new setup. */
+ li r15,(BOOK3E_PAGESZ_1GB<<MAS1_TSIZE_SHIFT)
+ oris r15,r15,MAS1_VALID@h /* MAS1 needs V and TSIZE */
+ mtspr SPRN_MAS1,r15
+
+ /* Already somebody there ? */
+ PPC_TLBSRX_DOT(0,R16)
+ beq tlb_load_linear_done
+
+ /* Now we build the remaining MAS. MAS0 and 2 should be fine
+ * with their defaults, which leaves us with MAS 3 and 7. The
+ * mapping is linear, so we just take the address, clear the
+ * region bits, and or in the permission bits which are currently
+ * hard wired
+ */
+ clrrdi r10,r16,30 /* 1G page index */
+ clrldi r10,r10,4 /* clear region bits */
+ ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX
+
+ srdi r16,r10,32
+ mtspr SPRN_MAS3,r10
+ mtspr SPRN_MAS7,r16
+
+ tlbwe
+
+tlb_load_linear_done:
+ /* We use the "error" epilog for success as we do want to
+ * restore to the initial faulting context, whatever it was.
+ * We do that because we can't resume a fault within a TLB
+ * miss handler, due to MAS and TLB reservation being clobbered.
+ */
+ TLB_MISS_EPILOG_ERROR
+ rfi
+
+tlb_load_linear_fault:
+ /* We keep the DEAR and ESR around, this shouldn't have happened */
+ cmpdi cr0,r14,-1
+ beq 1f
+ TLB_MISS_EPILOG_ERROR_SPECIAL
+ b exc_data_storage_book3e
+1: TLB_MISS_EPILOG_ERROR_SPECIAL
+ b exc_instruction_storage_book3e
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 08397217e8ac..603a0f652ba6 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1,15 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* pSeries NUMA support
*
* Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) "numa: " fmt
+
#include <linux/threads.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
@@ -17,8 +15,8 @@
#include <linux/nodemask.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
-#include <linux/memblock.h>
#include <linux/of.h>
+#include <linux/of_address.h>
#include <linux/pfn.h>
#include <linux/cpuset.h>
#include <linux/node.h>
@@ -27,38 +25,44 @@
#include <linux/seq_file.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
+#include <asm/cputhreads.h>
#include <asm/sparsemem.h>
-#include <asm/prom.h>
#include <asm/smp.h>
+#include <asm/topology.h>
#include <asm/firmware.h>
#include <asm/paca.h>
#include <asm/hvcall.h>
#include <asm/setup.h>
#include <asm/vdso.h>
+#include <asm/vphn.h>
+#include <asm/drmem.h>
static int numa_enabled = 1;
static char *cmdline __initdata;
-static int numa_debug;
-#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
-
int numa_cpu_lookup_table[NR_CPUS];
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
-struct pglist_data *node_data[MAX_NUMNODES];
EXPORT_SYMBOL(numa_cpu_lookup_table);
EXPORT_SYMBOL(node_to_cpumask_map);
-EXPORT_SYMBOL(node_data);
-static int min_common_depth;
+static int primary_domain_index;
static int n_mem_addr_cells, n_mem_size_cells;
-static int form1_affinity;
+
+#define FORM0_AFFINITY 0
+#define FORM1_AFFINITY 1
+#define FORM2_AFFINITY 2
+static int affinity_form;
#define MAX_DISTANCE_REF_POINTS 4
static int distance_ref_points_depth;
-static const unsigned int *distance_ref_points;
+static const __be32 *distance_ref_points;
static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
+static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = {
+ [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 }
+};
+static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE };
/*
* Allocate node_to_cpumask_map based on number of available nodes
@@ -75,11 +79,11 @@ static void __init setup_node_to_cpumask_map(void)
setup_nr_node_ids();
/* allocate the map */
- for (node = 0; node < nr_node_ids; node++)
+ for_each_node(node)
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
/* cpumask_of_node() will now work */
- dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
+ pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
}
static int __init fake_numa_create_new_node(unsigned long end_pfn,
@@ -123,145 +127,152 @@ static int __init fake_numa_create_new_node(unsigned long end_pfn,
cmdline = p;
fake_nid++;
*nid = fake_nid;
- dbg("created new fake_node with id %d\n", fake_nid);
+ pr_debug("created new fake_node with id %d\n", fake_nid);
return 1;
}
return 0;
}
-/*
- * get_node_active_region - Return active region containing pfn
- * Active range returned is empty if none found.
- * @pfn: The page to return the region for
- * @node_ar: Returned set to the active region containing @pfn
- */
-static void __init get_node_active_region(unsigned long pfn,
- struct node_active_region *node_ar)
+static void __init reset_numa_cpu_lookup_table(void)
{
- unsigned long start_pfn, end_pfn;
- int i, nid;
+ unsigned int cpu;
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
- if (pfn >= start_pfn && pfn < end_pfn) {
- node_ar->nid = nid;
- node_ar->start_pfn = start_pfn;
- node_ar->end_pfn = end_pfn;
- break;
- }
- }
+ for_each_possible_cpu(cpu)
+ numa_cpu_lookup_table[cpu] = -1;
}
-static void map_cpu_to_node(int cpu, int node)
+void map_cpu_to_node(int cpu, int node)
{
- numa_cpu_lookup_table[cpu] = node;
+ update_numa_cpu_lookup_table(cpu, node);
- dbg("adding cpu %d to node %d\n", cpu, node);
-
- if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
+ if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) {
+ pr_debug("adding cpu %d to node %d\n", cpu, node);
cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
+ }
}
#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
-static void unmap_cpu_from_node(unsigned long cpu)
+void unmap_cpu_from_node(unsigned long cpu)
{
int node = numa_cpu_lookup_table[cpu];
- dbg("removing cpu %lu from node %d\n", cpu, node);
-
if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
+ pr_debug("removing cpu %lu from node %d\n", cpu, node);
} else {
- printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
- cpu, node);
+ pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node);
}
}
#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
-/* must hold reference to node during call */
-static const int *of_get_associativity(struct device_node *dev)
+static int __associativity_to_nid(const __be32 *associativity,
+ int max_array_sz)
{
- return of_get_property(dev, "ibm,associativity", NULL);
-}
+ int nid;
+ /*
+ * primary_domain_index is 1 based array index.
+ */
+ int index = primary_domain_index - 1;
+ if (!numa_enabled || index >= max_array_sz)
+ return NUMA_NO_NODE;
+
+ nid = of_read_number(&associativity[index], 1);
+
+ /* POWER4 LPAR uses 0xffff as invalid node */
+ if (nid == 0xffff || nid >= nr_node_ids)
+ nid = NUMA_NO_NODE;
+ return nid;
+}
/*
- * Returns the property linux,drconf-usable-memory if
- * it exists (the property exists only in kexec/kdump kernels,
- * added by kexec-tools)
+ * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
+ * info is found.
*/
-static const u32 *of_get_usable_memory(struct device_node *memory)
+static int associativity_to_nid(const __be32 *associativity)
{
- const u32 *prop;
- u32 len;
- prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
- if (!prop || len < sizeof(unsigned int))
+ int array_sz = of_read_number(associativity, 1);
+
+ /* Skip the first element in the associativity array */
+ return __associativity_to_nid((associativity + 1), array_sz);
+}
+
+static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+{
+ int dist;
+ int node1, node2;
+
+ node1 = associativity_to_nid(cpu1_assoc);
+ node2 = associativity_to_nid(cpu2_assoc);
+
+ dist = numa_distance_table[node1][node2];
+ if (dist <= LOCAL_DISTANCE)
return 0;
- return prop;
+ else if (dist <= REMOTE_DISTANCE)
+ return 1;
+ else
+ return 2;
}
-int __node_distance(int a, int b)
+static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
{
- int i;
- int distance = LOCAL_DISTANCE;
+ int dist = 0;
- if (!form1_affinity)
- return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
+ int i, index;
for (i = 0; i < distance_ref_points_depth; i++) {
- if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
+ index = be32_to_cpu(distance_ref_points[i]);
+ if (cpu1_assoc[index] == cpu2_assoc[index])
break;
-
- /* Double the distance for each NUMA level */
- distance *= 2;
+ dist++;
}
- return distance;
+ return dist;
}
-static void initialize_distance_lookup_table(int nid,
- const unsigned int *associativity)
+int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
{
- int i;
-
- if (!form1_affinity)
- return;
-
- for (i = 0; i < distance_ref_points_depth; i++) {
- distance_lookup_table[nid][i] =
- associativity[distance_ref_points[i]];
- }
+ /* We should not get called with FORM0 */
+ VM_WARN_ON(affinity_form == FORM0_AFFINITY);
+ if (affinity_form == FORM1_AFFINITY)
+ return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc);
+ return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc);
}
-/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
- * info is found.
- */
-static int associativity_to_nid(const unsigned int *associativity)
+/* must hold reference to node during call */
+static const __be32 *of_get_associativity(struct device_node *dev)
{
- int nid = -1;
+ return of_get_property(dev, "ibm,associativity", NULL);
+}
- if (min_common_depth == -1)
- goto out;
+int __node_distance(int a, int b)
+{
+ int i;
+ int distance = LOCAL_DISTANCE;
- if (associativity[0] >= min_common_depth)
- nid = associativity[min_common_depth];
+ if (affinity_form == FORM2_AFFINITY)
+ return numa_distance_table[a][b];
+ else if (affinity_form == FORM0_AFFINITY)
+ return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
- /* POWER4 LPAR uses 0xffff as invalid node */
- if (nid == 0xffff || nid >= MAX_NUMNODES)
- nid = -1;
+ for (i = 0; i < distance_ref_points_depth; i++) {
+ if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
+ break;
- if (nid > 0 && associativity[0] >= distance_ref_points_depth)
- initialize_distance_lookup_table(nid, associativity);
+ /* Double the distance for each NUMA level */
+ distance *= 2;
+ }
-out:
- return nid;
+ return distance;
}
+EXPORT_SYMBOL(__node_distance);
/* Returns the nid associated with the given device tree node,
* or -1 if not found.
*/
static int of_node_to_nid_single(struct device_node *device)
{
- int nid = -1;
- const unsigned int *tmp;
+ int nid = NUMA_NO_NODE;
+ const __be32 *tmp;
tmp = of_get_associativity(device);
if (tmp)
@@ -272,8 +283,7 @@ static int of_node_to_nid_single(struct device_node *device)
/* Walk the device tree upwards, looking for an associativity id */
int of_node_to_nid(struct device_node *device)
{
- struct device_node *tmp;
- int nid = -1;
+ int nid = NUMA_NO_NODE;
of_node_get(device);
while (device) {
@@ -281,20 +291,163 @@ int of_node_to_nid(struct device_node *device)
if (nid != -1)
break;
- tmp = device;
- device = of_get_parent(tmp);
- of_node_put(tmp);
+ device = of_get_next_parent(device);
}
of_node_put(device);
return nid;
}
-EXPORT_SYMBOL_GPL(of_node_to_nid);
+EXPORT_SYMBOL(of_node_to_nid);
-static int __init find_min_common_depth(void)
+static void __initialize_form1_numa_distance(const __be32 *associativity,
+ int max_array_sz)
{
- int depth;
+ int i, nid;
+
+ if (affinity_form != FORM1_AFFINITY)
+ return;
+
+ nid = __associativity_to_nid(associativity, max_array_sz);
+ if (nid != NUMA_NO_NODE) {
+ for (i = 0; i < distance_ref_points_depth; i++) {
+ const __be32 *entry;
+ int index = be32_to_cpu(distance_ref_points[i]) - 1;
+
+ /*
+ * broken hierarchy, return with broken distance table
+ */
+ if (WARN(index >= max_array_sz, "Broken ibm,associativity property"))
+ return;
+
+ entry = &associativity[index];
+ distance_lookup_table[nid][i] = of_read_number(entry, 1);
+ }
+ }
+}
+
+static void initialize_form1_numa_distance(const __be32 *associativity)
+{
+ int array_sz;
+
+ array_sz = of_read_number(associativity, 1);
+ /* Skip the first element in the associativity array */
+ __initialize_form1_numa_distance(associativity + 1, array_sz);
+}
+
+/*
+ * Used to update distance information w.r.t newly added node.
+ */
+void update_numa_distance(struct device_node *node)
+{
+ int nid;
+
+ if (affinity_form == FORM0_AFFINITY)
+ return;
+ else if (affinity_form == FORM1_AFFINITY) {
+ const __be32 *associativity;
+
+ associativity = of_get_associativity(node);
+ if (!associativity)
+ return;
+
+ initialize_form1_numa_distance(associativity);
+ return;
+ }
+
+ /* FORM2 affinity */
+ nid = of_node_to_nid_single(node);
+ if (nid == NUMA_NO_NODE)
+ return;
+
+ /*
+ * With FORM2 we expect NUMA distance of all possible NUMA
+ * nodes to be provided during boot.
+ */
+ WARN(numa_distance_table[nid][nid] == -1,
+ "NUMA distance details for node %d not provided\n", nid);
+}
+EXPORT_SYMBOL_GPL(update_numa_distance);
+
+/*
+ * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN}
+ * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements}
+ */
+static void __init initialize_form2_numa_distance_lookup_table(void)
+{
+ int i, j;
struct device_node *root;
+ const __u8 *form2_distances;
+ const __be32 *numa_lookup_index;
+ int form2_distances_length;
+ int max_numa_index, distance_index;
+
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ root = of_find_node_by_path("/ibm,opal");
+ else
+ root = of_find_node_by_path("/rtas");
+ if (!root)
+ root = of_find_node_by_path("/");
+
+ numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL);
+ max_numa_index = of_read_number(&numa_lookup_index[0], 1);
+
+ /* first element of the array is the size and is encode-int */
+ form2_distances = of_get_property(root, "ibm,numa-distance-table", NULL);
+ form2_distances_length = of_read_number((const __be32 *)&form2_distances[0], 1);
+ /* Skip the size which is encoded int */
+ form2_distances += sizeof(__be32);
+
+ pr_debug("form2_distances_len = %d, numa_dist_indexes_len = %d\n",
+ form2_distances_length, max_numa_index);
+
+ for (i = 0; i < max_numa_index; i++)
+ /* +1 skip the max_numa_index in the property */
+ numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1);
+
+
+ if (form2_distances_length != max_numa_index * max_numa_index) {
+ WARN(1, "Wrong NUMA distance information\n");
+ form2_distances = NULL; // don't use it
+ }
+ distance_index = 0;
+ for (i = 0; i < max_numa_index; i++) {
+ for (j = 0; j < max_numa_index; j++) {
+ int nodeA = numa_id_index_table[i];
+ int nodeB = numa_id_index_table[j];
+ int dist;
+
+ if (form2_distances)
+ dist = form2_distances[distance_index++];
+ else if (nodeA == nodeB)
+ dist = LOCAL_DISTANCE;
+ else
+ dist = REMOTE_DISTANCE;
+ numa_distance_table[nodeA][nodeB] = dist;
+ pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, dist);
+ }
+ }
+
+ of_node_put(root);
+}
+
+static int __init find_primary_domain_index(void)
+{
+ int index;
+ struct device_node *root;
+
+ /*
+ * Check for which form of affinity.
+ */
+ if (firmware_has_feature(FW_FEATURE_OPAL)) {
+ affinity_form = FORM1_AFFINITY;
+ } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
+ pr_debug("Using form 2 affinity\n");
+ affinity_form = FORM2_AFFINITY;
+ } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
+ pr_debug("Using form 1 affinity\n");
+ affinity_form = FORM1_AFFINITY;
+ } else
+ affinity_form = FORM0_AFFINITY;
if (firmware_has_feature(FW_FEATURE_OPAL))
root = of_find_node_by_path("/ibm,opal");
@@ -320,42 +473,37 @@ static int __init find_min_common_depth(void)
&distance_ref_points_depth);
if (!distance_ref_points) {
- dbg("NUMA: ibm,associativity-reference-points not found.\n");
+ pr_debug("ibm,associativity-reference-points not found.\n");
goto err;
}
distance_ref_points_depth /= sizeof(int);
-
- if (firmware_has_feature(FW_FEATURE_OPAL) ||
- firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
- dbg("Using form 1 affinity\n");
- form1_affinity = 1;
- }
-
- if (form1_affinity) {
- depth = distance_ref_points[0];
- } else {
+ if (affinity_form == FORM0_AFFINITY) {
if (distance_ref_points_depth < 2) {
- printk(KERN_WARNING "NUMA: "
- "short ibm,associativity-reference-points\n");
+ pr_warn("short ibm,associativity-reference-points\n");
goto err;
}
- depth = distance_ref_points[1];
+ index = of_read_number(&distance_ref_points[1], 1);
+ } else {
+ /*
+ * Both FORM1 and FORM2 affinity find the primary domain details
+ * at the same offset.
+ */
+ index = of_read_number(distance_ref_points, 1);
}
-
/*
* Warn and cap if the hardware supports more than
* MAX_DISTANCE_REF_POINTS domains.
*/
if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
- printk(KERN_WARNING "NUMA: distance array capped at "
- "%d entries\n", MAX_DISTANCE_REF_POINTS);
+ pr_warn("distance array capped at %d entries\n",
+ MAX_DISTANCE_REF_POINTS);
distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
}
of_node_put(root);
- return depth;
+ return index;
err:
of_node_put(root);
@@ -375,84 +523,21 @@ static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
of_node_put(memory);
}
-static unsigned long read_n_cells(int n, const unsigned int **buf)
+static unsigned long read_n_cells(int n, const __be32 **buf)
{
unsigned long result = 0;
while (n--) {
- result = (result << 32) | **buf;
+ result = (result << 32) | of_read_number(*buf, 1);
(*buf)++;
}
return result;
}
-/*
- * Read the next memblock list entry from the ibm,dynamic-memory property
- * and return the information in the provided of_drconf_cell structure.
- */
-static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
-{
- const u32 *cp;
-
- drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
-
- cp = *cellp;
- drmem->drc_index = cp[0];
- drmem->reserved = cp[1];
- drmem->aa_index = cp[2];
- drmem->flags = cp[3];
-
- *cellp = cp + 4;
-}
-
-/*
- * Retrieve and validate the ibm,dynamic-memory property of the device tree.
- *
- * The layout of the ibm,dynamic-memory property is a number N of memblock
- * list entries followed by N memblock list entries. Each memblock list entry
- * contains information as laid out in the of_drconf_cell struct above.
- */
-static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
-{
- const u32 *prop;
- u32 len, entries;
-
- prop = of_get_property(memory, "ibm,dynamic-memory", &len);
- if (!prop || len < sizeof(unsigned int))
- return 0;
-
- entries = *prop++;
-
- /* Now that we know the number of entries, revalidate the size
- * of the property read in to ensure we have everything
- */
- if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
- return 0;
-
- *dm = prop;
- return entries;
-}
-
-/*
- * Retrieve and validate the ibm,lmb-size property for drconf memory
- * from the device tree.
- */
-static u64 of_get_lmb_size(struct device_node *memory)
-{
- const u32 *prop;
- u32 len;
-
- prop = of_get_property(memory, "ibm,lmb-size", &len);
- if (!prop || len < sizeof(unsigned int))
- return 0;
-
- return read_n_cells(n_mem_size_cells, &prop);
-}
-
struct assoc_arrays {
u32 n_arrays;
u32 array_sz;
- const u32 *arrays;
+ const __be32 *arrays;
};
/*
@@ -465,18 +550,26 @@ struct assoc_arrays {
* indicating the size of each associativity array, followed by a list
* of N associativity arrays.
*/
-static int of_get_assoc_arrays(struct device_node *memory,
- struct assoc_arrays *aa)
+static int of_get_assoc_arrays(struct assoc_arrays *aa)
{
- const u32 *prop;
+ struct device_node *memory;
+ const __be32 *prop;
u32 len;
+ memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+ if (!memory)
+ return -1;
+
prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
- if (!prop || len < 2 * sizeof(unsigned int))
+ if (!prop || len < 2 * sizeof(unsigned int)) {
+ of_node_put(memory);
return -1;
+ }
+
+ aa->n_arrays = of_read_number(prop++, 1);
+ aa->array_sz = of_read_number(prop++, 1);
- aa->n_arrays = *prop++;
- aa->array_sz = *prop++;
+ of_node_put(memory);
/* Now that we know the number of arrays and size of each array,
* revalidate the size of the property read in.
@@ -488,79 +581,222 @@ static int of_get_assoc_arrays(struct device_node *memory,
return 0;
}
+static int __init get_nid_and_numa_distance(struct drmem_lmb *lmb)
+{
+ struct assoc_arrays aa = { .arrays = NULL };
+ int default_nid = NUMA_NO_NODE;
+ int nid = default_nid;
+ int rc, index;
+
+ if ((primary_domain_index < 0) || !numa_enabled)
+ return default_nid;
+
+ rc = of_get_assoc_arrays(&aa);
+ if (rc)
+ return default_nid;
+
+ if (primary_domain_index <= aa.array_sz &&
+ !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
+ const __be32 *associativity;
+
+ index = lmb->aa_index * aa.array_sz;
+ associativity = &aa.arrays[index];
+ nid = __associativity_to_nid(associativity, aa.array_sz);
+ if (nid > 0 && affinity_form == FORM1_AFFINITY) {
+ /*
+ * lookup array associativity entries have
+ * no length of the array as the first element.
+ */
+ __initialize_form1_numa_distance(associativity, aa.array_sz);
+ }
+ }
+ return nid;
+}
+
/*
* This is like of_node_to_nid_single() for memory represented in the
* ibm,dynamic-reconfiguration-memory node.
*/
-static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
- struct assoc_arrays *aa)
+int of_drconf_to_nid_single(struct drmem_lmb *lmb)
{
- int default_nid = 0;
+ struct assoc_arrays aa = { .arrays = NULL };
+ int default_nid = NUMA_NO_NODE;
int nid = default_nid;
- int index;
+ int rc, index;
- if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
- !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
- drmem->aa_index < aa->n_arrays) {
- index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
- nid = aa->arrays[index];
+ if ((primary_domain_index < 0) || !numa_enabled)
+ return default_nid;
- if (nid == 0xffff || nid >= MAX_NUMNODES)
- nid = default_nid;
- }
+ rc = of_get_assoc_arrays(&aa);
+ if (rc)
+ return default_nid;
+
+ if (primary_domain_index <= aa.array_sz &&
+ !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
+ const __be32 *associativity;
+ index = lmb->aa_index * aa.array_sz;
+ associativity = &aa.arrays[index];
+ nid = __associativity_to_nid(associativity, aa.array_sz);
+ }
return nid;
}
+#ifdef CONFIG_PPC_SPLPAR
+
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
+{
+ long rc, hwid;
+
+ /*
+ * On a shared lpar, device tree will not have node associativity.
+ * At this time lppaca, or its __old_status field may not be
+ * updated. Hence kernel cannot detect if its on a shared lpar. So
+ * request an explicit associativity irrespective of whether the
+ * lpar is shared or dedicated. Use the device tree property as a
+ * fallback. cpu_to_phys_id is only valid between
+ * smp_setup_cpu_maps() and smp_setup_pacas().
+ */
+ if (firmware_has_feature(FW_FEATURE_VPHN)) {
+ if (cpu_to_phys_id)
+ hwid = cpu_to_phys_id[lcpu];
+ else
+ hwid = get_hard_smp_processor_id(lcpu);
+
+ rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
+ if (rc == H_SUCCESS)
+ return 0;
+ }
+
+ return -1;
+}
+
+static int vphn_get_nid(long lcpu)
+{
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+
+
+ if (!__vphn_get_associativity(lcpu, associativity))
+ return associativity_to_nid(associativity);
+
+ return NUMA_NO_NODE;
+
+}
+#else
+
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
+{
+ return -1;
+}
+
+static int vphn_get_nid(long unused)
+{
+ return NUMA_NO_NODE;
+}
+#endif /* CONFIG_PPC_SPLPAR */
+
/*
* Figure out to which domain a cpu belongs and stick it there.
* Return the id of the domain used.
*/
static int numa_setup_cpu(unsigned long lcpu)
{
- int nid = 0;
- struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
+ struct device_node *cpu;
+ int fcpu = cpu_first_thread_sibling(lcpu);
+ int nid = NUMA_NO_NODE;
+
+ if (!cpu_present(lcpu)) {
+ set_cpu_numa_node(lcpu, first_online_node);
+ return first_online_node;
+ }
+
+ /*
+ * If a valid cpu-to-node mapping is already available, use it
+ * directly instead of querying the firmware, since it represents
+ * the most recent mapping notified to us by the platform (eg: VPHN).
+ * Since cpu_to_node binding remains the same for all threads in the
+ * core. If a valid cpu-to-node mapping is already available, for
+ * the first thread in the core, use it.
+ */
+ nid = numa_cpu_lookup_table[fcpu];
+ if (nid >= 0) {
+ map_cpu_to_node(lcpu, nid);
+ return nid;
+ }
+
+ nid = vphn_get_nid(lcpu);
+ if (nid != NUMA_NO_NODE)
+ goto out_present;
+
+ cpu = of_get_cpu_node(lcpu, NULL);
if (!cpu) {
WARN_ON(1);
- goto out;
+ if (cpu_present(lcpu))
+ goto out_present;
+ else
+ goto out;
}
nid = of_node_to_nid_single(cpu);
+ of_node_put(cpu);
- if (nid < 0 || !node_online(nid))
+out_present:
+ if (nid < 0 || !node_possible(nid))
nid = first_online_node;
-out:
- map_cpu_to_node(lcpu, nid);
- of_node_put(cpu);
+ /*
+ * Update for the first thread of the core. All threads of a core
+ * have to be part of the same node. This not only avoids querying
+ * for every other thread in the core, but always avoids a case
+ * where virtual node associativity change causes subsequent threads
+ * of a core to be associated with different nid. However if first
+ * thread is already online, expect it to have a valid mapping.
+ */
+ if (fcpu != lcpu) {
+ WARN_ON(cpu_online(fcpu));
+ map_cpu_to_node(fcpu, nid);
+ }
+ map_cpu_to_node(lcpu, nid);
+out:
return nid;
}
-static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
+static void verify_cpu_node_mapping(int cpu, int node)
{
- unsigned long lcpu = (unsigned long)hcpu;
- int ret = NOTIFY_DONE;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- numa_setup_cpu(lcpu);
- ret = NOTIFY_OK;
- break;
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- unmap_cpu_from_node(lcpu);
- break;
- ret = NOTIFY_OK;
-#endif
+ int base, sibling, i;
+
+ /* Verify that all the threads in the core belong to the same node */
+ base = cpu_first_thread_sibling(cpu);
+
+ for (i = 0; i < threads_per_core; i++) {
+ sibling = base + i;
+
+ if (sibling == cpu || cpu_is_offline(sibling))
+ continue;
+
+ if (cpu_to_node(sibling) != node) {
+ WARN(1, "CPU thread siblings %d and %d don't belong"
+ " to the same node!\n", cpu, sibling);
+ break;
+ }
}
- return ret;
+}
+
+/* Must run before sched domains notifier. */
+static int ppc_numa_cpu_prepare(unsigned int cpu)
+{
+ int nid;
+
+ nid = numa_setup_cpu(cpu);
+ verify_cpu_node_mapping(cpu, nid);
+ return 0;
+}
+
+static int ppc_numa_cpu_dead(unsigned int cpu)
+{
+ return 0;
}
/*
@@ -594,7 +830,7 @@ static unsigned long __init numa_enforce_memory_limit(unsigned long start,
* Reads the counter for a given entry in
* linux,drconf-usable-memory property
*/
-static inline int __init read_usm_ranges(const u32 **usm)
+static inline int __init read_usm_ranges(const __be32 **usm)
{
/*
* For each lmb in ibm,dynamic-memory a corresponding
@@ -609,85 +845,83 @@ static inline int __init read_usm_ranges(const u32 **usm)
* Extract NUMA information from the ibm,dynamic-reconfiguration-memory
* node. This assumes n_mem_{addr,size}_cells have been set.
*/
-static void __init parse_drconf_memory(struct device_node *memory)
+static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
+ const __be32 **usm,
+ void *data)
{
- const u32 *uninitialized_var(dm), *usm;
- unsigned int n, rc, ranges, is_kexec_kdump = 0;
- unsigned long lmb_size, base, size, sz;
+ unsigned int ranges, is_kexec_kdump = 0;
+ unsigned long base, size, sz;
int nid;
- struct assoc_arrays aa = { .arrays = NULL };
-
- n = of_get_drconf_memory(memory, &dm);
- if (!n)
- return;
- lmb_size = of_get_lmb_size(memory);
- if (!lmb_size)
- return;
-
- rc = of_get_assoc_arrays(memory, &aa);
- if (rc)
- return;
+ /*
+ * Skip this block if the reserved bit is set in flags (0x80)
+ * or if the block is not assigned to this partition (0x8)
+ */
+ if ((lmb->flags & DRCONF_MEM_RESERVED)
+ || !(lmb->flags & DRCONF_MEM_ASSIGNED))
+ return 0;
- /* check if this is a kexec/kdump kernel */
- usm = of_get_usable_memory(memory);
- if (usm != NULL)
+ if (*usm)
is_kexec_kdump = 1;
- for (; n != 0; --n) {
- struct of_drconf_cell drmem;
+ base = lmb->base_addr;
+ size = drmem_lmb_size();
+ ranges = 1;
- read_drconf_cell(&drmem, &dm);
-
- /* skip this block if the reserved bit is set in flags (0x80)
- or if the block is not assigned to this partition (0x8) */
- if ((drmem.flags & DRCONF_MEM_RESERVED)
- || !(drmem.flags & DRCONF_MEM_ASSIGNED))
- continue;
-
- base = drmem.base_addr;
- size = lmb_size;
- ranges = 1;
+ if (is_kexec_kdump) {
+ ranges = read_usm_ranges(usm);
+ if (!ranges) /* there are no (base, size) duple */
+ return 0;
+ }
+ do {
if (is_kexec_kdump) {
- ranges = read_usm_ranges(&usm);
- if (!ranges) /* there are no (base, size) duple */
- continue;
+ base = read_n_cells(n_mem_addr_cells, usm);
+ size = read_n_cells(n_mem_size_cells, usm);
}
- do {
- if (is_kexec_kdump) {
- base = read_n_cells(n_mem_addr_cells, &usm);
- size = read_n_cells(n_mem_size_cells, &usm);
- }
- nid = of_drconf_to_nid_single(&drmem, &aa);
- fake_numa_create_new_node(
- ((base + size) >> PAGE_SHIFT),
- &nid);
- node_set_online(nid);
- sz = numa_enforce_memory_limit(base, size);
- if (sz)
- memblock_set_node(base, sz, nid);
- } while (--ranges);
- }
+
+ nid = get_nid_and_numa_distance(lmb);
+ fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
+ &nid);
+ node_set_online(nid);
+ sz = numa_enforce_memory_limit(base, size);
+ if (sz)
+ memblock_set_node(base, sz, &memblock.memory, nid);
+ } while (--ranges);
+
+ return 0;
}
static int __init parse_numa_properties(void)
{
- struct device_node *memory;
+ struct device_node *memory, *pci;
int default_nid = 0;
unsigned long i;
+ const __be32 *associativity;
if (numa_enabled == 0) {
- printk(KERN_WARNING "NUMA disabled by user\n");
+ pr_warn("disabled by user\n");
return -1;
}
- min_common_depth = find_min_common_depth();
+ primary_domain_index = find_primary_domain_index();
- if (min_common_depth < 0)
- return min_common_depth;
+ if (primary_domain_index < 0) {
+ /*
+ * if we fail to parse primary_domain_index from device tree
+ * mark the numa disabled, boot with numa disabled.
+ */
+ numa_enabled = false;
+ return primary_domain_index;
+ }
- dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+ pr_debug("associativity depth for CPU/Memory: %d\n", primary_domain_index);
+
+ /*
+ * If it is FORM2 initialize the distance table here.
+ */
+ if (affinity_form == FORM2_AFFINITY)
+ initialize_form2_numa_distance_lookup_table();
/*
* Even though we connect cpus to numa domains later in SMP
@@ -695,22 +929,36 @@ static int __init parse_numa_properties(void)
* each node to be onlined must have NODE_DATA etc backing it.
*/
for_each_present_cpu(i) {
+ __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
struct device_node *cpu;
- int nid;
+ int nid = NUMA_NO_NODE;
- cpu = of_get_cpu_node(i, NULL);
- BUG_ON(!cpu);
- nid = of_node_to_nid_single(cpu);
- of_node_put(cpu);
+ memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
- /*
- * Don't fall back to default_nid yet -- we will plug
- * cpus into nodes once the memory scan has discovered
- * the topology.
- */
- if (nid < 0)
- continue;
- node_set_online(nid);
+ if (__vphn_get_associativity(i, vphn_assoc) == 0) {
+ nid = associativity_to_nid(vphn_assoc);
+ initialize_form1_numa_distance(vphn_assoc);
+ } else {
+
+ /*
+ * Don't fall back to default_nid yet -- we will plug
+ * cpus into nodes once the memory scan has discovered
+ * the topology.
+ */
+ cpu = of_get_cpu_node(i, NULL);
+ BUG_ON(!cpu);
+
+ associativity = of_get_associativity(cpu);
+ if (associativity) {
+ nid = associativity_to_nid(associativity);
+ initialize_form1_numa_distance(associativity);
+ }
+ of_node_put(cpu);
+ }
+
+ /* node_set_online() is an UB if 'nid' is negative */
+ if (likely(nid >= 0))
+ node_set_online(nid);
}
get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
@@ -720,7 +968,7 @@ static int __init parse_numa_properties(void)
unsigned long size;
int nid;
int ranges;
- const unsigned int *memcell_buf;
+ const __be32 *memcell_buf;
unsigned int len;
memcell_buf = of_get_property(memory,
@@ -742,34 +990,46 @@ new_range:
* have associativity properties. If none, then
* everything goes to default_nid.
*/
- nid = of_node_to_nid_single(memory);
- if (nid < 0)
+ associativity = of_get_associativity(memory);
+ if (associativity) {
+ nid = associativity_to_nid(associativity);
+ initialize_form1_numa_distance(associativity);
+ } else
nid = default_nid;
fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
node_set_online(nid);
- if (!(size = numa_enforce_memory_limit(start, size))) {
- if (--ranges)
- goto new_range;
- else
- continue;
- }
-
- memblock_set_node(start, size, nid);
+ size = numa_enforce_memory_limit(start, size);
+ if (size)
+ memblock_set_node(start, size, &memblock.memory, nid);
if (--ranges)
goto new_range;
}
+ for_each_node_by_name(pci, "pci") {
+ int nid = NUMA_NO_NODE;
+
+ associativity = of_get_associativity(pci);
+ if (associativity) {
+ nid = associativity_to_nid(associativity);
+ initialize_form1_numa_distance(associativity);
+ }
+ if (likely(nid >= 0) && !node_online(nid))
+ node_set_online(nid);
+ }
+
/*
* Now do the same thing for each MEMBLOCK listed in the
* ibm,dynamic-memory property in the
* ibm,dynamic-reconfiguration-memory node.
*/
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
- if (memory)
- parse_drconf_memory(memory);
+ if (memory) {
+ walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb);
+ of_node_put(memory);
+ }
return 0;
}
@@ -780,20 +1040,16 @@ static void __init setup_nonnuma(void)
unsigned long total_ram = memblock_phys_mem_size();
unsigned long start_pfn, end_pfn;
unsigned int nid = 0;
- struct memblock_region *reg;
-
- printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
- top_of_ram, total_ram);
- printk(KERN_DEBUG "Memory hole size: %ldMB\n",
- (top_of_ram - total_ram) >> 20);
+ int i;
- for_each_memblock(memory, reg) {
- start_pfn = memblock_region_memory_base_pfn(reg);
- end_pfn = memblock_region_memory_end_pfn(reg);
+ pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram);
+ pr_debug("Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20);
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
fake_numa_create_new_node(end_pfn, &nid);
memblock_set_node(PFN_PHYS(start_pfn),
- PFN_PHYS(end_pfn - start_pfn), nid);
+ PFN_PHYS(end_pfn - start_pfn),
+ &memblock.memory, nid);
node_set_online(nid);
}
}
@@ -803,11 +1059,11 @@ void __init dump_numa_cpu_topology(void)
unsigned int node;
unsigned int cpu, count;
- if (min_common_depth == -1 || !numa_enabled)
+ if (!numa_enabled)
return;
for_each_online_node(node) {
- printk(KERN_DEBUG "Node %d CPUs:", node);
+ pr_info("Node %d CPUs:", node);
count = 0;
/*
@@ -818,263 +1074,156 @@ void __init dump_numa_cpu_topology(void)
if (cpumask_test_cpu(cpu,
node_to_cpumask_map[node])) {
if (count == 0)
- printk(" %u", cpu);
+ pr_cont(" %u", cpu);
++count;
} else {
if (count > 1)
- printk("-%u", cpu - 1);
+ pr_cont("-%u", cpu - 1);
count = 0;
}
}
if (count > 1)
- printk("-%u", nr_cpu_ids - 1);
- printk("\n");
+ pr_cont("-%u", nr_cpu_ids - 1);
+ pr_cont("\n");
}
}
-static void __init dump_numa_memory_topology(void)
+/* Initialize NODE_DATA for a node on the local memory */
+static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
{
- unsigned int node;
- unsigned int count;
+ u64 spanned_pages = end_pfn - start_pfn;
- if (min_common_depth == -1 || !numa_enabled)
- return;
+ alloc_node_data(nid);
- for_each_online_node(node) {
- unsigned long i;
+ NODE_DATA(nid)->node_id = nid;
+ NODE_DATA(nid)->node_start_pfn = start_pfn;
+ NODE_DATA(nid)->node_spanned_pages = spanned_pages;
+}
- printk(KERN_DEBUG "Node %d Memory:", node);
+static void __init find_possible_nodes(void)
+{
+ struct device_node *rtas, *root;
+ const __be32 *domains = NULL;
+ int prop_length, max_nodes;
+ u32 i;
- count = 0;
+ if (!numa_enabled)
+ return;
- for (i = 0; i < memblock_end_of_DRAM();
- i += (1 << SECTION_SIZE_BITS)) {
- if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
- if (count == 0)
- printk(" 0x%lx", i);
- ++count;
- } else {
- if (count > 0)
- printk("-0x%lx", i);
- count = 0;
- }
- }
+ rtas = of_find_node_by_path("/rtas");
+ if (!rtas)
+ return;
- if (count > 0)
- printk("-0x%lx", i);
- printk("\n");
+ /*
+ * ibm,current-associativity-domains is a fairly recent property. If
+ * it doesn't exist, then fallback on ibm,max-associativity-domains.
+ * Current denotes what the platform can support compared to max
+ * which denotes what the Hypervisor can support.
+ *
+ * If the LPAR is migratable, new nodes might be activated after a LPM,
+ * so we should consider the max number in that case.
+ */
+ root = of_find_node_by_path("/");
+ if (!of_get_property(root, "ibm,migratable-partition", NULL))
+ domains = of_get_property(rtas,
+ "ibm,current-associativity-domains",
+ &prop_length);
+ of_node_put(root);
+ if (!domains) {
+ domains = of_get_property(rtas, "ibm,max-associativity-domains",
+ &prop_length);
+ if (!domains)
+ goto out;
}
-}
-/*
- * Allocate some memory, satisfying the memblock or bootmem allocator where
- * required. nid is the preferred node and end is the physical address of
- * the highest address in the node.
- *
- * Returns the virtual address of the memory.
- */
-static void __init *careful_zallocation(int nid, unsigned long size,
- unsigned long align,
- unsigned long end_pfn)
-{
- void *ret;
- int new_nid;
- unsigned long ret_paddr;
+ max_nodes = of_read_number(&domains[primary_domain_index], 1);
+ pr_info("Partition configured for %d NUMA nodes.\n", max_nodes);
- ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT);
+ for (i = 0; i < max_nodes; i++) {
+ if (!node_possible(i))
+ node_set(i, node_possible_map);
+ }
- /* retry over all memory */
- if (!ret_paddr)
- ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM());
+ prop_length /= sizeof(int);
+ if (prop_length > primary_domain_index + 2)
+ coregroup_enabled = 1;
- if (!ret_paddr)
- panic("numa.c: cannot allocate %lu bytes for node %d",
- size, nid);
+out:
+ of_node_put(rtas);
+}
+
+void __init mem_topology_setup(void)
+{
+ int cpu;
- ret = __va(ret_paddr);
+ max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
+ min_low_pfn = MEMORY_START >> PAGE_SHIFT;
/*
- * We initialize the nodes in numeric order: 0, 1, 2...
- * and hand over control from the MEMBLOCK allocator to the
- * bootmem allocator. If this function is called for
- * node 5, then we know that all nodes <5 are using the
- * bootmem allocator instead of the MEMBLOCK allocator.
- *
- * So, check the nid from which this allocation came
- * and double check to see if we need to use bootmem
- * instead of the MEMBLOCK. We don't free the MEMBLOCK memory
- * since it would be useless.
+ * Linux/mm assumes node 0 to be online at boot. However this is not
+ * true on PowerPC, where node 0 is similar to any other node, it
+ * could be cpuless, memoryless node. So force node 0 to be offline
+ * for now. This will prevent cpuless, memoryless node 0 showing up
+ * unnecessarily as online. If a node has cpus or memory that need
+ * to be online, then node will anyway be marked online.
*/
- new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT);
- if (new_nid < nid) {
- ret = __alloc_bootmem_node(NODE_DATA(new_nid),
- size, align, 0);
+ node_set_offline(0);
- dbg("alloc_bootmem %p %lx\n", ret, size);
- }
+ if (parse_numa_properties())
+ setup_nonnuma();
- memset(ret, 0, size);
- return ret;
-}
+ /*
+ * Modify the set of possible NUMA nodes to reflect information
+ * available about the set of online nodes, and the set of nodes
+ * that we expect to make use of for this platform's affinity
+ * calculations.
+ */
+ nodes_and(node_possible_map, node_possible_map, node_online_map);
-static struct notifier_block ppc64_numa_nb = {
- .notifier_call = cpu_numa_callback,
- .priority = 1 /* Must run before sched domains notifier. */
-};
+ find_possible_nodes();
-static void __init mark_reserved_regions_for_nid(int nid)
-{
- struct pglist_data *node = NODE_DATA(nid);
- struct memblock_region *reg;
-
- for_each_memblock(reserved, reg) {
- unsigned long physbase = reg->base;
- unsigned long size = reg->size;
- unsigned long start_pfn = physbase >> PAGE_SHIFT;
- unsigned long end_pfn = PFN_UP(physbase + size);
- struct node_active_region node_ar;
- unsigned long node_end_pfn = node->node_start_pfn +
- node->node_spanned_pages;
+ setup_node_to_cpumask_map();
+
+ reset_numa_cpu_lookup_table();
+ for_each_possible_cpu(cpu) {
/*
- * Check to make sure that this memblock.reserved area is
- * within the bounds of the node that we care about.
- * Checking the nid of the start and end points is not
- * sufficient because the reserved area could span the
- * entire node.
+ * Powerpc with CONFIG_NUMA always used to have a node 0,
+ * even if it was memoryless or cpuless. For all cpus that
+ * are possible but not present, cpu_to_node() would point
+ * to node 0. To remove a cpuless, memoryless dummy node,
+ * powerpc need to make sure all possible but not present
+ * cpu_to_node are set to a proper node.
*/
- if (end_pfn <= node->node_start_pfn ||
- start_pfn >= node_end_pfn)
- continue;
-
- get_node_active_region(start_pfn, &node_ar);
- while (start_pfn < end_pfn &&
- node_ar.start_pfn < node_ar.end_pfn) {
- unsigned long reserve_size = size;
- /*
- * if reserved region extends past active region
- * then trim size to active region
- */
- if (end_pfn > node_ar.end_pfn)
- reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
- - physbase;
- /*
- * Only worry about *this* node, others may not
- * yet have valid NODE_DATA().
- */
- if (node_ar.nid == nid) {
- dbg("reserve_bootmem %lx %lx nid=%d\n",
- physbase, reserve_size, node_ar.nid);
- reserve_bootmem_node(NODE_DATA(node_ar.nid),
- physbase, reserve_size,
- BOOTMEM_DEFAULT);
- }
- /*
- * if reserved region is contained in the active region
- * then done.
- */
- if (end_pfn <= node_ar.end_pfn)
- break;
-
- /*
- * reserved region extends past the active region
- * get next active region that contains this
- * reserved region
- */
- start_pfn = node_ar.end_pfn;
- physbase = start_pfn << PAGE_SHIFT;
- size = size - reserve_size;
- get_node_active_region(start_pfn, &node_ar);
- }
+ numa_setup_cpu(cpu);
}
}
-
-void __init do_init_bootmem(void)
+void __init initmem_init(void)
{
int nid;
- min_low_pfn = 0;
- max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
- max_pfn = max_low_pfn;
-
- if (parse_numa_properties())
- setup_nonnuma();
- else
- dump_numa_memory_topology();
+ memblock_dump_all();
for_each_online_node(nid) {
unsigned long start_pfn, end_pfn;
- void *bootmem_vaddr;
- unsigned long bootmap_pages;
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
-
- /*
- * Allocate the node structure node local if possible
- *
- * Be careful moving this around, as it relies on all
- * previous nodes' bootmem to be initialized and have
- * all reserved areas marked.
- */
- NODE_DATA(nid) = careful_zallocation(nid,
- sizeof(struct pglist_data),
- SMP_CACHE_BYTES, end_pfn);
-
- dbg("node %d\n", nid);
- dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
-
- NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
- NODE_DATA(nid)->node_start_pfn = start_pfn;
- NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
-
- if (NODE_DATA(nid)->node_spanned_pages == 0)
- continue;
-
- dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
- dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
-
- bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
- bootmem_vaddr = careful_zallocation(nid,
- bootmap_pages << PAGE_SHIFT,
- PAGE_SIZE, end_pfn);
-
- dbg("bootmap_vaddr = %p\n", bootmem_vaddr);
-
- init_bootmem_node(NODE_DATA(nid),
- __pa(bootmem_vaddr) >> PAGE_SHIFT,
- start_pfn, end_pfn);
-
- free_bootmem_with_active_regions(nid, end_pfn);
- /*
- * Be very careful about moving this around. Future
- * calls to careful_zallocation() depend on this getting
- * done correctly.
- */
- mark_reserved_regions_for_nid(nid);
- sparse_memory_present_with_active_regions(nid);
+ setup_node_data(nid, start_pfn, end_pfn);
}
- init_bootmem_done = 1;
+ sparse_init();
/*
- * Now bootmem is initialised we can create the node to cpumask
- * lookup tables and setup the cpu callback to populate them.
+ * We need the numa_cpu_lookup_table to be accurate for all CPUs,
+ * even before we online them, so that we can use cpu_to_{node,mem}
+ * early in boot, cf. smp_prepare_cpus().
+ * _nocalls() + manual invocation is used because cpuhp is not yet
+ * initialized for the boot CPU.
*/
- setup_node_to_cpumask_map();
-
- register_cpu_notifier(&ppc64_numa_nb);
- cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
- (void *)(unsigned long)boot_cpuid);
-}
-
-void __init paging_init(void)
-{
- unsigned long max_zone_pfns[MAX_NR_ZONES];
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
- max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT;
- free_area_init_nodes(max_zone_pfns);
+ cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare",
+ ppc_numa_cpu_prepare, ppc_numa_cpu_dead);
}
static int __init early_numa(char *p)
@@ -1085,9 +1234,6 @@ static int __init early_numa(char *p)
if (strstr(p, "off"))
numa_enabled = 0;
- if (strstr(p, "debug"))
- numa_debug = 1;
-
p = strstr(p, "fake=");
if (p)
cmdline = p + strlen("fake=");
@@ -1102,43 +1248,26 @@ early_param("numa", early_numa);
* memory represented in the device tree by the property
* ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
*/
-static int hot_add_drconf_scn_to_nid(struct device_node *memory,
- unsigned long scn_addr)
+static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
{
- const u32 *dm;
- unsigned int drconf_cell_cnt, rc;
+ struct drmem_lmb *lmb;
unsigned long lmb_size;
- struct assoc_arrays aa;
- int nid = -1;
+ int nid = NUMA_NO_NODE;
- drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
- if (!drconf_cell_cnt)
- return -1;
-
- lmb_size = of_get_lmb_size(memory);
- if (!lmb_size)
- return -1;
-
- rc = of_get_assoc_arrays(memory, &aa);
- if (rc)
- return -1;
-
- for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
- struct of_drconf_cell drmem;
-
- read_drconf_cell(&drmem, &dm);
+ lmb_size = drmem_lmb_size();
+ for_each_drmem_lmb(lmb) {
/* skip this block if it is reserved or not assigned to
* this partition */
- if ((drmem.flags & DRCONF_MEM_RESERVED)
- || !(drmem.flags & DRCONF_MEM_ASSIGNED))
+ if ((lmb->flags & DRCONF_MEM_RESERVED)
+ || !(lmb->flags & DRCONF_MEM_ASSIGNED))
continue;
- if ((scn_addr < drmem.base_addr)
- || (scn_addr >= (drmem.base_addr + lmb_size)))
+ if ((scn_addr < lmb->base_addr)
+ || (scn_addr >= (lmb->base_addr + lmb_size)))
continue;
- nid = of_drconf_to_nid_single(&drmem, &aa);
+ nid = of_drconf_to_nid_single(lmb);
break;
}
@@ -1150,29 +1279,21 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory,
* represented in the device tree as a node (i.e. memory@XXXX) for
* each memblock.
*/
-int hot_add_node_scn_to_nid(unsigned long scn_addr)
+static int hot_add_node_scn_to_nid(unsigned long scn_addr)
{
struct device_node *memory;
- int nid = -1;
+ int nid = NUMA_NO_NODE;
for_each_node_by_type(memory, "memory") {
- unsigned long start, size;
- int ranges;
- const unsigned int *memcell_buf;
- unsigned int len;
+ int i = 0;
- memcell_buf = of_get_property(memory, "reg", &len);
- if (!memcell_buf || len <= 0)
- continue;
-
- /* ranges in cell */
- ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
+ while (1) {
+ struct resource res;
- while (ranges--) {
- start = read_n_cells(n_mem_addr_cells, &memcell_buf);
- size = read_n_cells(n_mem_size_cells, &memcell_buf);
+ if (of_address_to_resource(memory, i++, &res))
+ break;
- if ((scn_addr < start) || (scn_addr >= (start + size)))
+ if ((scn_addr < res.start) || (scn_addr > res.end))
continue;
nid = of_node_to_nid_single(memory);
@@ -1196,50 +1317,45 @@ int hot_add_node_scn_to_nid(unsigned long scn_addr)
int hot_add_scn_to_nid(unsigned long scn_addr)
{
struct device_node *memory = NULL;
- int nid, found = 0;
+ int nid;
- if (!numa_enabled || (min_common_depth < 0))
+ if (!numa_enabled)
return first_online_node;
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (memory) {
- nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
+ nid = hot_add_drconf_scn_to_nid(scn_addr);
of_node_put(memory);
} else {
nid = hot_add_node_scn_to_nid(scn_addr);
}
- if (nid < 0 || !node_online(nid))
+ if (nid < 0 || !node_possible(nid))
nid = first_online_node;
- if (NODE_DATA(nid)->node_spanned_pages)
- return nid;
-
- for_each_online_node(nid) {
- if (NODE_DATA(nid)->node_spanned_pages) {
- found = 1;
- break;
- }
- }
-
- BUG_ON(!found);
return nid;
}
-static u64 hot_add_drconf_memory_max(void)
+u64 hot_add_drconf_memory_max(void)
{
- struct device_node *memory = NULL;
- unsigned int drconf_cell_cnt = 0;
- u64 lmb_size = 0;
- const u32 *dm = 0;
-
- memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
- if (memory) {
- drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
- lmb_size = of_get_lmb_size(memory);
- of_node_put(memory);
- }
- return lmb_size * drconf_cell_cnt;
+ struct device_node *memory = NULL;
+ struct device_node *dn = NULL;
+ const __be64 *lrdr = NULL;
+
+ dn = of_find_node_by_path("/rtas");
+ if (dn) {
+ lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL);
+ of_node_put(dn);
+ if (lrdr)
+ return be64_to_cpup(lrdr);
+ }
+
+ memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+ if (memory) {
+ of_node_put(memory);
+ return drmem_lmb_memory_max();
+ }
+ return 0;
}
/*
@@ -1256,417 +1372,95 @@ u64 memory_hotplug_max(void)
/* Virtual Processor Home Node (VPHN) support */
#ifdef CONFIG_PPC_SPLPAR
-struct topology_update_data {
- struct topology_update_data *next;
- unsigned int cpu;
- int old_nid;
- int new_nid;
-};
-
-static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
-static cpumask_t cpu_associativity_changes_mask;
-static int vphn_enabled;
-static int prrn_enabled;
-static void reset_topology_timer(void);
-
-/*
- * Store the current values of the associativity change counters in the
- * hypervisor.
- */
-static void setup_cpu_associativity_change_counters(void)
-{
- int cpu;
-
- /* The VPHN feature supports a maximum of 8 reference points */
- BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
-
- for_each_possible_cpu(cpu) {
- int i;
- u8 *counts = vphn_cpu_change_counts[cpu];
- volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
-
- for (i = 0; i < distance_ref_points_depth; i++)
- counts[i] = hypervisor_counts[i];
- }
-}
-
-/*
- * The hypervisor maintains a set of 8 associativity change counters in
- * the VPA of each cpu that correspond to the associativity levels in the
- * ibm,associativity-reference-points property. When an associativity
- * level changes, the corresponding counter is incremented.
- *
- * Set a bit in cpu_associativity_changes_mask for each cpu whose home
- * node associativity levels have changed.
- *
- * Returns the number of cpus with unhandled associativity changes.
- */
-static int update_cpu_associativity_changes_mask(void)
-{
- int cpu;
- cpumask_t *changes = &cpu_associativity_changes_mask;
-
- for_each_possible_cpu(cpu) {
- int i, changed = 0;
- u8 *counts = vphn_cpu_change_counts[cpu];
- volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
-
- for (i = 0; i < distance_ref_points_depth; i++) {
- if (hypervisor_counts[i] != counts[i]) {
- counts[i] = hypervisor_counts[i];
- changed = 1;
- }
- }
- if (changed) {
- cpumask_set_cpu(cpu, changes);
- }
- }
-
- return cpumask_weight(changes);
-}
-
-/*
- * 6 64-bit registers unpacked into 12 32-bit associativity values. To form
- * the complete property we have to add the length in the first cell.
- */
-#define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1)
-
-/*
- * Convert the associativity domain numbers returned from the hypervisor
- * to the sequence they would appear in the ibm,associativity property.
- */
-static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked)
-{
- int i, nr_assoc_doms = 0;
- const u16 *field = (const u16*) packed;
-
-#define VPHN_FIELD_UNUSED (0xffff)
-#define VPHN_FIELD_MSB (0x8000)
-#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB)
-
- for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
- if (*field == VPHN_FIELD_UNUSED) {
- /* All significant fields processed, and remaining
- * fields contain the reserved value of all 1's.
- * Just store them.
- */
- unpacked[i] = *((u32*)field);
- field += 2;
- } else if (*field & VPHN_FIELD_MSB) {
- /* Data is in the lower 15 bits of this field */
- unpacked[i] = *field & VPHN_FIELD_MASK;
- field++;
- nr_assoc_doms++;
- } else {
- /* Data is in the lower 15 bits of this field
- * concatenated with the next 16 bit field
- */
- unpacked[i] = *((u32*)field);
- field += 2;
- nr_assoc_doms++;
- }
- }
-
- /* The first cell contains the length of the property */
- unpacked[0] = nr_assoc_doms;
-
- return nr_assoc_doms;
-}
+static int topology_inited;
/*
* Retrieve the new associativity information for a virtual processor's
* home node.
*/
-static long hcall_vphn(unsigned long cpu, unsigned int *associativity)
-{
- long rc;
- long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
- u64 flags = 1;
- int hwcpu = get_hard_smp_processor_id(cpu);
-
- rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
- vphn_unpack_associativity(retbuf, associativity);
-
- return rc;
-}
-
static long vphn_get_associativity(unsigned long cpu,
- unsigned int *associativity)
+ __be32 *associativity)
{
long rc;
- rc = hcall_vphn(cpu, associativity);
+ rc = hcall_vphn(get_hard_smp_processor_id(cpu),
+ VPHN_FLAG_VCPU, associativity);
switch (rc) {
+ case H_SUCCESS:
+ pr_debug("VPHN hcall succeeded. Reset polling...\n");
+ goto out;
+
case H_FUNCTION:
- printk(KERN_INFO
- "VPHN is not supported. Disabling polling...\n");
- stop_topology_update();
+ pr_err_ratelimited("VPHN unsupported. Disabling polling...\n");
break;
case H_HARDWARE:
- printk(KERN_ERR
- "hcall_vphn() experienced a hardware fault "
+ pr_err_ratelimited("hcall_vphn() experienced a hardware fault "
"preventing VPHN. Disabling polling...\n");
- stop_topology_update();
- }
-
- return rc;
-}
-
-/*
- * Update the CPU maps and sysfs entries for a single CPU when its NUMA
- * characteristics change. This function doesn't perform any locking and is
- * only safe to call from stop_machine().
- */
-static int update_cpu_topology(void *data)
-{
- struct topology_update_data *update;
- unsigned long cpu;
-
- if (!data)
- return -EINVAL;
-
- cpu = get_cpu();
-
- for (update = data; update; update = update->next) {
- if (cpu != update->cpu)
- continue;
-
- unmap_cpu_from_node(update->cpu);
- map_cpu_to_node(update->cpu, update->new_nid);
- vdso_getcpu_init();
- }
-
- return 0;
-}
-
-/*
- * Update the node maps and sysfs entries for each cpu whose home node
- * has changed. Returns 1 when the topology has changed, and 0 otherwise.
- */
-int arch_update_cpu_topology(void)
-{
- unsigned int cpu, changed = 0;
- struct topology_update_data *updates, *ud;
- unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
- cpumask_t updated_cpus;
- struct device *dev;
- int weight, i = 0;
-
- weight = cpumask_weight(&cpu_associativity_changes_mask);
- if (!weight)
- return 0;
-
- updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL);
- if (!updates)
- return 0;
-
- cpumask_clear(&updated_cpus);
-
- for_each_cpu(cpu, &cpu_associativity_changes_mask) {
- ud = &updates[i++];
- ud->cpu = cpu;
- vphn_get_associativity(cpu, associativity);
- ud->new_nid = associativity_to_nid(associativity);
-
- if (ud->new_nid < 0 || !node_online(ud->new_nid))
- ud->new_nid = first_online_node;
-
- ud->old_nid = numa_cpu_lookup_table[cpu];
- cpumask_set_cpu(cpu, &updated_cpus);
-
- if (i < weight)
- ud->next = &updates[i];
- }
-
- stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
-
- for (ud = &updates[0]; ud; ud = ud->next) {
- unregister_cpu_under_node(ud->cpu, ud->old_nid);
- register_cpu_under_node(ud->cpu, ud->new_nid);
-
- dev = get_cpu_device(ud->cpu);
- if (dev)
- kobject_uevent(&dev->kobj, KOBJ_CHANGE);
- cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
- changed = 1;
- }
-
- kfree(updates);
- return changed;
-}
-
-static void topology_work_fn(struct work_struct *work)
-{
- rebuild_sched_domains();
-}
-static DECLARE_WORK(topology_work, topology_work_fn);
-
-void topology_schedule_update(void)
-{
- schedule_work(&topology_work);
-}
-
-static void topology_timer_fn(unsigned long ignored)
-{
- if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
- topology_schedule_update();
- else if (vphn_enabled) {
- if (update_cpu_associativity_changes_mask() > 0)
- topology_schedule_update();
- reset_topology_timer();
- }
-}
-static struct timer_list topology_timer =
- TIMER_INITIALIZER(topology_timer_fn, 0, 0);
-
-static void reset_topology_timer(void)
-{
- topology_timer.data = 0;
- topology_timer.expires = jiffies + 60 * HZ;
- mod_timer(&topology_timer, topology_timer.expires);
-}
-
-#ifdef CONFIG_SMP
-
-static void stage_topology_update(int core_id)
-{
- cpumask_or(&cpu_associativity_changes_mask,
- &cpu_associativity_changes_mask, cpu_sibling_mask(core_id));
- reset_topology_timer();
-}
-
-static int dt_update_callback(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- struct of_prop_reconfig *update;
- int rc = NOTIFY_DONE;
-
- switch (action) {
- case OF_RECONFIG_UPDATE_PROPERTY:
- update = (struct of_prop_reconfig *)data;
- if (!of_prop_cmp(update->dn->type, "cpu") &&
- !of_prop_cmp(update->prop->name, "ibm,associativity")) {
- u32 core_id;
- of_property_read_u32(update->dn, "reg", &core_id);
- stage_topology_update(core_id);
- rc = NOTIFY_OK;
- }
+ break;
+ case H_PARAMETER:
+ pr_err_ratelimited("hcall_vphn() was passed an invalid parameter. "
+ "Disabling polling...\n");
+ break;
+ default:
+ pr_err_ratelimited("hcall_vphn() returned %ld. Disabling polling...\n"
+ , rc);
break;
}
-
- return rc;
-}
-
-static struct notifier_block dt_update_nb = {
- .notifier_call = dt_update_callback,
-};
-
-#endif
-
-/*
- * Start polling for associativity changes.
- */
-int start_topology_update(void)
-{
- int rc = 0;
-
- if (firmware_has_feature(FW_FEATURE_PRRN)) {
- if (!prrn_enabled) {
- prrn_enabled = 1;
- vphn_enabled = 0;
-#ifdef CONFIG_SMP
- rc = of_reconfig_notifier_register(&dt_update_nb);
-#endif
- }
- } else if (firmware_has_feature(FW_FEATURE_VPHN) &&
- get_lppaca()->shared_proc) {
- if (!vphn_enabled) {
- prrn_enabled = 0;
- vphn_enabled = 1;
- setup_cpu_associativity_change_counters();
- init_timer_deferrable(&topology_timer);
- reset_topology_timer();
- }
- }
-
+out:
return rc;
}
-/*
- * Disable polling for VPHN associativity changes.
- */
-int stop_topology_update(void)
+void find_and_update_cpu_nid(int cpu)
{
- int rc = 0;
-
- if (prrn_enabled) {
- prrn_enabled = 0;
-#ifdef CONFIG_SMP
- rc = of_reconfig_notifier_unregister(&dt_update_nb);
-#endif
- } else if (vphn_enabled) {
- vphn_enabled = 0;
- rc = del_timer_sync(&topology_timer);
- }
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+ int new_nid;
- return rc;
-}
+ /* Use associativity from first thread for all siblings */
+ if (vphn_get_associativity(cpu, associativity))
+ return;
-int prrn_is_enabled(void)
-{
- return prrn_enabled;
-}
+ /* Do not have previous associativity, so find it now. */
+ new_nid = associativity_to_nid(associativity);
-static int topology_read(struct seq_file *file, void *v)
-{
- if (vphn_enabled || prrn_enabled)
- seq_puts(file, "on\n");
+ if (new_nid < 0 || !node_possible(new_nid))
+ new_nid = first_online_node;
else
- seq_puts(file, "off\n");
+ // Associate node <-> cpu, so cpu_up() calls
+ // try_online_node() on the right node.
+ set_cpu_numa_node(cpu, new_nid);
- return 0;
+ pr_debug("%s:%d cpu %d nid %d\n", __func__, __LINE__, cpu, new_nid);
}
-static int topology_open(struct inode *inode, struct file *file)
+int cpu_to_coregroup_id(int cpu)
{
- return single_open(file, topology_read, NULL);
-}
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+ int index;
-static ssize_t topology_write(struct file *file, const char __user *buf,
- size_t count, loff_t *off)
-{
- char kbuf[4]; /* "on" or "off" plus null. */
- int read_len;
+ if (cpu < 0 || cpu > nr_cpu_ids)
+ return -1;
- read_len = count < 3 ? count : 3;
- if (copy_from_user(kbuf, buf, read_len))
- return -EINVAL;
+ if (!coregroup_enabled)
+ goto out;
- kbuf[read_len] = '\0';
+ if (!firmware_has_feature(FW_FEATURE_VPHN))
+ goto out;
- if (!strncmp(kbuf, "on", 2))
- start_topology_update();
- else if (!strncmp(kbuf, "off", 3))
- stop_topology_update();
- else
- return -EINVAL;
+ if (vphn_get_associativity(cpu, associativity))
+ goto out;
- return count;
-}
+ index = of_read_number(associativity, 1);
+ if (index > primary_domain_index + 1)
+ return of_read_number(&associativity[index - 1], 1);
-static const struct file_operations topology_ops = {
- .read = seq_read,
- .write = topology_write,
- .open = topology_open,
- .release = single_release
-};
+out:
+ return cpu_to_core_id(cpu);
+}
static int topology_update_init(void)
{
- start_topology_update();
- proc_create("powerpc/topology_updates", 644, NULL, &topology_ops);
-
+ topology_inited = 1;
return 0;
}
device_initcall(topology_update_init);
diff --git a/arch/powerpc/mm/pageattr.c b/arch/powerpc/mm/pageattr.c
new file mode 100644
index 000000000000..ac22bf28086f
--- /dev/null
+++ b/arch/powerpc/mm/pageattr.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * MMU-generic set_memory implementation for powerpc
+ *
+ * Copyright 2019-2021, IBM Corporation.
+ */
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
+
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include <mm/mmu_decl.h>
+
+static pte_basic_t pte_update_delta(pte_t *ptep, unsigned long addr,
+ unsigned long old, unsigned long new)
+{
+ return pte_update(&init_mm, addr, ptep, old & ~new, new & ~old, 0);
+}
+
+/*
+ * Updates the attributes of a page atomically.
+ *
+ * This sequence is safe against concurrent updates, and also allows updating the
+ * attributes of a page currently being executed or accessed.
+ */
+static int change_page_attr(pte_t *ptep, unsigned long addr, void *data)
+{
+ long action = (long)data;
+
+ addr &= PAGE_MASK;
+ /* modify the PTE bits as desired */
+ switch (action) {
+ case SET_MEMORY_RO:
+ /* Don't clear DIRTY bit */
+ pte_update_delta(ptep, addr, _PAGE_KERNEL_RW & ~_PAGE_DIRTY, _PAGE_KERNEL_RO);
+ break;
+ case SET_MEMORY_ROX:
+ /* Don't clear DIRTY bit */
+ pte_update_delta(ptep, addr, _PAGE_KERNEL_RW & ~_PAGE_DIRTY, _PAGE_KERNEL_ROX);
+ break;
+ case SET_MEMORY_RW:
+ pte_update_delta(ptep, addr, _PAGE_KERNEL_RO, _PAGE_KERNEL_RW);
+ break;
+ case SET_MEMORY_NX:
+ pte_update_delta(ptep, addr, _PAGE_KERNEL_ROX, _PAGE_KERNEL_RO);
+ break;
+ case SET_MEMORY_X:
+ pte_update_delta(ptep, addr, _PAGE_KERNEL_RO, _PAGE_KERNEL_ROX);
+ break;
+ case SET_MEMORY_NP:
+ pte_update(&init_mm, addr, ptep, _PAGE_PRESENT, 0, 0);
+ break;
+ case SET_MEMORY_P:
+ pte_update(&init_mm, addr, ptep, 0, _PAGE_PRESENT, 0);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ /* See ptesync comment in radix__set_pte_at() */
+ if (radix_enabled())
+ asm volatile("ptesync": : :"memory");
+
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+
+ return 0;
+}
+
+int change_memory_attr(unsigned long addr, int numpages, long action)
+{
+ unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
+ unsigned long size = numpages * PAGE_SIZE;
+
+ if (!numpages)
+ return 0;
+
+ if (WARN_ON_ONCE(is_vmalloc_or_module_addr((void *)addr) &&
+ is_vm_area_hugepages((void *)addr)))
+ return -EINVAL;
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ /*
+ * On hash, the linear mapping is not in the Linux page table so
+ * apply_to_existing_page_range() will have no effect. If in the future
+ * the set_memory_* functions are used on the linear map this will need
+ * to be updated.
+ */
+ if (!radix_enabled()) {
+ int region = get_region_id(addr);
+
+ if (WARN_ON_ONCE(region != VMALLOC_REGION_ID && region != IO_REGION_ID))
+ return -EINVAL;
+ }
+#endif
+
+ return apply_to_existing_page_range(&init_mm, start, size,
+ change_page_attr, (void *)action);
+}
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
+#ifdef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ int err;
+ unsigned long addr = (unsigned long)page_address(page);
+
+ if (PageHighMem(page))
+ return;
+
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled())
+ err = hash__kernel_map_pages(page, numpages, enable);
+ else if (enable)
+ err = set_memory_p(addr, numpages);
+ else
+ err = set_memory_np(addr, numpages);
+
+ if (err)
+ panic("%s: changing memory protections failed\n", __func__);
+}
+#endif
+#endif
diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
new file mode 100644
index 000000000000..77e55eac16e4
--- /dev/null
+++ b/arch/powerpc/mm/pgtable-frag.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Handling Page Tables through page fragments
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/hugetlb.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+void pte_frag_destroy(void *pte_frag)
+{
+ int count;
+ struct ptdesc *ptdesc;
+
+ ptdesc = virt_to_ptdesc(pte_frag);
+ /* drop all the pending references */
+ count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
+ /* We allow PTE_FRAG_NR fragments from a PTE page */
+ if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
+ pagetable_dtor(ptdesc);
+ pagetable_free(ptdesc);
+ }
+}
+
+static pte_t *get_pte_from_cache(struct mm_struct *mm)
+{
+ void *pte_frag, *ret;
+
+ if (PTE_FRAG_NR == 1)
+ return NULL;
+
+ spin_lock(&mm->page_table_lock);
+ ret = pte_frag_get(&mm->context);
+ if (ret) {
+ pte_frag = ret + PTE_FRAG_SIZE;
+ /*
+ * If we have taken up all the fragments mark PTE page NULL
+ */
+ if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
+ pte_frag = NULL;
+ pte_frag_set(&mm->context, pte_frag);
+ }
+ spin_unlock(&mm->page_table_lock);
+ return (pte_t *)ret;
+}
+
+static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel)
+{
+ void *ret = NULL;
+ struct ptdesc *ptdesc;
+ gfp_t gfp = PGALLOC_GFP;
+
+ if (!kernel)
+ gfp |= __GFP_ACCOUNT;
+
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
+ return NULL;
+ if (!pagetable_pte_ctor(mm, ptdesc)) {
+ pagetable_free(ptdesc);
+ return NULL;
+ }
+
+ atomic_set(&ptdesc->pt_frag_refcount, 1);
+
+ ret = ptdesc_address(ptdesc);
+ /*
+ * if we support only one fragment just return the
+ * allocated page.
+ */
+ if (PTE_FRAG_NR == 1)
+ return ret;
+ spin_lock(&mm->page_table_lock);
+ /*
+ * If we find ptdesc_page set, we return
+ * the allocated page with single fragment
+ * count.
+ */
+ if (likely(!pte_frag_get(&mm->context))) {
+ atomic_set(&ptdesc->pt_frag_refcount, PTE_FRAG_NR);
+ pte_frag_set(&mm->context, ret + PTE_FRAG_SIZE);
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return (pte_t *)ret;
+}
+
+pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel)
+{
+ pte_t *pte;
+
+ pte = get_pte_from_cache(mm);
+ if (pte)
+ return pte;
+
+ return __alloc_for_ptecache(mm, kernel);
+}
+
+static void pte_free_now(struct rcu_head *head)
+{
+ struct ptdesc *ptdesc;
+
+ ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+ pagetable_dtor(ptdesc);
+ pagetable_free(ptdesc);
+}
+
+void pte_fragment_free(unsigned long *table, int kernel)
+{
+ struct ptdesc *ptdesc = virt_to_ptdesc(table);
+
+ if (pagetable_is_reserved(ptdesc))
+ return free_reserved_ptdesc(ptdesc);
+
+ BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
+ if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
+ if (kernel || !folio_test_clear_active(ptdesc_folio(ptdesc)))
+ pte_free_now(&ptdesc->pt_rcu_head);
+ else
+ call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
+ }
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+ struct folio *folio;
+
+ folio = virt_to_folio(pgtable);
+ folio_set_active(folio);
+ pte_fragment_free((unsigned long *)pgtable, 0);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index edda589795c3..56d7e8960e77 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file contains common routines for dealing with free of page tables
* Along with common page table handling code
@@ -14,25 +15,26 @@
*
* Dave Engebretsen <engebret@us.ibm.com>
* Rework for PPC64 port.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/mm.h>
-#include <linux/init.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
+#include <asm/hugetlb.h>
+#include <asm/pte-walk.h>
+
+#ifdef CONFIG_PPC64
+#define PGD_ALIGN (sizeof(pgd_t) * MAX_PTRS_PER_PGD)
+#else
+#define PGD_ALIGN PAGE_SIZE
+#endif
-#include "mmu_decl.h"
+pgd_t swapper_pg_dir[MAX_PTRS_PER_PGD] __section(".bss..page_aligned") __aligned(PGD_ALIGN);
static inline int is_exec_fault(void)
{
@@ -41,17 +43,22 @@ static inline int is_exec_fault(void)
/* We only try to do i/d cache coherency on stuff that looks like
* reasonably "normal" PTEs. We currently require a PTE to be present
- * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that
+ * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that
* on userspace PTEs
*/
-static inline int pte_looks_normal(pte_t pte)
+static inline int pte_looks_normal(pte_t pte, unsigned long addr)
{
- return (pte_val(pte) &
- (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) ==
- (_PAGE_PRESENT | _PAGE_USER);
+
+ if (pte_present(pte) && !pte_special(pte)) {
+ if (pte_ci(pte))
+ return 0;
+ if (!is_kernel_addr(addr))
+ return 1;
+ }
+ return 0;
}
-struct page * maybe_pte_to_page(pte_t pte)
+static struct folio *maybe_pte_to_folio(pte_t pte)
{
unsigned long pfn = pte_pfn(pte);
struct page *page;
@@ -61,10 +68,10 @@ struct page * maybe_pte_to_page(pte_t pte)
page = pfn_to_page(pfn);
if (PageReserved(page))
return NULL;
- return page;
+ return page_folio(page);
}
-#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0
+#ifdef CONFIG_PPC_BOOK3S
/* Server-style MMU handles coherency when hashing if HW exec permission
* is supposed per page (currently 64-bit only). If not, then, we always
@@ -72,84 +79,85 @@ struct page * maybe_pte_to_page(pte_t pte)
* support falls into the same category.
*/
-static pte_t set_pte_filter(pte_t pte, unsigned long addr)
+static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr)
{
pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
- if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
- cpu_has_feature(CPU_FTR_NOEXECUTE))) {
- struct page *pg = maybe_pte_to_page(pte);
- if (!pg)
+ if (pte_looks_normal(pte, addr) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
+ cpu_has_feature(CPU_FTR_NOEXECUTE))) {
+ struct folio *folio = maybe_pte_to_folio(pte);
+ if (!folio)
return pte;
- if (!test_bit(PG_arch_1, &pg->flags)) {
-#ifdef CONFIG_8xx
- /* On 8xx, cache control instructions (particularly
- * "dcbst" from flush_dcache_icache) fault as write
- * operation if there is an unpopulated TLB entry
- * for the address in question. To workaround that,
- * we invalidate the TLB here, thus avoiding dcbst
- * misbehaviour.
- */
- /* 8xx doesn't care about PID, size or ind args */
- _tlbil_va(addr, 0, 0, 0);
-#endif /* CONFIG_8xx */
- flush_dcache_icache_page(pg);
- set_bit(PG_arch_1, &pg->flags);
+ if (!test_bit(PG_dcache_clean, &folio->flags.f)) {
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags.f);
}
}
return pte;
}
-static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
- int dirty)
-{
- return pte;
-}
+#else /* CONFIG_PPC_BOOK3S */
-#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */
+static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr) { return pte; }
+
+#endif /* CONFIG_PPC_BOOK3S */
/* Embedded type MMU with HW exec support. This is a bit more complicated
* as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
* instead we "filter out" the exec permission for non clean pages.
+ *
+ * This is also called once for the folio. So only work with folio->flags here.
*/
-static pte_t set_pte_filter(pte_t pte, unsigned long addr)
+static inline pte_t set_pte_filter(pte_t pte, unsigned long addr)
{
- struct page *pg;
+ struct folio *folio;
+
+ if (radix_enabled())
+ return pte;
+
+ if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ return set_pte_filter_hash(pte, addr);
/* No exec permission in the first place, move on */
- if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte))
+ if (!pte_exec(pte) || !pte_looks_normal(pte, addr))
return pte;
/* If you set _PAGE_EXEC on weird pages you're on your own */
- pg = maybe_pte_to_page(pte);
- if (unlikely(!pg))
+ folio = maybe_pte_to_folio(pte);
+ if (unlikely(!folio))
return pte;
/* If the page clean, we move on */
- if (test_bit(PG_arch_1, &pg->flags))
+ if (test_bit(PG_dcache_clean, &folio->flags.f))
return pte;
/* If it's an exec fault, we flush the cache and make it clean */
if (is_exec_fault()) {
- flush_dcache_icache_page(pg);
- set_bit(PG_arch_1, &pg->flags);
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags.f);
return pte;
}
/* Else, we filter out _PAGE_EXEC */
- return __pte(pte_val(pte) & ~_PAGE_EXEC);
+ return pte_exprotect(pte);
}
static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
int dirty)
{
- struct page *pg;
+ struct folio *folio;
+
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
+ return pte;
+
+ if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ return pte;
/* So here, we only care about exec faults, as we use them
* to recover lost _PAGE_EXEC and perform I$/D$ coherency
* if necessary. Also if _PAGE_EXEC is already set, same deal,
* we just bail out
*/
- if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault())
+ if (dirty || pte_exec(pte) || !is_exec_fault())
return pte;
#ifdef CONFIG_DEBUG_VM
@@ -162,41 +170,67 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
#endif /* CONFIG_DEBUG_VM */
/* If you set _PAGE_EXEC on weird pages you're on your own */
- pg = maybe_pte_to_page(pte);
- if (unlikely(!pg))
+ folio = maybe_pte_to_folio(pte);
+ if (unlikely(!folio))
goto bail;
/* If the page is already clean, we move on */
- if (test_bit(PG_arch_1, &pg->flags))
+ if (test_bit(PG_dcache_clean, &folio->flags.f))
goto bail;
- /* Clean the page and set PG_arch_1 */
- flush_dcache_icache_page(pg);
- set_bit(PG_arch_1, &pg->flags);
+ /* Clean the page and set PG_dcache_clean */
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags.f);
bail:
- return __pte(pte_val(pte) | _PAGE_EXEC);
+ return pte_mkexec(pte);
}
-#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */
-
/*
* set_pte stores a linux PTE into the linux page table.
*/
-void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
- pte_t pte)
+void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned int nr)
{
-#ifdef CONFIG_DEBUG_VM
- WARN_ON(pte_present(*ptep));
-#endif
+
/* Note: mm->context.id might not yet have been assigned as
* this context might not have been activated yet when this
- * is called.
+ * is called. Filter the pte value and use the filtered value
+ * to setup all the ptes in the range.
*/
pte = set_pte_filter(pte, addr);
- /* Perform the setting of the PTE */
- __set_pte_at(mm, addr, ptep, pte, 0);
+ /*
+ * We don't need to call arch_enter/leave_lazy_mmu_mode()
+ * because we expect set_ptes to be only be used on not present
+ * and not hw_valid ptes. Hence there is no translation cache flush
+ * involved that need to be batched.
+ */
+ for (;;) {
+
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
+
+ /* Perform the setting of the PTE */
+ __set_pte_at(mm, addr, ptep, pte, 0);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ pte = pte_next_pfn(pte);
+ }
+}
+
+void unmap_kernel_page(unsigned long va)
+{
+ pmd_t *pmdp = pmd_off_k(va);
+ pte_t *ptep = pte_offset_kernel(pmdp, va);
+
+ pte_clear(&init_mm, va, ptep);
+ flush_tlb_kernel_range(va, va + PAGE_SIZE);
}
/*
@@ -213,38 +247,309 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
entry = set_access_flags_filter(entry, vma, dirty);
changed = !pte_same(*(ptep), entry);
if (changed) {
- if (!is_vm_hugetlb_page(vma))
- assert_pte_locked(vma->vm_mm, address);
- __ptep_set_access_flags(ptep, entry);
- flush_tlb_page_nohash(vma, address);
+ assert_pte_locked(vma->vm_mm, address);
+ __ptep_set_access_flags(vma, ptep, entry,
+ address, mmu_virtual_psize);
+ }
+ return changed;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, int dirty)
+{
+#ifdef HUGETLB_NEED_PRELOAD
+ /*
+ * The "return 1" forces a call of update_mmu_cache, which will write a
+ * TLB entry. Without this, platforms that don't do a write of the TLB
+ * entry in the TLB miss handler asm will fault ad infinitum.
+ */
+ ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+ return 1;
+#else
+ int changed, psize;
+
+ pte = set_access_flags_filter(pte, vma, dirty);
+ changed = !pte_same(*(ptep), pte);
+ if (changed) {
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ struct hstate *h = hstate_vma(vma);
+
+ psize = hstate_get_psize(h);
+#ifdef CONFIG_DEBUG_VM
+ assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep));
+#endif
+
+#else
+ /*
+ * Not used on non book3s64 platforms.
+ * 8xx compares it with mmu_virtual_psize to
+ * know if it is a huge page or not.
+ */
+ psize = MMU_PAGE_COUNT;
+#endif
+ __ptep_set_access_flags(vma, ptep, pte, addr, psize);
}
return changed;
+#endif
+}
+
+#if defined(CONFIG_PPC_8xx)
+
+#if defined(CONFIG_SPLIT_PTE_PTLOCKS) || defined(CONFIG_SPLIT_PMD_PTLOCKS)
+/* We need the same lock to protect the PMD table and the two PTE tables. */
+#error "8M hugetlb folios are incompatible with split page table locks"
+#endif
+
+static void __set_huge_pte_at(pmd_t *pmd, pte_t *ptep, pte_basic_t val)
+{
+ pte_basic_t *entry = (pte_basic_t *)ptep;
+ int num, i;
+
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
+
+ num = number_of_cells_per_pte(pmd, val, 1);
+
+ for (i = 0; i < num; i++, entry++, val += SZ_4K)
+ *entry = val;
+}
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned long sz)
+{
+ pmd_t *pmdp = pmd_off(mm, addr);
+
+ pte = set_pte_filter(pte, addr);
+
+ if (sz == SZ_8M) { /* Flag both PMD entries as 8M and fill both page tables */
+ *pmdp = __pmd(pmd_val(*pmdp) | _PMD_PAGE_8M);
+ *(pmdp + 1) = __pmd(pmd_val(*(pmdp + 1)) | _PMD_PAGE_8M);
+
+ __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp, 0), pte_val(pte));
+ __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp + 1, 0), pte_val(pte) + SZ_4M);
+ } else {
+ __set_huge_pte_at(pmdp, ptep, pte_val(pte));
+ }
+}
+#else
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned long sz)
+{
+ unsigned long pdsize;
+ int i;
+
+ pte = set_pte_filter(pte, addr);
+
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
+
+ if (sz < PMD_SIZE)
+ pdsize = PAGE_SIZE;
+ else if (sz < PUD_SIZE)
+ pdsize = PMD_SIZE;
+ else if (sz < P4D_SIZE)
+ pdsize = PUD_SIZE;
+ else if (sz < PGDIR_SIZE)
+ pdsize = P4D_SIZE;
+ else
+ pdsize = PGDIR_SIZE;
+
+ for (i = 0; i < sz / pdsize; i++, ptep++, addr += pdsize) {
+ __set_pte_at(mm, addr, ptep, pte, 0);
+ pte = __pte(pte_val(pte) + ((unsigned long long)pdsize / PAGE_SIZE << PFN_PTE_SHIFT));
+ }
}
+#endif
+#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_DEBUG_VM
void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
+ pte_t *pte;
+ spinlock_t *ptl;
if (mm == &init_mm)
return;
pgd = mm->pgd + pgd_index(addr);
BUG_ON(pgd_none(*pgd));
- pud = pud_offset(pgd, addr);
+ p4d = p4d_offset(pgd, addr);
+ BUG_ON(p4d_none(*p4d));
+ pud = pud_offset(p4d, addr);
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, addr);
/*
* khugepaged to collapse normal pages to hugepage, first set
- * pmd to none to force page fault/gup to take mmap_sem. After
+ * pmd to none to force page fault/gup to take mmap_lock. After
* pmd is set to none, we do a pte_clear which does this assertion
* so if we find pmd none, return.
*/
if (pmd_none(*pmd))
return;
- BUG_ON(!pmd_present(*pmd));
- assert_spin_locked(pte_lockptr(mm, pmd));
+ pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl);
+ BUG_ON(!pte);
+ assert_spin_locked(ptl);
+ pte_unmap(pte);
}
#endif /* CONFIG_DEBUG_VM */
+unsigned long vmalloc_to_phys(void *va)
+{
+ unsigned long pfn = vmalloc_to_pfn(va);
+
+ BUG_ON(!pfn);
+ return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va);
+}
+EXPORT_SYMBOL_GPL(vmalloc_to_phys);
+
+/*
+ * We have 3 cases for pgds and pmds:
+ * (1) invalid (all zeroes)
+ * (2) pointer to next table, as normal; bottom 6 bits == 0
+ * (3) leaf pte for huge page _PAGE_PTE set
+ *
+ * So long as we atomically load page table pointers we are safe against teardown,
+ * we can follow the address down to the page and take a ref on it.
+ * This function need to be called with interrupts disabled. We use this variant
+ * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
+ */
+pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
+ bool *is_thp, unsigned *hpage_shift)
+{
+ pgd_t *pgdp;
+#ifdef CONFIG_PPC64
+ p4d_t p4d, *p4dp;
+ pud_t pud, *pudp;
+#endif
+ pmd_t pmd, *pmdp;
+ pte_t *ret_pte;
+ unsigned pdshift;
+
+ if (hpage_shift)
+ *hpage_shift = 0;
+
+ if (is_thp)
+ *is_thp = false;
+
+ /*
+ * Always operate on the local stack value. This make sure the
+ * value don't get updated by a parallel THP split/collapse,
+ * page fault or a page unmap. The return pte_t * is still not
+ * stable. So should be checked there for above conditions.
+ * Top level is an exception because it is folded into p4d.
+ *
+ * On PPC32, P4D/PUD/PMD are folded into PGD so go straight to
+ * PMD level.
+ */
+ pgdp = pgdir + pgd_index(ea);
+#ifdef CONFIG_PPC64
+ p4dp = p4d_offset(pgdp, ea);
+ p4d = READ_ONCE(*p4dp);
+ pdshift = P4D_SHIFT;
+
+ if (p4d_none(p4d))
+ return NULL;
+
+ if (p4d_leaf(p4d)) {
+ ret_pte = (pte_t *)p4dp;
+ goto out;
+ }
+
+ /*
+ * Even if we end up with an unmap, the pgtable will not
+ * be freed, because we do an rcu free and here we are
+ * irq disabled
+ */
+ pdshift = PUD_SHIFT;
+ pudp = pud_offset(&p4d, ea);
+ pud = READ_ONCE(*pudp);
+
+ if (pud_none(pud))
+ return NULL;
+
+ if (pud_leaf(pud)) {
+ ret_pte = (pte_t *)pudp;
+ goto out;
+ }
+
+ pmdp = pmd_offset(&pud, ea);
+#else
+ pmdp = pmd_offset(pud_offset(p4d_offset(pgdp, ea), ea), ea);
+#endif
+ pdshift = PMD_SHIFT;
+ pmd = READ_ONCE(*pmdp);
+
+ /*
+ * A hugepage collapse is captured by this condition, see
+ * pmdp_collapse_flush.
+ */
+ if (pmd_none(pmd))
+ return NULL;
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ /*
+ * A hugepage split is captured by this condition, see
+ * pmdp_invalidate.
+ *
+ * Huge page modification can be caught here too.
+ */
+ if (pmd_is_serializing(pmd))
+ return NULL;
+#endif
+
+ if (pmd_trans_huge(pmd)) {
+ if (is_thp)
+ *is_thp = true;
+ ret_pte = (pte_t *)pmdp;
+ goto out;
+ }
+
+ if (pmd_leaf(pmd)) {
+ ret_pte = (pte_t *)pmdp;
+ goto out;
+ }
+
+ return pte_offset_kernel(&pmd, ea);
+
+out:
+ if (hpage_shift)
+ *hpage_shift = pdshift;
+ return ret_pte;
+}
+EXPORT_SYMBOL_GPL(__find_linux_pte);
+
+/* Note due to the way vm flags are laid out, the bits are XWR */
+const pgprot_t protection_map[16] = {
+ [VM_NONE] = PAGE_NONE,
+ [VM_READ] = PAGE_READONLY,
+ [VM_WRITE] = PAGE_COPY,
+ [VM_WRITE | VM_READ] = PAGE_COPY,
+ [VM_EXEC] = PAGE_EXECONLY_X,
+ [VM_EXEC | VM_READ] = PAGE_READONLY_X,
+ [VM_EXEC | VM_WRITE] = PAGE_COPY_X,
+ [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_X,
+ [VM_SHARED] = PAGE_NONE,
+ [VM_SHARED | VM_READ] = PAGE_READONLY,
+ [VM_SHARED | VM_WRITE] = PAGE_SHARED,
+ [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED,
+ [VM_SHARED | VM_EXEC] = PAGE_EXECONLY_X,
+ [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_X,
+ [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_X,
+ [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_X
+};
+
+#ifndef CONFIG_PPC_BOOK3S_64
+DECLARE_VM_GET_PAGE_PROT
+#endif
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 6c856fb8c15b..0c9ef705803e 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file contains the routines setting up the linux page tables.
* -- paulus
@@ -11,12 +12,6 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/kernel.h>
@@ -28,297 +23,88 @@
#include <linux/highmem.h>
#include <linux/memblock.h>
#include <linux/slab.h>
+#include <linux/set_memory.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
-#include <asm/io.h>
#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/early_ioremap.h>
-#include "mmu_decl.h"
-
-unsigned long ioremap_base;
-unsigned long ioremap_bot;
-EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */
-
-#if defined(CONFIG_6xx) || defined(CONFIG_POWER3)
-#define HAVE_BATS 1
-#endif
-
-#if defined(CONFIG_FSL_BOOKE)
-#define HAVE_TLBCAM 1
-#endif
-
-extern char etext[], _stext[];
-
-#ifdef HAVE_BATS
-extern phys_addr_t v_mapped_by_bats(unsigned long va);
-extern unsigned long p_mapped_by_bats(phys_addr_t pa);
-void setbat(int index, unsigned long virt, phys_addr_t phys,
- unsigned int size, int flags);
-
-#else /* !HAVE_BATS */
-#define v_mapped_by_bats(x) (0UL)
-#define p_mapped_by_bats(x) (0UL)
-#endif /* HAVE_BATS */
-
-#ifdef HAVE_TLBCAM
-extern unsigned int tlbcam_index;
-extern phys_addr_t v_mapped_by_tlbcam(unsigned long va);
-extern unsigned long p_mapped_by_tlbcam(phys_addr_t pa);
-#else /* !HAVE_TLBCAM */
-#define v_mapped_by_tlbcam(x) (0UL)
-#define p_mapped_by_tlbcam(x) (0UL)
-#endif /* HAVE_TLBCAM */
-
-#define PGDIR_ORDER (32 + PGD_T_LOG2 - PGDIR_SHIFT)
+#include <mm/mmu_decl.h>
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
- pgd_t *ret;
-
- /* pgdir take page or two with 4K pages and a page fraction otherwise */
-#ifndef CONFIG_PPC_4K_PAGES
- ret = kzalloc(1 << PGDIR_ORDER, GFP_KERNEL);
-#else
- ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
- PGDIR_ORDER - PAGE_SHIFT);
-#endif
- return ret;
-}
-
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-#ifndef CONFIG_PPC_4K_PAGES
- kfree((void *)pgd);
-#else
- free_pages((unsigned long)pgd, PGDIR_ORDER - PAGE_SHIFT);
-#endif
-}
-
-__init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
- pte_t *pte;
- extern int mem_init_done;
- extern void *early_get_page(void);
-
- if (mem_init_done) {
- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
- } else {
- pte = (pte_t *)early_get_page();
- if (pte)
- clear_page(pte);
- }
- return pte;
-}
+static u8 early_fixmap_pagetable[FIXMAP_PTE_SIZE] __page_aligned_data;
-pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+notrace void __init early_ioremap_init(void)
{
- struct page *ptepage;
-
- gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO;
-
- ptepage = alloc_pages(flags, 0);
- if (!ptepage)
- return NULL;
- pgtable_page_ctor(ptepage);
- return ptepage;
-}
+ unsigned long addr = ALIGN_DOWN(FIXADDR_START, PGDIR_SIZE);
+ pte_t *ptep = (pte_t *)early_fixmap_pagetable;
+ pmd_t *pmdp = pmd_off_k(addr);
-void __iomem *
-ioremap(phys_addr_t addr, unsigned long size)
-{
- return __ioremap_caller(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED,
- __builtin_return_address(0));
-}
-EXPORT_SYMBOL(ioremap);
+ for (; (s32)(FIXADDR_TOP - addr) > 0;
+ addr += PGDIR_SIZE, ptep += PTRS_PER_PTE, pmdp++)
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
-void __iomem *
-ioremap_wc(phys_addr_t addr, unsigned long size)
-{
- return __ioremap_caller(addr, size, _PAGE_NO_CACHE,
- __builtin_return_address(0));
+ early_ioremap_setup();
}
-EXPORT_SYMBOL(ioremap_wc);
-void __iomem *
-ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
+void __init *early_alloc_pgtable(unsigned long size)
{
- /* writeable implies dirty for kernel addresses */
- if (flags & _PAGE_RW)
- flags |= _PAGE_DIRTY | _PAGE_HWWRITE;
-
- /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
- flags &= ~(_PAGE_USER | _PAGE_EXEC);
-
-#ifdef _PAGE_BAP_SR
- /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
- * which means that we just cleared supervisor access... oops ;-) This
- * restores it
- */
- flags |= _PAGE_BAP_SR;
-#endif
-
- return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(ioremap_prot);
+ return memblock_alloc_or_panic(size, size);
-void __iomem *
-__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
-{
- return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
}
-void __iomem *
-__ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags,
- void *caller)
+pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va)
{
- unsigned long v, i;
- phys_addr_t p;
- int err;
-
- /* Make sure we have the base flags */
- if ((flags & _PAGE_PRESENT) == 0)
- flags |= PAGE_KERNEL;
-
- /* Non-cacheable page cannot be coherent */
- if (flags & _PAGE_NO_CACHE)
- flags &= ~_PAGE_COHERENT;
-
- /*
- * Choose an address to map it to.
- * Once the vmalloc system is running, we use it.
- * Before then, we use space going down from ioremap_base
- * (ioremap_bot records where we're up to).
- */
- p = addr & PAGE_MASK;
- size = PAGE_ALIGN(addr + size) - p;
-
- /*
- * If the address lies within the first 16 MB, assume it's in ISA
- * memory space
- */
- if (p < 16*1024*1024)
- p += _ISA_MEM_BASE;
+ if (pmd_none(*pmdp)) {
+ pte_t *ptep = early_alloc_pgtable(PTE_FRAG_SIZE);
-#ifndef CONFIG_CRASH_DUMP
- /*
- * Don't allow anybody to remap normal RAM that we're using.
- * mem_init() sets high_memory so only do the check after that.
- */
- if (mem_init_done && (p < virt_to_phys(high_memory)) &&
- !(__allow_ioremap_reserved && memblock_is_region_reserved(p, size))) {
- printk("__ioremap(): phys addr 0x%llx is RAM lr %pf\n",
- (unsigned long long)p, __builtin_return_address(0));
- return NULL;
- }
-#endif
-
- if (size == 0)
- return NULL;
-
- /*
- * Is it already mapped? Perhaps overlapped by a previous
- * BAT mapping. If the whole area is mapped then we're done,
- * otherwise remap it since we want to keep the virt addrs for
- * each request contiguous.
- *
- * We make the assumption here that if the bottom and top
- * of the range we want are mapped then it's mapped to the
- * same virt address (and this is contiguous).
- * -- Cort
- */
- if ((v = p_mapped_by_bats(p)) /*&& p_mapped_by_bats(p+size-1)*/ )
- goto out;
-
- if ((v = p_mapped_by_tlbcam(p)))
- goto out;
-
- if (mem_init_done) {
- struct vm_struct *area;
- area = get_vm_area_caller(size, VM_IOREMAP, caller);
- if (area == 0)
- return NULL;
- area->phys_addr = p;
- v = (unsigned long) area->addr;
- } else {
- v = (ioremap_bot -= size);
- }
-
- /*
- * Should check if it is a candidate for a BAT mapping
- */
-
- err = 0;
- for (i = 0; i < size && err == 0; i += PAGE_SIZE)
- err = map_page(v+i, p+i, flags);
- if (err) {
- if (mem_init_done)
- vunmap((void *)v);
- return NULL;
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
}
-
-out:
- return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK));
+ return pte_offset_kernel(pmdp, va);
}
-EXPORT_SYMBOL(__ioremap);
-
-void iounmap(volatile void __iomem *addr)
-{
- /*
- * If mapped by BATs then there is nothing to do.
- * Calling vfree() generates a benign warning.
- */
- if (v_mapped_by_bats((unsigned long)addr)) return;
- if (addr > high_memory && (unsigned long) addr < ioremap_bot)
- vunmap((void *) (PAGE_MASK & (unsigned long)addr));
-}
-EXPORT_SYMBOL(iounmap);
-int map_page(unsigned long va, phys_addr_t pa, int flags)
+int __ref map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot)
{
pmd_t *pd;
pte_t *pg;
int err = -ENOMEM;
/* Use upper 10 bits of VA to index the first level map */
- pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va);
+ pd = pmd_off_k(va);
/* Use middle 10 bits of VA to index the second-level map */
- pg = pte_alloc_kernel(pd, va);
- if (pg != 0) {
+ if (likely(slab_is_available()))
+ pg = pte_alloc_kernel(pd, va);
+ else
+ pg = early_pte_alloc_kernel(pd, va);
+ if (pg) {
err = 0;
/* The PTE should never be already set nor present in the
* hash table
*/
- BUG_ON((pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)) &&
- flags);
- set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT,
- __pgprot(flags)));
+ BUG_ON((pte_present(*pg) | pte_hashpte(*pg)) && pgprot_val(prot));
+ set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, prot));
}
+ smp_wmb();
return err;
}
/*
* Map in a chunk of physical memory starting at start.
*/
-void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
+static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
{
- unsigned long v, s, f;
+ unsigned long v, s;
phys_addr_t p;
- int ktext;
+ bool ktext;
s = offset;
v = PAGE_OFFSET + s;
p = memstart_addr + s;
for (; s < top; s += PAGE_SIZE) {
- ktext = ((char *) v >= _stext && (char *) v < etext);
- f = ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL;
- map_page(v, p, f);
-#ifdef CONFIG_PPC_STD_MMU_32
- if (ktext)
- hash_preload(&init_mm, v, 0, 0x300);
-#endif
+ ktext = core_kernel_text(v);
+ map_kernel_page(v, p, ktext ? PAGE_KERNEL_X : PAGE_KERNEL);
v += PAGE_SIZE;
p += PAGE_SIZE;
}
@@ -326,131 +112,71 @@ void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
void __init mapin_ram(void)
{
- unsigned long s, top;
+ phys_addr_t base, end;
+ u64 i;
-#ifndef CONFIG_WII
- top = total_lowmem;
- s = mmu_mapin_ram(top);
- __mapin_ram_chunk(s, top);
-#else
- if (!wii_hole_size) {
- s = mmu_mapin_ram(total_lowmem);
- __mapin_ram_chunk(s, total_lowmem);
- } else {
- top = wii_hole_start;
- s = mmu_mapin_ram(top);
- __mapin_ram_chunk(s, top);
+ for_each_mem_range(i, &base, &end) {
+ phys_addr_t top = min(end, total_lowmem);
- top = memblock_end_of_DRAM();
- s = wii_mmu_mapin_mem2(top);
- __mapin_ram_chunk(s, top);
+ if (base >= top)
+ continue;
+ base = mmu_mapin_ram(base, top);
+ __mapin_ram_chunk(base, top);
}
-#endif
}
-/* Scan the real Linux page tables and return a PTE pointer for
- * a virtual address in a context.
- * Returns true (1) if PTE was found, zero otherwise. The pointer to
- * the PTE pointer is unmodified if PTE is not found.
- */
-int
-get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
+static int __mark_initmem_nx(void)
{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- int retval = 0;
-
- pgd = pgd_offset(mm, addr & PAGE_MASK);
- if (pgd) {
- pud = pud_offset(pgd, addr & PAGE_MASK);
- if (pud && pud_present(*pud)) {
- pmd = pmd_offset(pud, addr & PAGE_MASK);
- if (pmd_present(*pmd)) {
- pte = pte_offset_map(pmd, addr & PAGE_MASK);
- if (pte) {
- retval = 1;
- *ptep = pte;
- if (pmdp)
- *pmdp = pmd;
- /* XXX caller needs to do pte_unmap, yuck */
- }
- }
- }
- }
- return(retval);
-}
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-
-static int __change_page_attr(struct page *page, pgprot_t prot)
-{
- pte_t *kpte;
- pmd_t *kpmd;
- unsigned long address;
-
- BUG_ON(PageHighMem(page));
- address = (unsigned long)page_address(page);
-
- if (v_mapped_by_bats(address) || v_mapped_by_tlbcam(address))
- return 0;
- if (!get_pteptr(&init_mm, address, &kpte, &kpmd))
- return -EINVAL;
- __set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0);
- wmb();
- flush_tlb_page(NULL, address);
- pte_unmap(kpte);
+ unsigned long numpages = PFN_UP((unsigned long)_einittext) -
+ PFN_DOWN((unsigned long)_sinittext);
+ int err;
- return 0;
-}
+ err = mmu_mark_initmem_nx();
-/*
- * Change the page attributes of an page in the linear mapping.
- *
- * THIS CONFLICTS WITH BAT MAPPINGS, DEBUG USE ONLY
- */
-static int change_page_attr(struct page *page, int numpages, pgprot_t prot)
-{
- int i, err = 0;
- unsigned long flags;
-
- local_irq_save(flags);
- for (i = 0; i < numpages; i++, page++) {
- err = __change_page_attr(page, prot);
+ if (!v_block_mapped((unsigned long)_sinittext)) {
+ err = set_memory_nx((unsigned long)_sinittext, numpages);
if (err)
- break;
+ return err;
+ err = set_memory_rw((unsigned long)_sinittext, numpages);
}
- local_irq_restore(flags);
return err;
}
-
-void kernel_map_pages(struct page *page, int numpages, int enable)
+void mark_initmem_nx(void)
{
- if (PageHighMem(page))
- return;
+ int err = __mark_initmem_nx();
- change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
+ if (err)
+ panic("%s() failed, err = %d\n", __func__, err);
}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-static int fixmaps;
-
-void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
+#ifdef CONFIG_STRICT_KERNEL_RWX
+static int __mark_rodata_ro(void)
{
- unsigned long address = __fix_to_virt(idx);
+ unsigned long numpages;
- if (idx >= __end_of_fixed_addresses) {
- BUG();
- return;
- }
+ if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX) && mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ pr_warn("This platform has HASH MMU, STRICT_MODULE_RWX won't work\n");
+
+ if (v_block_mapped((unsigned long)_stext + 1))
+ return mmu_mark_rodata_ro();
- map_page(address, phys, pgprot_val(flags));
- fixmaps++;
+ /*
+ * mark text and rodata as read only. __end_rodata is set by
+ * powerpc's linker script and includes tables and data
+ * requiring relocation which are not put in RO_DATA.
+ */
+ numpages = PFN_UP((unsigned long)__end_rodata) -
+ PFN_DOWN((unsigned long)_stext);
+
+ return set_memory_ro((unsigned long)_stext, numpages);
}
-void __this_fixmap_does_not_exist(void)
+void mark_rodata_ro(void)
{
- WARN_ON(1);
+ int err = __mark_rodata_ro();
+
+ if (err)
+ panic("%s() failed, err = %d\n", __func__, err);
}
+#endif
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 536eec72c0f7..6621cfc3baf8 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This file contains ioremap and related functions for 64-bit machines.
+ * This file contains pgtable related functions for 64-bit machines.
*
* Derived from arch/ppc64/mm/init.c
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -13,12 +14,6 @@
*
* Dave Engebretsen <engebret@us.ibm.com>
* Rework for PPC64 port.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/signal.h>
@@ -33,17 +28,11 @@
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/memblock.h>
#include <linux/slab.h>
+#include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
#include <asm/page.h>
-#include <asm/prom.h>
-#include <asm/io.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/smp.h>
#include <asm/machdep.h>
@@ -52,820 +41,122 @@
#include <asm/cputable.h>
#include <asm/sections.h>
#include <asm/firmware.h>
+#include <asm/dma.h>
-#include "mmu_decl.h"
-
-/* Some sanity checking */
-#if TASK_SIZE_USER64 > PGTABLE_RANGE
-#error TASK_SIZE_USER64 exceeds pagetable range
-#endif
-
-#ifdef CONFIG_PPC_STD_MMU_64
-#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
-#error TASK_SIZE_USER64 exceeds user VSID range
-#endif
-#endif
-
-unsigned long ioremap_bot = IOREMAP_BASE;
-
-#ifdef CONFIG_PPC_MMU_NOHASH
-static void *early_alloc_pgtable(unsigned long size)
-{
- void *pt;
+#include <mm/mmu_decl.h>
- if (init_bootmem_done)
- pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS));
- else
- pt = __va(memblock_alloc_base(size, size,
- __pa(MAX_DMA_ADDRESS)));
- memset(pt, 0, size);
-
- return pt;
-}
-#endif /* CONFIG_PPC_MMU_NOHASH */
+#ifdef CONFIG_PPC_BOOK3S_64
/*
- * map_kernel_page currently only called by __ioremap
- * map_kernel_page adds an entry to the ioremap page table
- * and adds an entry to the HPT, possibly bolting it
+ * partition table and process table for ISA 3.0
*/
-int map_kernel_page(unsigned long ea, unsigned long pa, int flags)
-{
- pgd_t *pgdp;
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
-
- if (slab_is_available()) {
- pgdp = pgd_offset_k(ea);
- pudp = pud_alloc(&init_mm, pgdp, ea);
- if (!pudp)
- return -ENOMEM;
- pmdp = pmd_alloc(&init_mm, pudp, ea);
- if (!pmdp)
- return -ENOMEM;
- ptep = pte_alloc_kernel(pmdp, ea);
- if (!ptep)
- return -ENOMEM;
- set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
- __pgprot(flags)));
- } else {
-#ifdef CONFIG_PPC_MMU_NOHASH
- /* Warning ! This will blow up if bootmem is not initialized
- * which our ppc64 code is keen to do that, we'll need to
- * fix it and/or be more careful
- */
- pgdp = pgd_offset_k(ea);
-#ifdef PUD_TABLE_SIZE
- if (pgd_none(*pgdp)) {
- pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
- BUG_ON(pudp == NULL);
- pgd_populate(&init_mm, pgdp, pudp);
- }
-#endif /* PUD_TABLE_SIZE */
- pudp = pud_offset(pgdp, ea);
- if (pud_none(*pudp)) {
- pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
- BUG_ON(pmdp == NULL);
- pud_populate(&init_mm, pudp, pmdp);
- }
- pmdp = pmd_offset(pudp, ea);
- if (!pmd_present(*pmdp)) {
- ptep = early_alloc_pgtable(PAGE_SIZE);
- BUG_ON(ptep == NULL);
- pmd_populate_kernel(&init_mm, pmdp, ptep);
- }
- ptep = pte_offset_kernel(pmdp, ea);
- set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
- __pgprot(flags)));
-#else /* CONFIG_PPC_MMU_NOHASH */
- /*
- * If the mm subsystem is not fully up, we cannot create a
- * linux page table entry for this mapping. Simply bolt an
- * entry in the hardware page table.
- *
- */
- if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
- mmu_io_psize, mmu_kernel_ssize)) {
- printk(KERN_ERR "Failed to do bolted mapping IO "
- "memory at %016lx !\n", pa);
- return -ENOMEM;
- }
-#endif /* !CONFIG_PPC_MMU_NOHASH */
- }
- return 0;
-}
-
-
-/**
- * __ioremap_at - Low level function to establish the page tables
- * for an IO mapping
- */
-void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
- unsigned long flags)
-{
- unsigned long i;
-
- /* Make sure we have the base flags */
- if ((flags & _PAGE_PRESENT) == 0)
- flags |= pgprot_val(PAGE_KERNEL);
-
- /* Non-cacheable page cannot be coherent */
- if (flags & _PAGE_NO_CACHE)
- flags &= ~_PAGE_COHERENT;
-
- /* We don't support the 4K PFN hack with ioremap */
- if (flags & _PAGE_4K_PFN)
- return NULL;
-
- WARN_ON(pa & ~PAGE_MASK);
- WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
- WARN_ON(size & ~PAGE_MASK);
-
- for (i = 0; i < size; i += PAGE_SIZE)
- if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
- return NULL;
-
- return (void __iomem *)ea;
-}
-
-/**
- * __iounmap_from - Low level function to tear down the page tables
- * for an IO mapping. This is used for mappings that
- * are manipulated manually, like partial unmapping of
- * PCI IOs or ISA space.
+struct prtb_entry *process_tb;
+struct patb_entry *partition_tb;
+/*
+ * page table size
*/
-void __iounmap_at(void *ea, unsigned long size)
-{
- WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
- WARN_ON(size & ~PAGE_MASK);
-
- unmap_kernel_range((unsigned long)ea, size);
-}
+unsigned long __pte_index_size;
+EXPORT_SYMBOL(__pte_index_size);
+unsigned long __pmd_index_size;
+EXPORT_SYMBOL(__pmd_index_size);
+unsigned long __pud_index_size;
+EXPORT_SYMBOL(__pud_index_size);
+unsigned long __pgd_index_size;
+EXPORT_SYMBOL(__pgd_index_size);
+unsigned long __pud_cache_index;
+EXPORT_SYMBOL(__pud_cache_index);
+unsigned long __pte_table_size;
+EXPORT_SYMBOL(__pte_table_size);
+unsigned long __pmd_table_size;
+EXPORT_SYMBOL(__pmd_table_size);
+unsigned long __pud_table_size;
+EXPORT_SYMBOL(__pud_table_size);
+unsigned long __pgd_table_size;
+EXPORT_SYMBOL(__pgd_table_size);
+unsigned long __pmd_val_bits;
+EXPORT_SYMBOL(__pmd_val_bits);
+unsigned long __pud_val_bits;
+EXPORT_SYMBOL(__pud_val_bits);
+unsigned long __pgd_val_bits;
+EXPORT_SYMBOL(__pgd_val_bits);
+unsigned long __kernel_virt_start;
+EXPORT_SYMBOL(__kernel_virt_start);
+unsigned long __vmalloc_start;
+EXPORT_SYMBOL(__vmalloc_start);
+unsigned long __vmalloc_end;
+EXPORT_SYMBOL(__vmalloc_end);
+unsigned long __kernel_io_start;
+EXPORT_SYMBOL(__kernel_io_start);
+unsigned long __kernel_io_end;
+struct page *vmemmap;
+EXPORT_SYMBOL(vmemmap);
+unsigned long __pte_frag_nr;
+EXPORT_SYMBOL(__pte_frag_nr);
+unsigned long __pte_frag_size_shift;
+EXPORT_SYMBOL(__pte_frag_size_shift);
+#endif
-void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
- unsigned long flags, void *caller)
+#ifndef __PAGETABLE_PUD_FOLDED
+/* 4 level page table */
+struct page *p4d_page(p4d_t p4d)
{
- phys_addr_t paligned;
- void __iomem *ret;
-
- /*
- * Choose an address to map it to.
- * Once the imalloc system is running, we use it.
- * Before that, we map using addresses going
- * up from ioremap_bot. imalloc will use
- * the addresses from ioremap_bot through
- * IMALLOC_END
- *
- */
- paligned = addr & PAGE_MASK;
- size = PAGE_ALIGN(addr + size) - paligned;
-
- if ((size == 0) || (paligned == 0))
- return NULL;
-
- if (mem_init_done) {
- struct vm_struct *area;
-
- area = __get_vm_area_caller(size, VM_IOREMAP,
- ioremap_bot, IOREMAP_END,
- caller);
- if (area == NULL)
- return NULL;
-
- area->phys_addr = paligned;
- ret = __ioremap_at(paligned, area->addr, size, flags);
- if (!ret)
- vunmap(area->addr);
- } else {
- ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags);
- if (ret)
- ioremap_bot += size;
+ if (p4d_leaf(p4d)) {
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ VM_WARN_ON(!p4d_leaf(p4d));
+ return pte_page(p4d_pte(p4d));
}
-
- if (ret)
- ret += addr & ~PAGE_MASK;
- return ret;
-}
-
-void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
- unsigned long flags)
-{
- return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
-}
-
-void __iomem * ioremap(phys_addr_t addr, unsigned long size)
-{
- unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED;
- void *caller = __builtin_return_address(0);
-
- if (ppc_md.ioremap)
- return ppc_md.ioremap(addr, size, flags, caller);
- return __ioremap_caller(addr, size, flags, caller);
-}
-
-void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
-{
- unsigned long flags = _PAGE_NO_CACHE;
- void *caller = __builtin_return_address(0);
-
- if (ppc_md.ioremap)
- return ppc_md.ioremap(addr, size, flags, caller);
- return __ioremap_caller(addr, size, flags, caller);
+ return virt_to_page(p4d_pgtable(p4d));
}
-
-void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
- unsigned long flags)
-{
- void *caller = __builtin_return_address(0);
-
- /* writeable implies dirty for kernel addresses */
- if (flags & _PAGE_RW)
- flags |= _PAGE_DIRTY;
-
- /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
- flags &= ~(_PAGE_USER | _PAGE_EXEC);
-
-#ifdef _PAGE_BAP_SR
- /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
- * which means that we just cleared supervisor access... oops ;-) This
- * restores it
- */
- flags |= _PAGE_BAP_SR;
#endif
- if (ppc_md.ioremap)
- return ppc_md.ioremap(addr, size, flags, caller);
- return __ioremap_caller(addr, size, flags, caller);
-}
-
-
-/*
- * Unmap an IO region and remove it from imalloc'd list.
- * Access to IO memory should be serialized by driver.
- */
-void __iounmap(volatile void __iomem *token)
+struct page *pud_page(pud_t pud)
{
- void *addr;
-
- if (!mem_init_done)
- return;
-
- addr = (void *) ((unsigned long __force)
- PCI_FIX_ADDR(token) & PAGE_MASK);
- if ((unsigned long)addr < ioremap_bot) {
- printk(KERN_WARNING "Attempt to iounmap early bolted mapping"
- " at 0x%p\n", addr);
- return;
+ if (pud_leaf(pud)) {
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ VM_WARN_ON(!pud_leaf(pud));
+ return pte_page(pud_pte(pud));
}
- vunmap(addr);
+ return virt_to_page(pud_pgtable(pud));
}
-void iounmap(volatile void __iomem *token)
-{
- if (ppc_md.iounmap)
- ppc_md.iounmap(token);
- else
- __iounmap(token);
-}
-
-EXPORT_SYMBOL(ioremap);
-EXPORT_SYMBOL(ioremap_wc);
-EXPORT_SYMBOL(ioremap_prot);
-EXPORT_SYMBOL(__ioremap);
-EXPORT_SYMBOL(__ioremap_at);
-EXPORT_SYMBOL(iounmap);
-EXPORT_SYMBOL(__iounmap);
-EXPORT_SYMBOL(__iounmap_at);
-
/*
* For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
* For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
*/
struct page *pmd_page(pmd_t pmd)
{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_trans_huge(pmd))
- return pfn_to_page(pmd_pfn(pmd));
-#endif
- return virt_to_page(pmd_page_vaddr(pmd));
-}
-
-#ifdef CONFIG_PPC_64K_PAGES
-static pte_t *get_from_cache(struct mm_struct *mm)
-{
- void *pte_frag, *ret;
-
- spin_lock(&mm->page_table_lock);
- ret = mm->context.pte_frag;
- if (ret) {
- pte_frag = ret + PTE_FRAG_SIZE;
- /*
- * If we have taken up all the fragments mark PTE page NULL
- */
- if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
- pte_frag = NULL;
- mm->context.pte_frag = pte_frag;
- }
- spin_unlock(&mm->page_table_lock);
- return (pte_t *)ret;
-}
-
-static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
-{
- void *ret = NULL;
- struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
- __GFP_REPEAT | __GFP_ZERO);
- if (!page)
- return NULL;
-
- ret = page_address(page);
- spin_lock(&mm->page_table_lock);
- /*
- * If we find pgtable_page set, we return
- * the allocated page with single fragement
- * count.
- */
- if (likely(!mm->context.pte_frag)) {
- atomic_set(&page->_count, PTE_FRAG_NR);
- mm->context.pte_frag = ret + PTE_FRAG_SIZE;
- }
- spin_unlock(&mm->page_table_lock);
-
- if (!kernel)
- pgtable_page_ctor(page);
-
- return (pte_t *)ret;
-}
-
-pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
-{
- pte_t *pte;
-
- pte = get_from_cache(mm);
- if (pte)
- return pte;
-
- return __alloc_for_cache(mm, kernel);
-}
-
-void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
-{
- struct page *page = virt_to_page(table);
- if (put_page_testzero(page)) {
- if (!kernel)
- pgtable_page_dtor(page);
- free_hot_cold_page(page, 0);
- }
-}
-
-#ifdef CONFIG_SMP
-static void page_table_free_rcu(void *table)
-{
- struct page *page = virt_to_page(table);
- if (put_page_testzero(page)) {
- pgtable_page_dtor(page);
- free_hot_cold_page(page, 0);
- }
-}
-
-void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
-{
- unsigned long pgf = (unsigned long)table;
-
- BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
- pgf |= shift;
- tlb_remove_table(tlb, (void *)pgf);
-}
-
-void __tlb_remove_table(void *_table)
-{
- void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
- unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
-
- if (!shift)
- /* PTE page needs special handling */
- page_table_free_rcu(table);
- else {
- BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
- kmem_cache_free(PGT_CACHE(shift), table);
- }
-}
-#else
-void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
-{
- if (!shift) {
- /* PTE page needs special handling */
- struct page *page = virt_to_page(table);
- if (put_page_testzero(page)) {
- pgtable_page_dtor(page);
- free_hot_cold_page(page, 0);
- }
- } else {
- BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
- kmem_cache_free(PGT_CACHE(shift), table);
- }
-}
-#endif
-#endif /* CONFIG_PPC_64K_PAGES */
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-/*
- * This is called when relaxing access to a hugepage. It's also called in the page
- * fault path when we don't hit any of the major fault cases, ie, a minor
- * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
- * handled those two for us, we additionally deal with missing execute
- * permission here on some processors
- */
-int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp, pmd_t entry, int dirty)
-{
- int changed;
-#ifdef CONFIG_DEBUG_VM
- WARN_ON(!pmd_trans_huge(*pmdp));
- assert_spin_locked(&vma->vm_mm->page_table_lock);
-#endif
- changed = !pmd_same(*(pmdp), entry);
- if (changed) {
- __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
- /*
- * Since we are not supporting SW TLB systems, we don't
- * have any thing similar to flush_tlb_page_nohash()
- */
- }
- return changed;
-}
-
-unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, unsigned long clr)
-{
-
- unsigned long old, tmp;
-
-#ifdef CONFIG_DEBUG_VM
- WARN_ON(!pmd_trans_huge(*pmdp));
- assert_spin_locked(&mm->page_table_lock);
-#endif
-
-#ifdef PTE_ATOMIC_UPDATES
- __asm__ __volatile__(
- "1: ldarx %0,0,%3\n\
- andi. %1,%0,%6\n\
- bne- 1b \n\
- andc %1,%0,%4 \n\
- stdcx. %1,0,%3 \n\
- bne- 1b"
- : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
- : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY)
- : "cc" );
-#else
- old = pmd_val(*pmdp);
- *pmdp = __pmd(old & ~clr);
-#endif
- if (old & _PAGE_HASHPTE)
- hpte_do_hugepage_flush(mm, addr, pmdp);
- return old;
-}
-
-pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_t pmd;
-
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- if (pmd_trans_huge(*pmdp)) {
- pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
- } else {
- /*
- * khugepaged calls this for normal pmd
- */
- pmd = *pmdp;
- pmd_clear(pmdp);
+ if (pmd_leaf(pmd)) {
/*
- * Wait for all pending hash_page to finish. This is needed
- * in case of subpage collapse. When we collapse normal pages
- * to hugepage, we first clear the pmd, then invalidate all
- * the PTE entries. The assumption here is that any low level
- * page fault will see a none pmd and take the slow path that
- * will wait on mmap_sem. But we could very well be in a
- * hash_page with local ptep pointer value. Such a hash page
- * can result in adding new HPTE entries for normal subpages.
- * That means we could be modifying the page content as we
- * copy them to a huge page. So wait for parallel hash_page
- * to finish before invalidating HPTE entries. We can do this
- * by sending an IPI to all the cpus and executing a dummy
- * function there.
+ * vmalloc_to_page may be called on any vmap address (not only
+ * vmalloc), and it uses pmd_page() etc., when huge vmap is
+ * enabled so these checks can't be used.
*/
- kick_all_cpus_sync();
- /*
- * Now invalidate the hpte entries in the range
- * covered by pmd. This make sure we take a
- * fault and will find the pmd as none, which will
- * result in a major fault which takes mmap_sem and
- * hence wait for collapse to complete. Without this
- * the __collapse_huge_page_copy can result in copying
- * the old content.
- */
- flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ VM_WARN_ON(!pmd_leaf(pmd));
+ return pte_page(pmd_pte(pmd));
}
- return pmd;
-}
-
-int pmdp_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
-}
-
-/*
- * We currently remove entries from the hashtable regardless of whether
- * the entry was young or dirty. The generic routines only flush if the
- * entry was young or dirty which is not good enough.
- *
- * We should be more intelligent about this but for the moment we override
- * these functions and force a tlb flush unconditionally
- */
-int pmdp_clear_flush_young(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
-}
-
-/*
- * We mark the pmd splitting and invalidate all the hpte
- * entries for this hugepage.
- */
-void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- unsigned long old, tmp;
-
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
-#ifdef CONFIG_DEBUG_VM
- WARN_ON(!pmd_trans_huge(*pmdp));
- assert_spin_locked(&vma->vm_mm->page_table_lock);
-#endif
-
-#ifdef PTE_ATOMIC_UPDATES
-
- __asm__ __volatile__(
- "1: ldarx %0,0,%3\n\
- andi. %1,%0,%6\n\
- bne- 1b \n\
- ori %1,%0,%4 \n\
- stdcx. %1,0,%3 \n\
- bne- 1b"
- : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
- : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
- : "cc" );
-#else
- old = pmd_val(*pmdp);
- *pmdp = __pmd(old | _PAGE_SPLITTING);
-#endif
- /*
- * If we didn't had the splitting flag set, go and flush the
- * HPTE entries.
- */
- if (!(old & _PAGE_SPLITTING)) {
- /* We need to flush the hpte */
- if (old & _PAGE_HASHPTE)
- hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
- }
-}
-
-/*
- * We want to put the pgtable in pmd and use pgtable for tracking
- * the base page size hptes
- */
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
- pgtable_t pgtable)
-{
- pgtable_t *pgtable_slot;
- assert_spin_locked(&mm->page_table_lock);
- /*
- * we store the pgtable in the second half of PMD
- */
- pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
- *pgtable_slot = pgtable;
- /*
- * expose the deposited pgtable to other cpus.
- * before we set the hugepage PTE at pmd level
- * hash fault code looks at the deposted pgtable
- * to store hash index values.
- */
- smp_wmb();
-}
-
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
-{
- pgtable_t pgtable;
- pgtable_t *pgtable_slot;
-
- assert_spin_locked(&mm->page_table_lock);
- pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
- pgtable = *pgtable_slot;
- /*
- * Once we withdraw, mark the entry NULL.
- */
- *pgtable_slot = NULL;
- /*
- * We store HPTE information in the deposited PTE fragment.
- * zero out the content on withdraw.
- */
- memset(pgtable, 0, PTE_FRAG_SIZE);
- return pgtable;
-}
-
-/*
- * set a new huge pmd. We should not be called for updating
- * an existing pmd entry. That should go via pmd_hugepage_update.
- */
-void set_pmd_at(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, pmd_t pmd)
-{
-#ifdef CONFIG_DEBUG_VM
- WARN_ON(!pmd_none(*pmdp));
- assert_spin_locked(&mm->page_table_lock);
- WARN_ON(!pmd_trans_huge(pmd));
-#endif
- return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+ return virt_to_page(pmd_page_vaddr(pmd));
}
-void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void mark_rodata_ro(void)
{
- pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT);
-}
-
-/*
- * A linux hugepage PMD was changed and the corresponding hash table entries
- * neesd to be flushed.
- */
-void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp)
-{
- int ssize, i;
- unsigned long s_addr;
- int max_hpte_count;
- unsigned int psize, valid;
- unsigned char *hpte_slot_array;
- unsigned long hidx, vpn, vsid, hash, shift, slot;
-
- /*
- * Flush all the hptes mapping this hugepage
- */
- s_addr = addr & HPAGE_PMD_MASK;
- hpte_slot_array = get_hpte_slot_array(pmdp);
- /*
- * IF we try to do a HUGE PTE update after a withdraw is done.
- * we will find the below NULL. This happens when we do
- * split_huge_page_pmd
- */
- if (!hpte_slot_array)
+ if (!mmu_has_feature(MMU_FTR_KERNEL_RO)) {
+ pr_warn("Warning: Unable to mark rodata read only on this CPU.\n");
return;
-
- /* get the base page size */
- psize = get_slice_psize(mm, s_addr);
-
- if (ppc_md.hugepage_invalidate)
- return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
- s_addr, psize);
- /*
- * No bluk hpte removal support, invalidate each entry
- */
- shift = mmu_psize_defs[psize].shift;
- max_hpte_count = HPAGE_PMD_SIZE >> shift;
- for (i = 0; i < max_hpte_count; i++) {
- /*
- * 8 bits per each hpte entries
- * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
- */
- valid = hpte_valid(hpte_slot_array, i);
- if (!valid)
- continue;
- hidx = hpte_hash_index(hpte_slot_array, i);
-
- /* get the vpn */
- addr = s_addr + (i * (1ul << shift));
- if (!is_kernel_addr(addr)) {
- ssize = user_segment_size(addr);
- vsid = get_vsid(mm->context.id, addr, ssize);
- WARN_ON(vsid == 0);
- } else {
- vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
- ssize = mmu_kernel_ssize;
- }
-
- vpn = hpt_vpn(addr, vsid, ssize);
- hash = hpt_hash(vpn, shift, ssize);
- if (hidx & _PTEIDX_SECONDARY)
- hash = ~hash;
-
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += hidx & _PTEIDX_GROUP_IX;
- ppc_md.hpte_invalidate(slot, vpn, psize,
- MMU_PAGE_16M, ssize, 0);
}
-}
-
-static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
-{
- pmd_val(pmd) |= pgprot_val(pgprot);
- return pmd;
-}
-
-pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
-{
- pmd_t pmd;
- /*
- * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
- * set. We use this to check THP page at pmd level.
- * leaf pte for huge page, bottom two bits != 00
- */
- pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
- pmd_val(pmd) |= _PAGE_THP_HUGE;
- pmd = pmd_set_protbits(pmd, pgprot);
- return pmd;
-}
-
-pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
-{
- return pfn_pmd(page_to_pfn(page), pgprot);
-}
-
-pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
-{
- pmd_val(pmd) &= _HPAGE_CHG_MASK;
- pmd = pmd_set_protbits(pmd, newprot);
- return pmd;
-}
-
-/*
- * This is called at the end of handling a user page fault, when the
- * fault has been handled by updating a HUGE PMD entry in the linux page tables.
- * We use it to preload an HPTE into the hash table corresponding to
- * the updated linux HUGE PMD entry.
- */
-void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd)
-{
- return;
-}
-
-pmd_t pmdp_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp)
-{
- pmd_t old_pmd;
- pgtable_t pgtable;
- unsigned long old;
- pgtable_t *pgtable_slot;
-
- old = pmd_hugepage_update(mm, addr, pmdp, ~0UL);
- old_pmd = __pmd(old);
- /*
- * We have pmd == none and we are holding page_table_lock.
- * So we can safely go and clear the pgtable hash
- * index info.
- */
- pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
- pgtable = *pgtable_slot;
- /*
- * Let's zero out old valid and hash index details
- * hash fault look at them.
- */
- memset(pgtable, 0, PTE_FRAG_SIZE);
- return old_pmd;
+ if (radix_enabled())
+ radix__mark_rodata_ro();
+ else
+ hash__mark_rodata_ro();
}
-int has_transparent_hugepage(void)
+void mark_initmem_nx(void)
{
- if (!mmu_has_feature(MMU_FTR_16M_PAGE))
- return 0;
- /*
- * We support THP only if PMD_SIZE is 16MB.
- */
- if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
- return 0;
- /*
- * We need to make sure that we support 16MB hugepage in a segement
- * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
- * of 64K.
- */
- /*
- * If we have 64K HPTE, we will be using that by default
- */
- if (mmu_psize_defs[MMU_PAGE_64K].shift &&
- (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
- return 0;
- /*
- * Ok we only have 4K HPTE
- */
- if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
- return 0;
-
- return 1;
+ if (radix_enabled())
+ radix__mark_initmem_nx();
+ else
+ hash__mark_initmem_nx();
}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
deleted file mode 100644
index 11571e118831..000000000000
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * This file contains the routines for handling the MMU on those
- * PowerPC implementations where the MMU substantially follows the
- * architecture specification. This includes the 6xx, 7xx, 7xxx,
- * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
- * -- paulus
- *
- * Derived from arch/ppc/mm/init.c:
- * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- * and Cort Dougan (PReP) (cort@cs.nmt.edu)
- * Copyright (C) 1996 Paul Mackerras
- *
- * Derived from "arch/i386/mm/init.c"
- * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/highmem.h>
-#include <linux/memblock.h>
-
-#include <asm/prom.h>
-#include <asm/mmu.h>
-#include <asm/machdep.h>
-
-#include "mmu_decl.h"
-
-struct hash_pte *Hash, *Hash_end;
-unsigned long Hash_size, Hash_mask;
-unsigned long _SDR1;
-
-struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */
-
-struct batrange { /* stores address ranges mapped by BATs */
- unsigned long start;
- unsigned long limit;
- phys_addr_t phys;
-} bat_addrs[8];
-
-/*
- * Return PA for this VA if it is mapped by a BAT, or 0
- */
-phys_addr_t v_mapped_by_bats(unsigned long va)
-{
- int b;
- for (b = 0; b < 4; ++b)
- if (va >= bat_addrs[b].start && va < bat_addrs[b].limit)
- return bat_addrs[b].phys + (va - bat_addrs[b].start);
- return 0;
-}
-
-/*
- * Return VA for a given PA or 0 if not mapped
- */
-unsigned long p_mapped_by_bats(phys_addr_t pa)
-{
- int b;
- for (b = 0; b < 4; ++b)
- if (pa >= bat_addrs[b].phys
- && pa < (bat_addrs[b].limit-bat_addrs[b].start)
- +bat_addrs[b].phys)
- return bat_addrs[b].start+(pa-bat_addrs[b].phys);
- return 0;
-}
-
-unsigned long __init mmu_mapin_ram(unsigned long top)
-{
- unsigned long tot, bl, done;
- unsigned long max_size = (256<<20);
-
- if (__map_without_bats) {
- printk(KERN_DEBUG "RAM mapped without BATs\n");
- return 0;
- }
-
- /* Set up BAT2 and if necessary BAT3 to cover RAM. */
-
- /* Make sure we don't map a block larger than the
- smallest alignment of the physical address. */
- tot = top;
- for (bl = 128<<10; bl < max_size; bl <<= 1) {
- if (bl * 2 > tot)
- break;
- }
-
- setbat(2, PAGE_OFFSET, 0, bl, PAGE_KERNEL_X);
- done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1;
- if ((done < tot) && !bat_addrs[3].limit) {
- /* use BAT3 to cover a bit more */
- tot -= done;
- for (bl = 128<<10; bl < max_size; bl <<= 1)
- if (bl * 2 > tot)
- break;
- setbat(3, PAGE_OFFSET+done, done, bl, PAGE_KERNEL_X);
- done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1;
- }
-
- return done;
-}
-
-/*
- * Set up one of the I/D BAT (block address translation) register pairs.
- * The parameters are not checked; in particular size must be a power
- * of 2 between 128k and 256M.
- */
-void __init setbat(int index, unsigned long virt, phys_addr_t phys,
- unsigned int size, int flags)
-{
- unsigned int bl;
- int wimgxpp;
- struct ppc_bat *bat = BATS[index];
-
- if ((flags & _PAGE_NO_CACHE) ||
- (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0))
- flags &= ~_PAGE_COHERENT;
-
- bl = (size >> 17) - 1;
- if (PVR_VER(mfspr(SPRN_PVR)) != 1) {
- /* 603, 604, etc. */
- /* Do DBAT first */
- wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
- | _PAGE_COHERENT | _PAGE_GUARDED);
- wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX;
- bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
- bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
- if (flags & _PAGE_USER)
- bat[1].batu |= 1; /* Vp = 1 */
- if (flags & _PAGE_GUARDED) {
- /* G bit must be zero in IBATs */
- bat[0].batu = bat[0].batl = 0;
- } else {
- /* make IBAT same as DBAT */
- bat[0] = bat[1];
- }
- } else {
- /* 601 cpu */
- if (bl > BL_8M)
- bl = BL_8M;
- wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
- | _PAGE_COHERENT);
- wimgxpp |= (flags & _PAGE_RW)?
- ((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX;
- bat->batu = virt | wimgxpp | 4; /* Ks=0, Ku=1 */
- bat->batl = phys | bl | 0x40; /* V=1 */
- }
-
- bat_addrs[index].start = virt;
- bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1;
- bat_addrs[index].phys = phys;
-}
-
-/*
- * Preload a translation in the hash table
- */
-void hash_preload(struct mm_struct *mm, unsigned long ea,
- unsigned long access, unsigned long trap)
-{
- pmd_t *pmd;
-
- if (Hash == 0)
- return;
- pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
- if (!pmd_none(*pmd))
- add_hash_page(mm->context.id, ea, pmd_val(*pmd));
-}
-
-/*
- * Initialize the hash table and patch the instructions in hashtable.S.
- */
-void __init MMU_init_hw(void)
-{
- unsigned int hmask, mb, mb2;
- unsigned int n_hpteg, lg_n_hpteg;
-
- extern unsigned int hash_page_patch_A[];
- extern unsigned int hash_page_patch_B[], hash_page_patch_C[];
- extern unsigned int hash_page[];
- extern unsigned int flush_hash_patch_A[], flush_hash_patch_B[];
-
- if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
- /*
- * Put a blr (procedure return) instruction at the
- * start of hash_page, since we can still get DSI
- * exceptions on a 603.
- */
- hash_page[0] = 0x4e800020;
- flush_icache_range((unsigned long) &hash_page[0],
- (unsigned long) &hash_page[1]);
- return;
- }
-
- if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105);
-
-#define LG_HPTEG_SIZE 6 /* 64 bytes per HPTEG */
-#define SDR1_LOW_BITS ((n_hpteg - 1) >> 10)
-#define MIN_N_HPTEG 1024 /* min 64kB hash table */
-
- /*
- * Allow 1 HPTE (1/8 HPTEG) for each page of memory.
- * This is less than the recommended amount, but then
- * Linux ain't AIX.
- */
- n_hpteg = total_memory / (PAGE_SIZE * 8);
- if (n_hpteg < MIN_N_HPTEG)
- n_hpteg = MIN_N_HPTEG;
- lg_n_hpteg = __ilog2(n_hpteg);
- if (n_hpteg & (n_hpteg - 1)) {
- ++lg_n_hpteg; /* round up if not power of 2 */
- n_hpteg = 1 << lg_n_hpteg;
- }
- Hash_size = n_hpteg << LG_HPTEG_SIZE;
-
- /*
- * Find some memory for the hash table.
- */
- if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
- Hash = __va(memblock_alloc(Hash_size, Hash_size));
- cacheable_memzero(Hash, Hash_size);
- _SDR1 = __pa(Hash) | SDR1_LOW_BITS;
-
- Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
-
- printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n",
- (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash);
-
-
- /*
- * Patch up the instructions in hashtable.S:create_hpte
- */
- if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345);
- Hash_mask = n_hpteg - 1;
- hmask = Hash_mask >> (16 - LG_HPTEG_SIZE);
- mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg;
- if (lg_n_hpteg > 16)
- mb2 = 16 - LG_HPTEG_SIZE;
-
- hash_page_patch_A[0] = (hash_page_patch_A[0] & ~0xffff)
- | ((unsigned int)(Hash) >> 16);
- hash_page_patch_A[1] = (hash_page_patch_A[1] & ~0x7c0) | (mb << 6);
- hash_page_patch_A[2] = (hash_page_patch_A[2] & ~0x7c0) | (mb2 << 6);
- hash_page_patch_B[0] = (hash_page_patch_B[0] & ~0xffff) | hmask;
- hash_page_patch_C[0] = (hash_page_patch_C[0] & ~0xffff) | hmask;
-
- /*
- * Ensure that the locations we've patched have been written
- * out from the data cache and invalidated in the instruction
- * cache, on those machines with split caches.
- */
- flush_icache_range((unsigned long) &hash_page_patch_A[0],
- (unsigned long) &hash_page_patch_C[1]);
-
- /*
- * Patch up the instructions in hashtable.S:flush_hash_page
- */
- flush_hash_patch_A[0] = (flush_hash_patch_A[0] & ~0xffff)
- | ((unsigned int)(Hash) >> 16);
- flush_hash_patch_A[1] = (flush_hash_patch_A[1] & ~0x7c0) | (mb << 6);
- flush_hash_patch_A[2] = (flush_hash_patch_A[2] & ~0x7c0) | (mb2 << 6);
- flush_hash_patch_B[0] = (flush_hash_patch_B[0] & ~0xffff) | hmask;
- flush_icache_range((unsigned long) &flush_hash_patch_A[0],
- (unsigned long) &flush_hash_patch_B[1]);
-
- if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205);
-}
-
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size)
-{
- /* We don't currently support the first MEMBLOCK not mapping 0
- * physical on those processors
- */
- BUG_ON(first_memblock_base != 0);
-
- /* 601 can only access 16MB at the moment */
- if (PVR_VER(mfspr(SPRN_PVR)) == 1)
- memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000));
- else /* Anything else has 256M mapped */
- memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000));
-}
diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c
new file mode 100644
index 000000000000..ff845f251724
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * From split of dump_linuxpagetables.c
+ * Copyright 2016, Rashmica Gupta, IBM Corp.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/pgtable.h>
+
+#include "ptdump.h"
+
+static const struct flag_info flag_array[] = {
+ {
+#ifdef CONFIG_PPC_16K_PAGES
+ .mask = _PAGE_HUGE,
+ .val = _PAGE_HUGE,
+#else
+ .mask = _PAGE_SPS,
+ .val = _PAGE_SPS,
+#endif
+ .set = "huge",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_RO | _PAGE_NA,
+ .val = 0,
+ .set = "rw",
+ }, {
+ .mask = _PAGE_RO | _PAGE_NA,
+ .val = _PAGE_RO,
+ .set = "r ",
+ }, {
+ .mask = _PAGE_RO | _PAGE_NA,
+ .val = _PAGE_NA,
+ .set = " ",
+ }, {
+ .mask = _PAGE_EXEC,
+ .val = _PAGE_EXEC,
+ .set = " X ",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_PRESENT,
+ .val = _PAGE_PRESENT,
+ .set = "present",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_GUARDED,
+ .val = _PAGE_GUARDED,
+ .set = "guarded",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_DIRTY,
+ .val = _PAGE_DIRTY,
+ .set = "dirty",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_ACCESSED,
+ .val = _PAGE_ACCESSED,
+ .set = "accessed",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_NO_CACHE,
+ .val = _PAGE_NO_CACHE,
+ .set = "no cache",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_SPECIAL,
+ .val = _PAGE_SPECIAL,
+ .set = "special",
+ }
+};
+
+struct ptdump_pg_level pg_level[5] = {
+ { /* pgd */
+ .name = "PGD",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* p4d */
+ .name = "P4D",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pud */
+ .name = "PUD",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pmd */
+ .name = "PMD",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pte */
+ .name = "PTE",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ },
+};
diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile
new file mode 100644
index 000000000000..0f7a050f327e
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += ptdump.o
+
+obj-$(CONFIG_44x) += shared.o
+obj-$(CONFIG_PPC_8xx) += 8xx.o
+obj-$(CONFIG_PPC_E500) += shared.o
+obj-$(CONFIG_PPC_BOOK3S_32) += shared.o
+obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o
+
+ifdef CONFIG_PTDUMP_DEBUGFS
+obj-$(CONFIG_PPC_BOOK3S_32) += bats.o segment_regs.o
+obj-$(CONFIG_PPC_64S_HASH_MMU) += hashpagetable.o
+endif
diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c
new file mode 100644
index 000000000000..820c119013e4
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/bats.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright 2018, Christophe Leroy CS S.I.
+ * <christophe.leroy@c-s.fr>
+ *
+ * This dumps the content of BATS
+ */
+
+#include <linux/pgtable.h>
+#include <linux/debugfs.h>
+#include <asm/cpu_has_feature.h>
+
+#include "ptdump.h"
+
+static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, bool is_d)
+{
+ u32 bepi = upper & 0xfffe0000;
+ u32 bl = (upper >> 2) & 0x7ff;
+ u32 k = upper & 3;
+ phys_addr_t brpn = PHYS_BAT_ADDR(lower);
+ u32 size = (bl + 1) << 17;
+
+ seq_printf(m, "%d: ", idx);
+ if (k == 0) {
+ seq_puts(m, " -\n");
+ return;
+ }
+
+ seq_printf(m, "0x%08x-0x%08x ", bepi, bepi + size - 1);
+#ifdef CONFIG_PHYS_64BIT
+ seq_printf(m, "0x%016llx ", brpn);
+#else
+ seq_printf(m, "0x%08x ", brpn);
+#endif
+ pt_dump_size(m, size);
+
+ if (k == 1)
+ seq_puts(m, "User ");
+ else if (k == 2)
+ seq_puts(m, "Kernel ");
+ else
+ seq_puts(m, "Kernel/User ");
+
+ if (lower & BPP_RX)
+ seq_puts(m, is_d ? "r " : " x ");
+ else if (lower & BPP_RW)
+ seq_puts(m, is_d ? "rw " : " x ");
+ else
+ seq_puts(m, is_d ? " " : " ");
+
+ seq_puts(m, lower & _PAGE_WRITETHRU ? "w " : " ");
+ seq_puts(m, lower & _PAGE_NO_CACHE ? "i " : " ");
+ seq_puts(m, lower & _PAGE_COHERENT ? "m " : " ");
+ seq_puts(m, lower & _PAGE_GUARDED ? "g " : " ");
+ seq_puts(m, "\n");
+}
+
+#define BAT_SHOW_603(_m, _n, _l, _u, _d) bat_show_603(_m, _n, mfspr(_l), mfspr(_u), _d)
+
+static int bats_show(struct seq_file *m, void *v)
+{
+ seq_puts(m, "---[ Instruction Block Address Translation ]---\n");
+
+ BAT_SHOW_603(m, 0, SPRN_IBAT0L, SPRN_IBAT0U, false);
+ BAT_SHOW_603(m, 1, SPRN_IBAT1L, SPRN_IBAT1U, false);
+ BAT_SHOW_603(m, 2, SPRN_IBAT2L, SPRN_IBAT2U, false);
+ BAT_SHOW_603(m, 3, SPRN_IBAT3L, SPRN_IBAT3U, false);
+ if (mmu_has_feature(MMU_FTR_USE_HIGH_BATS)) {
+ BAT_SHOW_603(m, 4, SPRN_IBAT4L, SPRN_IBAT4U, false);
+ BAT_SHOW_603(m, 5, SPRN_IBAT5L, SPRN_IBAT5U, false);
+ BAT_SHOW_603(m, 6, SPRN_IBAT6L, SPRN_IBAT6U, false);
+ BAT_SHOW_603(m, 7, SPRN_IBAT7L, SPRN_IBAT7U, false);
+ }
+
+ seq_puts(m, "\n---[ Data Block Address Translation ]---\n");
+
+ BAT_SHOW_603(m, 0, SPRN_DBAT0L, SPRN_DBAT0U, true);
+ BAT_SHOW_603(m, 1, SPRN_DBAT1L, SPRN_DBAT1U, true);
+ BAT_SHOW_603(m, 2, SPRN_DBAT2L, SPRN_DBAT2U, true);
+ BAT_SHOW_603(m, 3, SPRN_DBAT3L, SPRN_DBAT3U, true);
+ if (mmu_has_feature(MMU_FTR_USE_HIGH_BATS)) {
+ BAT_SHOW_603(m, 4, SPRN_DBAT4L, SPRN_DBAT4U, true);
+ BAT_SHOW_603(m, 5, SPRN_DBAT5L, SPRN_DBAT5U, true);
+ BAT_SHOW_603(m, 6, SPRN_DBAT6L, SPRN_DBAT6U, true);
+ BAT_SHOW_603(m, 7, SPRN_DBAT7L, SPRN_DBAT7U, true);
+ }
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(bats);
+
+static int __init bats_init(void)
+{
+ debugfs_create_file("block_address_translation", 0400,
+ arch_debugfs_dir, NULL, &bats_fops);
+ return 0;
+}
+device_initcall(bats_init);
diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c
new file mode 100644
index 000000000000..e8a21c6dc32e
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/book3s64.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * From split of dump_linuxpagetables.c
+ * Copyright 2016, Rashmica Gupta, IBM Corp.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/pgtable.h>
+
+#include "ptdump.h"
+
+static const struct flag_info flag_array[] = {
+ {
+ .mask = _PAGE_PRIVILEGED,
+ .val = 0,
+ .set = "user",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_READ,
+ .val = _PAGE_READ,
+ .set = "r",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_WRITE,
+ .val = _PAGE_WRITE,
+ .set = "w",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_EXEC,
+ .val = _PAGE_EXEC,
+ .set = " X ",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_PTE,
+ .val = _PAGE_PTE,
+ .set = "pte",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_PRESENT,
+ .val = _PAGE_PRESENT,
+ .set = "valid",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_PRESENT | _PAGE_INVALID,
+ .val = 0,
+ .set = " ",
+ .clear = "present",
+ }, {
+ .mask = H_PAGE_HASHPTE,
+ .val = H_PAGE_HASHPTE,
+ .set = "hpte",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_DIRTY,
+ .val = _PAGE_DIRTY,
+ .set = "dirty",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_ACCESSED,
+ .val = _PAGE_ACCESSED,
+ .set = "accessed",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_NON_IDEMPOTENT,
+ .val = _PAGE_NON_IDEMPOTENT,
+ .set = "non-idempotent",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_TOLERANT,
+ .val = _PAGE_TOLERANT,
+ .set = "tolerant",
+ .clear = " ",
+ }, {
+ .mask = H_PAGE_BUSY,
+ .val = H_PAGE_BUSY,
+ .set = "busy",
+ }, {
+#ifdef CONFIG_PPC_64K_PAGES
+ .mask = H_PAGE_COMBO,
+ .val = H_PAGE_COMBO,
+ .set = "combo",
+ }, {
+ .mask = H_PAGE_4K_PFN,
+ .val = H_PAGE_4K_PFN,
+ .set = "4K_pfn",
+ }, {
+#else /* CONFIG_PPC_64K_PAGES */
+ .mask = H_PAGE_F_GIX,
+ .val = H_PAGE_F_GIX,
+ .set = "f_gix",
+ .is_val = true,
+ .shift = H_PAGE_F_GIX_SHIFT,
+ }, {
+ .mask = H_PAGE_F_SECOND,
+ .val = H_PAGE_F_SECOND,
+ .set = "f_second",
+ }, {
+#endif /* CONFIG_PPC_64K_PAGES */
+ .mask = _PAGE_SPECIAL,
+ .val = _PAGE_SPECIAL,
+ .set = "special",
+ }
+};
+
+struct ptdump_pg_level pg_level[5] = {
+ { /* pgd */
+ .name = "PGD",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* p4d */
+ .name = "P4D",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pud */
+ .name = "PUD",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pmd */
+ .name = "PMD",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pte */
+ .name = "PTE",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ },
+};
diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c
new file mode 100644
index 000000000000..671d0dc00c6d
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -0,0 +1,549 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2016, Rashmica Gupta, IBM Corp.
+ *
+ * This traverses the kernel virtual memory and dumps the pages that are in
+ * the hash pagetable, along with their flags to
+ * /sys/kernel/debug/kernel_hash_pagetable.
+ *
+ * If radix is enabled then there is no hash page table and so no debugfs file
+ * is generated.
+ */
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/const.h>
+#include <asm/page.h>
+#include <asm/plpar_wrappers.h>
+#include <linux/memblock.h>
+#include <asm/firmware.h>
+#include <asm/pgalloc.h>
+
+struct pg_state {
+ struct seq_file *seq;
+ const struct addr_marker *marker;
+ unsigned long start_address;
+ unsigned int level;
+ u64 current_flags;
+};
+
+struct addr_marker {
+ unsigned long start_address;
+ const char *name;
+};
+
+static struct addr_marker address_markers[] = {
+ { 0, "Start of kernel VM" },
+ { 0, "vmalloc() Area" },
+ { 0, "vmalloc() End" },
+ { 0, "isa I/O start" },
+ { 0, "isa I/O end" },
+ { 0, "phb I/O start" },
+ { 0, "phb I/O end" },
+ { 0, "I/O remap start" },
+ { 0, "I/O remap end" },
+ { 0, "vmemmap start" },
+ { -1, NULL },
+};
+
+struct flag_info {
+ u64 mask;
+ u64 val;
+ const char *set;
+ const char *clear;
+ bool is_val;
+ int shift;
+};
+
+static const struct flag_info v_flag_array[] = {
+ {
+ .mask = SLB_VSID_B,
+ .val = SLB_VSID_B_256M,
+ .set = "ssize: 256M",
+ .clear = "ssize: 1T ",
+ }, {
+ .mask = HPTE_V_SECONDARY,
+ .val = HPTE_V_SECONDARY,
+ .set = "secondary",
+ .clear = "primary ",
+ }, {
+ .mask = HPTE_V_VALID,
+ .val = HPTE_V_VALID,
+ .set = "valid ",
+ .clear = "invalid",
+ }, {
+ .mask = HPTE_V_BOLTED,
+ .val = HPTE_V_BOLTED,
+ .set = "bolted",
+ .clear = "",
+ }
+};
+
+static const struct flag_info r_flag_array[] = {
+ {
+ .mask = HPTE_R_PP0 | HPTE_R_PP,
+ .val = PP_RWXX,
+ .set = "prot:RW--",
+ }, {
+ .mask = HPTE_R_PP0 | HPTE_R_PP,
+ .val = PP_RWRX,
+ .set = "prot:RWR-",
+ }, {
+ .mask = HPTE_R_PP0 | HPTE_R_PP,
+ .val = PP_RWRW,
+ .set = "prot:RWRW",
+ }, {
+ .mask = HPTE_R_PP0 | HPTE_R_PP,
+ .val = PP_RXRX,
+ .set = "prot:R-R-",
+ }, {
+ .mask = HPTE_R_PP0 | HPTE_R_PP,
+ .val = PP_RXXX,
+ .set = "prot:R---",
+ }, {
+ .mask = HPTE_R_KEY_HI | HPTE_R_KEY_LO,
+ .val = HPTE_R_KEY_HI | HPTE_R_KEY_LO,
+ .set = "key",
+ .clear = "",
+ .is_val = true,
+ }, {
+ .mask = HPTE_R_R,
+ .val = HPTE_R_R,
+ .set = "ref",
+ .clear = " ",
+ }, {
+ .mask = HPTE_R_C,
+ .val = HPTE_R_C,
+ .set = "changed",
+ .clear = " ",
+ }, {
+ .mask = HPTE_R_N,
+ .val = HPTE_R_N,
+ .set = "no execute",
+ }, {
+ .mask = HPTE_R_WIMG,
+ .val = HPTE_R_W,
+ .set = "writethru",
+ }, {
+ .mask = HPTE_R_WIMG,
+ .val = HPTE_R_I,
+ .set = "no cache",
+ }, {
+ .mask = HPTE_R_WIMG,
+ .val = HPTE_R_G,
+ .set = "guarded",
+ }
+};
+
+static int calculate_pagesize(struct pg_state *st, int ps, char s[])
+{
+ static const char units[] = "BKMGTPE";
+ const char *unit = units;
+
+ while (ps > 9 && unit[1]) {
+ ps -= 10;
+ unit++;
+ }
+ seq_printf(st->seq, " %s_ps: %i%c\t", s, 1<<ps, *unit);
+ return ps;
+}
+
+static void dump_flag_info(struct pg_state *st, const struct flag_info
+ *flag, u64 pte, int num)
+{
+ unsigned int i;
+
+ for (i = 0; i < num; i++, flag++) {
+ const char *s = NULL;
+ u64 val;
+
+ /* flag not defined so don't check it */
+ if (flag->mask == 0)
+ continue;
+ /* Some 'flags' are actually values */
+ if (flag->is_val) {
+ val = pte & flag->val;
+ if (flag->shift)
+ val = val >> flag->shift;
+ seq_printf(st->seq, " %s:%llx", flag->set, val);
+ } else {
+ if ((pte & flag->mask) == flag->val)
+ s = flag->set;
+ else
+ s = flag->clear;
+ if (s)
+ seq_printf(st->seq, " %s", s);
+ }
+ }
+}
+
+static void dump_hpte_info(struct pg_state *st, unsigned long ea, u64 v, u64 r,
+ unsigned long rpn, int bps, int aps, unsigned long lp)
+{
+ int aps_index;
+
+ while (ea >= st->marker[1].start_address) {
+ st->marker++;
+ seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+ }
+ seq_printf(st->seq, "0x%lx:\t", ea);
+ seq_printf(st->seq, "AVPN:%llx\t", HPTE_V_AVPN_VAL(v));
+ dump_flag_info(st, v_flag_array, v, ARRAY_SIZE(v_flag_array));
+ seq_printf(st->seq, " rpn: %lx\t", rpn);
+ dump_flag_info(st, r_flag_array, r, ARRAY_SIZE(r_flag_array));
+
+ calculate_pagesize(st, bps, "base");
+ aps_index = calculate_pagesize(st, aps, "actual");
+ if (aps_index != 2)
+ seq_printf(st->seq, "LP enc: %lx", lp);
+ seq_putc(st->seq, '\n');
+}
+
+
+static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64
+ *r)
+{
+ struct hash_pte *hptep;
+ unsigned long hash, vsid, vpn, hpte_group, want_v, hpte_v;
+ int i, ssize = mmu_kernel_ssize;
+ unsigned long shift = mmu_psize_defs[psize].shift;
+
+ /* calculate hash */
+ vsid = get_kernel_vsid(ea, ssize);
+ vpn = hpt_vpn(ea, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ want_v = hpte_old_to_new_v(want_v);
+
+ /* to check in the secondary hash table, we invert the hash */
+ if (!primary)
+ hash = ~hash;
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ hptep = htab_address + hpte_group;
+ hpte_v = be64_to_cpu(hptep->v);
+
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+ /* HPTE matches */
+ *v = be64_to_cpu(hptep->v);
+ *r = be64_to_cpu(hptep->r);
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ *v = hpte_new_to_old_v(*v, *r);
+ *r = hpte_new_to_old_r(*r);
+ }
+ return 0;
+ }
+ ++hpte_group;
+ }
+ return -1;
+}
+
+static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r)
+{
+ struct {
+ unsigned long v;
+ unsigned long r;
+ } ptes[4];
+ unsigned long vsid, vpn, hash, hpte_group, want_v;
+ int i, j, ssize = mmu_kernel_ssize;
+ long lpar_rc = 0;
+ unsigned long shift = mmu_psize_defs[psize].shift;
+
+ /* calculate hash */
+ vsid = get_kernel_vsid(ea, ssize);
+ vpn = hpt_vpn(ea, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
+
+ /* to check in the secondary hash table, we invert the hash */
+ if (!primary)
+ hash = ~hash;
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ /* see if we can find an entry in the hpte with this hash */
+ for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
+ lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
+
+ if (lpar_rc)
+ continue;
+ for (j = 0; j < 4; j++) {
+ if (HPTE_V_COMPARE(ptes[j].v, want_v) &&
+ (ptes[j].v & HPTE_V_VALID)) {
+ /* HPTE matches */
+ *v = ptes[j].v;
+ *r = ptes[j].r;
+ return 0;
+ }
+ }
+ }
+ return -1;
+}
+
+static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps,
+ unsigned long *lp_bits)
+{
+ struct mmu_psize_def entry;
+ unsigned long arpn, mask, lp;
+ int penc = -2, idx = 0, shift;
+
+ /*.
+ * The LP field has 8 bits. Depending on the actual page size, some of
+ * these bits are concatenated with the APRN to get the RPN. The rest
+ * of the bits in the LP field is the LP value and is an encoding for
+ * the base page size and the actual page size.
+ *
+ * - find the mmu entry for our base page size
+ * - go through all page encodings and use the associated mask to
+ * find an encoding that matches our encoding in the LP field.
+ */
+ arpn = (r & HPTE_R_RPN) >> HPTE_R_RPN_SHIFT;
+ lp = arpn & 0xff;
+
+ entry = mmu_psize_defs[bps];
+ while (idx < MMU_PAGE_COUNT) {
+ penc = entry.penc[idx];
+ if ((penc != -1) && (mmu_psize_defs[idx].shift)) {
+ shift = mmu_psize_defs[idx].shift - HPTE_R_RPN_SHIFT;
+ mask = (0x1 << (shift)) - 1;
+ if ((lp & mask) == penc) {
+ *aps = mmu_psize_to_shift(idx);
+ *lp_bits = lp & mask;
+ *rpn = arpn >> shift;
+ return;
+ }
+ }
+ idx++;
+ }
+}
+
+static int base_hpte_find(unsigned long ea, int psize, bool primary, u64 *v,
+ u64 *r)
+{
+ if (IS_ENABLED(CONFIG_PPC_PSERIES) && firmware_has_feature(FW_FEATURE_LPAR))
+ return pseries_find(ea, psize, primary, v, r);
+
+ return native_find(ea, psize, primary, v, r);
+}
+
+static unsigned long hpte_find(struct pg_state *st, unsigned long ea, int psize)
+{
+ unsigned long slot;
+ u64 v = 0, r = 0;
+ unsigned long rpn, lp_bits;
+ int base_psize = 0, actual_psize = 0;
+
+ if (ea < PAGE_OFFSET)
+ return -1;
+
+ /* Look in primary table */
+ slot = base_hpte_find(ea, psize, true, &v, &r);
+
+ /* Look in secondary table */
+ if (slot == -1)
+ slot = base_hpte_find(ea, psize, false, &v, &r);
+
+ /* No entry found */
+ if (slot == -1)
+ return -1;
+
+ /*
+ * We found an entry in the hash page table:
+ * - check that this has the same base page
+ * - find the actual page size
+ * - find the RPN
+ */
+ base_psize = mmu_psize_to_shift(psize);
+
+ if ((v & HPTE_V_LARGE) == HPTE_V_LARGE) {
+ decode_r(psize, r, &rpn, &actual_psize, &lp_bits);
+ } else {
+ /* 4K actual page size */
+ actual_psize = 12;
+ rpn = (r & HPTE_R_RPN) >> HPTE_R_RPN_SHIFT;
+ /* In this case there are no LP bits */
+ lp_bits = -1;
+ }
+ /*
+ * We didn't find a matching encoding, so the PTE we found isn't for
+ * this address.
+ */
+ if (actual_psize == -1)
+ return -1;
+
+ dump_hpte_info(st, ea, v, r, rpn, base_psize, actual_psize, lp_bits);
+ return 0;
+}
+
+static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
+{
+ pte_t *pte = pte_offset_kernel(pmd, 0);
+ unsigned long addr, pteval, psize;
+ int i, status;
+
+ for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
+ addr = start + i * PAGE_SIZE;
+ pteval = pte_val(*pte);
+
+ if (addr < VMALLOC_END)
+ psize = mmu_vmalloc_psize;
+ else
+ psize = mmu_io_psize;
+
+ /* check for secret 4K mappings */
+ if (IS_ENABLED(CONFIG_PPC_64K_PAGES) &&
+ ((pteval & H_PAGE_COMBO) == H_PAGE_COMBO ||
+ (pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN))
+ psize = mmu_io_psize;
+
+ /* check for hashpte */
+ status = hpte_find(st, addr, psize);
+
+ if (((pteval & H_PAGE_HASHPTE) != H_PAGE_HASHPTE)
+ && (status != -1)) {
+ /* found a hpte that is not in the linux page tables */
+ seq_printf(st->seq, "page probably bolted before linux"
+ " pagetables were set: addr:%lx, pteval:%lx\n",
+ addr, pteval);
+ }
+ }
+}
+
+static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
+{
+ pmd_t *pmd = pmd_offset(pud, 0);
+ unsigned long addr;
+ unsigned int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
+ addr = start + i * PMD_SIZE;
+ if (!pmd_none(*pmd))
+ /* pmd exists */
+ walk_pte(st, pmd, addr);
+ }
+}
+
+static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
+{
+ pud_t *pud = pud_offset(p4d, 0);
+ unsigned long addr;
+ unsigned int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
+ addr = start + i * PUD_SIZE;
+ if (!pud_none(*pud))
+ /* pud exists */
+ walk_pmd(st, pud, addr);
+ }
+}
+
+static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start)
+{
+ p4d_t *p4d = p4d_offset(pgd, 0);
+ unsigned long addr;
+ unsigned int i;
+
+ for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
+ addr = start + i * P4D_SIZE;
+ if (!p4d_none(*p4d))
+ /* p4d exists */
+ walk_pud(st, p4d, addr);
+ }
+}
+
+static void walk_pagetables(struct pg_state *st)
+{
+ pgd_t *pgd = pgd_offset_k(0UL);
+ unsigned int i;
+ unsigned long addr;
+
+ /*
+ * Traverse the linux pagetable structure and dump pages that are in
+ * the hash pagetable.
+ */
+ for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
+ addr = KERN_VIRT_START + i * PGDIR_SIZE;
+ if (!pgd_none(*pgd))
+ /* pgd exists */
+ walk_p4d(st, pgd, addr);
+ }
+}
+
+
+static void walk_linearmapping(struct pg_state *st)
+{
+ unsigned long addr;
+
+ /*
+ * Traverse the linear mapping section of virtual memory and dump pages
+ * that are in the hash pagetable.
+ */
+ unsigned long psize = 1 << mmu_psize_defs[mmu_linear_psize].shift;
+
+ for (addr = PAGE_OFFSET; addr < PAGE_OFFSET +
+ memblock_end_of_DRAM(); addr += psize)
+ hpte_find(st, addr, mmu_linear_psize);
+}
+
+static void walk_vmemmap(struct pg_state *st)
+{
+ struct vmemmap_backing *ptr = vmemmap_list;
+
+ if (!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
+ return;
+ /*
+ * Traverse the vmemmaped memory and dump pages that are in the hash
+ * pagetable.
+ */
+ while (ptr) {
+ hpte_find(st, ptr->virt_addr, mmu_vmemmap_psize);
+ ptr = ptr->list;
+ }
+ seq_puts(st->seq, "---[ vmemmap end ]---\n");
+}
+
+static void populate_markers(void)
+{
+ address_markers[0].start_address = PAGE_OFFSET;
+ address_markers[1].start_address = VMALLOC_START;
+ address_markers[2].start_address = VMALLOC_END;
+ address_markers[3].start_address = ISA_IO_BASE;
+ address_markers[4].start_address = ISA_IO_END;
+ address_markers[5].start_address = PHB_IO_BASE;
+ address_markers[6].start_address = PHB_IO_END;
+ address_markers[7].start_address = IOREMAP_BASE;
+ address_markers[8].start_address = IOREMAP_END;
+ address_markers[9].start_address = H_VMEMMAP_START;
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+ struct pg_state st = {
+ .seq = m,
+ .start_address = PAGE_OFFSET,
+ .marker = address_markers,
+ };
+ /*
+ * Traverse the 0xc, 0xd and 0xf areas of the kernel virtual memory and
+ * dump pages that are in the hash pagetable.
+ */
+ walk_linearmapping(&st);
+ walk_pagetables(&st);
+ walk_vmemmap(&st);
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(ptdump);
+
+static int ptdump_init(void)
+{
+ if (!radix_enabled()) {
+ populate_markers();
+ debugfs_create_file("kernel_hash_pagetable", 0400, NULL, NULL,
+ &ptdump_fops);
+ }
+ return 0;
+}
+device_initcall(ptdump_init);
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
new file mode 100644
index 000000000000..0d499aebee72
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2016, Rashmica Gupta, IBM Corp.
+ *
+ * This traverses the kernel pagetables and dumps the
+ * information about the used sections of memory to
+ * /sys/kernel/debug/kernel_pagetables.
+ *
+ * Derived from the arm64 implementation:
+ * Copyright (c) 2014, The Linux Foundation, Laura Abbott.
+ * (C) Copyright 2008 Intel Corporation, Arjan van de Ven.
+ */
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/hugetlb.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/ptdump.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <asm/fixmap.h>
+#include <linux/const.h>
+#include <linux/kasan.h>
+#include <asm/page.h>
+#include <asm/hugetlb.h>
+
+#include <mm/mmu_decl.h>
+
+#include "ptdump.h"
+
+/*
+ * To visualise what is happening,
+ *
+ * - PTRS_PER_P** = how many entries there are in the corresponding P**
+ * - P**_SHIFT = how many bits of the address we use to index into the
+ * corresponding P**
+ * - P**_SIZE is how much memory we can access through the table - not the
+ * size of the table itself.
+ * P**={PGD, PUD, PMD, PTE}
+ *
+ *
+ * Each entry of the PGD points to a PUD. Each entry of a PUD points to a
+ * PMD. Each entry of a PMD points to a PTE. And every PTE entry points to
+ * a page.
+ *
+ * In the case where there are only 3 levels, the PUD is folded into the
+ * PGD: every PUD has only one entry which points to the PMD.
+ *
+ * The page dumper groups page table entries of the same type into a single
+ * description. It uses pg_state to track the range information while
+ * iterating over the PTE entries. When the continuity is broken it then
+ * dumps out a description of the range - ie PTEs that are virtually contiguous
+ * with the same PTE flags are chunked together. This is to make it clear how
+ * different areas of the kernel virtual memory are used.
+ *
+ */
+struct pg_state {
+ struct ptdump_state ptdump;
+ struct seq_file *seq;
+ const struct addr_marker *marker;
+ unsigned long start_address;
+ unsigned long start_pa;
+ int level;
+ u64 current_flags;
+ bool check_wx;
+ unsigned long wx_pages;
+};
+
+struct addr_marker {
+ unsigned long start_address;
+ const char *name;
+};
+
+static struct addr_marker address_markers[] = {
+ { 0, "Start of kernel VM" },
+#ifdef MODULES_VADDR
+ { 0, "modules start" },
+ { 0, "modules end" },
+#endif
+ { 0, "vmalloc() Area" },
+ { 0, "vmalloc() End" },
+#ifdef CONFIG_PPC64
+ { 0, "isa I/O start" },
+ { 0, "isa I/O end" },
+ { 0, "phb I/O start" },
+ { 0, "phb I/O end" },
+ { 0, "I/O remap start" },
+ { 0, "I/O remap end" },
+ { 0, "vmemmap start" },
+#else
+ { 0, "Early I/O remap start" },
+ { 0, "Early I/O remap end" },
+#ifdef CONFIG_HIGHMEM
+ { 0, "Highmem PTEs start" },
+ { 0, "Highmem PTEs end" },
+#endif
+ { 0, "Fixmap start" },
+ { 0, "Fixmap end" },
+#endif
+#ifdef CONFIG_KASAN
+ { 0, "kasan shadow mem start" },
+ { 0, "kasan shadow mem end" },
+#endif
+ { -1, NULL },
+};
+
+static struct ptdump_range ptdump_range[] __ro_after_init = {
+ {TASK_SIZE_MAX, ~0UL},
+ {0, 0}
+};
+
+#define pt_dump_seq_printf(m, fmt, args...) \
+({ \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
+#define pt_dump_seq_putc(m, c) \
+({ \
+ if (m) \
+ seq_putc(m, c); \
+})
+
+void pt_dump_size(struct seq_file *m, unsigned long size)
+{
+ static const char units[] = " KMGTPE";
+ const char *unit = units;
+
+ /* Work out what appropriate unit to use */
+ while (!(size & 1023) && unit[1]) {
+ size >>= 10;
+ unit++;
+ }
+ pt_dump_seq_printf(m, "%9lu%c ", size, *unit);
+}
+
+static void dump_flag_info(struct pg_state *st, const struct flag_info
+ *flag, u64 pte, int num)
+{
+ unsigned int i;
+
+ for (i = 0; i < num; i++, flag++) {
+ const char *s = NULL;
+ u64 val;
+
+ /* flag not defined so don't check it */
+ if (flag->mask == 0)
+ continue;
+ /* Some 'flags' are actually values */
+ if (flag->is_val) {
+ val = pte & flag->val;
+ if (flag->shift)
+ val = val >> flag->shift;
+ pt_dump_seq_printf(st->seq, " %s:%llx", flag->set, val);
+ } else {
+ if ((pte & flag->mask) == flag->val)
+ s = flag->set;
+ else
+ s = flag->clear;
+ if (s)
+ pt_dump_seq_printf(st->seq, " %s", s);
+ }
+ st->current_flags &= ~flag->mask;
+ }
+ if (st->current_flags != 0)
+ pt_dump_seq_printf(st->seq, " unknown flags:%llx", st->current_flags);
+}
+
+static void dump_addr(struct pg_state *st, unsigned long addr)
+{
+#ifdef CONFIG_PPC64
+#define REG "0x%016lx"
+#else
+#define REG "0x%08lx"
+#endif
+
+ pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1);
+ pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa);
+ pt_dump_size(st->seq, addr - st->start_address);
+ pt_dump_seq_printf(st->seq, "%s ", pg_level[st->level].name);
+}
+
+static void note_prot_wx(struct pg_state *st, unsigned long addr)
+{
+ pte_t pte = __pte(st->current_flags);
+
+ if (!st->check_wx)
+ return;
+
+ if (!pte_write(pte) || !pte_exec(pte))
+ return;
+
+ WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX),
+ "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n",
+ (void *)st->start_address, (void *)st->start_address);
+
+ st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
+}
+
+static void note_page_update_state(struct pg_state *st, unsigned long addr, int level, u64 val)
+{
+ u64 flag = level >= 0 ? val & pg_level[level].mask : 0;
+ u64 pa = val & PTE_RPN_MASK;
+
+ st->level = level;
+ st->current_flags = flag;
+ st->start_address = addr;
+ st->start_pa = pa;
+
+ while (addr >= st->marker[1].start_address) {
+ st->marker++;
+ pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+ }
+}
+
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
+{
+ u64 flag = level >= 0 ? val & pg_level[level].mask : 0;
+ struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
+
+ /* At first no level is set */
+ if (st->level == -1) {
+ pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+ note_page_update_state(st, addr, level, val);
+ /*
+ * Dump the section of virtual memory when:
+ * - the PTE flags from one entry to the next differs.
+ * - we change levels in the tree.
+ * - the address is in a different section of memory and is thus
+ * used for a different purpose, regardless of the flags.
+ */
+ } else if (flag != st->current_flags || level != st->level ||
+ addr >= st->marker[1].start_address) {
+
+ /* Check the PTE flags */
+ if (st->current_flags) {
+ note_prot_wx(st, addr);
+ dump_addr(st, addr);
+
+ /* Dump all the flags */
+ if (pg_level[st->level].flag)
+ dump_flag_info(st, pg_level[st->level].flag,
+ st->current_flags,
+ pg_level[st->level].num);
+
+ pt_dump_seq_putc(st->seq, '\n');
+ }
+
+ /*
+ * Address indicates we have passed the end of the
+ * current section of virtual memory
+ */
+ note_page_update_state(st, addr, level, val);
+ }
+}
+
+static void populate_markers(void)
+{
+ int i = 0;
+
+#ifdef CONFIG_PPC64
+ address_markers[i++].start_address = PAGE_OFFSET;
+#else
+ address_markers[i++].start_address = TASK_SIZE;
+#endif
+#ifdef MODULES_VADDR
+ address_markers[i++].start_address = MODULES_VADDR;
+ address_markers[i++].start_address = MODULES_END;
+#endif
+ address_markers[i++].start_address = VMALLOC_START;
+ address_markers[i++].start_address = VMALLOC_END;
+#ifdef CONFIG_PPC64
+ address_markers[i++].start_address = ISA_IO_BASE;
+ address_markers[i++].start_address = ISA_IO_END;
+ address_markers[i++].start_address = PHB_IO_BASE;
+ address_markers[i++].start_address = PHB_IO_END;
+ address_markers[i++].start_address = IOREMAP_BASE;
+ address_markers[i++].start_address = IOREMAP_END;
+ /* What is the ifdef about? */
+#ifdef CONFIG_PPC_BOOK3S_64
+ address_markers[i++].start_address = H_VMEMMAP_START;
+#else
+ address_markers[i++].start_address = VMEMMAP_BASE;
+#endif
+#else /* !CONFIG_PPC64 */
+ address_markers[i++].start_address = ioremap_bot;
+ address_markers[i++].start_address = IOREMAP_TOP;
+#ifdef CONFIG_HIGHMEM
+ address_markers[i++].start_address = PKMAP_BASE;
+ address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP);
+#endif
+ address_markers[i++].start_address = FIXADDR_START;
+ address_markers[i++].start_address = FIXADDR_TOP;
+#endif /* CONFIG_PPC64 */
+#ifdef CONFIG_KASAN
+ address_markers[i++].start_address = KASAN_SHADOW_START;
+ address_markers[i++].start_address = KASAN_SHADOW_END;
+#endif
+}
+
+static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
+{
+ note_page(pt_st, addr, 4, pte_val(pte));
+}
+
+static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+ note_page(pt_st, addr, 3, pmd_val(pmd));
+}
+
+static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
+{
+ note_page(pt_st, addr, 2, pud_val(pud));
+}
+
+static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+ note_page(pt_st, addr, 1, p4d_val(p4d));
+}
+
+static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
+{
+ note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+static void note_page_flush(struct ptdump_state *pt_st)
+{
+ pte_t pte_zero = {0};
+
+ note_page(pt_st, 0, -1, pte_val(pte_zero));
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+ struct pg_state st = {
+ .seq = m,
+ .marker = address_markers,
+ .level = -1,
+ .ptdump = {
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
+ .range = ptdump_range,
+ }
+ };
+
+ /* Traverse kernel page tables */
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(ptdump);
+
+static void __init build_pgtable_complete_mask(void)
+{
+ unsigned int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(pg_level); i++)
+ if (pg_level[i].flag)
+ for (j = 0; j < pg_level[i].num; j++)
+ pg_level[i].mask |= pg_level[i].flag[j].mask;
+}
+
+bool ptdump_check_wx(void)
+{
+ struct pg_state st = {
+ .seq = NULL,
+ .marker = (struct addr_marker[]) {
+ { 0, NULL},
+ { -1, NULL},
+ },
+ .level = -1,
+ .check_wx = true,
+ .ptdump = {
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
+ .range = ptdump_range,
+ }
+ };
+
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !mmu_has_feature(MMU_FTR_KERNEL_RO))
+ return true;
+
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+
+ if (st.wx_pages) {
+ pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n",
+ st.wx_pages);
+
+ return false;
+ } else {
+ pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+
+ return true;
+ }
+}
+
+static int __init ptdump_init(void)
+{
+#ifdef CONFIG_PPC64
+ if (!radix_enabled())
+ ptdump_range[0].start = KERN_VIRT_START;
+ else
+ ptdump_range[0].start = PAGE_OFFSET;
+
+ ptdump_range[0].end = PAGE_OFFSET + (PGDIR_SIZE * PTRS_PER_PGD);
+#endif
+
+ populate_markers();
+ build_pgtable_complete_mask();
+
+ if (IS_ENABLED(CONFIG_PTDUMP_DEBUGFS))
+ debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
+
+ return 0;
+}
+device_initcall(ptdump_init);
diff --git a/arch/powerpc/mm/ptdump/ptdump.h b/arch/powerpc/mm/ptdump/ptdump.h
new file mode 100644
index 000000000000..12aa9eca8b0c
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/ptdump.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/types.h>
+#include <linux/seq_file.h>
+
+struct flag_info {
+ u64 mask;
+ u64 val;
+ const char *set;
+ const char *clear;
+ bool is_val;
+ int shift;
+};
+
+struct ptdump_pg_level {
+ const struct flag_info *flag;
+ char name[4];
+ size_t num;
+ u64 mask;
+};
+
+extern struct ptdump_pg_level pg_level[5];
+
+void pt_dump_size(struct seq_file *m, unsigned long delta);
diff --git a/arch/powerpc/mm/ptdump/segment_regs.c b/arch/powerpc/mm/ptdump/segment_regs.c
new file mode 100644
index 000000000000..9df3af8d481f
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/segment_regs.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright 2018, Christophe Leroy CS S.I.
+ * <christophe.leroy@c-s.fr>
+ *
+ * This dumps the content of Segment Registers
+ */
+
+#include <linux/debugfs.h>
+
+static void seg_show(struct seq_file *m, int i)
+{
+ u32 val = mfsr(i << 28);
+
+ seq_printf(m, "0x%01x0000000-0x%01xfffffff ", i, i);
+ seq_printf(m, "Kern key %d ", (val >> 30) & 1);
+ seq_printf(m, "User key %d ", (val >> 29) & 1);
+ if (val & 0x80000000) {
+ seq_printf(m, "Device 0x%03x", (val >> 20) & 0x1ff);
+ seq_printf(m, "-0x%05x", val & 0xfffff);
+ } else {
+ if (val & 0x10000000)
+ seq_puts(m, "No Exec ");
+ seq_printf(m, "VSID 0x%06x", val & 0xffffff);
+ }
+ seq_puts(m, "\n");
+}
+
+static int sr_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ seq_puts(m, "---[ User Segments ]---\n");
+ for (i = 0; i < TASK_SIZE >> 28; i++)
+ seg_show(m, i);
+
+ seq_puts(m, "\n---[ Kernel Segments ]---\n");
+ for (; i < 16; i++)
+ seg_show(m, i);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(sr);
+
+static int __init sr_init(void)
+{
+ debugfs_create_file("segment_registers", 0400, arch_debugfs_dir,
+ NULL, &sr_fops);
+ return 0;
+}
+device_initcall(sr_init);
diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c
new file mode 100644
index 000000000000..edc69da19b85
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/shared.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * From split of dump_linuxpagetables.c
+ * Copyright 2016, Rashmica Gupta, IBM Corp.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/pgtable.h>
+
+#include "ptdump.h"
+
+static const struct flag_info flag_array[] = {
+ {
+ .mask = _PAGE_READ,
+ .val = 0,
+ .set = " ",
+ .clear = "r",
+ }, {
+ .mask = _PAGE_WRITE,
+ .val = 0,
+ .set = " ",
+ .clear = "w",
+ }, {
+ .mask = _PAGE_EXEC,
+ .val = _PAGE_EXEC,
+ .set = " X ",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_PRESENT,
+ .val = _PAGE_PRESENT,
+ .set = "present",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_COHERENT,
+ .val = _PAGE_COHERENT,
+ .set = "coherent",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_GUARDED,
+ .val = _PAGE_GUARDED,
+ .set = "guarded",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_DIRTY,
+ .val = _PAGE_DIRTY,
+ .set = "dirty",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_ACCESSED,
+ .val = _PAGE_ACCESSED,
+ .set = "accessed",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_WRITETHRU,
+ .val = _PAGE_WRITETHRU,
+ .set = "write through",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_NO_CACHE,
+ .val = _PAGE_NO_CACHE,
+ .set = "no cache",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_SPECIAL,
+ .val = _PAGE_SPECIAL,
+ .set = "special",
+ }
+};
+
+struct ptdump_pg_level pg_level[5] = {
+ { /* pgd */
+ .name = "PGD",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* p4d */
+ .name = "P4D",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pud */
+ .name = "PUD",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pmd */
+ .name = "PMD",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pte */
+ .name = "PTE",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ },
+};
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
deleted file mode 100644
index a538c80db2df..000000000000
--- a/arch/powerpc/mm/slb.c
+++ /dev/null
@@ -1,332 +0,0 @@
-/*
- * PowerPC64 SLB support.
- *
- * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
- * Based on earlier code written by:
- * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
- * Copyright (c) 2001 Dave Engebretsen
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/paca.h>
-#include <asm/cputable.h>
-#include <asm/cacheflush.h>
-#include <asm/smp.h>
-#include <linux/compiler.h>
-#include <asm/udbg.h>
-#include <asm/code-patching.h>
-
-
-extern void slb_allocate_realmode(unsigned long ea);
-extern void slb_allocate_user(unsigned long ea);
-
-static void slb_allocate(unsigned long ea)
-{
- /* Currently, we do real mode for all SLBs including user, but
- * that will change if we bring back dynamic VSIDs
- */
- slb_allocate_realmode(ea);
-}
-
-#define slb_esid_mask(ssize) \
- (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
-
-static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
- unsigned long slot)
-{
- return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot;
-}
-
-#define slb_vsid_shift(ssize) \
- ((ssize) == MMU_SEGSIZE_256M? SLB_VSID_SHIFT: SLB_VSID_SHIFT_1T)
-
-static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
- unsigned long flags)
-{
- return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags |
- ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
-}
-
-static inline void slb_shadow_update(unsigned long ea, int ssize,
- unsigned long flags,
- unsigned long entry)
-{
- /*
- * Clear the ESID first so the entry is not valid while we are
- * updating it. No write barriers are needed here, provided
- * we only update the current CPU's SLB shadow buffer.
- */
- get_slb_shadow()->save_area[entry].esid = 0;
- get_slb_shadow()->save_area[entry].vsid = mk_vsid_data(ea, ssize, flags);
- get_slb_shadow()->save_area[entry].esid = mk_esid_data(ea, ssize, entry);
-}
-
-static inline void slb_shadow_clear(unsigned long entry)
-{
- get_slb_shadow()->save_area[entry].esid = 0;
-}
-
-static inline void create_shadowed_slbe(unsigned long ea, int ssize,
- unsigned long flags,
- unsigned long entry)
-{
- /*
- * Updating the shadow buffer before writing the SLB ensures
- * we don't get a stale entry here if we get preempted by PHYP
- * between these two statements.
- */
- slb_shadow_update(ea, ssize, flags, entry);
-
- asm volatile("slbmte %0,%1" :
- : "r" (mk_vsid_data(ea, ssize, flags)),
- "r" (mk_esid_data(ea, ssize, entry))
- : "memory" );
-}
-
-static void __slb_flush_and_rebolt(void)
-{
- /* If you change this make sure you change SLB_NUM_BOLTED
- * appropriately too. */
- unsigned long linear_llp, vmalloc_llp, lflags, vflags;
- unsigned long ksp_esid_data, ksp_vsid_data;
-
- linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
- vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
- lflags = SLB_VSID_KERNEL | linear_llp;
- vflags = SLB_VSID_KERNEL | vmalloc_llp;
-
- ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, 2);
- if ((ksp_esid_data & ~0xfffffffUL) <= PAGE_OFFSET) {
- ksp_esid_data &= ~SLB_ESID_V;
- ksp_vsid_data = 0;
- slb_shadow_clear(2);
- } else {
- /* Update stack entry; others don't change */
- slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 2);
- ksp_vsid_data = get_slb_shadow()->save_area[2].vsid;
- }
-
- /* We need to do this all in asm, so we're sure we don't touch
- * the stack between the slbia and rebolting it. */
- asm volatile("isync\n"
- "slbia\n"
- /* Slot 1 - first VMALLOC segment */
- "slbmte %0,%1\n"
- /* Slot 2 - kernel stack */
- "slbmte %2,%3\n"
- "isync"
- :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)),
- "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, 1)),
- "r"(ksp_vsid_data),
- "r"(ksp_esid_data)
- : "memory");
-}
-
-void slb_flush_and_rebolt(void)
-{
-
- WARN_ON(!irqs_disabled());
-
- /*
- * We can't take a PMU exception in the following code, so hard
- * disable interrupts.
- */
- hard_irq_disable();
-
- __slb_flush_and_rebolt();
- get_paca()->slb_cache_ptr = 0;
-}
-
-void slb_vmalloc_update(void)
-{
- unsigned long vflags;
-
- vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp;
- slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, 1);
- slb_flush_and_rebolt();
-}
-
-/* Helper function to compare esids. There are four cases to handle.
- * 1. The system is not 1T segment size capable. Use the GET_ESID compare.
- * 2. The system is 1T capable, both addresses are < 1T, use the GET_ESID compare.
- * 3. The system is 1T capable, only one of the two addresses is > 1T. This is not a match.
- * 4. The system is 1T capable, both addresses are > 1T, use the GET_ESID_1T macro to compare.
- */
-static inline int esids_match(unsigned long addr1, unsigned long addr2)
-{
- int esid_1t_count;
-
- /* System is not 1T segment size capable. */
- if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
- return (GET_ESID(addr1) == GET_ESID(addr2));
-
- esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) +
- ((addr2 >> SID_SHIFT_1T) != 0));
-
- /* both addresses are < 1T */
- if (esid_1t_count == 0)
- return (GET_ESID(addr1) == GET_ESID(addr2));
-
- /* One address < 1T, the other > 1T. Not a match */
- if (esid_1t_count == 1)
- return 0;
-
- /* Both addresses are > 1T. */
- return (GET_ESID_1T(addr1) == GET_ESID_1T(addr2));
-}
-
-/* Flush all user entries from the segment table of the current processor. */
-void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
-{
- unsigned long offset;
- unsigned long slbie_data = 0;
- unsigned long pc = KSTK_EIP(tsk);
- unsigned long stack = KSTK_ESP(tsk);
- unsigned long exec_base;
-
- /*
- * We need interrupts hard-disabled here, not just soft-disabled,
- * so that a PMU interrupt can't occur, which might try to access
- * user memory (to get a stack trace) and possible cause an SLB miss
- * which would update the slb_cache/slb_cache_ptr fields in the PACA.
- */
- hard_irq_disable();
- offset = get_paca()->slb_cache_ptr;
- if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
- offset <= SLB_CACHE_ENTRIES) {
- int i;
- asm volatile("isync" : : : "memory");
- for (i = 0; i < offset; i++) {
- slbie_data = (unsigned long)get_paca()->slb_cache[i]
- << SID_SHIFT; /* EA */
- slbie_data |= user_segment_size(slbie_data)
- << SLBIE_SSIZE_SHIFT;
- slbie_data |= SLBIE_C; /* C set for user addresses */
- asm volatile("slbie %0" : : "r" (slbie_data));
- }
- asm volatile("isync" : : : "memory");
- } else {
- __slb_flush_and_rebolt();
- }
-
- /* Workaround POWER5 < DD2.1 issue */
- if (offset == 1 || offset > SLB_CACHE_ENTRIES)
- asm volatile("slbie %0" : : "r" (slbie_data));
-
- get_paca()->slb_cache_ptr = 0;
- get_paca()->context = mm->context;
-
- /*
- * preload some userspace segments into the SLB.
- * Almost all 32 and 64bit PowerPC executables are linked at
- * 0x10000000 so it makes sense to preload this segment.
- */
- exec_base = 0x10000000;
-
- if (is_kernel_addr(pc) || is_kernel_addr(stack) ||
- is_kernel_addr(exec_base))
- return;
-
- slb_allocate(pc);
-
- if (!esids_match(pc, stack))
- slb_allocate(stack);
-
- if (!esids_match(pc, exec_base) &&
- !esids_match(stack, exec_base))
- slb_allocate(exec_base);
-}
-
-static inline void patch_slb_encoding(unsigned int *insn_addr,
- unsigned int immed)
-{
- int insn = (*insn_addr & 0xffff0000) | immed;
- patch_instruction(insn_addr, insn);
-}
-
-void slb_set_size(u16 size)
-{
- extern unsigned int *slb_compare_rr_to_size;
-
- if (mmu_slb_size == size)
- return;
-
- mmu_slb_size = size;
- patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size);
-}
-
-void slb_initialize(void)
-{
- unsigned long linear_llp, vmalloc_llp, io_llp;
- unsigned long lflags, vflags;
- static int slb_encoding_inited;
- extern unsigned int *slb_miss_kernel_load_linear;
- extern unsigned int *slb_miss_kernel_load_io;
- extern unsigned int *slb_compare_rr_to_size;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- extern unsigned int *slb_miss_kernel_load_vmemmap;
- unsigned long vmemmap_llp;
-#endif
-
- /* Prepare our SLB miss handler based on our page size */
- linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
- io_llp = mmu_psize_defs[mmu_io_psize].sllp;
- vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
- get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp;
-#endif
- if (!slb_encoding_inited) {
- slb_encoding_inited = 1;
- patch_slb_encoding(slb_miss_kernel_load_linear,
- SLB_VSID_KERNEL | linear_llp);
- patch_slb_encoding(slb_miss_kernel_load_io,
- SLB_VSID_KERNEL | io_llp);
- patch_slb_encoding(slb_compare_rr_to_size,
- mmu_slb_size);
-
- pr_devel("SLB: linear LLP = %04lx\n", linear_llp);
- pr_devel("SLB: io LLP = %04lx\n", io_llp);
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- patch_slb_encoding(slb_miss_kernel_load_vmemmap,
- SLB_VSID_KERNEL | vmemmap_llp);
- pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
-#endif
- }
-
- get_paca()->stab_rr = SLB_NUM_BOLTED;
-
- lflags = SLB_VSID_KERNEL | linear_llp;
- vflags = SLB_VSID_KERNEL | vmalloc_llp;
-
- /* Invalidate the entire SLB (even slot 0) & all the ERATS */
- asm volatile("isync":::"memory");
- asm volatile("slbmte %0,%0"::"r" (0) : "memory");
- asm volatile("isync; slbia; isync":::"memory");
- create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, 0);
-
- create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, 1);
-
- /* For the boot cpu, we're running on the stack in init_thread_union,
- * which is in the first segment of the linear mapping, and also
- * get_paca()->kstack hasn't been initialized yet.
- * For secondary cpus, we need to bolt the kernel stack entry now.
- */
- slb_shadow_clear(2);
- if (raw_smp_processor_id() != boot_cpuid &&
- (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
- create_shadowed_slbe(get_paca()->kstack,
- mmu_kernel_ssize, lflags, 2);
-
- asm volatile("isync":::"memory");
-}
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
deleted file mode 100644
index 17aa6dfceb34..000000000000
--- a/arch/powerpc/mm/slb_low.S
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * Low-level SLB routines
- *
- * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
- *
- * Based on earlier C version:
- * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
- * Copyright (c) 2001 Dave Engebretsen
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/processor.h>
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/cputable.h>
-#include <asm/page.h>
-#include <asm/mmu.h>
-#include <asm/pgtable.h>
-#include <asm/firmware.h>
-
-/* void slb_allocate_realmode(unsigned long ea);
- *
- * Create an SLB entry for the given EA (user or kernel).
- * r3 = faulting address, r13 = PACA
- * r9, r10, r11 are clobbered by this function
- * No other registers are examined or changed.
- */
-_GLOBAL(slb_allocate_realmode)
- /*
- * check for bad kernel/user address
- * (ea & ~REGION_MASK) >= PGTABLE_RANGE
- */
- rldicr. r9,r3,4,(63 - 46 - 4)
- bne- 8f
-
- srdi r9,r3,60 /* get region */
- srdi r10,r3,SID_SHIFT /* get esid */
- cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */
-
- /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
- blt cr7,0f /* user or kernel? */
-
- /* kernel address: proto-VSID = ESID */
- /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
- * this code will generate the protoVSID 0xfffffffff for the
- * top segment. That's ok, the scramble below will translate
- * it to VSID 0, which is reserved as a bad VSID - one which
- * will never have any pages in it. */
-
- /* Check if hitting the linear mapping or some other kernel space
- */
- bne cr7,1f
-
- /* Linear mapping encoding bits, the "li" instruction below will
- * be patched by the kernel at boot
- */
-_GLOBAL(slb_miss_kernel_load_linear)
- li r11,0
- /*
- * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
- * r9 = region id.
- */
- addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
- addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
-
-
-BEGIN_FTR_SECTION
- b slb_finish_load
-END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
- b slb_finish_load_1T
-
-1:
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- /* Check virtual memmap region. To be patches at kernel boot */
- cmpldi cr0,r9,0xf
- bne 1f
-_GLOBAL(slb_miss_kernel_load_vmemmap)
- li r11,0
- b 6f
-1:
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-
- /* vmalloc mapping gets the encoding from the PACA as the mapping
- * can be demoted from 64K -> 4K dynamically on some machines
- */
- clrldi r11,r10,48
- cmpldi r11,(VMALLOC_SIZE >> 28) - 1
- bgt 5f
- lhz r11,PACAVMALLOCSLLP(r13)
- b 6f
-5:
- /* IO mapping */
- _GLOBAL(slb_miss_kernel_load_io)
- li r11,0
-6:
- /*
- * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
- * r9 = region id.
- */
- addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
- addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
-
-BEGIN_FTR_SECTION
- b slb_finish_load
-END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
- b slb_finish_load_1T
-
-0:
- /* when using slices, we extract the psize off the slice bitmaps
- * and then we need to get the sllp encoding off the mmu_psize_defs
- * array.
- *
- * XXX This is a bit inefficient especially for the normal case,
- * so we should try to implement a fast path for the standard page
- * size using the old sllp value so we avoid the array. We cannot
- * really do dynamic patching unfortunately as processes might flip
- * between 4k and 64k standard page size
- */
-#ifdef CONFIG_PPC_MM_SLICES
- /* r10 have esid */
- cmpldi r10,16
- /* below SLICE_LOW_TOP */
- blt 5f
- /*
- * Handle hpsizes,
- * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index
- */
- srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */
- addi r9,r11,PACAHIGHSLICEPSIZE
- lbzx r9,r13,r9 /* r9 is hpsizes[r11] */
- /* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */
- rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63
- b 6f
-
-5:
- /*
- * Handle lpsizes
- * r9 is get_paca()->context.low_slices_psize, r11 is index
- */
- ld r9,PACALOWSLICESPSIZE(r13)
- mr r11,r10
-6:
- sldi r11,r11,2 /* index * 4 */
- /* Extract the psize and multiply to get an array offset */
- srd r9,r9,r11
- andi. r9,r9,0xf
- mulli r9,r9,MMUPSIZEDEFSIZE
-
- /* Now get to the array and obtain the sllp
- */
- ld r11,PACATOC(r13)
- ld r11,mmu_psize_defs@got(r11)
- add r11,r11,r9
- ld r11,MMUPSIZESLLP(r11)
- ori r11,r11,SLB_VSID_USER
-#else
- /* paca context sllp already contains the SLB_VSID_USER bits */
- lhz r11,PACACONTEXTSLLP(r13)
-#endif /* CONFIG_PPC_MM_SLICES */
-
- ld r9,PACACONTEXTID(r13)
-BEGIN_FTR_SECTION
- cmpldi r10,0x1000
- bge slb_finish_load_1T
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
- b slb_finish_load
-
-8: /* invalid EA */
- li r10,0 /* BAD_VSID */
- li r9,0 /* BAD_VSID */
- li r11,SLB_VSID_USER /* flags don't much matter */
- b slb_finish_load
-
-#ifdef __DISABLED__
-
-/* void slb_allocate_user(unsigned long ea);
- *
- * Create an SLB entry for the given EA (user or kernel).
- * r3 = faulting address, r13 = PACA
- * r9, r10, r11 are clobbered by this function
- * No other registers are examined or changed.
- *
- * It is called with translation enabled in order to be able to walk the
- * page tables. This is not currently used.
- */
-_GLOBAL(slb_allocate_user)
- /* r3 = faulting address */
- srdi r10,r3,28 /* get esid */
-
- crset 4*cr7+lt /* set "user" flag for later */
-
- /* check if we fit in the range covered by the pagetables*/
- srdi. r9,r3,PGTABLE_EADDR_SIZE
- crnot 4*cr0+eq,4*cr0+eq
- beqlr
-
- /* now we need to get to the page tables in order to get the page
- * size encoding from the PMD. In the future, we'll be able to deal
- * with 1T segments too by getting the encoding from the PGD instead
- */
- ld r9,PACAPGDIR(r13)
- cmpldi cr0,r9,0
- beqlr
- rlwinm r11,r10,8,25,28
- ldx r9,r9,r11 /* get pgd_t */
- cmpldi cr0,r9,0
- beqlr
- rlwinm r11,r10,3,17,28
- ldx r9,r9,r11 /* get pmd_t */
- cmpldi cr0,r9,0
- beqlr
-
- /* build vsid flags */
- andi. r11,r9,SLB_VSID_LLP
- ori r11,r11,SLB_VSID_USER
-
- /* get context to calculate proto-VSID */
- ld r9,PACACONTEXTID(r13)
- /* fall through slb_finish_load */
-
-#endif /* __DISABLED__ */
-
-
-/*
- * Finish loading of an SLB entry and return
- *
- * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
- */
-slb_finish_load:
- rldimi r10,r9,ESID_BITS,0
- ASM_VSID_SCRAMBLE(r10,r9,256M)
- /*
- * bits above VSID_BITS_256M need to be ignored from r10
- * also combine VSID and flags
- */
- rldimi r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M))
-
- /* r3 = EA, r11 = VSID data */
- /*
- * Find a slot, round robin. Previously we tried to find a
- * free slot first but that took too long. Unfortunately we
- * dont have any LRU information to help us choose a slot.
- */
-
-7: ld r10,PACASTABRR(r13)
- addi r10,r10,1
- /* This gets soft patched on boot. */
-_GLOBAL(slb_compare_rr_to_size)
- cmpldi r10,0
-
- blt+ 4f
- li r10,SLB_NUM_BOLTED
-
-4:
- std r10,PACASTABRR(r13)
-
-3:
- rldimi r3,r10,0,36 /* r3= EA[0:35] | entry */
- oris r10,r3,SLB_ESID_V@h /* r3 |= SLB_ESID_V */
-
- /* r3 = ESID data, r11 = VSID data */
-
- /*
- * No need for an isync before or after this slbmte. The exception
- * we enter with and the rfid we exit with are context synchronizing.
- */
- slbmte r11,r10
-
- /* we're done for kernel addresses */
- crclr 4*cr0+eq /* set result to "success" */
- bgelr cr7
-
- /* Update the slb cache */
- lhz r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */
- cmpldi r3,SLB_CACHE_ENTRIES
- bge 1f
-
- /* still room in the slb cache */
- sldi r11,r3,2 /* r11 = offset * sizeof(u32) */
- srdi r10,r10,28 /* get the 36 bits of the ESID */
- add r11,r11,r13 /* r11 = (u32 *)paca + offset */
- stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */
- addi r3,r3,1 /* offset++ */
- b 2f
-1: /* offset >= SLB_CACHE_ENTRIES */
- li r3,SLB_CACHE_ENTRIES+1
-2:
- sth r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */
- crclr 4*cr0+eq /* set result to "success" */
- blr
-
-/*
- * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return.
- *
- * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9
- */
-slb_finish_load_1T:
- srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */
- rldimi r10,r9,ESID_BITS_1T,0
- ASM_VSID_SCRAMBLE(r10,r9,1T)
- /*
- * bits above VSID_BITS_1T need to be ignored from r10
- * also combine VSID and flags
- */
- rldimi r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T))
- li r10,MMU_SEGSIZE_1T
- rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */
-
- /* r3 = EA, r11 = VSID data */
- clrrdi r3,r3,SID_SHIFT_1T /* clear out non-ESID bits */
- b 7b
-
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
deleted file mode 100644
index 3e99c149271a..000000000000
--- a/arch/powerpc/mm/slice.c
+++ /dev/null
@@ -1,731 +0,0 @@
-/*
- * address space "slices" (meta-segments) support
- *
- * Copyright (C) 2007 Benjamin Herrenschmidt, IBM Corporation.
- *
- * Based on hugetlb implementation
- *
- * Copyright (C) 2003 David Gibson, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#undef DEBUG
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/err.h>
-#include <linux/spinlock.h>
-#include <linux/export.h>
-#include <asm/mman.h>
-#include <asm/mmu.h>
-#include <asm/spu.h>
-
-/* some sanity checks */
-#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
-#error PGTABLE_RANGE exceeds slice_mask high_slices size
-#endif
-
-static DEFINE_SPINLOCK(slice_convert_lock);
-
-
-#ifdef DEBUG
-int _slice_debug = 1;
-
-static void slice_print_mask(const char *label, struct slice_mask mask)
-{
- char *p, buf[16 + 3 + 64 + 1];
- int i;
-
- if (!_slice_debug)
- return;
- p = buf;
- for (i = 0; i < SLICE_NUM_LOW; i++)
- *(p++) = (mask.low_slices & (1 << i)) ? '1' : '0';
- *(p++) = ' ';
- *(p++) = '-';
- *(p++) = ' ';
- for (i = 0; i < SLICE_NUM_HIGH; i++)
- *(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0';
- *(p++) = 0;
-
- printk(KERN_DEBUG "%s:%s\n", label, buf);
-}
-
-#define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0)
-
-#else
-
-static void slice_print_mask(const char *label, struct slice_mask mask) {}
-#define slice_dbg(fmt...)
-
-#endif
-
-static struct slice_mask slice_range_to_mask(unsigned long start,
- unsigned long len)
-{
- unsigned long end = start + len - 1;
- struct slice_mask ret = { 0, 0 };
-
- if (start < SLICE_LOW_TOP) {
- unsigned long mend = min(end, SLICE_LOW_TOP);
- unsigned long mstart = min(start, SLICE_LOW_TOP);
-
- ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
- - (1u << GET_LOW_SLICE_INDEX(mstart));
- }
-
- if ((start + len) > SLICE_LOW_TOP)
- ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1))
- - (1ul << GET_HIGH_SLICE_INDEX(start));
-
- return ret;
-}
-
-static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
- unsigned long len)
-{
- struct vm_area_struct *vma;
-
- if ((mm->task_size - len) < addr)
- return 0;
- vma = find_vma(mm, addr);
- return (!vma || (addr + len) <= vma->vm_start);
-}
-
-static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice)
-{
- return !slice_area_is_free(mm, slice << SLICE_LOW_SHIFT,
- 1ul << SLICE_LOW_SHIFT);
-}
-
-static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
-{
- unsigned long start = slice << SLICE_HIGH_SHIFT;
- unsigned long end = start + (1ul << SLICE_HIGH_SHIFT);
-
- /* Hack, so that each addresses is controlled by exactly one
- * of the high or low area bitmaps, the first high area starts
- * at 4GB, not 0 */
- if (start == 0)
- start = SLICE_LOW_TOP;
-
- return !slice_area_is_free(mm, start, end - start);
-}
-
-static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
-{
- struct slice_mask ret = { 0, 0 };
- unsigned long i;
-
- for (i = 0; i < SLICE_NUM_LOW; i++)
- if (!slice_low_has_vma(mm, i))
- ret.low_slices |= 1u << i;
-
- if (mm->task_size <= SLICE_LOW_TOP)
- return ret;
-
- for (i = 0; i < SLICE_NUM_HIGH; i++)
- if (!slice_high_has_vma(mm, i))
- ret.high_slices |= 1ul << i;
-
- return ret;
-}
-
-static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
-{
- unsigned char *hpsizes;
- int index, mask_index;
- struct slice_mask ret = { 0, 0 };
- unsigned long i;
- u64 lpsizes;
-
- lpsizes = mm->context.low_slices_psize;
- for (i = 0; i < SLICE_NUM_LOW; i++)
- if (((lpsizes >> (i * 4)) & 0xf) == psize)
- ret.low_slices |= 1u << i;
-
- hpsizes = mm->context.high_slices_psize;
- for (i = 0; i < SLICE_NUM_HIGH; i++) {
- mask_index = i & 0x1;
- index = i >> 1;
- if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
- ret.high_slices |= 1ul << i;
- }
-
- return ret;
-}
-
-static int slice_check_fit(struct slice_mask mask, struct slice_mask available)
-{
- return (mask.low_slices & available.low_slices) == mask.low_slices &&
- (mask.high_slices & available.high_slices) == mask.high_slices;
-}
-
-static void slice_flush_segments(void *parm)
-{
- struct mm_struct *mm = parm;
- unsigned long flags;
-
- if (mm != current->active_mm)
- return;
-
- /* update the paca copy of the context struct */
- get_paca()->context = current->active_mm->context;
-
- local_irq_save(flags);
- slb_flush_and_rebolt();
- local_irq_restore(flags);
-}
-
-static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize)
-{
- int index, mask_index;
- /* Write the new slice psize bits */
- unsigned char *hpsizes;
- u64 lpsizes;
- unsigned long i, flags;
-
- slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
- slice_print_mask(" mask", mask);
-
- /* We need to use a spinlock here to protect against
- * concurrent 64k -> 4k demotion ...
- */
- spin_lock_irqsave(&slice_convert_lock, flags);
-
- lpsizes = mm->context.low_slices_psize;
- for (i = 0; i < SLICE_NUM_LOW; i++)
- if (mask.low_slices & (1u << i))
- lpsizes = (lpsizes & ~(0xful << (i * 4))) |
- (((unsigned long)psize) << (i * 4));
-
- /* Assign the value back */
- mm->context.low_slices_psize = lpsizes;
-
- hpsizes = mm->context.high_slices_psize;
- for (i = 0; i < SLICE_NUM_HIGH; i++) {
- mask_index = i & 0x1;
- index = i >> 1;
- if (mask.high_slices & (1ul << i))
- hpsizes[index] = (hpsizes[index] &
- ~(0xf << (mask_index * 4))) |
- (((unsigned long)psize) << (mask_index * 4));
- }
-
- slice_dbg(" lsps=%lx, hsps=%lx\n",
- mm->context.low_slices_psize,
- mm->context.high_slices_psize);
-
- spin_unlock_irqrestore(&slice_convert_lock, flags);
-
-#ifdef CONFIG_SPU_BASE
- spu_flush_all_slbs(mm);
-#endif
-}
-
-/*
- * Compute which slice addr is part of;
- * set *boundary_addr to the start or end boundary of that slice
- * (depending on 'end' parameter);
- * return boolean indicating if the slice is marked as available in the
- * 'available' slice_mark.
- */
-static bool slice_scan_available(unsigned long addr,
- struct slice_mask available,
- int end,
- unsigned long *boundary_addr)
-{
- unsigned long slice;
- if (addr < SLICE_LOW_TOP) {
- slice = GET_LOW_SLICE_INDEX(addr);
- *boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
- return !!(available.low_slices & (1u << slice));
- } else {
- slice = GET_HIGH_SLICE_INDEX(addr);
- *boundary_addr = (slice + end) ?
- ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
- return !!(available.high_slices & (1u << slice));
- }
-}
-
-static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
- unsigned long len,
- struct slice_mask available,
- int psize)
-{
- int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
- unsigned long addr, found, next_end;
- struct vm_unmapped_area_info info;
-
- info.flags = 0;
- info.length = len;
- info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
- info.align_offset = 0;
-
- addr = TASK_UNMAPPED_BASE;
- while (addr < TASK_SIZE) {
- info.low_limit = addr;
- if (!slice_scan_available(addr, available, 1, &addr))
- continue;
-
- next_slice:
- /*
- * At this point [info.low_limit; addr) covers
- * available slices only and ends at a slice boundary.
- * Check if we need to reduce the range, or if we can
- * extend it to cover the next available slice.
- */
- if (addr >= TASK_SIZE)
- addr = TASK_SIZE;
- else if (slice_scan_available(addr, available, 1, &next_end)) {
- addr = next_end;
- goto next_slice;
- }
- info.high_limit = addr;
-
- found = vm_unmapped_area(&info);
- if (!(found & ~PAGE_MASK))
- return found;
- }
-
- return -ENOMEM;
-}
-
-static unsigned long slice_find_area_topdown(struct mm_struct *mm,
- unsigned long len,
- struct slice_mask available,
- int psize)
-{
- int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
- unsigned long addr, found, prev;
- struct vm_unmapped_area_info info;
-
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
- info.align_offset = 0;
-
- addr = mm->mmap_base;
- while (addr > PAGE_SIZE) {
- info.high_limit = addr;
- if (!slice_scan_available(addr - 1, available, 0, &addr))
- continue;
-
- prev_slice:
- /*
- * At this point [addr; info.high_limit) covers
- * available slices only and starts at a slice boundary.
- * Check if we need to reduce the range, or if we can
- * extend it to cover the previous available slice.
- */
- if (addr < PAGE_SIZE)
- addr = PAGE_SIZE;
- else if (slice_scan_available(addr - 1, available, 0, &prev)) {
- addr = prev;
- goto prev_slice;
- }
- info.low_limit = addr;
-
- found = vm_unmapped_area(&info);
- if (!(found & ~PAGE_MASK))
- return found;
- }
-
- /*
- * A failed mmap() very likely causes application failure,
- * so fall back to the bottom-up function here. This scenario
- * can happen with large stack limits and large mmap()
- * allocations.
- */
- return slice_find_area_bottomup(mm, len, available, psize);
-}
-
-
-static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
- struct slice_mask mask, int psize,
- int topdown)
-{
- if (topdown)
- return slice_find_area_topdown(mm, len, mask, psize);
- else
- return slice_find_area_bottomup(mm, len, mask, psize);
-}
-
-#define or_mask(dst, src) do { \
- (dst).low_slices |= (src).low_slices; \
- (dst).high_slices |= (src).high_slices; \
-} while (0)
-
-#define andnot_mask(dst, src) do { \
- (dst).low_slices &= ~(src).low_slices; \
- (dst).high_slices &= ~(src).high_slices; \
-} while (0)
-
-#ifdef CONFIG_PPC_64K_PAGES
-#define MMU_PAGE_BASE MMU_PAGE_64K
-#else
-#define MMU_PAGE_BASE MMU_PAGE_4K
-#endif
-
-unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
- unsigned long flags, unsigned int psize,
- int topdown)
-{
- struct slice_mask mask = {0, 0};
- struct slice_mask good_mask;
- struct slice_mask potential_mask = {0,0} /* silence stupid warning */;
- struct slice_mask compat_mask = {0, 0};
- int fixed = (flags & MAP_FIXED);
- int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
- struct mm_struct *mm = current->mm;
- unsigned long newaddr;
-
- /* Sanity checks */
- BUG_ON(mm->task_size == 0);
-
- slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
- slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
- addr, len, flags, topdown);
-
- if (len > mm->task_size)
- return -ENOMEM;
- if (len & ((1ul << pshift) - 1))
- return -EINVAL;
- if (fixed && (addr & ((1ul << pshift) - 1)))
- return -EINVAL;
- if (fixed && addr > (mm->task_size - len))
- return -EINVAL;
-
- /* If hint, make sure it matches our alignment restrictions */
- if (!fixed && addr) {
- addr = _ALIGN_UP(addr, 1ul << pshift);
- slice_dbg(" aligned addr=%lx\n", addr);
- /* Ignore hint if it's too large or overlaps a VMA */
- if (addr > mm->task_size - len ||
- !slice_area_is_free(mm, addr, len))
- addr = 0;
- }
-
- /* First make up a "good" mask of slices that have the right size
- * already
- */
- good_mask = slice_mask_for_size(mm, psize);
- slice_print_mask(" good_mask", good_mask);
-
- /*
- * Here "good" means slices that are already the right page size,
- * "compat" means slices that have a compatible page size (i.e.
- * 4k in a 64k pagesize kernel), and "free" means slices without
- * any VMAs.
- *
- * If MAP_FIXED:
- * check if fits in good | compat => OK
- * check if fits in good | compat | free => convert free
- * else bad
- * If have hint:
- * check if hint fits in good => OK
- * check if hint fits in good | free => convert free
- * Otherwise:
- * search in good, found => OK
- * search in good | free, found => convert free
- * search in good | compat | free, found => convert free.
- */
-
-#ifdef CONFIG_PPC_64K_PAGES
- /* If we support combo pages, we can allow 64k pages in 4k slices */
- if (psize == MMU_PAGE_64K) {
- compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
- if (fixed)
- or_mask(good_mask, compat_mask);
- }
-#endif
-
- /* First check hint if it's valid or if we have MAP_FIXED */
- if (addr != 0 || fixed) {
- /* Build a mask for the requested range */
- mask = slice_range_to_mask(addr, len);
- slice_print_mask(" mask", mask);
-
- /* Check if we fit in the good mask. If we do, we just return,
- * nothing else to do
- */
- if (slice_check_fit(mask, good_mask)) {
- slice_dbg(" fits good !\n");
- return addr;
- }
- } else {
- /* Now let's see if we can find something in the existing
- * slices for that size
- */
- newaddr = slice_find_area(mm, len, good_mask, psize, topdown);
- if (newaddr != -ENOMEM) {
- /* Found within the good mask, we don't have to setup,
- * we thus return directly
- */
- slice_dbg(" found area at 0x%lx\n", newaddr);
- return newaddr;
- }
- }
-
- /* We don't fit in the good mask, check what other slices are
- * empty and thus can be converted
- */
- potential_mask = slice_mask_for_free(mm);
- or_mask(potential_mask, good_mask);
- slice_print_mask(" potential", potential_mask);
-
- if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) {
- slice_dbg(" fits potential !\n");
- goto convert;
- }
-
- /* If we have MAP_FIXED and failed the above steps, then error out */
- if (fixed)
- return -EBUSY;
-
- slice_dbg(" search...\n");
-
- /* If we had a hint that didn't work out, see if we can fit
- * anywhere in the good area.
- */
- if (addr) {
- addr = slice_find_area(mm, len, good_mask, psize, topdown);
- if (addr != -ENOMEM) {
- slice_dbg(" found area at 0x%lx\n", addr);
- return addr;
- }
- }
-
- /* Now let's see if we can find something in the existing slices
- * for that size plus free slices
- */
- addr = slice_find_area(mm, len, potential_mask, psize, topdown);
-
-#ifdef CONFIG_PPC_64K_PAGES
- if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
- /* retry the search with 4k-page slices included */
- or_mask(potential_mask, compat_mask);
- addr = slice_find_area(mm, len, potential_mask, psize,
- topdown);
- }
-#endif
-
- if (addr == -ENOMEM)
- return -ENOMEM;
-
- mask = slice_range_to_mask(addr, len);
- slice_dbg(" found potential area at 0x%lx\n", addr);
- slice_print_mask(" mask", mask);
-
- convert:
- andnot_mask(mask, good_mask);
- andnot_mask(mask, compat_mask);
- if (mask.low_slices || mask.high_slices) {
- slice_convert(mm, mask, psize);
- if (psize > MMU_PAGE_BASE)
- on_each_cpu(slice_flush_segments, mm, 1);
- }
- return addr;
-
-}
-EXPORT_SYMBOL_GPL(slice_get_unmapped_area);
-
-unsigned long arch_get_unmapped_area(struct file *filp,
- unsigned long addr,
- unsigned long len,
- unsigned long pgoff,
- unsigned long flags)
-{
- return slice_get_unmapped_area(addr, len, flags,
- current->mm->context.user_psize, 0);
-}
-
-unsigned long arch_get_unmapped_area_topdown(struct file *filp,
- const unsigned long addr0,
- const unsigned long len,
- const unsigned long pgoff,
- const unsigned long flags)
-{
- return slice_get_unmapped_area(addr0, len, flags,
- current->mm->context.user_psize, 1);
-}
-
-unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
-{
- unsigned char *hpsizes;
- int index, mask_index;
-
- if (addr < SLICE_LOW_TOP) {
- u64 lpsizes;
- lpsizes = mm->context.low_slices_psize;
- index = GET_LOW_SLICE_INDEX(addr);
- return (lpsizes >> (index * 4)) & 0xf;
- }
- hpsizes = mm->context.high_slices_psize;
- index = GET_HIGH_SLICE_INDEX(addr);
- mask_index = index & 0x1;
- return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xf;
-}
-EXPORT_SYMBOL_GPL(get_slice_psize);
-
-/*
- * This is called by hash_page when it needs to do a lazy conversion of
- * an address space from real 64K pages to combo 4K pages (typically
- * when hitting a non cacheable mapping on a processor or hypervisor
- * that won't allow them for 64K pages).
- *
- * This is also called in init_new_context() to change back the user
- * psize from whatever the parent context had it set to
- * N.B. This may be called before mm->context.id has been set.
- *
- * This function will only change the content of the {low,high)_slice_psize
- * masks, it will not flush SLBs as this shall be handled lazily by the
- * caller.
- */
-void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
-{
- int index, mask_index;
- unsigned char *hpsizes;
- unsigned long flags, lpsizes;
- unsigned int old_psize;
- int i;
-
- slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize);
-
- spin_lock_irqsave(&slice_convert_lock, flags);
-
- old_psize = mm->context.user_psize;
- slice_dbg(" old_psize=%d\n", old_psize);
- if (old_psize == psize)
- goto bail;
-
- mm->context.user_psize = psize;
- wmb();
-
- lpsizes = mm->context.low_slices_psize;
- for (i = 0; i < SLICE_NUM_LOW; i++)
- if (((lpsizes >> (i * 4)) & 0xf) == old_psize)
- lpsizes = (lpsizes & ~(0xful << (i * 4))) |
- (((unsigned long)psize) << (i * 4));
- /* Assign the value back */
- mm->context.low_slices_psize = lpsizes;
-
- hpsizes = mm->context.high_slices_psize;
- for (i = 0; i < SLICE_NUM_HIGH; i++) {
- mask_index = i & 0x1;
- index = i >> 1;
- if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == old_psize)
- hpsizes[index] = (hpsizes[index] &
- ~(0xf << (mask_index * 4))) |
- (((unsigned long)psize) << (mask_index * 4));
- }
-
-
-
-
- slice_dbg(" lsps=%lx, hsps=%lx\n",
- mm->context.low_slices_psize,
- mm->context.high_slices_psize);
-
- bail:
- spin_unlock_irqrestore(&slice_convert_lock, flags);
-}
-
-void slice_set_psize(struct mm_struct *mm, unsigned long address,
- unsigned int psize)
-{
- unsigned char *hpsizes;
- unsigned long i, flags;
- u64 *lpsizes;
-
- spin_lock_irqsave(&slice_convert_lock, flags);
- if (address < SLICE_LOW_TOP) {
- i = GET_LOW_SLICE_INDEX(address);
- lpsizes = &mm->context.low_slices_psize;
- *lpsizes = (*lpsizes & ~(0xful << (i * 4))) |
- ((unsigned long) psize << (i * 4));
- } else {
- int index, mask_index;
- i = GET_HIGH_SLICE_INDEX(address);
- hpsizes = mm->context.high_slices_psize;
- mask_index = i & 0x1;
- index = i >> 1;
- hpsizes[index] = (hpsizes[index] &
- ~(0xf << (mask_index * 4))) |
- (((unsigned long)psize) << (mask_index * 4));
- }
-
- spin_unlock_irqrestore(&slice_convert_lock, flags);
-
-#ifdef CONFIG_SPU_BASE
- spu_flush_all_slbs(mm);
-#endif
-}
-
-void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
- unsigned long len, unsigned int psize)
-{
- struct slice_mask mask = slice_range_to_mask(start, len);
-
- slice_convert(mm, mask, psize);
-}
-
-/*
- * is_hugepage_only_range() is used by generic code to verify whether
- * a normal mmap mapping (non hugetlbfs) is valid on a given area.
- *
- * until the generic code provides a more generic hook and/or starts
- * calling arch get_unmapped_area for MAP_FIXED (which our implementation
- * here knows how to deal with), we hijack it to keep standard mappings
- * away from us.
- *
- * because of that generic code limitation, MAP_FIXED mapping cannot
- * "convert" back a slice with no VMAs to the standard page size, only
- * get_unmapped_area() can. It would be possible to fix it here but I
- * prefer working on fixing the generic code instead.
- *
- * WARNING: This will not work if hugetlbfs isn't enabled since the
- * generic code will redefine that function as 0 in that. This is ok
- * for now as we only use slices with hugetlbfs enabled. This should
- * be fixed as the generic code gets fixed.
- */
-int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
- unsigned long len)
-{
- struct slice_mask mask, available;
- unsigned int psize = mm->context.user_psize;
-
- mask = slice_range_to_mask(addr, len);
- available = slice_mask_for_size(mm, psize);
-#ifdef CONFIG_PPC_64K_PAGES
- /* We need to account for 4k slices too */
- if (psize == MMU_PAGE_64K) {
- struct slice_mask compat_mask;
- compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
- or_mask(available, compat_mask);
- }
-#endif
-
-#if 0 /* too verbose */
- slice_dbg("is_hugepage_only_range(mm=%p, addr=%lx, len=%lx)\n",
- mm, addr, len);
- slice_print_mask(" mask", mask);
- slice_print_mask(" available", available);
-#endif
- return !slice_check_fit(mask, available);
-}
-
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
deleted file mode 100644
index 3f8efa6f2997..000000000000
--- a/arch/powerpc/mm/stab.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * PowerPC64 Segment Translation Support.
- *
- * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
- * Copyright (c) 2001 Dave Engebretsen
- *
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/memblock.h>
-
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/paca.h>
-#include <asm/cputable.h>
-#include <asm/prom.h>
-
-struct stab_entry {
- unsigned long esid_data;
- unsigned long vsid_data;
-};
-
-#define NR_STAB_CACHE_ENTRIES 8
-static DEFINE_PER_CPU(long, stab_cache_ptr);
-static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache);
-
-/*
- * Create a segment table entry for the given esid/vsid pair.
- */
-static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid)
-{
- unsigned long esid_data, vsid_data;
- unsigned long entry, group, old_esid, castout_entry, i;
- unsigned int global_entry;
- struct stab_entry *ste, *castout_ste;
- unsigned long kernel_segment = (esid << SID_SHIFT) >= PAGE_OFFSET;
-
- vsid_data = vsid << STE_VSID_SHIFT;
- esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V;
- if (! kernel_segment)
- esid_data |= STE_ESID_KS;
-
- /* Search the primary group first. */
- global_entry = (esid & 0x1f) << 3;
- ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
-
- /* Find an empty entry, if one exists. */
- for (group = 0; group < 2; group++) {
- for (entry = 0; entry < 8; entry++, ste++) {
- if (!(ste->esid_data & STE_ESID_V)) {
- ste->vsid_data = vsid_data;
- eieio();
- ste->esid_data = esid_data;
- return (global_entry | entry);
- }
- }
- /* Now search the secondary group. */
- global_entry = ((~esid) & 0x1f) << 3;
- ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
- }
-
- /*
- * Could not find empty entry, pick one with a round robin selection.
- * Search all entries in the two groups.
- */
- castout_entry = get_paca()->stab_rr;
- for (i = 0; i < 16; i++) {
- if (castout_entry < 8) {
- global_entry = (esid & 0x1f) << 3;
- ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
- castout_ste = ste + castout_entry;
- } else {
- global_entry = ((~esid) & 0x1f) << 3;
- ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
- castout_ste = ste + (castout_entry - 8);
- }
-
- /* Dont cast out the first kernel segment */
- if ((castout_ste->esid_data & ESID_MASK) != PAGE_OFFSET)
- break;
-
- castout_entry = (castout_entry + 1) & 0xf;
- }
-
- get_paca()->stab_rr = (castout_entry + 1) & 0xf;
-
- /* Modify the old entry to the new value. */
-
- /* Force previous translations to complete. DRENG */
- asm volatile("isync" : : : "memory");
-
- old_esid = castout_ste->esid_data >> SID_SHIFT;
- castout_ste->esid_data = 0; /* Invalidate old entry */
-
- asm volatile("sync" : : : "memory"); /* Order update */
-
- castout_ste->vsid_data = vsid_data;
- eieio(); /* Order update */
- castout_ste->esid_data = esid_data;
-
- asm volatile("slbie %0" : : "r" (old_esid << SID_SHIFT));
- /* Ensure completion of slbie */
- asm volatile("sync" : : : "memory");
-
- return (global_entry | (castout_entry & 0x7));
-}
-
-/*
- * Allocate a segment table entry for the given ea and mm
- */
-static int __ste_allocate(unsigned long ea, struct mm_struct *mm)
-{
- unsigned long vsid;
- unsigned char stab_entry;
- unsigned long offset;
-
- /* Kernel or user address? */
- if (is_kernel_addr(ea)) {
- vsid = get_kernel_vsid(ea, MMU_SEGSIZE_256M);
- } else {
- if ((ea >= TASK_SIZE_USER64) || (! mm))
- return 1;
-
- vsid = get_vsid(mm->context.id, ea, MMU_SEGSIZE_256M);
- }
-
- stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid);
-
- if (!is_kernel_addr(ea)) {
- offset = __get_cpu_var(stab_cache_ptr);
- if (offset < NR_STAB_CACHE_ENTRIES)
- __get_cpu_var(stab_cache[offset++]) = stab_entry;
- else
- offset = NR_STAB_CACHE_ENTRIES+1;
- __get_cpu_var(stab_cache_ptr) = offset;
-
- /* Order update */
- asm volatile("sync":::"memory");
- }
-
- return 0;
-}
-
-int ste_allocate(unsigned long ea)
-{
- return __ste_allocate(ea, current->mm);
-}
-
-/*
- * Do the segment table work for a context switch: flush all user
- * entries from the table, then preload some probably useful entries
- * for the new task
- */
-void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
-{
- struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
- struct stab_entry *ste;
- unsigned long offset;
- unsigned long pc = KSTK_EIP(tsk);
- unsigned long stack = KSTK_ESP(tsk);
- unsigned long unmapped_base;
-
- /* Force previous translations to complete. DRENG */
- asm volatile("isync" : : : "memory");
-
- /*
- * We need interrupts hard-disabled here, not just soft-disabled,
- * so that a PMU interrupt can't occur, which might try to access
- * user memory (to get a stack trace) and possible cause an STAB miss
- * which would update the stab_cache/stab_cache_ptr per-cpu variables.
- */
- hard_irq_disable();
-
- offset = __get_cpu_var(stab_cache_ptr);
- if (offset <= NR_STAB_CACHE_ENTRIES) {
- int i;
-
- for (i = 0; i < offset; i++) {
- ste = stab + __get_cpu_var(stab_cache[i]);
- ste->esid_data = 0; /* invalidate entry */
- }
- } else {
- unsigned long entry;
-
- /* Invalidate all entries. */
- ste = stab;
-
- /* Never flush the first entry. */
- ste += 1;
- for (entry = 1;
- entry < (HW_PAGE_SIZE / sizeof(struct stab_entry));
- entry++, ste++) {
- unsigned long ea;
- ea = ste->esid_data & ESID_MASK;
- if (!is_kernel_addr(ea)) {
- ste->esid_data = 0;
- }
- }
- }
-
- asm volatile("sync; slbia; sync":::"memory");
-
- __get_cpu_var(stab_cache_ptr) = 0;
-
- /* Now preload some entries for the new task */
- if (test_tsk_thread_flag(tsk, TIF_32BIT))
- unmapped_base = TASK_UNMAPPED_BASE_USER32;
- else
- unmapped_base = TASK_UNMAPPED_BASE_USER64;
-
- __ste_allocate(pc, mm);
-
- if (GET_ESID(pc) == GET_ESID(stack))
- return;
-
- __ste_allocate(stack, mm);
-
- if ((GET_ESID(pc) == GET_ESID(unmapped_base))
- || (GET_ESID(stack) == GET_ESID(unmapped_base)))
- return;
-
- __ste_allocate(unmapped_base, mm);
-
- /* Order update */
- asm volatile("sync" : : : "memory");
-}
-
-/*
- * Allocate segment tables for secondary CPUs. These must all go in
- * the first (bolted) segment, so that do_stab_bolted won't get a
- * recursive segment miss on the segment table itself.
- */
-void __init stabs_alloc(void)
-{
- int cpu;
-
- if (mmu_has_feature(MMU_FTR_SLB))
- return;
-
- for_each_possible_cpu(cpu) {
- unsigned long newstab;
-
- if (cpu == 0)
- continue; /* stab for CPU 0 is statically allocated */
-
- newstab = memblock_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE,
- 1<<SID_SHIFT);
- newstab = (unsigned long)__va(newstab);
-
- memset((void *)newstab, 0, HW_PAGE_SIZE);
-
- paca[cpu].stab_addr = newstab;
- paca[cpu].stab_real = __pa(newstab);
- printk(KERN_INFO "Segment table for CPU %d at 0x%llx "
- "virtual, 0x%llx absolute\n",
- cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
- }
-}
-
-/*
- * Build an entry for the base kernel segment and put it into
- * the segment table or SLB. All other segment table or SLB
- * entries are faulted in.
- */
-void stab_initialize(unsigned long stab)
-{
- unsigned long vsid = get_kernel_vsid(PAGE_OFFSET, MMU_SEGSIZE_256M);
- unsigned long stabreal;
-
- asm volatile("isync; slbia; isync":::"memory");
- make_ste(stab, GET_ESID(PAGE_OFFSET), vsid);
-
- /* Order update */
- asm volatile("sync":::"memory");
-
- /* Set ASR */
- stabreal = get_paca()->stab_real | 0x1ul;
-
- mtspr(SPRN_ASR, stabreal);
-}
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
deleted file mode 100644
index 558e30cce33e..000000000000
--- a/arch/powerpc/mm/tlb_hash32.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * This file contains the routines for TLB flushing.
- * On machines where the MMU uses a hash table to store virtual to
- * physical translations, these routines flush entries from the
- * hash table also.
- * -- paulus
- *
- * Derived from arch/ppc/mm/init.c:
- * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- * and Cort Dougan (PReP) (cort@cs.nmt.edu)
- * Copyright (C) 1996 Paul Mackerras
- *
- * Derived from "arch/i386/mm/init.c"
- * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/export.h>
-
-#include <asm/tlbflush.h>
-#include <asm/tlb.h>
-
-#include "mmu_decl.h"
-
-/*
- * Called when unmapping pages to flush entries from the TLB/hash table.
- */
-void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
-{
- unsigned long ptephys;
-
- if (Hash != 0) {
- ptephys = __pa(ptep) & PAGE_MASK;
- flush_hash_pages(mm->context.id, addr, ptephys, 1);
- }
-}
-EXPORT_SYMBOL(flush_hash_entry);
-
-/*
- * Called by ptep_set_access_flags, must flush on CPUs for which the
- * DSI handler can't just "fixup" the TLB on a write fault
- */
-void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr)
-{
- if (Hash != 0)
- return;
- _tlbie(addr);
-}
-
-/*
- * Called at the end of a mmu_gather operation to make sure the
- * TLB flush is completely done.
- */
-void tlb_flush(struct mmu_gather *tlb)
-{
- if (Hash == 0) {
- /*
- * 603 needs to flush the whole TLB here since
- * it doesn't use a hash table.
- */
- _tlbia();
- }
-}
-
-/*
- * TLB flushing:
- *
- * - flush_tlb_mm(mm) flushes the specified mm context TLB's
- * - flush_tlb_page(vma, vmaddr) flushes one page
- * - flush_tlb_range(vma, start, end) flushes a range of pages
- * - flush_tlb_kernel_range(start, end) flushes kernel pages
- *
- * since the hardware hash table functions as an extension of the
- * tlb as far as the linux tables are concerned, flush it too.
- * -- Cort
- */
-
-static void flush_range(struct mm_struct *mm, unsigned long start,
- unsigned long end)
-{
- pmd_t *pmd;
- unsigned long pmd_end;
- int count;
- unsigned int ctx = mm->context.id;
-
- if (Hash == 0) {
- _tlbia();
- return;
- }
- start &= PAGE_MASK;
- if (start >= end)
- return;
- end = (end - 1) | ~PAGE_MASK;
- pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
- for (;;) {
- pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
- if (pmd_end > end)
- pmd_end = end;
- if (!pmd_none(*pmd)) {
- count = ((pmd_end - start) >> PAGE_SHIFT) + 1;
- flush_hash_pages(ctx, start, pmd_val(*pmd), count);
- }
- if (pmd_end == end)
- break;
- start = pmd_end + 1;
- ++pmd;
- }
-}
-
-/*
- * Flush kernel TLB entries in the given range
- */
-void flush_tlb_kernel_range(unsigned long start, unsigned long end)
-{
- flush_range(&init_mm, start, end);
-}
-EXPORT_SYMBOL(flush_tlb_kernel_range);
-
-/*
- * Flush all the (user) entries for the address space described by mm.
- */
-void flush_tlb_mm(struct mm_struct *mm)
-{
- struct vm_area_struct *mp;
-
- if (Hash == 0) {
- _tlbia();
- return;
- }
-
- /*
- * It is safe to go down the mm's list of vmas when called
- * from dup_mmap, holding mmap_sem. It would also be safe from
- * unmap_region or exit_mmap, but not from vmtruncate on SMP -
- * but it seems dup_mmap is the only SMP case which gets here.
- */
- for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
- flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
-}
-EXPORT_SYMBOL(flush_tlb_mm);
-
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
- struct mm_struct *mm;
- pmd_t *pmd;
-
- if (Hash == 0) {
- _tlbie(vmaddr);
- return;
- }
- mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
- pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
- if (!pmd_none(*pmd))
- flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
-}
-EXPORT_SYMBOL(flush_tlb_page);
-
-/*
- * For each address in the range, find the pte for the address
- * and check _PAGE_HASHPTE bit; if it is set, find and destroy
- * the corresponding HPTE.
- */
-void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end)
-{
- flush_range(vma->vm_mm, start, end);
-}
-EXPORT_SYMBOL(flush_tlb_range);
-
-void __init early_init_mmu(void)
-{
-}
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
deleted file mode 100644
index b4113bf86353..000000000000
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ /dev/null
@@ -1,980 +0,0 @@
-/*
- * Low level TLB miss handlers for Book3E
- *
- * Copyright (C) 2008-2009
- * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/processor.h>
-#include <asm/reg.h>
-#include <asm/page.h>
-#include <asm/mmu.h>
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/cputable.h>
-#include <asm/pgtable.h>
-#include <asm/exception-64e.h>
-#include <asm/ppc-opcode.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_booke_hv_asm.h>
-
-#ifdef CONFIG_PPC_64K_PAGES
-#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1)
-#else
-#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE)
-#endif
-#define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE)
-#define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE)
-#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE)
-
-/**********************************************************************
- * *
- * TLB miss handling for Book3E with a bolted linear mapping *
- * No virtual page table, no nested TLB misses *
- * *
- **********************************************************************/
-
-.macro tlb_prolog_bolted intnum addr
- mtspr SPRN_SPRG_GEN_SCRATCH,r13
- mfspr r13,SPRN_SPRG_PACA
- std r10,PACA_EXTLB+EX_TLB_R10(r13)
- mfcr r10
- std r11,PACA_EXTLB+EX_TLB_R11(r13)
-#ifdef CONFIG_KVM_BOOKE_HV
-BEGIN_FTR_SECTION
- mfspr r11, SPRN_SRR1
-END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
-#endif
- DO_KVM \intnum, SPRN_SRR1
- std r16,PACA_EXTLB+EX_TLB_R16(r13)
- mfspr r16,\addr /* get faulting address */
- std r14,PACA_EXTLB+EX_TLB_R14(r13)
- ld r14,PACAPGD(r13)
- std r15,PACA_EXTLB+EX_TLB_R15(r13)
- std r10,PACA_EXTLB+EX_TLB_CR(r13)
- TLB_MISS_PROLOG_STATS_BOLTED
-.endm
-
-.macro tlb_epilog_bolted
- ld r14,PACA_EXTLB+EX_TLB_CR(r13)
- ld r10,PACA_EXTLB+EX_TLB_R10(r13)
- ld r11,PACA_EXTLB+EX_TLB_R11(r13)
- mtcr r14
- ld r14,PACA_EXTLB+EX_TLB_R14(r13)
- ld r15,PACA_EXTLB+EX_TLB_R15(r13)
- TLB_MISS_RESTORE_STATS_BOLTED
- ld r16,PACA_EXTLB+EX_TLB_R16(r13)
- mfspr r13,SPRN_SPRG_GEN_SCRATCH
-.endm
-
-/* Data TLB miss */
- START_EXCEPTION(data_tlb_miss_bolted)
- tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
-
- /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */
-
- /* We do the user/kernel test for the PID here along with the RW test
- */
- /* We pre-test some combination of permissions to avoid double
- * faults:
- *
- * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
- * ESR_ST is 0x00800000
- * _PAGE_BAP_SW is 0x00000010
- * So the shift is >> 19. This tests for supervisor writeability.
- * If the page happens to be supervisor writeable and not user
- * writeable, we will take a new fault later, but that should be
- * a rare enough case.
- *
- * We also move ESR_ST in _PAGE_DIRTY position
- * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
- *
- * MAS1 is preset for all we need except for TID that needs to
- * be cleared for kernel translations
- */
-
- mfspr r11,SPRN_ESR
-
- srdi r15,r16,60 /* get region */
- rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
- bne- dtlb_miss_fault_bolted /* Bail if fault addr is invalid */
-
- rlwinm r10,r11,32-19,27,27
- rlwimi r10,r11,32-16,19,19
- cmpwi r15,0 /* user vs kernel check */
- ori r10,r10,_PAGE_PRESENT
- oris r11,r10,_PAGE_ACCESSED@h
-
- TLB_MISS_STATS_SAVE_INFO_BOLTED
- bne tlb_miss_kernel_bolted
-
-tlb_miss_common_bolted:
-/*
- * This is the guts of the TLB miss handler for bolted-linear.
- * We are entered with:
- *
- * r16 = faulting address
- * r15 = crap (free to use)
- * r14 = page table base
- * r13 = PACA
- * r11 = PTE permission mask
- * r10 = crap (free to use)
- */
- rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
- cmpldi cr0,r14,0
- clrrdi r15,r15,3
- beq tlb_miss_fault_bolted /* No PGDIR, bail */
-
-BEGIN_MMU_FTR_SECTION
- /* Set the TLB reservation and search for existing entry. Then load
- * the entry.
- */
- PPC_TLBSRX_DOT(0,R16)
- ldx r14,r14,r15 /* grab pgd entry */
- beq normal_tlb_miss_done /* tlb exists already, bail */
-MMU_FTR_SECTION_ELSE
- ldx r14,r14,r15 /* grab pgd entry */
-ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
-
-#ifndef CONFIG_PPC_64K_PAGES
- rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
- clrrdi r15,r15,3
- cmpdi cr0,r14,0
- bge tlb_miss_fault_bolted /* Bad pgd entry or hugepage; bail */
- ldx r14,r14,r15 /* grab pud entry */
-#endif /* CONFIG_PPC_64K_PAGES */
-
- rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
- clrrdi r15,r15,3
- cmpdi cr0,r14,0
- bge tlb_miss_fault_bolted
- ldx r14,r14,r15 /* Grab pmd entry */
-
- rldicl r15,r16,64-PAGE_SHIFT+3,64-PTE_INDEX_SIZE-3
- clrrdi r15,r15,3
- cmpdi cr0,r14,0
- bge tlb_miss_fault_bolted
- ldx r14,r14,r15 /* Grab PTE, normal (!huge) page */
-
- /* Check if required permissions are met */
- andc. r15,r11,r14
- rldicr r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
- bne- tlb_miss_fault_bolted
-
- /* Now we build the MAS:
- *
- * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
- * MAS 1 : Almost fully setup
- * - PID already updated by caller if necessary
- * - TSIZE need change if !base page size, not
- * yet implemented for now
- * MAS 2 : Defaults not useful, need to be redone
- * MAS 3+7 : Needs to be done
- */
- clrrdi r11,r16,12 /* Clear low crap in EA */
- clrldi r15,r15,12 /* Clear crap at the top */
- rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */
- rlwimi r15,r14,32-8,22,25 /* Move in U bits */
- mtspr SPRN_MAS2,r11
- andi. r11,r14,_PAGE_DIRTY
- rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */
-
- /* Mask out SW and UW if !DIRTY (XXX optimize this !) */
- bne 1f
- li r11,MAS3_SW|MAS3_UW
- andc r15,r15,r11
-1:
- mtspr SPRN_MAS7_MAS3,r15
- tlbwe
-
- TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
- tlb_epilog_bolted
- rfi
-
-itlb_miss_kernel_bolted:
- li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */
- oris r11,r11,_PAGE_ACCESSED@h
-tlb_miss_kernel_bolted:
- mfspr r10,SPRN_MAS1
- ld r14,PACA_KERNELPGD(r13)
- cmpldi cr0,r15,8 /* Check for vmalloc region */
- rlwinm r10,r10,0,16,1 /* Clear TID */
- mtspr SPRN_MAS1,r10
- beq+ tlb_miss_common_bolted
-
-tlb_miss_fault_bolted:
- /* We need to check if it was an instruction miss */
- andi. r10,r11,_PAGE_EXEC|_PAGE_BAP_SX
- bne itlb_miss_fault_bolted
-dtlb_miss_fault_bolted:
- TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
- tlb_epilog_bolted
- b exc_data_storage_book3e
-itlb_miss_fault_bolted:
- TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
- tlb_epilog_bolted
- b exc_instruction_storage_book3e
-
-/* Instruction TLB miss */
- START_EXCEPTION(instruction_tlb_miss_bolted)
- tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
-
- rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
- srdi r15,r16,60 /* get region */
- TLB_MISS_STATS_SAVE_INFO_BOLTED
- bne- itlb_miss_fault_bolted
-
- li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */
-
- /* We do the user/kernel test for the PID here along with the RW test
- */
-
- cmpldi cr0,r15,0 /* Check for user region */
- oris r11,r11,_PAGE_ACCESSED@h
- beq tlb_miss_common_bolted
- b itlb_miss_kernel_bolted
-
-/**********************************************************************
- * *
- * TLB miss handling for Book3E with TLB reservation and HES support *
- * *
- **********************************************************************/
-
-
-/* Data TLB miss */
- START_EXCEPTION(data_tlb_miss)
- TLB_MISS_PROLOG
-
- /* Now we handle the fault proper. We only save DEAR in normal
- * fault case since that's the only interesting values here.
- * We could probably also optimize by not saving SRR0/1 in the
- * linear mapping case but I'll leave that for later
- */
- mfspr r14,SPRN_ESR
- mfspr r16,SPRN_DEAR /* get faulting address */
- srdi r15,r16,60 /* get region */
- cmpldi cr0,r15,0xc /* linear mapping ? */
- TLB_MISS_STATS_SAVE_INFO
- beq tlb_load_linear /* yes -> go to linear map load */
-
- /* The page tables are mapped virtually linear. At this point, though,
- * we don't know whether we are trying to fault in a first level
- * virtual address or a virtual page table address. We can get that
- * from bit 0x1 of the region ID which we have set for a page table
- */
- andi. r10,r15,0x1
- bne- virt_page_table_tlb_miss
-
- std r14,EX_TLB_ESR(r12); /* save ESR */
- std r16,EX_TLB_DEAR(r12); /* save DEAR */
-
- /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */
- li r11,_PAGE_PRESENT
- oris r11,r11,_PAGE_ACCESSED@h
-
- /* We do the user/kernel test for the PID here along with the RW test
- */
- cmpldi cr0,r15,0 /* Check for user region */
-
- /* We pre-test some combination of permissions to avoid double
- * faults:
- *
- * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
- * ESR_ST is 0x00800000
- * _PAGE_BAP_SW is 0x00000010
- * So the shift is >> 19. This tests for supervisor writeability.
- * If the page happens to be supervisor writeable and not user
- * writeable, we will take a new fault later, but that should be
- * a rare enough case.
- *
- * We also move ESR_ST in _PAGE_DIRTY position
- * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
- *
- * MAS1 is preset for all we need except for TID that needs to
- * be cleared for kernel translations
- */
- rlwimi r11,r14,32-19,27,27
- rlwimi r11,r14,32-16,19,19
- beq normal_tlb_miss
- /* XXX replace the RMW cycles with immediate loads + writes */
-1: mfspr r10,SPRN_MAS1
- cmpldi cr0,r15,8 /* Check for vmalloc region */
- rlwinm r10,r10,0,16,1 /* Clear TID */
- mtspr SPRN_MAS1,r10
- beq+ normal_tlb_miss
-
- /* We got a crappy address, just fault with whatever DEAR and ESR
- * are here
- */
- TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
- TLB_MISS_EPILOG_ERROR
- b exc_data_storage_book3e
-
-/* Instruction TLB miss */
- START_EXCEPTION(instruction_tlb_miss)
- TLB_MISS_PROLOG
-
- /* If we take a recursive fault, the second level handler may need
- * to know whether we are handling a data or instruction fault in
- * order to get to the right store fault handler. We provide that
- * info by writing a crazy value in ESR in our exception frame
- */
- li r14,-1 /* store to exception frame is done later */
-
- /* Now we handle the fault proper. We only save DEAR in the non
- * linear mapping case since we know the linear mapping case will
- * not re-enter. We could indeed optimize and also not save SRR0/1
- * in the linear mapping case but I'll leave that for later
- *
- * Faulting address is SRR0 which is already in r16
- */
- srdi r15,r16,60 /* get region */
- cmpldi cr0,r15,0xc /* linear mapping ? */
- TLB_MISS_STATS_SAVE_INFO
- beq tlb_load_linear /* yes -> go to linear map load */
-
- /* We do the user/kernel test for the PID here along with the RW test
- */
- li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */
- oris r11,r11,_PAGE_ACCESSED@h
-
- cmpldi cr0,r15,0 /* Check for user region */
- std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */
- beq normal_tlb_miss
-
- li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */
- oris r11,r11,_PAGE_ACCESSED@h
- /* XXX replace the RMW cycles with immediate loads + writes */
- mfspr r10,SPRN_MAS1
- cmpldi cr0,r15,8 /* Check for vmalloc region */
- rlwinm r10,r10,0,16,1 /* Clear TID */
- mtspr SPRN_MAS1,r10
- beq+ normal_tlb_miss
-
- /* We got a crappy address, just fault */
- TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
- TLB_MISS_EPILOG_ERROR
- b exc_instruction_storage_book3e
-
-/*
- * This is the guts of the first-level TLB miss handler for direct
- * misses. We are entered with:
- *
- * r16 = faulting address
- * r15 = region ID
- * r14 = crap (free to use)
- * r13 = PACA
- * r12 = TLB exception frame in PACA
- * r11 = PTE permission mask
- * r10 = crap (free to use)
- */
-normal_tlb_miss:
- /* So we first construct the page table address. We do that by
- * shifting the bottom of the address (not the region ID) by
- * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and
- * or'ing the fourth high bit.
- *
- * NOTE: For 64K pages, we do things slightly differently in
- * order to handle the weird page table format used by linux
- */
- ori r10,r15,0x1
-#ifdef CONFIG_PPC_64K_PAGES
- /* For the top bits, 16 bytes per PTE */
- rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4
- /* Now create the bottom bits as 0 in position 0x8000 and
- * the rest calculated for 8 bytes per PTE
- */
- rldicl r15,r16,64-(PAGE_SHIFT-3),64-15
- /* Insert the bottom bits in */
- rlwimi r14,r15,0,16,31
-#else
- rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4
-#endif
- sldi r15,r10,60
- clrrdi r14,r14,3
- or r10,r15,r14
-
-BEGIN_MMU_FTR_SECTION
- /* Set the TLB reservation and search for existing entry. Then load
- * the entry.
- */
- PPC_TLBSRX_DOT(0,R16)
- ld r14,0(r10)
- beq normal_tlb_miss_done
-MMU_FTR_SECTION_ELSE
- ld r14,0(r10)
-ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
-
-finish_normal_tlb_miss:
- /* Check if required permissions are met */
- andc. r15,r11,r14
- bne- normal_tlb_miss_access_fault
-
- /* Now we build the MAS:
- *
- * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
- * MAS 1 : Almost fully setup
- * - PID already updated by caller if necessary
- * - TSIZE need change if !base page size, not
- * yet implemented for now
- * MAS 2 : Defaults not useful, need to be redone
- * MAS 3+7 : Needs to be done
- *
- * TODO: mix up code below for better scheduling
- */
- clrrdi r11,r16,12 /* Clear low crap in EA */
- rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */
- mtspr SPRN_MAS2,r11
-
- /* Check page size, if not standard, update MAS1 */
- rldicl r11,r14,64-8,64-8
-#ifdef CONFIG_PPC_64K_PAGES
- cmpldi cr0,r11,BOOK3E_PAGESZ_64K
-#else
- cmpldi cr0,r11,BOOK3E_PAGESZ_4K
-#endif
- beq- 1f
- mfspr r11,SPRN_MAS1
- rlwimi r11,r14,31,21,24
- rlwinm r11,r11,0,21,19
- mtspr SPRN_MAS1,r11
-1:
- /* Move RPN in position */
- rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
- clrldi r15,r11,12 /* Clear crap at the top */
- rlwimi r15,r14,32-8,22,25 /* Move in U bits */
- rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */
-
- /* Mask out SW and UW if !DIRTY (XXX optimize this !) */
- andi. r11,r14,_PAGE_DIRTY
- bne 1f
- li r11,MAS3_SW|MAS3_UW
- andc r15,r15,r11
-1:
-BEGIN_MMU_FTR_SECTION
- srdi r16,r15,32
- mtspr SPRN_MAS3,r15
- mtspr SPRN_MAS7,r16
-MMU_FTR_SECTION_ELSE
- mtspr SPRN_MAS7_MAS3,r15
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
-
- tlbwe
-
-normal_tlb_miss_done:
- /* We don't bother with restoring DEAR or ESR since we know we are
- * level 0 and just going back to userland. They are only needed
- * if you are going to take an access fault
- */
- TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
- TLB_MISS_EPILOG_SUCCESS
- rfi
-
-normal_tlb_miss_access_fault:
- /* We need to check if it was an instruction miss */
- andi. r10,r11,_PAGE_EXEC
- bne 1f
- ld r14,EX_TLB_DEAR(r12)
- ld r15,EX_TLB_ESR(r12)
- mtspr SPRN_DEAR,r14
- mtspr SPRN_ESR,r15
- TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
- TLB_MISS_EPILOG_ERROR
- b exc_data_storage_book3e
-1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
- TLB_MISS_EPILOG_ERROR
- b exc_instruction_storage_book3e
-
-
-/*
- * This is the guts of the second-level TLB miss handler for direct
- * misses. We are entered with:
- *
- * r16 = virtual page table faulting address
- * r15 = region (top 4 bits of address)
- * r14 = crap (free to use)
- * r13 = PACA
- * r12 = TLB exception frame in PACA
- * r11 = crap (free to use)
- * r10 = crap (free to use)
- *
- * Note that this should only ever be called as a second level handler
- * with the current scheme when using SW load.
- * That means we can always get the original fault DEAR at
- * EX_TLB_DEAR-EX_TLB_SIZE(r12)
- *
- * It can be re-entered by the linear mapping miss handler. However, to
- * avoid too much complication, it will restart the whole fault at level
- * 0 so we don't care too much about clobbers
- *
- * XXX That code was written back when we couldn't clobber r14. We can now,
- * so we could probably optimize things a bit
- */
-virt_page_table_tlb_miss:
- /* Are we hitting a kernel page table ? */
- andi. r10,r15,0x8
-
- /* The cool thing now is that r10 contains 0 for user and 8 for kernel,
- * and we happen to have the swapper_pg_dir at offset 8 from the user
- * pgdir in the PACA :-).
- */
- add r11,r10,r13
-
- /* If kernel, we need to clear MAS1 TID */
- beq 1f
- /* XXX replace the RMW cycles with immediate loads + writes */
- mfspr r10,SPRN_MAS1
- rlwinm r10,r10,0,16,1 /* Clear TID */
- mtspr SPRN_MAS1,r10
-1:
-BEGIN_MMU_FTR_SECTION
- /* Search if we already have a TLB entry for that virtual address, and
- * if we do, bail out.
- */
- PPC_TLBSRX_DOT(0,R16)
- beq virt_page_table_tlb_miss_done
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
-
- /* Now, we need to walk the page tables. First check if we are in
- * range.
- */
- rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4
- bne- virt_page_table_tlb_miss_fault
-
- /* Get the PGD pointer */
- ld r15,PACAPGD(r11)
- cmpldi cr0,r15,0
- beq- virt_page_table_tlb_miss_fault
-
- /* Get to PGD entry */
- rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3
- clrrdi r10,r11,3
- ldx r15,r10,r15
- cmpdi cr0,r15,0
- bge virt_page_table_tlb_miss_fault
-
-#ifndef CONFIG_PPC_64K_PAGES
- /* Get to PUD entry */
- rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
- clrrdi r10,r11,3
- ldx r15,r10,r15
- cmpdi cr0,r15,0
- bge virt_page_table_tlb_miss_fault
-#endif /* CONFIG_PPC_64K_PAGES */
-
- /* Get to PMD entry */
- rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
- clrrdi r10,r11,3
- ldx r15,r10,r15
- cmpdi cr0,r15,0
- bge virt_page_table_tlb_miss_fault
-
- /* Ok, we're all right, we can now create a kernel translation for
- * a 4K or 64K page from r16 -> r15.
- */
- /* Now we build the MAS:
- *
- * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
- * MAS 1 : Almost fully setup
- * - PID already updated by caller if necessary
- * - TSIZE for now is base page size always
- * MAS 2 : Use defaults
- * MAS 3+7 : Needs to be done
- *
- * So we only do MAS 2 and 3 for now...
- */
- clrldi r11,r15,4 /* remove region ID from RPN */
- ori r10,r11,1 /* Or-in SR */
-
-BEGIN_MMU_FTR_SECTION
- srdi r16,r10,32
- mtspr SPRN_MAS3,r10
- mtspr SPRN_MAS7,r16
-MMU_FTR_SECTION_ELSE
- mtspr SPRN_MAS7_MAS3,r10
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
-
- tlbwe
-
-BEGIN_MMU_FTR_SECTION
-virt_page_table_tlb_miss_done:
-
- /* We have overriden MAS2:EPN but currently our primary TLB miss
- * handler will always restore it so that should not be an issue,
- * if we ever optimize the primary handler to not write MAS2 on
- * some cases, we'll have to restore MAS2:EPN here based on the
- * original fault's DEAR. If we do that we have to modify the
- * ITLB miss handler to also store SRR0 in the exception frame
- * as DEAR.
- *
- * However, one nasty thing we did is we cleared the reservation
- * (well, potentially we did). We do a trick here thus if we
- * are not a level 0 exception (we interrupted the TLB miss) we
- * offset the return address by -4 in order to replay the tlbsrx
- * instruction there
- */
- subf r10,r13,r12
- cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE
- bne- 1f
- ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
- addi r10,r11,-4
- std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
-1:
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
- /* Return to caller, normal case */
- TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK);
- TLB_MISS_EPILOG_SUCCESS
- rfi
-
-virt_page_table_tlb_miss_fault:
- /* If we fault here, things are a little bit tricky. We need to call
- * either data or instruction store fault, and we need to retrieve
- * the original fault address and ESR (for data).
- *
- * The thing is, we know that in normal circumstances, this is
- * always called as a second level tlb miss for SW load or as a first
- * level TLB miss for HW load, so we should be able to peek at the
- * relevant information in the first exception frame in the PACA.
- *
- * However, we do need to double check that, because we may just hit
- * a stray kernel pointer or a userland attack trying to hit those
- * areas. If that is the case, we do a data fault. (We can't get here
- * from an instruction tlb miss anyway).
- *
- * Note also that when going to a fault, we must unwind the previous
- * level as well. Since we are doing that, we don't need to clear or
- * restore the TLB reservation neither.
- */
- subf r10,r13,r12
- cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE
- bne- virt_page_table_tlb_miss_whacko_fault
-
- /* We dig the original DEAR and ESR from slot 0 */
- ld r15,EX_TLB_DEAR+PACA_EXTLB(r13)
- ld r16,EX_TLB_ESR+PACA_EXTLB(r13)
-
- /* We check for the "special" ESR value for instruction faults */
- cmpdi cr0,r16,-1
- beq 1f
- mtspr SPRN_DEAR,r15
- mtspr SPRN_ESR,r16
- TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT);
- TLB_MISS_EPILOG_ERROR
- b exc_data_storage_book3e
-1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT);
- TLB_MISS_EPILOG_ERROR
- b exc_instruction_storage_book3e
-
-virt_page_table_tlb_miss_whacko_fault:
- /* The linear fault will restart everything so ESR and DEAR will
- * not have been clobbered, let's just fault with what we have
- */
- TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT);
- TLB_MISS_EPILOG_ERROR
- b exc_data_storage_book3e
-
-
-/**************************************************************
- * *
- * TLB miss handling for Book3E with hw page table support *
- * *
- **************************************************************/
-
-
-/* Data TLB miss */
- START_EXCEPTION(data_tlb_miss_htw)
- TLB_MISS_PROLOG
-
- /* Now we handle the fault proper. We only save DEAR in normal
- * fault case since that's the only interesting values here.
- * We could probably also optimize by not saving SRR0/1 in the
- * linear mapping case but I'll leave that for later
- */
- mfspr r14,SPRN_ESR
- mfspr r16,SPRN_DEAR /* get faulting address */
- srdi r11,r16,60 /* get region */
- cmpldi cr0,r11,0xc /* linear mapping ? */
- TLB_MISS_STATS_SAVE_INFO
- beq tlb_load_linear /* yes -> go to linear map load */
-
- /* We do the user/kernel test for the PID here along with the RW test
- */
- cmpldi cr0,r11,0 /* Check for user region */
- ld r15,PACAPGD(r13) /* Load user pgdir */
- beq htw_tlb_miss
-
- /* XXX replace the RMW cycles with immediate loads + writes */
-1: mfspr r10,SPRN_MAS1
- cmpldi cr0,r11,8 /* Check for vmalloc region */
- rlwinm r10,r10,0,16,1 /* Clear TID */
- mtspr SPRN_MAS1,r10
- ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */
- beq+ htw_tlb_miss
-
- /* We got a crappy address, just fault with whatever DEAR and ESR
- * are here
- */
- TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
- TLB_MISS_EPILOG_ERROR
- b exc_data_storage_book3e
-
-/* Instruction TLB miss */
- START_EXCEPTION(instruction_tlb_miss_htw)
- TLB_MISS_PROLOG
-
- /* If we take a recursive fault, the second level handler may need
- * to know whether we are handling a data or instruction fault in
- * order to get to the right store fault handler. We provide that
- * info by keeping a crazy value for ESR in r14
- */
- li r14,-1 /* store to exception frame is done later */
-
- /* Now we handle the fault proper. We only save DEAR in the non
- * linear mapping case since we know the linear mapping case will
- * not re-enter. We could indeed optimize and also not save SRR0/1
- * in the linear mapping case but I'll leave that for later
- *
- * Faulting address is SRR0 which is already in r16
- */
- srdi r11,r16,60 /* get region */
- cmpldi cr0,r11,0xc /* linear mapping ? */
- TLB_MISS_STATS_SAVE_INFO
- beq tlb_load_linear /* yes -> go to linear map load */
-
- /* We do the user/kernel test for the PID here along with the RW test
- */
- cmpldi cr0,r11,0 /* Check for user region */
- ld r15,PACAPGD(r13) /* Load user pgdir */
- beq htw_tlb_miss
-
- /* XXX replace the RMW cycles with immediate loads + writes */
-1: mfspr r10,SPRN_MAS1
- cmpldi cr0,r11,8 /* Check for vmalloc region */
- rlwinm r10,r10,0,16,1 /* Clear TID */
- mtspr SPRN_MAS1,r10
- ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */
- beq+ htw_tlb_miss
-
- /* We got a crappy address, just fault */
- TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
- TLB_MISS_EPILOG_ERROR
- b exc_instruction_storage_book3e
-
-
-/*
- * This is the guts of the second-level TLB miss handler for direct
- * misses. We are entered with:
- *
- * r16 = virtual page table faulting address
- * r15 = PGD pointer
- * r14 = ESR
- * r13 = PACA
- * r12 = TLB exception frame in PACA
- * r11 = crap (free to use)
- * r10 = crap (free to use)
- *
- * It can be re-entered by the linear mapping miss handler. However, to
- * avoid too much complication, it will save/restore things for us
- */
-htw_tlb_miss:
- /* Search if we already have a TLB entry for that virtual address, and
- * if we do, bail out.
- *
- * MAS1:IND should be already set based on MAS4
- */
- PPC_TLBSRX_DOT(0,R16)
- beq htw_tlb_miss_done
-
- /* Now, we need to walk the page tables. First check if we are in
- * range.
- */
- rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
- bne- htw_tlb_miss_fault
-
- /* Get the PGD pointer */
- cmpldi cr0,r15,0
- beq- htw_tlb_miss_fault
-
- /* Get to PGD entry */
- rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3
- clrrdi r10,r11,3
- ldx r15,r10,r15
- cmpdi cr0,r15,0
- bge htw_tlb_miss_fault
-
-#ifndef CONFIG_PPC_64K_PAGES
- /* Get to PUD entry */
- rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
- clrrdi r10,r11,3
- ldx r15,r10,r15
- cmpdi cr0,r15,0
- bge htw_tlb_miss_fault
-#endif /* CONFIG_PPC_64K_PAGES */
-
- /* Get to PMD entry */
- rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
- clrrdi r10,r11,3
- ldx r15,r10,r15
- cmpdi cr0,r15,0
- bge htw_tlb_miss_fault
-
- /* Ok, we're all right, we can now create an indirect entry for
- * a 1M or 256M page.
- *
- * The last trick is now that because we use "half" pages for
- * the HTW (1M IND is 2K and 256M IND is 32K) we need to account
- * for an added LSB bit to the RPN. For 64K pages, there is no
- * problem as we already use 32K arrays (half PTE pages), but for
- * 4K page we need to extract a bit from the virtual address and
- * insert it into the "PA52" bit of the RPN.
- */
-#ifndef CONFIG_PPC_64K_PAGES
- rlwimi r15,r16,32-9,20,20
-#endif
- /* Now we build the MAS:
- *
- * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
- * MAS 1 : Almost fully setup
- * - PID already updated by caller if necessary
- * - TSIZE for now is base ind page size always
- * MAS 2 : Use defaults
- * MAS 3+7 : Needs to be done
- */
-#ifdef CONFIG_PPC_64K_PAGES
- ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT)
-#else
- ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
-#endif
-
-BEGIN_MMU_FTR_SECTION
- srdi r16,r10,32
- mtspr SPRN_MAS3,r10
- mtspr SPRN_MAS7,r16
-MMU_FTR_SECTION_ELSE
- mtspr SPRN_MAS7_MAS3,r10
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
-
- tlbwe
-
-htw_tlb_miss_done:
- /* We don't bother with restoring DEAR or ESR since we know we are
- * level 0 and just going back to userland. They are only needed
- * if you are going to take an access fault
- */
- TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK)
- TLB_MISS_EPILOG_SUCCESS
- rfi
-
-htw_tlb_miss_fault:
- /* We need to check if it was an instruction miss. We know this
- * though because r14 would contain -1
- */
- cmpdi cr0,r14,-1
- beq 1f
- mtspr SPRN_DEAR,r16
- mtspr SPRN_ESR,r14
- TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT)
- TLB_MISS_EPILOG_ERROR
- b exc_data_storage_book3e
-1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT)
- TLB_MISS_EPILOG_ERROR
- b exc_instruction_storage_book3e
-
-/*
- * This is the guts of "any" level TLB miss handler for kernel linear
- * mapping misses. We are entered with:
- *
- *
- * r16 = faulting address
- * r15 = crap (free to use)
- * r14 = ESR (data) or -1 (instruction)
- * r13 = PACA
- * r12 = TLB exception frame in PACA
- * r11 = crap (free to use)
- * r10 = crap (free to use)
- *
- * In addition we know that we will not re-enter, so in theory, we could
- * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later.
- *
- * We also need to be careful about MAS registers here & TLB reservation,
- * as we know we'll have clobbered them if we interrupt the main TLB miss
- * handlers in which case we probably want to do a full restart at level
- * 0 rather than saving / restoring the MAS.
- *
- * Note: If we care about performance of that core, we can easily shuffle
- * a few things around
- */
-tlb_load_linear:
- /* For now, we assume the linear mapping is contiguous and stops at
- * linear_map_top. We also assume the size is a multiple of 1G, thus
- * we only use 1G pages for now. That might have to be changed in a
- * final implementation, especially when dealing with hypervisors
- */
- ld r11,PACATOC(r13)
- ld r11,linear_map_top@got(r11)
- ld r10,0(r11)
- cmpld cr0,r10,r16
- bge tlb_load_linear_fault
-
- /* MAS1 need whole new setup. */
- li r15,(BOOK3E_PAGESZ_1GB<<MAS1_TSIZE_SHIFT)
- oris r15,r15,MAS1_VALID@h /* MAS1 needs V and TSIZE */
- mtspr SPRN_MAS1,r15
-
- /* Already somebody there ? */
- PPC_TLBSRX_DOT(0,R16)
- beq tlb_load_linear_done
-
- /* Now we build the remaining MAS. MAS0 and 2 should be fine
- * with their defaults, which leaves us with MAS 3 and 7. The
- * mapping is linear, so we just take the address, clear the
- * region bits, and or in the permission bits which are currently
- * hard wired
- */
- clrrdi r10,r16,30 /* 1G page index */
- clrldi r10,r10,4 /* clear region bits */
- ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX
-
-BEGIN_MMU_FTR_SECTION
- srdi r16,r10,32
- mtspr SPRN_MAS3,r10
- mtspr SPRN_MAS7,r16
-MMU_FTR_SECTION_ELSE
- mtspr SPRN_MAS7_MAS3,r10
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
-
- tlbwe
-
-tlb_load_linear_done:
- /* We use the "error" epilog for success as we do want to
- * restore to the initial faulting context, whatever it was.
- * We do that because we can't resume a fault within a TLB
- * miss handler, due to MAS and TLB reservation being clobbered.
- */
- TLB_MISS_STATS_X(MMSTAT_TLB_MISS_LINEAR)
- TLB_MISS_EPILOG_ERROR
- rfi
-
-tlb_load_linear_fault:
- /* We keep the DEAR and ESR around, this shouldn't have happened */
- cmpdi cr0,r14,-1
- beq 1f
- TLB_MISS_EPILOG_ERROR_SPECIAL
- b exc_data_storage_book3e
-1: TLB_MISS_EPILOG_ERROR_SPECIAL
- b exc_instruction_storage_book3e
-
-
-#ifdef CONFIG_BOOK3E_MMU_TLB_STATS
-.tlb_stat_inc:
-1: ldarx r8,0,r9
- addi r8,r8,1
- stdcx. r8,0,r9
- bne- 1b
- blr
-#endif
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
deleted file mode 100644
index 41cd68dee681..000000000000
--- a/arch/powerpc/mm/tlb_nohash.c
+++ /dev/null
@@ -1,692 +0,0 @@
-/*
- * This file contains the routines for TLB flushing.
- * On machines where the MMU does not use a hash table to store virtual to
- * physical translations (ie, SW loaded TLBs or Book3E compilant processors,
- * this does -not- include 603 however which shares the implementation with
- * hash based processors)
- *
- * -- BenH
- *
- * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org>
- * IBM Corp.
- *
- * Derived from arch/ppc/mm/init.c:
- * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- * and Cort Dougan (PReP) (cort@cs.nmt.edu)
- * Copyright (C) 1996 Paul Mackerras
- *
- * Derived from "arch/i386/mm/init.c"
- * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/preempt.h>
-#include <linux/spinlock.h>
-#include <linux/memblock.h>
-#include <linux/of_fdt.h>
-#include <linux/hugetlb.h>
-
-#include <asm/tlbflush.h>
-#include <asm/tlb.h>
-#include <asm/code-patching.h>
-#include <asm/hugetlb.h>
-
-#include "mmu_decl.h"
-
-/*
- * This struct lists the sw-supported page sizes. The hardawre MMU may support
- * other sizes not listed here. The .ind field is only used on MMUs that have
- * indirect page table entries.
- */
-#ifdef CONFIG_PPC_BOOK3E_MMU
-#ifdef CONFIG_PPC_FSL_BOOK3E
-struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
- [MMU_PAGE_4K] = {
- .shift = 12,
- .enc = BOOK3E_PAGESZ_4K,
- },
- [MMU_PAGE_4M] = {
- .shift = 22,
- .enc = BOOK3E_PAGESZ_4M,
- },
- [MMU_PAGE_16M] = {
- .shift = 24,
- .enc = BOOK3E_PAGESZ_16M,
- },
- [MMU_PAGE_64M] = {
- .shift = 26,
- .enc = BOOK3E_PAGESZ_64M,
- },
- [MMU_PAGE_256M] = {
- .shift = 28,
- .enc = BOOK3E_PAGESZ_256M,
- },
- [MMU_PAGE_1G] = {
- .shift = 30,
- .enc = BOOK3E_PAGESZ_1GB,
- },
-};
-#else
-struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
- [MMU_PAGE_4K] = {
- .shift = 12,
- .ind = 20,
- .enc = BOOK3E_PAGESZ_4K,
- },
- [MMU_PAGE_16K] = {
- .shift = 14,
- .enc = BOOK3E_PAGESZ_16K,
- },
- [MMU_PAGE_64K] = {
- .shift = 16,
- .ind = 28,
- .enc = BOOK3E_PAGESZ_64K,
- },
- [MMU_PAGE_1M] = {
- .shift = 20,
- .enc = BOOK3E_PAGESZ_1M,
- },
- [MMU_PAGE_16M] = {
- .shift = 24,
- .ind = 36,
- .enc = BOOK3E_PAGESZ_16M,
- },
- [MMU_PAGE_256M] = {
- .shift = 28,
- .enc = BOOK3E_PAGESZ_256M,
- },
- [MMU_PAGE_1G] = {
- .shift = 30,
- .enc = BOOK3E_PAGESZ_1GB,
- },
-};
-#endif /* CONFIG_FSL_BOOKE */
-
-static inline int mmu_get_tsize(int psize)
-{
- return mmu_psize_defs[psize].enc;
-}
-#else
-static inline int mmu_get_tsize(int psize)
-{
- /* This isn't used on !Book3E for now */
- return 0;
-}
-#endif /* CONFIG_PPC_BOOK3E_MMU */
-
-/* The variables below are currently only used on 64-bit Book3E
- * though this will probably be made common with other nohash
- * implementations at some point
- */
-#ifdef CONFIG_PPC64
-
-int mmu_linear_psize; /* Page size used for the linear mapping */
-int mmu_pte_psize; /* Page size used for PTE pages */
-int mmu_vmemmap_psize; /* Page size used for the virtual mem map */
-int book3e_htw_enabled; /* Is HW tablewalk enabled ? */
-unsigned long linear_map_top; /* Top of linear mapping */
-
-#endif /* CONFIG_PPC64 */
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
-/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */
-DEFINE_PER_CPU(int, next_tlbcam_idx);
-EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx);
-#endif
-
-/*
- * Base TLB flushing operations:
- *
- * - flush_tlb_mm(mm) flushes the specified mm context TLB's
- * - flush_tlb_page(vma, vmaddr) flushes one page
- * - flush_tlb_range(vma, start, end) flushes a range of pages
- * - flush_tlb_kernel_range(start, end) flushes kernel pages
- *
- * - local_* variants of page and mm only apply to the current
- * processor
- */
-
-/*
- * These are the base non-SMP variants of page and mm flushing
- */
-void local_flush_tlb_mm(struct mm_struct *mm)
-{
- unsigned int pid;
-
- preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbil_pid(pid);
- preempt_enable();
-}
-EXPORT_SYMBOL(local_flush_tlb_mm);
-
-void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
- int tsize, int ind)
-{
- unsigned int pid;
-
- preempt_disable();
- pid = mm ? mm->context.id : 0;
- if (pid != MMU_NO_CONTEXT)
- _tlbil_va(vmaddr, pid, tsize, ind);
- preempt_enable();
-}
-
-void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
- __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
- mmu_get_tsize(mmu_virtual_psize), 0);
-}
-EXPORT_SYMBOL(local_flush_tlb_page);
-
-/*
- * And here are the SMP non-local implementations
- */
-#ifdef CONFIG_SMP
-
-static DEFINE_RAW_SPINLOCK(tlbivax_lock);
-
-static int mm_is_core_local(struct mm_struct *mm)
-{
- return cpumask_subset(mm_cpumask(mm),
- topology_thread_cpumask(smp_processor_id()));
-}
-
-struct tlb_flush_param {
- unsigned long addr;
- unsigned int pid;
- unsigned int tsize;
- unsigned int ind;
-};
-
-static void do_flush_tlb_mm_ipi(void *param)
-{
- struct tlb_flush_param *p = param;
-
- _tlbil_pid(p ? p->pid : 0);
-}
-
-static void do_flush_tlb_page_ipi(void *param)
-{
- struct tlb_flush_param *p = param;
-
- _tlbil_va(p->addr, p->pid, p->tsize, p->ind);
-}
-
-
-/* Note on invalidations and PID:
- *
- * We snapshot the PID with preempt disabled. At this point, it can still
- * change either because:
- * - our context is being stolen (PID -> NO_CONTEXT) on another CPU
- * - we are invaliating some target that isn't currently running here
- * and is concurrently acquiring a new PID on another CPU
- * - some other CPU is re-acquiring a lost PID for this mm
- * etc...
- *
- * However, this shouldn't be a problem as we only guarantee
- * invalidation of TLB entries present prior to this call, so we
- * don't care about the PID changing, and invalidating a stale PID
- * is generally harmless.
- */
-
-void flush_tlb_mm(struct mm_struct *mm)
-{
- unsigned int pid;
-
- preempt_disable();
- pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
- goto no_context;
- if (!mm_is_core_local(mm)) {
- struct tlb_flush_param p = { .pid = pid };
- /* Ignores smp_processor_id() even if set. */
- smp_call_function_many(mm_cpumask(mm),
- do_flush_tlb_mm_ipi, &p, 1);
- }
- _tlbil_pid(pid);
- no_context:
- preempt_enable();
-}
-EXPORT_SYMBOL(flush_tlb_mm);
-
-void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
- int tsize, int ind)
-{
- struct cpumask *cpu_mask;
- unsigned int pid;
-
- preempt_disable();
- pid = mm ? mm->context.id : 0;
- if (unlikely(pid == MMU_NO_CONTEXT))
- goto bail;
- cpu_mask = mm_cpumask(mm);
- if (!mm_is_core_local(mm)) {
- /* If broadcast tlbivax is supported, use it */
- if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) {
- int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL);
- if (lock)
- raw_spin_lock(&tlbivax_lock);
- _tlbivax_bcast(vmaddr, pid, tsize, ind);
- if (lock)
- raw_spin_unlock(&tlbivax_lock);
- goto bail;
- } else {
- struct tlb_flush_param p = {
- .pid = pid,
- .addr = vmaddr,
- .tsize = tsize,
- .ind = ind,
- };
- /* Ignores smp_processor_id() even if set in cpu_mask */
- smp_call_function_many(cpu_mask,
- do_flush_tlb_page_ipi, &p, 1);
- }
- }
- _tlbil_va(vmaddr, pid, tsize, ind);
- bail:
- preempt_enable();
-}
-
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-#ifdef CONFIG_HUGETLB_PAGE
- if (is_vm_hugetlb_page(vma))
- flush_hugetlb_page(vma, vmaddr);
-#endif
-
- __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
- mmu_get_tsize(mmu_virtual_psize), 0);
-}
-EXPORT_SYMBOL(flush_tlb_page);
-
-#endif /* CONFIG_SMP */
-
-#ifdef CONFIG_PPC_47x
-void __init early_init_mmu_47x(void)
-{
-#ifdef CONFIG_SMP
- unsigned long root = of_get_flat_dt_root();
- if (of_get_flat_dt_prop(root, "cooperative-partition", NULL))
- mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST);
-#endif /* CONFIG_SMP */
-}
-#endif /* CONFIG_PPC_47x */
-
-/*
- * Flush kernel TLB entries in the given range
- */
-void flush_tlb_kernel_range(unsigned long start, unsigned long end)
-{
-#ifdef CONFIG_SMP
- preempt_disable();
- smp_call_function(do_flush_tlb_mm_ipi, NULL, 1);
- _tlbil_pid(0);
- preempt_enable();
-#else
- _tlbil_pid(0);
-#endif
-}
-EXPORT_SYMBOL(flush_tlb_kernel_range);
-
-/*
- * Currently, for range flushing, we just do a full mm flush. This should
- * be optimized based on a threshold on the size of the range, since
- * some implementation can stack multiple tlbivax before a tlbsync but
- * for now, we keep it that way
- */
-void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end)
-
-{
- flush_tlb_mm(vma->vm_mm);
-}
-EXPORT_SYMBOL(flush_tlb_range);
-
-void tlb_flush(struct mmu_gather *tlb)
-{
- flush_tlb_mm(tlb->mm);
-}
-
-/*
- * Below are functions specific to the 64-bit variant of Book3E though that
- * may change in the future
- */
-
-#ifdef CONFIG_PPC64
-
-/*
- * Handling of virtual linear page tables or indirect TLB entries
- * flushing when PTE pages are freed
- */
-void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
-{
- int tsize = mmu_psize_defs[mmu_pte_psize].enc;
-
- if (book3e_htw_enabled) {
- unsigned long start = address & PMD_MASK;
- unsigned long end = address + PMD_SIZE;
- unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
-
- /* This isn't the most optimal, ideally we would factor out the
- * while preempt & CPU mask mucking around, or even the IPI but
- * it will do for now
- */
- while (start < end) {
- __flush_tlb_page(tlb->mm, start, tsize, 1);
- start += size;
- }
- } else {
- unsigned long rmask = 0xf000000000000000ul;
- unsigned long rid = (address & rmask) | 0x1000000000000000ul;
- unsigned long vpte = address & ~rmask;
-
-#ifdef CONFIG_PPC_64K_PAGES
- vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful;
-#else
- vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
-#endif
- vpte |= rid;
- __flush_tlb_page(tlb->mm, vpte, tsize, 0);
- }
-}
-
-static void setup_page_sizes(void)
-{
- unsigned int tlb0cfg;
- unsigned int tlb0ps;
- unsigned int eptcfg;
- int i, psize;
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
- unsigned int mmucfg = mfspr(SPRN_MMUCFG);
- int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E);
-
- if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
- unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG);
- unsigned int min_pg, max_pg;
-
- min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT;
- max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT;
-
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
- struct mmu_psize_def *def;
- unsigned int shift;
-
- def = &mmu_psize_defs[psize];
- shift = def->shift;
-
- if (shift == 0)
- continue;
-
- /* adjust to be in terms of 4^shift Kb */
- shift = (shift - 10) >> 1;
-
- if ((shift >= min_pg) && (shift <= max_pg))
- def->flags |= MMU_PAGE_SIZE_DIRECT;
- }
-
- goto no_indirect;
- }
-
- if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
- u32 tlb1ps = mfspr(SPRN_TLB1PS);
-
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
- struct mmu_psize_def *def = &mmu_psize_defs[psize];
-
- if (tlb1ps & (1U << (def->shift - 10))) {
- def->flags |= MMU_PAGE_SIZE_DIRECT;
- }
- }
-
- goto no_indirect;
- }
-#endif
-
- tlb0cfg = mfspr(SPRN_TLB0CFG);
- tlb0ps = mfspr(SPRN_TLB0PS);
- eptcfg = mfspr(SPRN_EPTCFG);
-
- /* Look for supported direct sizes */
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
- struct mmu_psize_def *def = &mmu_psize_defs[psize];
-
- if (tlb0ps & (1U << (def->shift - 10)))
- def->flags |= MMU_PAGE_SIZE_DIRECT;
- }
-
- /* Indirect page sizes supported ? */
- if ((tlb0cfg & TLBnCFG_IND) == 0)
- goto no_indirect;
-
- /* Now, we only deal with one IND page size for each
- * direct size. Hopefully all implementations today are
- * unambiguous, but we might want to be careful in the
- * future.
- */
- for (i = 0; i < 3; i++) {
- unsigned int ps, sps;
-
- sps = eptcfg & 0x1f;
- eptcfg >>= 5;
- ps = eptcfg & 0x1f;
- eptcfg >>= 5;
- if (!ps || !sps)
- continue;
- for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
- struct mmu_psize_def *def = &mmu_psize_defs[psize];
-
- if (ps == (def->shift - 10))
- def->flags |= MMU_PAGE_SIZE_INDIRECT;
- if (sps == (def->shift - 10))
- def->ind = ps + 10;
- }
- }
- no_indirect:
-
- /* Cleanup array and print summary */
- pr_info("MMU: Supported page sizes\n");
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
- struct mmu_psize_def *def = &mmu_psize_defs[psize];
- const char *__page_type_names[] = {
- "unsupported",
- "direct",
- "indirect",
- "direct & indirect"
- };
- if (def->flags == 0) {
- def->shift = 0;
- continue;
- }
- pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10),
- __page_type_names[def->flags & 0x3]);
- }
-}
-
-static void __patch_exception(int exc, unsigned long addr)
-{
- extern unsigned int interrupt_base_book3e;
- unsigned int *ibase = &interrupt_base_book3e;
-
- /* Our exceptions vectors start with a NOP and -then- a branch
- * to deal with single stepping from userspace which stops on
- * the second instruction. Thus we need to patch the second
- * instruction of the exception, not the first one
- */
-
- patch_branch(ibase + (exc / 4) + 1, addr, 0);
-}
-
-#define patch_exception(exc, name) do { \
- extern unsigned int name; \
- __patch_exception((exc), (unsigned long)&name); \
-} while (0)
-
-static void setup_mmu_htw(void)
-{
- /* Check if HW tablewalk is present, and if yes, enable it by:
- *
- * - patching the TLB miss handlers to branch to the
- * one dedicates to it
- *
- * - setting the global book3e_htw_enabled
- */
- unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
-
- if ((tlb0cfg & TLBnCFG_IND) &&
- (tlb0cfg & TLBnCFG_PT)) {
- patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e);
- patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e);
- book3e_htw_enabled = 1;
- }
- pr_info("MMU: Book3E HW tablewalk %s\n",
- book3e_htw_enabled ? "enabled" : "not supported");
-}
-
-/*
- * Early initialization of the MMU TLB code
- */
-static void __early_init_mmu(int boot_cpu)
-{
- unsigned int mas4;
-
- /* XXX This will have to be decided at runtime, but right
- * now our boot and TLB miss code hard wires it. Ideally
- * we should find out a suitable page size and patch the
- * TLB miss code (either that or use the PACA to store
- * the value we want)
- */
- mmu_linear_psize = MMU_PAGE_1G;
-
- /* XXX This should be decided at runtime based on supported
- * page sizes in the TLB, but for now let's assume 16M is
- * always there and a good fit (which it probably is)
- */
- mmu_vmemmap_psize = MMU_PAGE_16M;
-
- /* XXX This code only checks for TLB 0 capabilities and doesn't
- * check what page size combos are supported by the HW. It
- * also doesn't handle the case where a separate array holds
- * the IND entries from the array loaded by the PT.
- */
- if (boot_cpu) {
- /* Look for supported page sizes */
- setup_page_sizes();
-
- /* Look for HW tablewalk support */
- setup_mmu_htw();
- }
-
- /* Set MAS4 based on page table setting */
-
- mas4 = 0x4 << MAS4_WIMGED_SHIFT;
- if (book3e_htw_enabled) {
- mas4 |= mas4 | MAS4_INDD;
-#ifdef CONFIG_PPC_64K_PAGES
- mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
- mmu_pte_psize = MMU_PAGE_256M;
-#else
- mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
- mmu_pte_psize = MMU_PAGE_1M;
-#endif
- } else {
-#ifdef CONFIG_PPC_64K_PAGES
- mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
-#else
- mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
-#endif
- mmu_pte_psize = mmu_virtual_psize;
- }
- mtspr(SPRN_MAS4, mas4);
-
- /* Set the global containing the top of the linear mapping
- * for use by the TLB miss code
- */
- linear_map_top = memblock_end_of_DRAM();
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
- if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
- unsigned int num_cams;
-
- /* use a quarter of the TLBCAM for bolted linear map */
- num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
- linear_map_top = map_mem_in_cams(linear_map_top, num_cams);
-
- /* limit memory so we dont have linear faults */
- memblock_enforce_memory_limit(linear_map_top);
-
- patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
- patch_exception(0x1e0, exc_instruction_tlb_miss_bolted_book3e);
- }
-#endif
-
- /* A sync won't hurt us after mucking around with
- * the MMU configuration
- */
- mb();
-
- memblock_set_current_limit(linear_map_top);
-}
-
-void __init early_init_mmu(void)
-{
- __early_init_mmu(1);
-}
-
-void early_init_mmu_secondary(void)
-{
- __early_init_mmu(0);
-}
-
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size)
-{
- /* On non-FSL Embedded 64-bit, we adjust the RMA size to match
- * the bolted TLB entry. We know for now that only 1G
- * entries are supported though that may eventually
- * change.
- *
- * on FSL Embedded 64-bit, we adjust the RMA size to match the
- * first bolted TLB entry size. We still limit max to 1G even if
- * the TLB could cover more. This is due to what the early init
- * code is setup to do.
- *
- * We crop it to the size of the first MEMBLOCK to
- * avoid going over total available memory just in case...
- */
-#ifdef CONFIG_PPC_FSL_BOOK3E
- if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
- unsigned long linear_sz;
- linear_sz = calc_cam_sz(first_memblock_size, PAGE_OFFSET,
- first_memblock_base);
- ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
- } else
-#endif
- ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
-
- /* Finally limit subsequent allocations */
- memblock_set_current_limit(first_memblock_base + ppc64_rma_size);
-}
-#else /* ! CONFIG_PPC64 */
-void __init early_init_mmu(void)
-{
-#ifdef CONFIG_PPC_47x
- early_init_mmu_47x();
-#endif
-}
-#endif /* CONFIG_PPC64 */