summaryrefslogtreecommitdiff
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/Makefile7
-rw-r--r--arch/powerpc/mm/book3s32/Makefile1
-rw-r--r--arch/powerpc/mm/book3s32/hash_low.S42
-rw-r--r--arch/powerpc/mm/book3s32/kuap.c25
-rw-r--r--arch/powerpc/mm/book3s32/kuep.c20
-rw-r--r--arch/powerpc/mm/book3s32/mmu.c64
-rw-r--r--arch/powerpc/mm/book3s32/mmu_context.c15
-rw-r--r--arch/powerpc/mm/book3s32/tlb.c20
-rw-r--r--arch/powerpc/mm/book3s64/Makefile30
-rw-r--r--arch/powerpc/mm/book3s64/hash_4k.c5
-rw-r--r--arch/powerpc/mm/book3s64/hash_64k.c10
-rw-r--r--arch/powerpc/mm/book3s64/hash_hugepage.c17
-rw-r--r--arch/powerpc/mm/book3s64/hash_native.c179
-rw-r--r--arch/powerpc/mm/book3s64/hash_pgtable.c21
-rw-r--r--arch/powerpc/mm/book3s64/hash_tlb.c6
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c788
-rw-r--r--arch/powerpc/mm/book3s64/hugetlbpage.c (renamed from arch/powerpc/mm/book3s64/hash_hugetlbpage.c)21
-rw-r--r--arch/powerpc/mm/book3s64/internal.h18
-rw-r--r--arch/powerpc/mm/book3s64/iommu_api.c72
-rw-r--r--arch/powerpc/mm/book3s64/mmu_context.c46
-rw-r--r--arch/powerpc/mm/book3s64/pgtable.c267
-rw-r--r--arch/powerpc/mm/book3s64/pkeys.c8
-rw-r--r--arch/powerpc/mm/book3s64/radix_hugetlbpage.c69
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c882
-rw-r--r--arch/powerpc/mm/book3s64/radix_tlb.c395
-rw-r--r--arch/powerpc/mm/book3s64/slb.c133
-rw-r--r--arch/powerpc/mm/book3s64/slice.c (renamed from arch/powerpc/mm/slice.c)101
-rw-r--r--arch/powerpc/mm/book3s64/subpage_prot.c18
-rw-r--r--arch/powerpc/mm/book3s64/trace.c7
-rw-r--r--arch/powerpc/mm/cacheflush.c45
-rw-r--r--arch/powerpc/mm/copro_fault.c30
-rw-r--r--arch/powerpc/mm/drmem.c64
-rw-r--r--arch/powerpc/mm/fault.c194
-rw-r--r--arch/powerpc/mm/hugetlbpage.c542
-rw-r--r--arch/powerpc/mm/init-common.c42
-rw-r--r--arch/powerpc/mm/init_32.c58
-rw-r--r--arch/powerpc/mm/init_64.c243
-rw-r--r--arch/powerpc/mm/ioremap.c63
-rw-r--r--arch/powerpc/mm/ioremap_32.c19
-rw-r--r--arch/powerpc/mm/ioremap_64.c12
-rw-r--r--arch/powerpc/mm/kasan/8xx.c21
-rw-r--r--arch/powerpc/mm/kasan/Makefile5
-rw-r--r--arch/powerpc/mm/kasan/book3s_32.c58
-rw-r--r--arch/powerpc/mm/kasan/init_32.c (renamed from arch/powerpc/mm/kasan/kasan_init_32.c)10
-rw-r--r--arch/powerpc/mm/kasan/init_book3e_64.c133
-rw-r--r--arch/powerpc/mm/kasan/init_book3s_64.c100
-rw-r--r--arch/powerpc/mm/maccess.c17
-rw-r--r--arch/powerpc/mm/mem.c193
-rw-r--r--arch/powerpc/mm/mmap.c228
-rw-r--r--arch/powerpc/mm/mmu_context.c19
-rw-r--r--arch/powerpc/mm/mmu_decl.h66
-rw-r--r--arch/powerpc/mm/nohash/40x.c152
-rw-r--r--arch/powerpc/mm/nohash/44x.c22
-rw-r--r--arch/powerpc/mm/nohash/8xx.c142
-rw-r--r--arch/powerpc/mm/nohash/Makefile13
-rw-r--r--arch/powerpc/mm/nohash/book3e_pgtable.c27
-rw-r--r--arch/powerpc/mm/nohash/e500.c (renamed from arch/powerpc/mm/nohash/fsl_booke.c)102
-rw-r--r--arch/powerpc/mm/nohash/e500_hugetlbpage.c (renamed from arch/powerpc/mm/nohash/book3e_hugetlbpage.c)39
-rw-r--r--arch/powerpc/mm/nohash/kaslr_booke.c28
-rw-r--r--arch/powerpc/mm/nohash/kup.c27
-rw-r--r--arch/powerpc/mm/nohash/mmu_context.c44
-rw-r--r--arch/powerpc/mm/nohash/tlb.c517
-rw-r--r--arch/powerpc/mm/nohash/tlb_64e.c314
-rw-r--r--arch/powerpc/mm/nohash/tlb_low.S49
-rw-r--r--arch/powerpc/mm/nohash/tlb_low_64e.S525
-rw-r--r--arch/powerpc/mm/numa.c604
-rw-r--r--arch/powerpc/mm/pageattr.c99
-rw-r--r--arch/powerpc/mm/pgtable-frag.c83
-rw-r--r--arch/powerpc/mm/pgtable.c257
-rw-r--r--arch/powerpc/mm/pgtable_32.c87
-rw-r--r--arch/powerpc/mm/pgtable_64.c24
-rw-r--r--arch/powerpc/mm/ptdump/8xx.c18
-rw-r--r--arch/powerpc/mm/ptdump/Makefile13
-rw-r--r--arch/powerpc/mm/ptdump/bats.c18
-rw-r--r--arch/powerpc/mm/ptdump/book3s64.c13
-rw-r--r--arch/powerpc/mm/ptdump/hashpagetable.c25
-rw-r--r--arch/powerpc/mm/ptdump/ptdump.c247
-rw-r--r--arch/powerpc/mm/ptdump/ptdump.h5
-rw-r--r--arch/powerpc/mm/ptdump/segment_regs.c16
-rw-r--r--arch/powerpc/mm/ptdump/shared.c29
80 files changed, 4638 insertions, 4350 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index eae4ec2988fc..8c1582b2987d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -3,9 +3,7 @@
# Makefile for the linux ppc-specific parts of the memory manager.
#
-ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
-
-obj-y := fault.o mem.o pgtable.o mmap.o maccess.o pageattr.o \
+obj-y := fault.o mem.o pgtable.o maccess.o pageattr.o \
init_$(BITS).o pgtable_$(BITS).o \
pgtable-frag.o ioremap.o ioremap_$(BITS).o \
init-common.o mmu_context.o drmem.o \
@@ -14,9 +12,8 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/
obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/
obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/
obj-$(CONFIG_NUMA) += numa.o
-obj-$(CONFIG_PPC_MM_SLICES) += slice.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
-obj-$(CONFIG_PPC_PTDUMP) += ptdump/
+obj-$(CONFIG_PTDUMP) += ptdump/
obj-$(CONFIG_KASAN) += kasan/
diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile
index 15f4773643d2..50dd8f6bdf46 100644
--- a/arch/powerpc/mm/book3s32/Makefile
+++ b/arch/powerpc/mm/book3s32/Makefile
@@ -9,5 +9,4 @@ endif
obj-y += mmu.o mmu_context.o
obj-$(CONFIG_PPC_BOOK3S_603) += nohash_low.o
obj-$(CONFIG_PPC_BOOK3S_604) += hash_low.o tlb.o
-obj-$(CONFIG_PPC_KUEP) += kuep.o
obj-$(CONFIG_PPC_KUAP) += kuap.o
diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S
index 6925ce998557..4ed0efd03db5 100644
--- a/arch/powerpc/mm/book3s32/hash_low.S
+++ b/arch/powerpc/mm/book3s32/hash_low.S
@@ -14,6 +14,7 @@
* hash table, so this file is not used on them.)
*/
+#include <linux/export.h>
#include <linux/pgtable.h>
#include <linux/init.h>
#include <asm/reg.h>
@@ -22,7 +23,6 @@
#include <asm/ppc_asm.h>
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
-#include <asm/export.h>
#include <asm/feature-fixups.h>
#include <asm/code-patching-asm.h>
@@ -36,8 +36,9 @@
/*
* Load a PTE into the hash table, if possible.
- * The address is in r4, and r3 contains an access flag:
- * _PAGE_RW (0x400) if a write.
+ * The address is in r4, and r3 contains required access flags:
+ * - For ISI: _PAGE_PRESENT | _PAGE_EXEC
+ * - For DSI: _PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE if a write.
* r9 contains the SRR1 value, from which we use the MSR_PR bit.
* SPRG_THREAD contains the physical address of the current task's thread.
*
@@ -67,12 +68,16 @@ _GLOBAL(hash_page)
lis r0, TASK_SIZE@h /* check if kernel address */
cmplw 0,r4,r0
mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */
- ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
lwz r5,PGDIR(r8) /* virt page-table root */
blt+ 112f /* assume user more likely */
lis r5,swapper_pg_dir@ha /* if kernel address, use */
+ andi. r0,r9,MSR_PR /* Check usermode */
addi r5,r5,swapper_pg_dir@l /* kernel page table */
- rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */
+#ifdef CONFIG_SMP
+ bne- .Lhash_page_out /* return if usermode */
+#else
+ bnelr-
+#endif
112: tophys(r5, r5)
#ifndef CONFIG_PTE_64BIT
rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */
@@ -113,15 +118,15 @@ _GLOBAL(hash_page)
lwarx r6,0,r8 /* get linux-style pte, flag word */
#ifdef CONFIG_PPC_KUAP
mfsrin r5,r4
- rlwinm r0,r9,28,_PAGE_RW /* MSR[PR] => _PAGE_RW */
- rlwinm r5,r5,12,_PAGE_RW /* Ks => _PAGE_RW */
+ rlwinm r0,r9,28,_PAGE_WRITE /* MSR[PR] => _PAGE_WRITE */
+ rlwinm r5,r5,12,_PAGE_WRITE /* Ks => _PAGE_WRITE */
andc r5,r5,r0 /* Ks & ~MSR[PR] */
- andc r5,r6,r5 /* Clear _PAGE_RW when Ks = 1 && MSR[PR] = 0 */
+ andc r5,r6,r5 /* Clear _PAGE_WRITE when Ks = 1 && MSR[PR] = 0 */
andc. r5,r3,r5 /* check access & ~permission */
#else
andc. r5,r3,r6 /* check access & ~permission */
#endif
- rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */
+ rlwinm r0,r3,32-3,24,24 /* _PAGE_WRITE access -> _PAGE_DIRTY */
ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
#ifdef CONFIG_SMP
bne- .Lhash_page_out /* return if access not permitted */
@@ -199,12 +204,12 @@ _GLOBAL(add_hash_page)
lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha
addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l
10: lwarx r0,0,r6 /* take the mmu_hash_lock */
- cmpi 0,r0,0
+ cmpwi 0,r0,0
bne- 11f
stwcx. r8,0,r6
beq+ 12f
11: lwz r0,0(r6)
- cmpi 0,r0,0
+ cmpwi 0,r0,0
beq 10b
b 11b
12: isync
@@ -307,12 +312,15 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64)
__REF
_GLOBAL(create_hpte)
/* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */
- rlwinm r8,r5,32-9,30,30 /* _PAGE_RW -> PP msb */
+ lis r0, TASK_SIZE@h
+ rlwinm r5,r5,0,~3 /* Clear PP bits */
+ cmplw r4,r0
+ rlwinm r8,r5,32-9,30,30 /* _PAGE_WRITE -> PP msb */
rlwinm r0,r5,32-6,30,30 /* _PAGE_DIRTY -> PP msb */
and r8,r8,r0 /* writable if _RW & _DIRTY */
- rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */
- rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */
- ori r8,r8,0xe04 /* clear out reserved bits */
+ bge- 1f /* Kernelspace ? Skip */
+ ori r5,r5,3 /* Userspace ? PP = 3 */
+1: ori r8,r8,0xe04 /* clear out reserved bits */
andc r8,r5,r8 /* PP = user? (rw&dirty? 1: 3): 0 */
BEGIN_FTR_SECTION
rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */
@@ -512,12 +520,12 @@ _GLOBAL(flush_hash_pages)
lwz r8, TASK_CPU(r8)
oris r8,r8,9
10: lwarx r0,0,r9
- cmpi 0,r0,0
+ cmpwi 0,r0,0
bne- 11f
stwcx. r8,0,r9
beq+ 12f
11: lwz r0,0(r9)
- cmpi 0,r0,0
+ cmpwi 0,r0,0
beq 10b
b 11b
12: isync
diff --git a/arch/powerpc/mm/book3s32/kuap.c b/arch/powerpc/mm/book3s32/kuap.c
index 0f920f09af57..3a8815555a48 100644
--- a/arch/powerpc/mm/book3s32/kuap.c
+++ b/arch/powerpc/mm/book3s32/kuap.c
@@ -3,31 +3,20 @@
#include <asm/kup.h>
#include <asm/smp.h>
-struct static_key_false disable_kuap_key;
-EXPORT_SYMBOL(disable_kuap_key);
-
-void kuap_lock_all_ool(void)
-{
- kuap_lock_all();
-}
-EXPORT_SYMBOL(kuap_lock_all_ool);
-
-void kuap_unlock_all_ool(void)
-{
- kuap_unlock_all();
-}
-EXPORT_SYMBOL(kuap_unlock_all_ool);
-
void setup_kuap(bool disabled)
{
- if (!disabled)
- kuap_lock_all_ool();
+ if (!disabled) {
+ update_user_segments(mfsr(0) | SR_KS);
+ isync(); /* Context sync required after mtsr() */
+ init_mm.context.sr0 |= SR_KS;
+ current->thread.sr0 |= SR_KS;
+ }
if (smp_processor_id() != boot_cpuid)
return;
if (disabled)
- static_branch_enable(&disable_kuap_key);
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_KUAP;
else
pr_info("Activating Kernel Userspace Access Protection\n");
}
diff --git a/arch/powerpc/mm/book3s32/kuep.c b/arch/powerpc/mm/book3s32/kuep.c
deleted file mode 100644
index c20733d6e02c..000000000000
--- a/arch/powerpc/mm/book3s32/kuep.c
+++ /dev/null
@@ -1,20 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <asm/kup.h>
-#include <asm/smp.h>
-
-struct static_key_false disable_kuep_key;
-
-void setup_kuep(bool disabled)
-{
- if (!disabled)
- kuep_lock();
-
- if (smp_processor_id() != boot_cpuid)
- return;
-
- if (disabled)
- static_branch_enable(&disable_kuep_key);
- else
- pr_info("Activating Kernel Userspace Execution Prevention\n");
-}
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 27061583a010..c42ecdf94e48 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -23,10 +23,9 @@
#include <linux/highmem.h>
#include <linux/memblock.h>
-#include <asm/prom.h>
#include <asm/mmu.h>
#include <asm/machdep.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/sections.h>
#include <mm/mmu_decl.h>
@@ -76,7 +75,7 @@ unsigned long p_block_mapped(phys_addr_t pa)
return 0;
}
-static int find_free_bat(void)
+int __init find_free_bat(void)
{
int b;
int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
@@ -100,7 +99,7 @@ static int find_free_bat(void)
* - block size has to be a power of two. This is calculated by finding the
* highest bit set to 1.
*/
-static unsigned int block_size(unsigned long base, unsigned long top)
+unsigned int bat_block_size(unsigned long base, unsigned long top)
{
unsigned int max_size = SZ_256M;
unsigned int base_shift = (ffs(base) - 1) & 31;
@@ -128,7 +127,7 @@ static void setibat(int index, unsigned long virt, phys_addr_t phys,
wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX);
bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
- if (flags & _PAGE_USER)
+ if (!is_kernel_addr(virt))
bat[0].batu |= 1; /* Vp = 1 */
}
@@ -145,7 +144,7 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to
int idx;
while ((idx = find_free_bat()) != -1 && base != top) {
- unsigned int size = block_size(base, top);
+ unsigned int size = bat_block_size(base, top);
if (size < 128 << 10)
break;
@@ -159,10 +158,13 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to
unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
{
unsigned long done;
- unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET;
+ unsigned long border = (unsigned long)__srwx_boundary - PAGE_OFFSET;
+ unsigned long size;
+ size = roundup_pow_of_two((unsigned long)_einittext - PAGE_OFFSET);
+ setibat(0, PAGE_OFFSET, 0, size, PAGE_KERNEL_X);
- if (debug_pagealloc_enabled_or_kfence() || __map_without_bats) {
+ if (debug_pagealloc_enabled_or_kfence()) {
pr_debug_once("Read-Write memory mapped without BATs\n");
if (base >= border)
return base;
@@ -182,7 +184,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
static bool is_module_segment(unsigned long addr)
{
- if (!IS_ENABLED(CONFIG_MODULES))
+ if (!IS_ENABLED(CONFIG_EXECMEM))
return false;
if (addr < ALIGN_DOWN(MODULES_VADDR, SZ_256M))
return false;
@@ -191,30 +193,29 @@ static bool is_module_segment(unsigned long addr)
return true;
}
-void mmu_mark_initmem_nx(void)
+int mmu_mark_initmem_nx(void)
{
int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
int i;
unsigned long base = (unsigned long)_stext - PAGE_OFFSET;
- unsigned long top = (unsigned long)_etext - PAGE_OFFSET;
+ unsigned long top = ALIGN((unsigned long)_etext - PAGE_OFFSET, SZ_128K);
unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET;
unsigned long size;
- for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) {
- size = block_size(base, top);
- setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
+ for (i = 0; i < nb - 1 && base < top;) {
+ size = bat_block_size(base, top);
+ setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X);
base += size;
}
if (base < top) {
- size = block_size(base, top);
- size = max(size, 128UL << 10);
+ size = bat_block_size(base, top);
if ((top - base) > size) {
size <<= 1;
if (strict_kernel_rwx_enabled() && base + size > border)
pr_warn("Some RW data is getting mapped X. "
"Adjust CONFIG_DATA_SHIFT to avoid that.\n");
}
- setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
+ setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X);
base += size;
}
for (; i < nb; i++)
@@ -222,6 +223,8 @@ void mmu_mark_initmem_nx(void)
update_bats();
+ BUILD_BUG_ON(ALIGN_DOWN(MODULES_VADDR, SZ_256M) < TASK_SIZE);
+
for (i = TASK_SIZE >> 28; i < 16; i++) {
/* Do not set NX on VM space for modules */
if (is_module_segment(i << 28))
@@ -229,9 +232,10 @@ void mmu_mark_initmem_nx(void)
mtsr(mfsr(i << 28) | 0x10000000, i << 28);
}
+ return 0;
}
-void mmu_mark_rodata_ro(void)
+int mmu_mark_rodata_ro(void)
{
int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
int i;
@@ -239,18 +243,19 @@ void mmu_mark_rodata_ro(void)
for (i = 0; i < nb; i++) {
struct ppc_bat *bat = BATS[i];
- if (bat_addrs[i].start < (unsigned long)__init_begin)
+ if (bat_addrs[i].start < (unsigned long)__end_rodata)
bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX;
}
update_bats();
+
+ return 0;
}
/*
- * Set up one of the I/D BAT (block address translation) register pairs.
+ * Set up one of the D BAT (block address translation) register pairs.
* The parameters are not checked; in particular size must be a power
* of 2 between 128k and 256M.
- * On 603+, only set IBAT when _PAGE_EXEC is set
*/
void __init setbat(int index, unsigned long virt, phys_addr_t phys,
unsigned int size, pgprot_t prot)
@@ -277,19 +282,15 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys,
/* Do DBAT first */
wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
| _PAGE_COHERENT | _PAGE_GUARDED);
- wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX;
+ wimgxpp |= (flags & _PAGE_WRITE) ? BPP_RW : BPP_RX;
bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
- if (flags & _PAGE_USER)
+ if (!is_kernel_addr(virt))
bat[1].batu |= 1; /* Vp = 1 */
if (flags & _PAGE_GUARDED) {
/* G bit must be zero in IBATs */
flags &= ~_PAGE_EXEC;
}
- if (flags & _PAGE_EXEC)
- bat[0] = bat[1];
- else
- bat[0].batu = bat[0].batl = 0;
bat_addrs[index].start = virt;
bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1;
@@ -318,11 +319,9 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea)
*
* This must always be called with the pte lock held.
*/
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep)
{
- if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
- return;
/*
* We don't need to worry about _PAGE_PRESENT here because we are
* called with either mm->page_table_lock held or ptl lock held
@@ -378,10 +377,7 @@ void __init MMU_init_hw(void)
* Find some memory for the hash table.
*/
if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
- Hash = memblock_alloc(Hash_size, Hash_size);
- if (!Hash)
- panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
- __func__, Hash_size, Hash_size);
+ Hash = memblock_alloc_or_panic(Hash_size, Hash_size);
_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
pr_info("Total memory = %lldMB; using %ldkB for hash table\n",
diff --git a/arch/powerpc/mm/book3s32/mmu_context.c b/arch/powerpc/mm/book3s32/mmu_context.c
index e2708e387dc3..1922f9a6b058 100644
--- a/arch/powerpc/mm/book3s32/mmu_context.c
+++ b/arch/powerpc/mm/book3s32/mmu_context.c
@@ -69,6 +69,12 @@ EXPORT_SYMBOL_GPL(__init_new_context);
int init_new_context(struct task_struct *t, struct mm_struct *mm)
{
mm->context.id = __init_new_context();
+ mm->context.sr0 = CTX_TO_VSID(mm->context.id, 0);
+
+ if (IS_ENABLED(CONFIG_PPC_KUEP))
+ mm->context.sr0 |= SR_NX;
+ if (!kuap_is_disabled())
+ mm->context.sr0 |= SR_KS;
return 0;
}
@@ -108,20 +114,13 @@ void __init mmu_context_init(void)
void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk)
{
long id = next->context.id;
- unsigned long val;
if (id < 0)
panic("mm_struct %p has no context ID", next);
isync();
- val = CTX_TO_VSID(id, 0);
- if (!kuep_is_disabled())
- val |= SR_NX;
- if (!kuap_is_disabled())
- val |= SR_KS;
-
- update_user_segments(val);
+ update_user_segments(next->context.sr0);
if (IS_ENABLED(CONFIG_BDI_SWITCH))
abatron_pteptrs[1] = next->pgd;
diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c
index 19f0ef950d77..e54a7b011232 100644
--- a/arch/powerpc/mm/book3s32/tlb.c
+++ b/arch/powerpc/mm/book3s32/tlb.c
@@ -81,14 +81,15 @@ EXPORT_SYMBOL(hash__flush_range);
void hash__flush_tlb_mm(struct mm_struct *mm)
{
struct vm_area_struct *mp;
+ VMA_ITERATOR(vmi, mm, 0);
/*
- * It is safe to go down the mm's list of vmas when called
- * from dup_mmap, holding mmap_lock. It would also be safe from
- * unmap_region or exit_mmap, but not from vmtruncate on SMP -
- * but it seems dup_mmap is the only SMP case which gets here.
+ * It is safe to iterate the vmas when called from dup_mmap,
+ * holding mmap_lock. It would also be safe from unmap_region
+ * or exit_mmap, but not from vmtruncate on SMP - but it seems
+ * dup_mmap is the only SMP case which gets here.
*/
- for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
+ for_each_vma(vmi, mp)
hash__flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
}
EXPORT_SYMBOL(hash__flush_tlb_mm);
@@ -104,3 +105,12 @@ void hash__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
}
EXPORT_SYMBOL(hash__flush_tlb_page);
+
+void hash__flush_gather(struct mmu_gather *tlb)
+{
+ if (tlb->fullmm || tlb->need_flush_all)
+ hash__flush_tlb_mm(tlb->mm);
+ else
+ hash__flush_range(tlb->mm, tlb->start, tlb->end);
+}
+EXPORT_SYMBOL(hash__flush_gather);
diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile
index 1b56d3af47d4..33af5795856a 100644
--- a/arch/powerpc/mm/book3s64/Makefile
+++ b/arch/powerpc/mm/book3s64/Makefile
@@ -1,23 +1,33 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-y := $(NO_MINIMAL_TOC)
-
+obj-y += mmu_context.o pgtable.o trace.o
+ifdef CONFIG_PPC_64S_HASH_MMU
CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
-
-obj-y += hash_pgtable.o hash_utils.o slb.o \
- mmu_context.o pgtable.o hash_tlb.o
-obj-$(CONFIG_PPC_NATIVE) += hash_native.o
-obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o
+obj-y += hash_pgtable.o hash_utils.o hash_tlb.o slb.o slice.o
+obj-$(CONFIG_PPC_HASH_MMU_NATIVE) += hash_native.o
obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o
obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o
-obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o
+obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o
+endif
+
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+
+obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o
ifdef CONFIG_HUGETLB_PAGE
obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o
endif
-obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o
-obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o
obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o
obj-$(CONFIG_PPC_PKEY) += pkeys.o
# Instrumenting the SLB fault path can lead to duplicate SLB entries
KCOV_INSTRUMENT_slb.o := n
+
+# Parts of these can run in real mode and therefore are
+# not safe with the current outline KASAN implementation
+KASAN_SANITIZE_mmu_context.o := n
+KASAN_SANITIZE_pgtable.o := n
+KASAN_SANITIZE_radix_pgtable.o := n
+KASAN_SANITIZE_radix_tlb.o := n
+KASAN_SANITIZE_slb.o := n
+KASAN_SANITIZE_pkeys.o := n
diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c
index 7de1a8a0c62a..02acbfd05b46 100644
--- a/arch/powerpc/mm/book3s64/hash_4k.c
+++ b/arch/powerpc/mm/book3s64/hash_4k.c
@@ -16,6 +16,8 @@
#include <asm/machdep.h>
#include <asm/mmu.h>
+#include "internal.h"
+
int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
pte_t *ptep, unsigned long trap, unsigned long flags,
int ssize, int subpg_prot)
@@ -118,6 +120,9 @@ repeat:
}
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+ if (stress_hpt())
+ hpt_do_stress(ea, hpte_group);
}
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c
index 998c6817ed47..954af420f358 100644
--- a/arch/powerpc/mm/book3s64/hash_64k.c
+++ b/arch/powerpc/mm/book3s64/hash_64k.c
@@ -16,6 +16,8 @@
#include <asm/machdep.h>
#include <asm/mmu.h>
+#include "internal.h"
+
/*
* Return true, if the entry has a slot value which
* the software considers as invalid.
@@ -216,6 +218,9 @@ repeat:
new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
new_pte |= H_PAGE_HASHPTE;
+ if (stress_hpt())
+ hpt_do_stress(ea, hpte_group);
+
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
@@ -327,7 +332,12 @@ repeat:
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+ if (stress_hpt())
+ hpt_do_stress(ea, hpte_group);
}
+
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
+
return 0;
}
diff --git a/arch/powerpc/mm/book3s64/hash_hugepage.c b/arch/powerpc/mm/book3s64/hash_hugepage.c
index c0fabe6c5a12..cdfd4fe75edb 100644
--- a/arch/powerpc/mm/book3s64/hash_hugepage.c
+++ b/arch/powerpc/mm/book3s64/hash_hugepage.c
@@ -54,21 +54,18 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
/*
* Make sure this is thp or devmap entry
*/
- if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)))
+ if (!(old_pmd & H_PAGE_THP_HUGE))
return 0;
rflags = htab_convert_pte_flags(new_pmd, flags);
-#if 0
- if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+ /*
+ * THPs are only supported on platforms that can do mixed page size
+ * segments (MPSS) and all such platforms have coherent icache. Hence we
+ * don't need to do lazy icache flush (hash_page_do_lazy_icache()) on
+ * noexecute fault.
+ */
- /*
- * No CPU has hugepages but lacks no execute, so we
- * don't need to worry about that case
- */
- rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
- }
-#endif
/*
* Find the slot index details for this ea, using base page size.
*/
diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
index 52e170bd95ae..e9e2dd70c060 100644
--- a/arch/powerpc/mm/book3s64/hash_native.c
+++ b/arch/powerpc/mm/book3s64/hash_native.c
@@ -27,8 +27,6 @@
#include <asm/ppc-opcode.h>
#include <asm/feature-fixups.h>
-#include <misc/cxl-base.h>
-
#ifdef DEBUG_LOW
#define DBG_LOW(fmt...) udbg_printf(fmt)
#else
@@ -43,109 +41,28 @@
static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
-static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
-{
- unsigned long rb;
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map hpte_lock_map =
+ STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map);
- rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
-
- asm volatile("tlbiel %0" : : "r" (rb));
-}
-
-/*
- * tlbiel instruction for hash, set invalidation
- * i.e., r=1 and is=01 or is=10 or is=11
- */
-static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
- unsigned int pid,
- unsigned int ric, unsigned int prs)
+static void acquire_hpte_lock(void)
{
- unsigned long rb;
- unsigned long rs;
- unsigned int r = 0; /* hash format */
-
- rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
- rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
-
- asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
- : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r)
- : "memory");
+ lock_map_acquire(&hpte_lock_map);
}
-
-static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
+static void release_hpte_lock(void)
{
- unsigned int set;
-
- asm volatile("ptesync": : :"memory");
-
- for (set = 0; set < num_sets; set++)
- tlbiel_hash_set_isa206(set, is);
-
- ppc_after_tlbiel_barrier();
+ lock_map_release(&hpte_lock_map);
}
-
-static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
+#else
+static void acquire_hpte_lock(void)
{
- unsigned int set;
-
- asm volatile("ptesync": : :"memory");
-
- /*
- * Flush the partition table cache if this is HV mode.
- */
- if (early_cpu_has_feature(CPU_FTR_HVMODE))
- tlbiel_hash_set_isa300(0, is, 0, 2, 0);
-
- /*
- * Now invalidate the process table cache. UPRT=0 HPT modes (what
- * current hardware implements) do not use the process table, but
- * add the flushes anyway.
- *
- * From ISA v3.0B p. 1078:
- * The following forms are invalid.
- * * PRS=1, R=0, and RIC!=2 (The only process-scoped
- * HPT caching is of the Process Table.)
- */
- tlbiel_hash_set_isa300(0, is, 0, 2, 1);
-
- /*
- * Then flush the sets of the TLB proper. Hash mode uses
- * partition scoped TLB translations, which may be flushed
- * in !HV mode.
- */
- for (set = 0; set < num_sets; set++)
- tlbiel_hash_set_isa300(set, is, 0, 0, 0);
-
- ppc_after_tlbiel_barrier();
-
- asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory");
}
-void hash__tlbiel_all(unsigned int action)
+static void release_hpte_lock(void)
{
- unsigned int is;
-
- switch (action) {
- case TLB_INVAL_SCOPE_GLOBAL:
- is = 3;
- break;
- case TLB_INVAL_SCOPE_LPID:
- is = 2;
- break;
- default:
- BUG();
- }
-
- if (early_cpu_has_feature(CPU_FTR_ARCH_300))
- tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
- else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
- tlbiel_all_isa206(POWER8_TLB_SETS, is);
- else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
- tlbiel_all_isa206(POWER7_TLB_SETS, is);
- else
- WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
}
+#endif
static inline unsigned long ___tlbie(unsigned long vpn, int psize,
int apsize, int ssize)
@@ -267,7 +184,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
va |= ssize << 8;
sllp = get_sllp_encoding(apsize);
va |= sllp << 5;
- asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1)
+ asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 0), %1)
: : "r" (va), "i" (CPU_FTR_ARCH_206)
: "memory");
break;
@@ -286,7 +203,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
*/
va |= (vpn & 0xfe);
va |= 1; /* L */
- asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1)
+ asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 1), %1)
: : "r" (va), "i" (CPU_FTR_ARCH_206)
: "memory");
break;
@@ -298,11 +215,9 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
static inline void tlbie(unsigned long vpn, int psize, int apsize,
int ssize, int local)
{
- unsigned int use_local;
+ unsigned int use_local = local && mmu_has_feature(MMU_FTR_TLBIEL);
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
- use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use();
-
if (use_local)
use_local = mmu_psize_defs[psize].tlbiel;
if (lock_tlbie && !use_local)
@@ -324,6 +239,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
{
unsigned long *word = (unsigned long *)&hptep->v;
+ acquire_hpte_lock();
while (1) {
if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
break;
@@ -338,6 +254,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep)
{
unsigned long *word = (unsigned long *)&hptep->v;
+ release_hpte_lock();
clear_bit_unlock(HPTE_LOCK_BIT, word);
}
@@ -347,8 +264,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
{
struct hash_pte *hptep = htab_address + hpte_group;
unsigned long hpte_v, hpte_r;
+ unsigned long flags;
int i;
+ local_irq_save(flags);
+
if (!(vflags & HPTE_V_BOLTED)) {
DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx,"
" rflags=%lx, vflags=%lx, psize=%d)\n",
@@ -367,8 +287,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
hptep++;
}
- if (i == HPTES_PER_GROUP)
+ if (i == HPTES_PER_GROUP) {
+ local_irq_restore(flags);
return -1;
+ }
hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
@@ -390,19 +312,24 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
* Now set the first dword including the valid bit
* NOTE: this also unlocks the hpte
*/
+ release_hpte_lock();
hptep->v = cpu_to_be64(hpte_v);
__asm__ __volatile__ ("ptesync" : : : "memory");
+ local_irq_restore(flags);
+
return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
}
static long native_hpte_remove(unsigned long hpte_group)
{
+ unsigned long hpte_v, flags;
struct hash_pte *hptep;
int i;
int slot_offset;
- unsigned long hpte_v;
+
+ local_irq_save(flags);
DBG_LOW(" remove(group=%lx)\n", hpte_group);
@@ -427,12 +354,16 @@ static long native_hpte_remove(unsigned long hpte_group)
slot_offset &= 0x7;
}
- if (i == HPTES_PER_GROUP)
- return -1;
+ if (i == HPTES_PER_GROUP) {
+ i = -1;
+ goto out;
+ }
/* Invalidate the hpte. NOTE: this also unlocks it */
+ release_hpte_lock();
hptep->v = 0;
-
+out:
+ local_irq_restore(flags);
return i;
}
@@ -443,6 +374,9 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
struct hash_pte *hptep = htab_address + slot;
unsigned long hpte_v, want_v;
int ret = 0, local = 0;
+ unsigned long irqflags;
+
+ local_irq_save(irqflags);
want_v = hpte_encode_avpn(vpn, bpsize, ssize);
@@ -486,6 +420,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
if (!(flags & HPTE_NOHPTE_UPDATE))
tlbie(vpn, bpsize, apsize, ssize, local);
+ local_irq_restore(irqflags);
+
return ret;
}
@@ -549,6 +485,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
unsigned long vsid;
long slot;
struct hash_pte *hptep;
+ unsigned long flags;
+
+ local_irq_save(flags);
vsid = get_kernel_vsid(ea, ssize);
vpn = hpt_vpn(ea, vsid, ssize);
@@ -567,6 +506,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
* actual page size will be same.
*/
tlbie(vpn, psize, psize, ssize, 0);
+
+ local_irq_restore(flags);
}
/*
@@ -580,6 +521,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
unsigned long vsid;
long slot;
struct hash_pte *hptep;
+ unsigned long flags;
+
+ local_irq_save(flags);
vsid = get_kernel_vsid(ea, ssize);
vpn = hpt_vpn(ea, vsid, ssize);
@@ -597,6 +541,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
/* Invalidate the TLB */
tlbie(vpn, psize, psize, ssize, 0);
+
+ local_irq_restore(flags);
+
return 0;
}
@@ -621,10 +568,11 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
/* recheck with locks held */
hpte_v = hpte_get_old_v(hptep);
- if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
/* Invalidate the hpte. NOTE: this also unlocks it */
+ release_hpte_lock();
hptep->v = 0;
- else
+ } else
native_unlock_hpte(hptep);
}
/*
@@ -684,10 +632,8 @@ static void native_hugepage_invalidate(unsigned long vsid,
hpte_v = hpte_get_old_v(hptep);
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
- /*
- * Invalidate the hpte. NOTE: this also unlocks it
- */
-
+ /* Invalidate the hpte. NOTE: this also unlocks it */
+ release_hpte_lock();
hptep->v = 0;
} else
native_unlock_hpte(hptep);
@@ -787,7 +733,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
* TODO: add batching support when enabled. remember, no dynamic memory here,
* although there is the control page available...
*/
-static void native_hpte_clear(void)
+static notrace void native_hpte_clear(void)
{
unsigned long vpn = 0;
unsigned long slot, slots;
@@ -839,10 +785,6 @@ static void native_flush_hash_range(unsigned long number, int local)
unsigned long psize = batch->psize;
int ssize = batch->ssize;
int i;
- unsigned int use_local;
-
- use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) &&
- mmu_psize_defs[psize].tlbiel && !cxl_ctx_in_use();
local_irq_save(flags);
@@ -869,13 +811,16 @@ static void native_flush_hash_range(unsigned long number, int local)
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
- else
+ else {
+ release_hpte_lock();
hptep->v = 0;
+ }
} pte_iterate_hashed_end();
}
- if (use_local) {
+ if (mmu_has_feature(MMU_FTR_TLBIEL) &&
+ mmu_psize_defs[psize].tlbiel && local) {
asm volatile("ptesync":::"memory");
for (i = 0; i < number; i++) {
vpn = batch->vpn[i];
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
index ad5eff097d31..82d31177630b 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -13,10 +13,10 @@
#include <asm/sections.h>
#include <asm/mmu.h>
#include <asm/tlb.h>
+#include <asm/firmware.h>
#include <mm/mmu_decl.h>
-#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>
#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
@@ -195,7 +195,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr
unsigned long old;
#ifdef CONFIG_DEBUG_VM
- WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+ WARN_ON(!hash__pmd_trans_huge(*pmdp));
assert_spin_locked(pmd_lockptr(mm, pmdp));
#endif
@@ -214,7 +214,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr
old = be64_to_cpu(old_be);
- trace_hugepage_update(addr, old, clr, set);
+ trace_hugepage_update_pmd(addr, old, clr, set);
if (old & H_PAGE_HASHPTE)
hpte_do_hugepage_flush(mm, addr, pmdp, old);
return old;
@@ -227,7 +227,6 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
VM_BUG_ON(pmd_trans_huge(*pmdp));
- VM_BUG_ON(pmd_devmap(*pmdp));
pmd = *pmdp;
pmd_clear(pmdp);
@@ -256,7 +255,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres
* the __collapse_huge_page_copy can result in copying
* the old content.
*/
- flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+ flush_hash_table_pmd_range(vma->vm_mm, &pmd, address);
return pmd;
}
@@ -378,7 +377,7 @@ int hash__has_transparent_hugepage(void)
if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
return 0;
/*
- * We need to make sure that we support 16MB hugepage in a segement
+ * We need to make sure that we support 16MB hugepage in a segment
* with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
* of 64K.
*/
@@ -404,7 +403,8 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);
struct change_memory_parms {
unsigned long start, end, newpp;
- unsigned int step, nr_cpus, master_cpu;
+ unsigned int step, nr_cpus;
+ atomic_t master_cpu;
atomic_t cpu_counter;
};
@@ -478,7 +478,8 @@ static int change_memory_range_fn(void *data)
{
struct change_memory_parms *parms = data;
- if (parms->master_cpu != smp_processor_id())
+ // First CPU goes through, all others wait.
+ if (atomic_xchg(&parms->master_cpu, 1) == 1)
return chmem_secondary_loop(parms);
// Wait for all but one CPU (this one) to call-in
@@ -516,7 +517,7 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end,
chmem_parms.end = end;
chmem_parms.step = step;
chmem_parms.newpp = newpp;
- chmem_parms.master_cpu = smp_processor_id();
+ atomic_set(&chmem_parms.master_cpu, 0);
cpus_read_lock();
@@ -541,7 +542,7 @@ void hash__mark_rodata_ro(void)
unsigned long start, end, pp;
start = (unsigned long)_stext;
- end = (unsigned long)__init_begin;
+ end = (unsigned long)__end_rodata;
pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY);
diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c
index eb0bccaf221e..21fcad97ae80 100644
--- a/arch/powerpc/mm/book3s64/hash_tlb.c
+++ b/arch/powerpc/mm/book3s64/hash_tlb.c
@@ -221,7 +221,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end)
local_irq_restore(flags);
}
-void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
{
pte_t *pte;
pte_t *start_pte;
@@ -239,12 +239,16 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
start_pte = pte_offset_map(pmd, addr);
+ if (!start_pte)
+ goto out;
for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
unsigned long pteval = pte_val(*pte);
if (pteval & H_PAGE_HASHPTE)
hpte_need_flush(mm, addr, pte, pteval, 0);
addr += PAGE_SIZE;
}
+ pte_unmap(start_pte);
+out:
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
}
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index ac5720371c0d..9dc5889d6ecb 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -36,26 +36,30 @@
#include <linux/hugetlb.h>
#include <linux/cpu.h>
#include <linux/pgtable.h>
+#include <linux/debugfs.h>
+#include <linux/random.h>
+#include <linux/elf-randomize.h>
+#include <linux/of_fdt.h>
+#include <linux/kfence.h>
-#include <asm/debugfs.h>
#include <asm/interrupt.h>
#include <asm/processor.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/page.h>
+#include <asm/pgalloc.h>
#include <asm/types.h>
#include <linux/uaccess.h>
#include <asm/machdep.h>
-#include <asm/prom.h>
#include <asm/io.h>
#include <asm/eeh.h>
#include <asm/tlb.h>
#include <asm/cacheflush.h>
#include <asm/cputable.h>
#include <asm/sections.h>
-#include <asm/copro.h>
+#include <asm/spu.h>
#include <asm/udbg.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/fadump.h>
#include <asm/firmware.h>
#include <asm/tm.h>
@@ -64,6 +68,7 @@
#include <asm/pte-walk.h>
#include <asm/asm-prototypes.h>
#include <asm/ultravisor.h>
+#include <asm/kfence.h>
#include <mm/mmu_decl.h>
@@ -99,8 +104,6 @@
*/
static unsigned long _SDR1;
-struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
-EXPORT_SYMBOL_GPL(mmu_psize_defs);
u8 hpte_page_sizes[1 << LP_BITS];
EXPORT_SYMBOL_GPL(hpte_page_sizes);
@@ -114,9 +117,6 @@ EXPORT_SYMBOL_GPL(mmu_linear_psize);
int mmu_virtual_psize = MMU_PAGE_4K;
int mmu_vmalloc_psize = MMU_PAGE_4K;
EXPORT_SYMBOL_GPL(mmu_vmalloc_psize);
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-int mmu_vmemmap_psize = MMU_PAGE_4K;
-#endif
int mmu_io_psize = MMU_PAGE_4K;
int mmu_kernel_ssize = MMU_SEGSIZE_256M;
EXPORT_SYMBOL_GPL(mmu_kernel_ssize);
@@ -126,12 +126,7 @@ EXPORT_SYMBOL_GPL(mmu_slb_size);
#ifdef CONFIG_PPC_64K_PAGES
int mmu_ci_restrictions;
#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-static u8 *linear_map_hash_slots;
-static unsigned long linear_map_hash_count;
-static DEFINE_SPINLOCK(linear_map_hash_lock);
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-struct mmu_hash_ops mmu_hash_ops;
+struct mmu_hash_ops mmu_hash_ops __ro_after_init;
EXPORT_SYMBOL(mmu_hash_ops);
/*
@@ -175,6 +170,376 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
},
};
+static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
+{
+ unsigned long rb;
+
+ rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+
+ asm volatile("tlbiel %0" : : "r" (rb));
+}
+
+/*
+ * tlbiel instruction for hash, set invalidation
+ * i.e., r=1 and is=01 or is=10 or is=11
+ */
+static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
+ unsigned int pid,
+ unsigned int ric, unsigned int prs)
+{
+ unsigned long rb;
+ unsigned long rs;
+ unsigned int r = 0; /* hash format */
+
+ rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+ rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
+
+ asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
+ : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r)
+ : "memory");
+}
+
+
+static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
+{
+ unsigned int set;
+
+ asm volatile("ptesync": : :"memory");
+
+ for (set = 0; set < num_sets; set++)
+ tlbiel_hash_set_isa206(set, is);
+
+ ppc_after_tlbiel_barrier();
+}
+
+static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
+{
+ unsigned int set;
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the partition table cache if this is HV mode.
+ */
+ if (early_cpu_has_feature(CPU_FTR_HVMODE))
+ tlbiel_hash_set_isa300(0, is, 0, 2, 0);
+
+ /*
+ * Now invalidate the process table cache. UPRT=0 HPT modes (what
+ * current hardware implements) do not use the process table, but
+ * add the flushes anyway.
+ *
+ * From ISA v3.0B p. 1078:
+ * The following forms are invalid.
+ * * PRS=1, R=0, and RIC!=2 (The only process-scoped
+ * HPT caching is of the Process Table.)
+ */
+ tlbiel_hash_set_isa300(0, is, 0, 2, 1);
+
+ /*
+ * Then flush the sets of the TLB proper. Hash mode uses
+ * partition scoped TLB translations, which may be flushed
+ * in !HV mode.
+ */
+ for (set = 0; set < num_sets; set++)
+ tlbiel_hash_set_isa300(set, is, 0, 0, 0);
+
+ ppc_after_tlbiel_barrier();
+
+ asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+void hash__tlbiel_all(unsigned int action)
+{
+ unsigned int is;
+
+ switch (action) {
+ case TLB_INVAL_SCOPE_GLOBAL:
+ is = 3;
+ break;
+ case TLB_INVAL_SCOPE_LPID:
+ is = 2;
+ break;
+ default:
+ BUG();
+ }
+
+ if (early_cpu_has_feature(CPU_FTR_ARCH_300))
+ tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
+ else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
+ tlbiel_all_isa206(POWER8_TLB_SETS, is);
+ else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
+ tlbiel_all_isa206(POWER7_TLB_SETS, is);
+ else
+ WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
+}
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
+static void kernel_map_linear_page(unsigned long vaddr, unsigned long idx,
+ u8 *slots, raw_spinlock_t *lock)
+{
+ unsigned long hash;
+ unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
+ unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+ unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY);
+ long ret;
+
+ hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+
+ /* Don't create HPTE entries for bad address */
+ if (!vsid)
+ return;
+
+ if (slots[idx] & 0x80)
+ return;
+
+ ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
+ HPTE_V_BOLTED,
+ mmu_linear_psize, mmu_kernel_ssize);
+
+ BUG_ON (ret < 0);
+ raw_spin_lock(lock);
+ BUG_ON(slots[idx] & 0x80);
+ slots[idx] = ret | 0x80;
+ raw_spin_unlock(lock);
+}
+
+static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long idx,
+ u8 *slots, raw_spinlock_t *lock)
+{
+ unsigned long hash, hslot, slot;
+ unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
+ unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+
+ hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+ raw_spin_lock(lock);
+ if (!(slots[idx] & 0x80)) {
+ raw_spin_unlock(lock);
+ return;
+ }
+ hslot = slots[idx] & 0x7f;
+ slots[idx] = 0;
+ raw_spin_unlock(lock);
+ if (hslot & _PTEIDX_SECONDARY)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hslot & _PTEIDX_GROUP_IX;
+ mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
+ mmu_linear_psize,
+ mmu_kernel_ssize, 0);
+}
+#endif
+
+static inline bool hash_supports_debug_pagealloc(void)
+{
+ unsigned long max_hash_count = ppc64_rma_size / 4;
+ unsigned long linear_map_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
+
+ if (!debug_pagealloc_enabled() || linear_map_count > max_hash_count)
+ return false;
+ return true;
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static u8 *linear_map_hash_slots;
+static unsigned long linear_map_hash_count;
+static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
+static __init void hash_debug_pagealloc_alloc_slots(void)
+{
+ if (!hash_supports_debug_pagealloc())
+ return;
+
+ linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
+ linear_map_hash_slots = memblock_alloc_try_nid(
+ linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
+ ppc64_rma_size, NUMA_NO_NODE);
+ if (!linear_map_hash_slots)
+ panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
+ __func__, linear_map_hash_count, &ppc64_rma_size);
+}
+
+static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr,
+ int slot)
+{
+ if (!debug_pagealloc_enabled() || !linear_map_hash_count)
+ return;
+ if ((paddr >> PAGE_SHIFT) < linear_map_hash_count)
+ linear_map_hash_slots[paddr >> PAGE_SHIFT] = slot | 0x80;
+}
+
+static int hash_debug_pagealloc_map_pages(struct page *page, int numpages,
+ int enable)
+{
+ unsigned long flags, vaddr, lmi;
+ int i;
+
+ if (!debug_pagealloc_enabled() || !linear_map_hash_count)
+ return 0;
+
+ local_irq_save(flags);
+ for (i = 0; i < numpages; i++, page++) {
+ vaddr = (unsigned long)page_address(page);
+ lmi = __pa(vaddr) >> PAGE_SHIFT;
+ if (lmi >= linear_map_hash_count)
+ continue;
+ if (enable)
+ kernel_map_linear_page(vaddr, lmi,
+ linear_map_hash_slots, &linear_map_hash_lock);
+ else
+ kernel_unmap_linear_page(vaddr, lmi,
+ linear_map_hash_slots, &linear_map_hash_lock);
+ }
+ local_irq_restore(flags);
+ return 0;
+}
+
+#else /* CONFIG_DEBUG_PAGEALLOC */
+static inline void hash_debug_pagealloc_alloc_slots(void) {}
+static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot) {}
+static int __maybe_unused
+hash_debug_pagealloc_map_pages(struct page *page, int numpages, int enable)
+{
+ return 0;
+}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+#ifdef CONFIG_KFENCE
+static u8 *linear_map_kf_hash_slots;
+static unsigned long linear_map_kf_hash_count;
+static DEFINE_RAW_SPINLOCK(linear_map_kf_hash_lock);
+
+static phys_addr_t kfence_pool;
+
+static __init void hash_kfence_alloc_pool(void)
+{
+ if (!kfence_early_init_enabled())
+ goto err;
+
+ /* allocate linear map for kfence within RMA region */
+ linear_map_kf_hash_count = KFENCE_POOL_SIZE >> PAGE_SHIFT;
+ linear_map_kf_hash_slots = memblock_alloc_try_nid(
+ linear_map_kf_hash_count, 1,
+ MEMBLOCK_LOW_LIMIT, ppc64_rma_size,
+ NUMA_NO_NODE);
+ if (!linear_map_kf_hash_slots) {
+ pr_err("%s: memblock for linear map (%lu) failed\n", __func__,
+ linear_map_kf_hash_count);
+ goto err;
+ }
+
+ /* allocate kfence pool early */
+ kfence_pool = memblock_phys_alloc_range(KFENCE_POOL_SIZE, PAGE_SIZE,
+ MEMBLOCK_LOW_LIMIT, MEMBLOCK_ALLOC_ANYWHERE);
+ if (!kfence_pool) {
+ pr_err("%s: memblock for kfence pool (%lu) failed\n", __func__,
+ KFENCE_POOL_SIZE);
+ memblock_free(linear_map_kf_hash_slots,
+ linear_map_kf_hash_count);
+ linear_map_kf_hash_count = 0;
+ goto err;
+ }
+ memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
+
+ return;
+err:
+ pr_info("Disabling kfence\n");
+ disable_kfence();
+}
+
+static __init void hash_kfence_map_pool(void)
+{
+ unsigned long kfence_pool_start, kfence_pool_end;
+ unsigned long prot = pgprot_val(PAGE_KERNEL);
+ unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift;
+
+ if (!kfence_pool)
+ return;
+
+ kfence_pool_start = (unsigned long) __va(kfence_pool);
+ kfence_pool_end = kfence_pool_start + KFENCE_POOL_SIZE;
+ __kfence_pool = (char *) kfence_pool_start;
+ BUG_ON(htab_bolt_mapping(kfence_pool_start, kfence_pool_end,
+ kfence_pool, prot, mmu_linear_psize,
+ mmu_kernel_ssize));
+ update_page_count(mmu_linear_psize, KFENCE_POOL_SIZE >> pshift);
+ memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
+}
+
+static inline void hash_kfence_add_slot(phys_addr_t paddr, int slot)
+{
+ unsigned long vaddr = (unsigned long) __va(paddr);
+ unsigned long lmi = (vaddr - (unsigned long)__kfence_pool)
+ >> PAGE_SHIFT;
+
+ if (!kfence_pool)
+ return;
+ BUG_ON(!is_kfence_address((void *)vaddr));
+ BUG_ON(lmi >= linear_map_kf_hash_count);
+ linear_map_kf_hash_slots[lmi] = slot | 0x80;
+}
+
+static int hash_kfence_map_pages(struct page *page, int numpages, int enable)
+{
+ unsigned long flags, vaddr, lmi;
+ int i;
+
+ WARN_ON_ONCE(!linear_map_kf_hash_count);
+ local_irq_save(flags);
+ for (i = 0; i < numpages; i++, page++) {
+ vaddr = (unsigned long)page_address(page);
+ lmi = (vaddr - (unsigned long)__kfence_pool) >> PAGE_SHIFT;
+
+ /* Ideally this should never happen */
+ if (lmi >= linear_map_kf_hash_count) {
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ if (enable)
+ kernel_map_linear_page(vaddr, lmi,
+ linear_map_kf_hash_slots,
+ &linear_map_kf_hash_lock);
+ else
+ kernel_unmap_linear_page(vaddr, lmi,
+ linear_map_kf_hash_slots,
+ &linear_map_kf_hash_lock);
+ }
+ local_irq_restore(flags);
+ return 0;
+}
+#else
+static inline void hash_kfence_alloc_pool(void) {}
+static inline void hash_kfence_map_pool(void) {}
+static inline void hash_kfence_add_slot(phys_addr_t paddr, int slot) {}
+static int __maybe_unused
+hash_kfence_map_pages(struct page *page, int numpages, int enable)
+{
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
+int hash__kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ void *vaddr = page_address(page);
+
+ if (is_kfence_address(vaddr))
+ return hash_kfence_map_pages(page, numpages, enable);
+ else
+ return hash_debug_pagealloc_map_pages(page, numpages, enable);
+}
+
+static void hash_linear_map_add_slot(phys_addr_t paddr, int slot)
+{
+ if (is_kfence_address(__va(paddr)))
+ hash_kfence_add_slot(paddr, slot);
+ else
+ hash_debug_pagealloc_add_slot(paddr, slot);
+}
+#else
+static void hash_linear_map_add_slot(phys_addr_t paddr, int slot) {}
+#endif
+
/*
* 'R' and 'C' update notes:
* - Under pHyp or KVM, the updatepp path will not set C, thus it *will*
@@ -212,9 +577,16 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags
else
rflags |= 0x3;
}
+ VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request");
} else {
if (pteflags & _PAGE_RWX)
rflags |= 0x2;
+ /*
+ * We should never hit this in normal fault handling because
+ * a permission check (check_pte_access()) will bubble this
+ * to higher level linux handler even for PAGE_NONE.
+ */
+ VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request");
if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
rflags |= 0x1;
}
@@ -307,7 +679,7 @@ repeat:
ssize);
if (ret == -1) {
/*
- * Try to to keep bolted entries in primary.
+ * Try to keep bolted entries in primary.
* Remove non bolted entries and try insert again
*/
ret = mmu_hash_ops.hpte_remove(hpteg);
@@ -326,11 +698,8 @@ repeat:
break;
cond_resched();
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (debug_pagealloc_enabled() &&
- (paddr >> PAGE_SHIFT) < linear_map_hash_count)
- linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
-#endif /* CONFIG_DEBUG_PAGEALLOC */
+ /* add slot info in debug_pagealloc / kfence linear map */
+ hash_linear_map_add_slot(paddr, ret);
}
return ret < 0 ? ret : 0;
}
@@ -375,7 +744,7 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend,
return ret;
}
-static bool disable_1tb_segments = false;
+static bool disable_1tb_segments __ro_after_init;
static int __init parse_disable_1tb_segments(char *p)
{
@@ -384,6 +753,40 @@ static int __init parse_disable_1tb_segments(char *p)
}
early_param("disable_1tb_segments", parse_disable_1tb_segments);
+bool stress_hpt_enabled __initdata;
+
+static int __init parse_stress_hpt(char *p)
+{
+ stress_hpt_enabled = true;
+ return 0;
+}
+early_param("stress_hpt", parse_stress_hpt);
+
+__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_hpt_key);
+
+/*
+ * per-CPU array allocated if we enable stress_hpt.
+ */
+#define STRESS_MAX_GROUPS 16
+struct stress_hpt_struct {
+ unsigned long last_group[STRESS_MAX_GROUPS];
+};
+
+static inline int stress_nr_groups(void)
+{
+ /*
+ * LPAR H_REMOVE flushes TLB, so need some number > 1 of entries
+ * to allow practical forward progress. Bare metal returns 1, which
+ * seems to help uncover more bugs.
+ */
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ return STRESS_MAX_GROUPS;
+ else
+ return 1;
+}
+
+static struct stress_hpt_struct *stress_hpt_struct;
+
static int __init htab_dt_scan_seg_sizes(unsigned long node,
const char *uname, int depth,
void *data)
@@ -552,7 +955,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
block_size = be64_to_cpu(addr_prop[1]);
if (block_size != (16 * GB))
return 0;
- printk(KERN_INFO "Huge page(16GB) memory: "
+ pr_info("Huge page(16GB) memory: "
"addr = 0x%lX size = 0x%lX pages = %d\n",
phys_addr, block_size, expected_pages);
if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
@@ -563,7 +966,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
}
#endif /* CONFIG_HUGETLB_PAGE */
-static void mmu_psize_set_default_penc(void)
+static void __init mmu_psize_set_default_penc(void)
{
int bpsize, apsize;
for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
@@ -573,7 +976,7 @@ static void mmu_psize_set_default_penc(void)
#ifdef CONFIG_PPC_64K_PAGES
-static bool might_have_hea(void)
+static bool __init might_have_hea(void)
{
/*
* The HEA ethernet adapter requires awareness of the
@@ -644,7 +1047,7 @@ static void __init htab_scan_page_sizes(void)
* low-order N bits as the encoding for the 2^(12+N) byte page size
* (if it exists).
*/
-static void init_hpte_page_sizes(void)
+static void __init init_hpte_page_sizes(void)
{
long int ap, bp;
long int shift, penc;
@@ -677,7 +1080,7 @@ static void __init htab_init_page_sizes(void)
bool aligned = true;
init_hpte_page_sizes();
- if (!debug_pagealloc_enabled()) {
+ if (!hash_supports_debug_pagealloc() && !kfence_early_init_enabled()) {
/*
* Pick a size for the linear mapping. Currently, we only
* support 16M, 1M and 4K which is the default
@@ -735,7 +1138,7 @@ static void __init htab_init_page_sizes(void)
mmu_vmemmap_psize = mmu_virtual_psize;
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
- printk(KERN_DEBUG "Page orders: linear mapping = %d, "
+ pr_info("Page orders: linear mapping = %d, "
"virtual = %d, io = %d"
#ifdef CONFIG_SPARSEMEM_VMEMMAP
", vmemmap = %d"
@@ -834,6 +1237,7 @@ int hash__create_section_mapping(unsigned long start, unsigned long end,
int nid, pgprot_t prot)
{
int rc;
+ unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift;
if (end >= H_VMALLOC_START) {
pr_warn("Outside the supported range\n");
@@ -851,17 +1255,22 @@ int hash__create_section_mapping(unsigned long start, unsigned long end,
mmu_kernel_ssize);
BUG_ON(rc2 && (rc2 != -ENOENT));
}
+ update_page_count(mmu_linear_psize, (end - start) >> pshift);
return rc;
}
int hash__remove_section_mapping(unsigned long start, unsigned long end)
{
+ unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift;
+
int rc = htab_remove_mapping(start, end, mmu_linear_psize,
mmu_kernel_ssize);
if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
pr_warn("Hash collision while resizing HPT\n");
+ if (!rc)
+ update_page_count(mmu_linear_psize, -((end - start) >> pshift));
return rc;
}
#endif /* CONFIG_MEMORY_HOTPLUG */
@@ -880,25 +1289,64 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
pr_info("Partition table %p\n", partition_tb);
}
+void hpt_clear_stress(void);
+static struct timer_list stress_hpt_timer;
+static void stress_hpt_timer_fn(struct timer_list *timer)
+{
+ int next_cpu;
+
+ hpt_clear_stress();
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ tlbiel_all();
+
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+ stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
+ add_timer_on(&stress_hpt_timer, next_cpu);
+}
+
static void __init htab_initialize(void)
{
unsigned long table;
unsigned long pteg_count;
unsigned long prot;
- phys_addr_t base = 0, size = 0, end;
+ phys_addr_t base = 0, size = 0, end, limit = MEMBLOCK_ALLOC_ANYWHERE;
u64 i;
+ unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift;
DBG(" -> htab_initialize()\n");
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ limit = ppc64_rma_size;
+
if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
mmu_kernel_ssize = MMU_SEGSIZE_1T;
mmu_highuser_ssize = MMU_SEGSIZE_1T;
- printk(KERN_INFO "Using 1TB segments\n");
+ pr_info("Using 1TB segments\n");
}
if (stress_slb_enabled)
static_branch_enable(&stress_slb_key);
+ if (no_slb_preload)
+ static_branch_enable(&no_slb_preload_key);
+
+ if (stress_hpt_enabled) {
+ unsigned long tmp;
+ static_branch_enable(&stress_hpt_key);
+ // Too early to use nr_cpu_ids, so use NR_CPUS
+ tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS,
+ __alignof__(struct stress_hpt_struct),
+ MEMBLOCK_LOW_LIMIT, limit);
+ memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS);
+ stress_hpt_struct = __va(tmp);
+
+ timer_setup(&stress_hpt_timer, stress_hpt_timer_fn, 0);
+ stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
+ add_timer(&stress_hpt_timer);
+ }
+
/*
* Calculate the required size of the htab. We want the number of
* PTEGs to equal one half the number of real pages.
@@ -924,23 +1372,10 @@ static void __init htab_initialize(void)
mmu_hash_ops.hpte_clear_all();
#endif
} else {
- unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE;
-
-#ifdef CONFIG_PPC_CELL
- /*
- * Cell may require the hash table down low when using the
- * Axon IOMMU in order to fit the dynamic region over it, see
- * comments in cell/iommu.c
- */
- if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) {
- limit = 0x80000000;
- pr_info("Hash table forced below 2G for Axon IOMMU\n");
- }
-#endif /* CONFIG_PPC_CELL */
table = memblock_phys_alloc_range(htab_size_bytes,
htab_size_bytes,
- 0, limit);
+ MEMBLOCK_LOW_LIMIT, limit);
if (!table)
panic("ERROR: Failed to allocate %pa bytes below %pa\n",
&htab_size_bytes, &limit);
@@ -965,25 +1400,15 @@ static void __init htab_initialize(void)
prot = pgprot_val(PAGE_KERNEL);
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (debug_pagealloc_enabled()) {
- linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
- linear_map_hash_slots = memblock_alloc_try_nid(
- linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
- ppc64_rma_size, NUMA_NO_NODE);
- if (!linear_map_hash_slots)
- panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
- __func__, linear_map_hash_count, &ppc64_rma_size);
- }
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
+ hash_debug_pagealloc_alloc_slots();
+ hash_kfence_alloc_pool();
/* create bolted the linear mapping in the hash table */
for_each_mem_range(i, &base, &end) {
size = end - base;
base = (unsigned long)__va(base);
- DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
- base, size, prot);
+ pr_debug("creating mapping for region: 0x%pa..0x%pa (prot: %lx)\n",
+ &base, &size, prot);
if ((base + size) >= H_VMALLOC_START) {
pr_warn("Outside the supported range\n");
@@ -992,7 +1417,10 @@ static void __init htab_initialize(void)
BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
prot, mmu_linear_psize, mmu_kernel_ssize));
+
+ update_page_count(mmu_linear_psize, size >> pshift);
}
+ hash_kfence_map_pool();
memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
/*
@@ -1012,6 +1440,8 @@ static void __init htab_initialize(void)
BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
__pa(tce_alloc_start), prot,
mmu_linear_psize, mmu_kernel_ssize));
+ update_page_count(mmu_linear_psize,
+ (tce_alloc_end - tce_alloc_start) >> pshift);
}
@@ -1066,10 +1496,6 @@ void __init hash__early_init_mmu(void)
__pmd_table_size = H_PMD_TABLE_SIZE;
__pud_table_size = H_PUD_TABLE_SIZE;
__pgd_table_size = H_PGD_TABLE_SIZE;
- /*
- * 4k use hugepd format, so for hash set then to
- * zero
- */
__pmd_val_bits = HASH_PMD_VAL_BITS;
__pud_val_bits = HASH_PUD_VAL_BITS;
__pgd_val_bits = HASH_PGD_VAL_BITS;
@@ -1091,7 +1517,7 @@ void __init hash__early_init_mmu(void)
ps3_early_mm_init();
else if (firmware_has_feature(FW_FEATURE_LPAR))
hpte_init_pseries();
- else if (IS_ENABLED(CONFIG_PPC_NATIVE))
+ else if (IS_ENABLED(CONFIG_PPC_HASH_MMU_NATIVE))
hpte_init_native();
if (!mmu_hash_ops.hpte_insert)
@@ -1147,25 +1573,25 @@ void hash__early_init_mmu_secondary(void)
*/
unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
{
- struct page *page;
+ struct folio *folio;
if (!pfn_valid(pte_pfn(pte)))
return pp;
- page = pte_page(pte);
+ folio = page_folio(pte_page(pte));
/* page is dirty */
- if (!test_bit(PG_dcache_clean, &page->flags) && !PageReserved(page)) {
+ if (!test_bit(PG_dcache_clean, &folio->flags.f) &&
+ !folio_test_reserved(folio)) {
if (trap == INTERRUPT_INST_STORAGE) {
- flush_dcache_icache_page(page);
- set_bit(PG_dcache_clean, &page->flags);
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags.f);
} else
pp |= HPTE_R_N;
}
return pp;
}
-#ifdef CONFIG_PPC_MM_SLICES
static unsigned int get_paca_psize(unsigned long addr)
{
unsigned char *psizes;
@@ -1182,12 +1608,6 @@ static unsigned int get_paca_psize(unsigned long addr)
return (psizes[index >> 1] >> (mask_index * 4)) & 0xF;
}
-#else
-unsigned int get_paca_psize(unsigned long addr)
-{
- return get_paca()->mm_ctx_user_psize;
-}
-#endif
/*
* Demote a segment to using 4k pages.
@@ -1199,7 +1619,9 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
return;
slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
- copro_flush_all_slbs(mm);
+#ifdef CONFIG_SPU_BASE
+ spu_flush_all_slbs(mm);
+#endif
if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
copy_mm_to_paca(mm);
@@ -1244,7 +1666,7 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea)
spp >>= 30 - 2 * ((ea >> 12) & 0xf);
/*
- * 0 -> full premission
+ * 0 -> full permission
* 1 -> Read only
* 2 -> no access.
* We return the flag that need to be cleared.
@@ -1385,6 +1807,13 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
goto bail;
}
+ if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !radix_enabled()) {
+ if (hugeshift == PMD_SHIFT && psize == MMU_PAGE_16M)
+ hugeshift = mmu_psize_defs[MMU_PAGE_16M].shift;
+ if (hugeshift == PUD_SHIFT && psize == MMU_PAGE_16G)
+ hugeshift = mmu_psize_defs[MMU_PAGE_16G].shift;
+ }
+
/*
* Add _PAGE_PRESENT to the required access perm. If there are parallel
* updates to the pte that can possibly clear _PAGE_PTE, catch that too.
@@ -1457,11 +1886,13 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
* in vmalloc space, so switch vmalloc
* to 4k pages
*/
- printk(KERN_ALERT "Reducing vmalloc segment "
+ pr_alert("Reducing vmalloc segment "
"to 4kB pages because of "
"non-cacheable mapping\n");
psize = mmu_vmalloc_psize = MMU_PAGE_4K;
- copro_flush_all_slbs(mm);
+#ifdef CONFIG_SPU_BASE
+ spu_flush_all_slbs(mm);
+#endif
}
}
@@ -1522,8 +1953,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
}
EXPORT_SYMBOL_GPL(hash_page);
-DECLARE_INTERRUPT_HANDLER(__do_hash_fault);
-DEFINE_INTERRUPT_HANDLER(__do_hash_fault)
+DEFINE_INTERRUPT_HANDLER(do_hash_fault)
{
unsigned long ea = regs->dar;
unsigned long dsisr = regs->dsisr;
@@ -1566,7 +1996,7 @@ DEFINE_INTERRUPT_HANDLER(__do_hash_fault)
err = hash_page_mm(mm, ea, access, TRAP(regs), flags);
if (unlikely(err < 0)) {
- // failed to instert a hash PTE due to an hypervisor error
+ // failed to insert a hash PTE due to an hypervisor error
if (user_mode(regs)) {
if (IS_ENABLED(CONFIG_PPC_SUBPAGE_PROT) && err == -2)
_exception(SIGSEGV, regs, SEGV_ACCERR, ea);
@@ -1582,36 +2012,6 @@ DEFINE_INTERRUPT_HANDLER(__do_hash_fault)
}
}
-/*
- * The _RAW interrupt entry checks for the in_nmi() case before
- * running the full handler.
- */
-DEFINE_INTERRUPT_HANDLER_RAW(do_hash_fault)
-{
- /*
- * If we are in an "NMI" (e.g., an interrupt when soft-disabled), then
- * don't call hash_page, just fail the fault. This is required to
- * prevent re-entrancy problems in the hash code, namely perf
- * interrupts hitting while something holds H_PAGE_BUSY, and taking a
- * hash fault. See the comment in hash_preload().
- *
- * We come here as a result of a DSI at a point where we don't want
- * to call hash_page, such as when we are accessing memory (possibly
- * user memory) inside a PMU interrupt that occurred while interrupts
- * were soft-disabled. We want to invoke the exception handler for
- * the access, or panic if there isn't a handler.
- */
- if (unlikely(in_nmi())) {
- do_bad_page_fault_segv(regs);
- return 0;
- }
-
- __do_hash_fault(regs);
-
- return 0;
-}
-
-#ifdef CONFIG_PPC_MM_SLICES
static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
{
int psize = get_slice_psize(mm, ea);
@@ -1628,12 +2028,6 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
return true;
}
-#else
-static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
-{
- return true;
-}
-#endif
static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
bool is_exec, unsigned long trap)
@@ -1677,26 +2071,18 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
#endif /* CONFIG_PPC_64K_PAGES */
/*
- * __hash_page_* must run with interrupts off, as it sets the
- * H_PAGE_BUSY bit. It's possible for perf interrupts to hit at any
- * time and may take a hash fault reading the user stack, see
- * read_user_stack_slow() in the powerpc/perf code.
+ * __hash_page_* must run with interrupts off, including PMI interrupts
+ * off, as it sets the H_PAGE_BUSY bit.
*
- * If that takes a hash fault on the same page as we lock here, it
- * will bail out when seeing H_PAGE_BUSY set, and retry the access
- * leading to an infinite loop.
- *
- * Disabling interrupts here does not prevent perf interrupts, but it
- * will prevent them taking hash faults (see the NMI test in
- * do_hash_page), then read_user_stack's copy_from_user_nofault will
- * fail and perf will fall back to read_user_stack_slow(), which
- * walks the Linux page tables.
+ * It's otherwise possible for perf interrupts to hit at any time and
+ * may take a hash fault reading the user stack, which could take a
+ * hash miss and deadlock on the same H_PAGE_BUSY bit.
*
* Interrupts must also be off for the duration of the
* mm_is_thread_local test and update, to prevent preempt running the
* mm on another CPU (XXX: this may be racy vs kthread_use_mm).
*/
- local_irq_save(flags);
+ powerpc_local_irq_pmu_save(flags);
/* Is that local to this CPU ? */
if (mm_is_thread_local(mm))
@@ -1721,7 +2107,7 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
mm_ctx_user_psize(&mm->context),
pte_val(*ptep));
- local_irq_restore(flags);
+ powerpc_local_irq_pmu_restore(flags);
}
/*
@@ -1732,7 +2118,7 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
*
* This must always be called with the pte lock held.
*/
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep)
{
/*
@@ -1742,9 +2128,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
unsigned long trap;
bool is_exec;
- if (radix_enabled())
- return;
-
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
if (!pte_young(*ptep) || address >= TASK_SIZE)
return;
@@ -1941,72 +2324,68 @@ repeat:
return slot;
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
-static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
+void hpt_clear_stress(void)
{
- unsigned long hash;
- unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
- unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
- unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY);
- long ret;
-
- hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
-
- /* Don't create HPTE entries for bad address */
- if (!vsid)
- return;
-
- ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
- HPTE_V_BOLTED,
- mmu_linear_psize, mmu_kernel_ssize);
-
- BUG_ON (ret < 0);
- spin_lock(&linear_map_hash_lock);
- BUG_ON(linear_map_hash_slots[lmi] & 0x80);
- linear_map_hash_slots[lmi] = ret | 0x80;
- spin_unlock(&linear_map_hash_lock);
+ int cpu = raw_smp_processor_id();
+ int g;
+
+ for (g = 0; g < stress_nr_groups(); g++) {
+ unsigned long last_group;
+ last_group = stress_hpt_struct[cpu].last_group[g];
+
+ if (last_group != -1UL) {
+ int i;
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ if (mmu_hash_ops.hpte_remove(last_group) == -1)
+ break;
+ }
+ stress_hpt_struct[cpu].last_group[g] = -1;
+ }
+ }
}
-static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
+void hpt_do_stress(unsigned long ea, unsigned long hpte_group)
{
- unsigned long hash, hidx, slot;
- unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
- unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+ unsigned long last_group;
+ int cpu = raw_smp_processor_id();
- hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
- spin_lock(&linear_map_hash_lock);
- BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
- hidx = linear_map_hash_slots[lmi] & 0x7f;
- linear_map_hash_slots[lmi] = 0;
- spin_unlock(&linear_map_hash_lock);
- if (hidx & _PTEIDX_SECONDARY)
- hash = ~hash;
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += hidx & _PTEIDX_GROUP_IX;
- mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
- mmu_linear_psize,
- mmu_kernel_ssize, 0);
-}
+ last_group = stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1];
+ if (hpte_group == last_group)
+ return;
-void __kernel_map_pages(struct page *page, int numpages, int enable)
-{
- unsigned long flags, vaddr, lmi;
- int i;
+ if (last_group != -1UL) {
+ int i;
+ /*
+ * Concurrent CPUs might be inserting into this group, so
+ * give up after a number of iterations, to prevent a live
+ * lock.
+ */
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ if (mmu_hash_ops.hpte_remove(last_group) == -1)
+ break;
+ }
+ stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1] = -1;
+ }
- local_irq_save(flags);
- for (i = 0; i < numpages; i++, page++) {
- vaddr = (unsigned long)page_address(page);
- lmi = __pa(vaddr) >> PAGE_SHIFT;
- if (lmi >= linear_map_hash_count)
- continue;
- if (enable)
- kernel_map_linear_page(vaddr, lmi);
- else
- kernel_unmap_linear_page(vaddr, lmi);
+ if (ea >= PAGE_OFFSET) {
+ /*
+ * We would really like to prefetch to get the TLB loaded, then
+ * remove the PTE before returning from fault interrupt, to
+ * increase the hash fault rate.
+ *
+ * Unfortunately QEMU TCG does not model the TLB in a way that
+ * makes this possible, and systemsim (mambo) emulator does not
+ * bring in TLBs with prefetches (although loads/stores do
+ * work for non-CI PTEs).
+ *
+ * So remember this PTE and clear it on the next hash fault.
+ */
+ memmove(&stress_hpt_struct[cpu].last_group[1],
+ &stress_hpt_struct[cpu].last_group[0],
+ (stress_nr_groups() - 1) * sizeof(unsigned long));
+ stress_hpt_struct[cpu].last_group[0] = hpte_group;
}
- local_irq_restore(flags);
}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
@@ -2072,7 +2451,9 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n")
static int __init hash64_debugfs(void)
{
- debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root, NULL,
+ if (radix_enabled())
+ return 0;
+ debugfs_create_file("hpt_order", 0600, arch_debugfs_dir, NULL,
&fops_hpt_order);
return 0;
}
@@ -2086,3 +2467,20 @@ void __init print_system_hash_info(void)
if (htab_hash_mask)
pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask);
}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ /*
+ * If we are using 1TB segments and we are allowed to randomise
+ * the heap, we can put it above 1TB so it is backed by a 1TB
+ * segment. Otherwise the heap will be in the bottom 1TB
+ * which always uses 256MB segments and this may result in a
+ * performance penalty.
+ */
+ if (is_32bit_task())
+ return randomize_page(mm->brk, SZ_32M);
+ else if (!radix_enabled() && mmu_highuser_ssize == MMU_SEGSIZE_1T)
+ return randomize_page(max_t(unsigned long, mm->brk, SZ_1T), SZ_1G);
+ else
+ return randomize_page(mm->brk, SZ_1G);
+}
diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c
index a688e1324ae5..2bcbbf9d85ac 100644
--- a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/hugetlbpage.c
@@ -16,6 +16,7 @@
unsigned int hpage_shift;
EXPORT_SYMBOL(hpage_shift);
+#ifdef CONFIG_PPC_64S_HASH_MMU
int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
pte_t *ptep, unsigned long trap, unsigned long flags,
int ssize, unsigned int shift, unsigned int mmu_psize)
@@ -52,6 +53,16 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
/* If PTE permissions don't match, take page fault */
if (unlikely(!check_pte_access(access, old_pte)))
return 1;
+ /*
+ * If hash-4k, hugepages use seeral contiguous PxD entries
+ * so bail out and let mm make the page young or dirty
+ */
+ if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) {
+ if (!(old_pte & _PAGE_ACCESSED))
+ return 1;
+ if ((access & _PAGE_WRITE) && !(old_pte & _PAGE_DIRTY))
+ return 1;
+ }
/*
* Try to lock the PTE, add ACCESSED and DIRTY if it was
@@ -63,7 +74,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
} while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
/* Make sure this is a hugetlb entry */
- if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))
+ if (old_pte & H_PAGE_THP_HUGE)
return 0;
rflags = htab_convert_pte_flags(new_pte, flags);
@@ -122,6 +133,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
+#endif
pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
@@ -141,14 +153,17 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, pte_t old_pte, pte_t pte)
{
+ unsigned long psize;
if (radix_enabled())
return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
old_pte, pte);
- set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+
+ psize = huge_page_size(hstate_vma(vma));
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
}
-void hugetlbpage_init_default(void)
+void __init hugetlbpage_init_defaultsize(void)
{
/* Set default large page size. Currently, we pick 16M or 1M
* depending on what is available
diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h
index 5045048ce244..cad08d83369c 100644
--- a/arch/powerpc/mm/book3s64/internal.h
+++ b/arch/powerpc/mm/book3s64/internal.h
@@ -13,7 +13,23 @@ static inline bool stress_slb(void)
return static_branch_unlikely(&stress_slb_key);
}
-void slb_setup_new_exec(void);
+extern bool stress_hpt_enabled;
+
+DECLARE_STATIC_KEY_FALSE(stress_hpt_key);
+
+static inline bool stress_hpt(void)
+{
+ return static_branch_unlikely(&stress_hpt_key);
+}
+
+extern bool no_slb_preload;
+DECLARE_STATIC_KEY_FALSE(no_slb_preload_key);
+static inline bool slb_preload_disabled(void)
+{
+ return static_branch_unlikely(&no_slb_preload_key);
+}
+
+void hpt_do_stress(unsigned long ea, unsigned long hpte_group);
void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush);
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index cd18e94d0843..c0e8d597e4cb 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -97,7 +97,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
}
mmap_read_lock(mm);
- chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) /
+ chunk = (1UL << (PAGE_SHIFT + MAX_PAGE_ORDER)) /
sizeof(struct vm_area_struct *);
chunk = min(chunk, entries);
for (entry = 0; entry < entries; entry += chunk) {
@@ -105,7 +105,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n,
FOLL_WRITE | FOLL_LONGTERM,
- mem->hpages + entry, NULL);
+ mem->hpages + entry);
if (ret == n) {
pinned += n;
continue;
@@ -305,24 +305,6 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(mm_iommu_lookup);
-struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
- unsigned long ua, unsigned long size)
-{
- struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
-
- list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
- next) {
- if ((mem->ua <= ua) &&
- (ua + size <= mem->ua +
- (mem->entries << PAGE_SHIFT))) {
- ret = mem;
- break;
- }
- }
-
- return ret;
-}
-
struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
unsigned long ua, unsigned long entries)
{
@@ -369,56 +351,6 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
}
EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
-long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned int pageshift, unsigned long *hpa)
-{
- const long entry = (ua - mem->ua) >> PAGE_SHIFT;
- unsigned long *pa;
-
- if (entry >= mem->entries)
- return -EFAULT;
-
- if (pageshift > mem->pageshift)
- return -EFAULT;
-
- if (!mem->hpas) {
- *hpa = mem->dev_hpa + (ua - mem->ua);
- return 0;
- }
-
- pa = (void *) vmalloc_to_phys(&mem->hpas[entry]);
- if (!pa)
- return -EFAULT;
-
- *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
-
- return 0;
-}
-
-extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
-{
- struct mm_iommu_table_group_mem_t *mem;
- long entry;
- void *va;
- unsigned long *pa;
-
- mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE);
- if (!mem)
- return;
-
- if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA)
- return;
-
- entry = (ua - mem->ua) >> PAGE_SHIFT;
- va = &mem->hpas[entry];
-
- pa = (void *) vmalloc_to_phys(va);
- if (!pa)
- return;
-
- *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
-}
-
bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
unsigned int pageshift, unsigned long *size)
{
diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
index c10fc8a72fb3..fb9dcf9ca599 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -31,7 +31,8 @@ static int alloc_context_id(int min_id, int max_id)
return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL);
}
-void hash__reserve_context_id(int id)
+#ifdef CONFIG_PPC_64S_HASH_MMU
+void __init hash__reserve_context_id(int id)
{
int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL);
@@ -50,7 +51,9 @@ int hash__alloc_context_id(void)
return alloc_context_id(MIN_USER_CONTEXT, max);
}
EXPORT_SYMBOL_GPL(hash__alloc_context_id);
+#endif
+#ifdef CONFIG_PPC_64S_HASH_MMU
static int realloc_context_ids(mm_context_t *ctx)
{
int i, id;
@@ -147,9 +150,14 @@ static int hash__init_new_context(struct mm_struct *mm)
void hash__setup_new_exec(void)
{
slice_setup_new_exec();
-
- slb_setup_new_exec();
}
+#else
+static inline int hash__init_new_context(struct mm_struct *mm)
+{
+ BUILD_BUG();
+ return 0;
+}
+#endif
static int radix__init_new_context(struct mm_struct *mm)
{
@@ -175,7 +183,9 @@ static int radix__init_new_context(struct mm_struct *mm)
*/
asm volatile("ptesync;isync" : : : "memory");
+#ifdef CONFIG_PPC_64S_HASH_MMU
mm->context.hash_context = NULL;
+#endif
return index;
}
@@ -213,28 +223,36 @@ EXPORT_SYMBOL_GPL(__destroy_context);
static void destroy_contexts(mm_context_t *ctx)
{
- int index, context_id;
+ if (radix_enabled()) {
+ ida_free(&mmu_context_ida, ctx->id);
+ } else {
+#ifdef CONFIG_PPC_64S_HASH_MMU
+ int index, context_id;
- for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) {
- context_id = ctx->extended_id[index];
- if (context_id)
- ida_free(&mmu_context_ida, context_id);
+ for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) {
+ context_id = ctx->extended_id[index];
+ if (context_id)
+ ida_free(&mmu_context_ida, context_id);
+ }
+ kfree(ctx->hash_context);
+#else
+ BUILD_BUG(); // radix_enabled() should be constant true
+#endif
}
- kfree(ctx->hash_context);
}
static void pmd_frag_destroy(void *pmd_frag)
{
int count;
- struct page *page;
+ struct ptdesc *ptdesc;
- page = virt_to_page(pmd_frag);
+ ptdesc = virt_to_ptdesc(pmd_frag);
/* drop all the pending references */
count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
- if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) {
- pgtable_pmd_page_dtor(page);
- __free_page(page);
+ if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
+ pagetable_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
}
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 9ffa65074cb0..e3485db7de02 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -6,9 +6,11 @@
#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/memblock.h>
-#include <misc/cxl-base.h>
+#include <linux/memremap.h>
+#include <linux/pkeys.h>
+#include <linux/debugfs.h>
+#include <linux/proc_fs.h>
-#include <asm/debugfs.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/trace.h>
@@ -22,11 +24,31 @@
#include "internal.h"
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
+EXPORT_SYMBOL_GPL(mmu_psize_defs);
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+int mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif
+
unsigned long __pmd_frag_nr;
EXPORT_SYMBOL(__pmd_frag_nr);
unsigned long __pmd_frag_size_shift;
EXPORT_SYMBOL(__pmd_frag_size_shift);
+#ifdef CONFIG_KFENCE
+extern bool kfence_early_init;
+static int __init parse_kfence_early_init(char *arg)
+{
+ int val;
+
+ if (get_option(&arg, &val))
+ kfence_early_init = !!val;
+ return 0;
+}
+early_param("kfence.sample_interval", parse_kfence_early_init);
+#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* This is called when relaxing access to a hugepage. It's also called in the page
@@ -40,7 +62,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
{
int changed;
#ifdef CONFIG_DEBUG_VM
- WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+ WARN_ON(!pmd_trans_huge(*pmdp));
assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp));
#endif
changed = !pmd_same(*(pmdp), entry);
@@ -55,11 +77,38 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
return changed;
}
+int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp, pud_t entry, int dirty)
+{
+ int changed;
+#ifdef CONFIG_DEBUG_VM
+ assert_spin_locked(pud_lockptr(vma->vm_mm, pudp));
+#endif
+ changed = !pud_same(*(pudp), entry);
+ if (changed) {
+ /*
+ * We can use MMU_PAGE_1G here, because only radix
+ * path look at the psize.
+ */
+ __ptep_set_access_flags(vma, pudp_ptep(pudp),
+ pud_pte(entry), address, MMU_PAGE_1G);
+ }
+ return changed;
+}
+
+
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
}
+
+int pudp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp)
+{
+ return __pudp_test_and_clear_young(vma->vm_mm, address, pudp);
+}
+
/*
* set a new huge pmd. We should not be called for updating
* an existing pmd entry. That should go via pmd_hugepage_update.
@@ -75,12 +124,29 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
assert_spin_locked(pmd_lockptr(mm, pmdp));
- WARN_ON(!(pmd_large(pmd)));
+ WARN_ON(!(pmd_leaf(pmd)));
#endif
trace_hugepage_set_pmd(addr, pmd_val(pmd));
return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
}
+void set_pud_at(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud)
+{
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+
+ WARN_ON(pte_hw_valid(pud_pte(*pudp)));
+ assert_spin_locked(pud_lockptr(mm, pudp));
+ WARN_ON(!(pud_leaf(pud)));
+#endif
+ trace_hugepage_set_pud(addr, pud_val(pud));
+ return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud));
+}
+
static void do_serialize(void *arg)
{
/* We've taken the IPI, so try to trim the mask while here */
@@ -91,14 +157,14 @@ static void do_serialize(void *arg)
}
/*
- * Serialize against find_current_mm_pte which does lock-less
+ * Serialize against __find_linux_pte() which does lock-less
* lookup in page tables with local interrupts disabled. For huge pages
* it casts pmd_t to pte_t. Since format of pte_t is different from
* pmd_t we want to prevent transit from pmd pointing to page table
* to pmd pointing to huge page (and back) while interrupts are disabled.
* We clear pmd to possibly replace it with page table pointer in
* different code paths. So make sure we wait for the parallel
- * find_current_mm_pte to finish.
+ * __find_linux_pte() to finish.
*/
void serialize_against_pte_lookup(struct mm_struct *mm)
{
@@ -115,18 +181,30 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
{
unsigned long old_pmd;
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return __pmd(old_pmd);
}
+pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp)
+{
+ unsigned long old_pud;
+
+ VM_WARN_ON_ONCE(!pud_present(*pudp));
+ old_pud = pud_hugepage_update(vma->vm_mm, address, pudp, _PAGE_PRESENT, _PAGE_INVALID);
+ flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
+ return __pud(old_pud);
+}
+
pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp, int full)
{
pmd_t pmd;
VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
- VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
- !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
+ VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp)) ||
+ !pmd_present(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
/*
* if it not a fullmm flush, then we can possibly end up converting
@@ -138,11 +216,34 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
return pmd;
}
+pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pudp, int full)
+{
+ pud_t pud;
+
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(!pud_present(*pudp));
+ pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp);
+ /*
+ * if it not a fullmm flush, then we can possibly end up converting
+ * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
+ * Make sure we flush the tlb in this case.
+ */
+ if (!full)
+ flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE);
+ return pud;
+}
+
static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
{
return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
}
+static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot)
+{
+ return __pud(pud_val(pud) | pgprot_val(pgprot));
+}
+
/*
* At some point we should be able to get rid of
* pmd_mkhuge() and mk_huge_pmd() when we update all the
@@ -157,9 +258,13 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot));
}
-pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot)
{
- return pfn_pmd(page_to_pfn(page), pgprot);
+ unsigned long pudv;
+
+ pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
+
+ return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot));
}
pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
@@ -170,10 +275,19 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
pmdv &= _HPAGE_CHG_MASK;
return pmd_set_protbits(__pmd(pmdv), newprot);
}
+
+pud_t pud_modify(pud_t pud, pgprot_t newprot)
+{
+ unsigned long pudv;
+
+ pudv = pud_val(pud);
+ pudv &= _HPAGE_CHG_MASK;
+ return pud_set_protbits(__pud(pudv), newprot);
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-/* For use by kexec */
-void mmu_cleanup_all(void)
+/* For use by kexec, called with MMU off */
+notrace void mmu_cleanup_all(void)
{
if (radix_enabled())
radix__mmu_cleanup_all();
@@ -207,17 +321,8 @@ void __init mmu_partition_table_init(void)
unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
unsigned long ptcr;
- BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
/* Initialize the Partition Table with no entries */
- partition_tb = memblock_alloc(patb_size, patb_size);
- if (!partition_tb)
- panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
- __func__, patb_size, patb_size);
-
- /*
- * update partition table control register,
- * 64 K size.
- */
+ partition_tb = memblock_alloc_or_panic(patb_size, patb_size);
ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
set_ptcr_when_no_uv(ptcr);
powernv_set_nmmu_ptcr(ptcr);
@@ -302,22 +407,22 @@ static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
{
void *ret = NULL;
- struct page *page;
+ struct ptdesc *ptdesc;
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
- page = alloc_page(gfp);
- if (!page)
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pmd_page_ctor(page)) {
- __free_pages(page, 0);
+ if (!pagetable_pmd_ctor(mm, ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- atomic_set(&page->pt_frag_refcount, 1);
+ atomic_set(&ptdesc->pt_frag_refcount, 1);
- ret = page_address(page);
+ ret = ptdesc_address(ptdesc);
/*
* if we support only one fragment just return the
* allocated page.
@@ -327,12 +432,12 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
spin_lock(&mm->page_table_lock);
/*
- * If we find pgtable_page set, we return
- * the allocated page with single fragement
+ * If we find ptdesc_page set, we return
+ * the allocated page with single fragment
* count.
*/
if (likely(!mm->context.pmd_frag)) {
- atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR);
+ atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR);
mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
}
spin_unlock(&mm->page_table_lock);
@@ -353,15 +458,15 @@ pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
void pmd_fragment_free(unsigned long *pmd)
{
- struct page *page = virt_to_page(pmd);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
- if (PageReserved(page))
- return free_reserved_page(page);
+ if (pagetable_is_reserved(ptdesc))
+ return free_reserved_ptdesc(ptdesc);
- BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
- if (atomic_dec_and_test(&page->pt_frag_refcount)) {
- pgtable_pmd_page_dtor(page);
- __free_page(page);
+ BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
+ if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
+ pagetable_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
}
@@ -377,18 +482,6 @@ static inline void pgtable_free(void *table, int index)
case PUD_INDEX:
__pud_free(table);
break;
-#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
- /* 16M hugepd directory at pud level */
- case HTLB_16M_INDEX:
- BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0);
- kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table);
- break;
- /* 16G hugepd directory at the pgd level */
- case HTLB_16G_INDEX:
- BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0);
- kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table);
- break;
-#endif
/* We don't free pgd table via RCU callback */
default:
BUG();
@@ -417,20 +510,21 @@ atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
void arch_report_meminfo(struct seq_file *m)
{
- /*
- * Hash maps the memory with one size mmu_linear_psize.
- * So don't bother to print these on hash
- */
- if (!radix_enabled())
- return;
seq_printf(m, "DirectMap4k: %8lu kB\n",
atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2);
- seq_printf(m, "DirectMap64k: %8lu kB\n",
+ seq_printf(m, "DirectMap64k: %8lu kB\n",
atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6);
- seq_printf(m, "DirectMap2M: %8lu kB\n",
- atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11);
- seq_printf(m, "DirectMap1G: %8lu kB\n",
- atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20);
+ if (radix_enabled()) {
+ seq_printf(m, "DirectMap2M: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11);
+ seq_printf(m, "DirectMap1G: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20);
+ } else {
+ seq_printf(m, "DirectMap16M: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_16M]) << 14);
+ seq_printf(m, "DirectMap16G: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_16G]) << 24);
+ }
}
#endif /* CONFIG_PROC_FS */
@@ -459,6 +553,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
set_pte_at(vma->vm_mm, addr, ptep, pte);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* For hash translation mode, we use the deposited table to store hash slot
* information and they are stored at PTRS_PER_PMD offset from related pmd
@@ -480,11 +575,12 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
return true;
}
+#endif
/*
* Does the CPU support tlbie?
*/
-bool tlbie_capable __read_mostly = true;
+bool tlbie_capable __read_mostly = IS_ENABLED(CONFIG_PPC_RADIX_BROADCAST_TLBIE);
EXPORT_SYMBOL(tlbie_capable);
/*
@@ -492,7 +588,7 @@ EXPORT_SYMBOL(tlbie_capable);
* address spaces? tlbie may still be used for nMMU accelerators, and for KVM
* guest address spaces.
*/
-bool tlbie_enabled __read_mostly = true;
+bool tlbie_enabled __read_mostly = IS_ENABLED(CONFIG_PPC_RADIX_BROADCAST_TLBIE);
static int __init setup_disable_tlbie(char *str)
{
@@ -520,9 +616,50 @@ static int __init pgtable_debugfs_setup(void)
* invalidated as expected.
*/
debugfs_create_bool("tlbie_enabled", 0600,
- powerpc_debugfs_root,
+ arch_debugfs_dir,
&tlbie_enabled);
return 0;
}
arch_initcall(pgtable_debugfs_setup);
+
+#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN)
+/*
+ * Override the generic version in mm/memremap.c.
+ *
+ * With hash translation, the direct-map range is mapped with just one
+ * page size selected by htab_init_page_sizes(). Consult
+ * mmu_psize_defs[] to determine the minimum page size alignment.
+*/
+unsigned long memremap_compat_align(void)
+{
+ if (!radix_enabled()) {
+ unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift;
+ return max(SUBSECTION_SIZE, 1UL << shift);
+ }
+
+ return SUBSECTION_SIZE;
+}
+EXPORT_SYMBOL_GPL(memremap_compat_align);
+#endif
+
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
+{
+ unsigned long prot;
+
+ /* Radix supports execute-only, but protection_map maps X -> RX */
+ if (!radix_enabled() && ((vm_flags & VM_ACCESS_FLAGS) == VM_EXEC))
+ vm_flags |= VM_READ;
+
+ prot = pgprot_val(protection_map[vm_flags & (VM_ACCESS_FLAGS | VM_SHARED)]);
+
+ if (vm_flags & VM_SAO)
+ prot |= _PAGE_SAO;
+
+#ifdef CONFIG_PPC_MEM_KEYS
+ prot |= vmflag_to_pte_pkey_bits(vm_flags);
+#endif
+
+ return __pgprot(prot);
+}
+EXPORT_SYMBOL(vm_get_page_prot);
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index a2d9ad138709..a974baf8f327 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -10,6 +10,7 @@
#include <asm/mmu.h>
#include <asm/setup.h>
#include <asm/smp.h>
+#include <asm/firmware.h>
#include <linux/pkeys.h>
#include <linux/of_fdt.h>
@@ -66,7 +67,7 @@ static int __init dt_scan_storage_keys(unsigned long node,
return 1;
}
-static int scan_pkey_feature(void)
+static int __init scan_pkey_feature(void)
{
int ret;
int pkeys_total = 0;
@@ -88,7 +89,8 @@ static int scan_pkey_feature(void)
unsigned long pvr = mfspr(SPRN_PVR);
if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E ||
- PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9)
+ PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9 ||
+ PVR_VER(pvr) == PVR_HX_C2000)
pkeys_total = 32;
}
}
@@ -290,7 +292,7 @@ void setup_kuap(bool disabled)
if (smp_processor_id() == boot_cpuid) {
pr_info("Activating Kernel Userspace Access Prevention\n");
- cur_cpu_spec->mmu_features |= MMU_FTR_BOOK3S_KUAP;
+ cur_cpu_spec->mmu_features |= MMU_FTR_KUAP;
}
/*
diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
index 23d3e08911d3..35fd2a95be24 100644
--- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
@@ -39,61 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st
radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize);
else
radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
-}
-
-/*
- * A vairant of hugetlb_get_unmapped_area doing topdown search
- * FIXME!! should we do as x86 does or non hugetlb area does ?
- * ie, use topdown or not based on mmap_is_legacy check ?
- */
-unsigned long
-radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- struct hstate *h = hstate_file(file);
- int fixed = (flags & MAP_FIXED);
- unsigned long high_limit;
- struct vm_unmapped_area_info info;
-
- high_limit = DEFAULT_MAP_WINDOW;
- if (addr >= high_limit || (fixed && (addr + len > high_limit)))
- high_limit = TASK_SIZE;
-
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (len > high_limit)
- return -ENOMEM;
-
- if (fixed) {
- if (addr > high_limit - len)
- return -ENOMEM;
- if (prepare_hugepage_range(file, addr, len))
- return -EINVAL;
- return addr;
- }
-
- if (addr) {
- addr = ALIGN(addr, huge_page_size(h));
- vma = find_vma(mm, addr);
- if (high_limit - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vm_start_gap(vma)))
- return addr;
- }
- /*
- * We are always doing an topdown search here. Slice code
- * does that too.
- */
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
-
- return vm_unmapped_area(&info);
+ mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
}
void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
@@ -101,14 +47,17 @@ void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
pte_t old_pte, pte_t pte)
{
struct mm_struct *mm = vma->vm_mm;
+ unsigned long psize = huge_page_size(hstate_vma(vma));
/*
- * To avoid NMMU hang while relaxing access we need to flush the tlb before
- * we set the new value.
+ * POWER9 NMMU must flush the TLB after clearing the PTE before
+ * installing a PTE with more relaxed access permissions, see
+ * radix__ptep_set_access_flags.
*/
- if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
- (atomic_read(&mm->context.copros) > 0))
+ if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
+ is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ atomic_read(&mm->context.copros) > 0)
radix__flush_hugetlb_page(vma, addr);
- set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
}
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index e50ddf129c15..73977dbabcf2 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -17,6 +17,7 @@
#include <linux/hugetlb.h>
#include <linux/string_helpers.h>
#include <linux/memory.h>
+#include <linux/kfence.h>
#include <asm/pgalloc.h>
#include <asm/mmu_context.h>
@@ -30,12 +31,14 @@
#include <asm/trace.h>
#include <asm/uaccess.h>
#include <asm/ultravisor.h>
+#include <asm/set_memory.h>
+#include <asm/kfence.h>
#include <trace/events/thp.h>
-unsigned int mmu_pid_bits;
+#include <mm/mmu_decl.h>
+
unsigned int mmu_base_pid;
-unsigned long radix_mem_block_size __ro_after_init;
static __ref void *early_alloc_pgtable(unsigned long size, int nid,
unsigned long region_start, unsigned long region_end)
@@ -203,14 +206,14 @@ static void radix__change_memory_range(unsigned long start, unsigned long end,
pudp = pud_alloc(&init_mm, p4dp, idx);
if (!pudp)
continue;
- if (pud_is_leaf(*pudp)) {
+ if (pud_leaf(*pudp)) {
ptep = (pte_t *)pudp;
goto update_the_pte;
}
pmdp = pmd_alloc(&init_mm, pudp, idx);
if (!pmdp)
continue;
- if (pmd_is_leaf(*pmdp)) {
+ if (pmd_leaf(*pmdp)) {
ptep = pmdp_ptep(pmdp);
goto update_the_pte;
}
@@ -229,9 +232,17 @@ void radix__mark_rodata_ro(void)
unsigned long start, end;
start = (unsigned long)_stext;
- end = (unsigned long)__init_begin;
+ end = (unsigned long)__end_rodata;
radix__change_memory_range(start, end, _PAGE_WRITE);
+
+ for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) {
+ end = start + PAGE_SIZE;
+ if (overlaps_interrupt_vector_text(start, end))
+ radix__change_memory_range(start, end, _PAGE_WRITE);
+ else
+ break;
+ }
}
void radix__mark_initmem_nx(void)
@@ -260,21 +271,44 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e
static unsigned long next_boundary(unsigned long addr, unsigned long end)
{
#ifdef CONFIG_STRICT_KERNEL_RWX
- if (addr < __pa_symbol(__init_begin))
- return __pa_symbol(__init_begin);
+ unsigned long stext_phys;
+
+ stext_phys = __pa_symbol(_stext);
+
+ // Relocatable kernel running at non-zero real address
+ if (stext_phys != 0) {
+ // The end of interrupts code at zero is a rodata boundary
+ unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys;
+ if (addr < end_intr)
+ return end_intr;
+
+ // Start of relocated kernel text is a rodata boundary
+ if (addr < stext_phys)
+ return stext_phys;
+ }
+
+ if (addr < __pa_symbol(__srwx_boundary))
+ return __pa_symbol(__srwx_boundary);
#endif
return end;
}
static int __meminit create_physical_mapping(unsigned long start,
unsigned long end,
- unsigned long max_mapping_size,
- int nid, pgprot_t _prot)
+ int nid, pgprot_t _prot,
+ unsigned long mapping_sz_limit)
{
unsigned long vaddr, addr, mapping_size = 0;
bool prev_exec, exec = false;
pgprot_t prot;
int psize;
+ unsigned long max_mapping_size = memory_block_size;
+
+ if (mapping_sz_limit < max_mapping_size)
+ max_mapping_size = mapping_sz_limit;
+
+ if (debug_pagealloc_enabled())
+ max_mapping_size = PAGE_SIZE;
start = ALIGN(start, PAGE_SIZE);
end = ALIGN_DOWN(end, PAGE_SIZE);
@@ -328,14 +362,70 @@ static int __meminit create_physical_mapping(unsigned long start,
return 0;
}
+#ifdef CONFIG_KFENCE
+static __init phys_addr_t alloc_kfence_pool(void)
+{
+ phys_addr_t kfence_pool;
+
+ /*
+ * TODO: Support to enable KFENCE after bootup depends on the ability to
+ * split page table mappings. As such support is not currently
+ * implemented for radix pagetables, support enabling KFENCE
+ * only at system startup for now.
+ *
+ * After support for splitting mappings is available on radix,
+ * alloc_kfence_pool() & map_kfence_pool() can be dropped and
+ * mapping for __kfence_pool memory can be
+ * split during arch_kfence_init_pool().
+ */
+ if (!kfence_early_init)
+ goto no_kfence;
+
+ kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
+ if (!kfence_pool)
+ goto no_kfence;
+
+ memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
+ return kfence_pool;
+
+no_kfence:
+ disable_kfence();
+ return 0;
+}
+
+static __init void map_kfence_pool(phys_addr_t kfence_pool)
+{
+ if (!kfence_pool)
+ return;
+
+ if (create_physical_mapping(kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
+ -1, PAGE_KERNEL, PAGE_SIZE))
+ goto err;
+
+ memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
+ __kfence_pool = __va(kfence_pool);
+ return;
+
+err:
+ memblock_phys_free(kfence_pool, KFENCE_POOL_SIZE);
+ disable_kfence();
+}
+#else
+static inline phys_addr_t alloc_kfence_pool(void) { return 0; }
+static inline void map_kfence_pool(phys_addr_t kfence_pool) { }
+#endif
+
static void __init radix_init_pgtable(void)
{
+ phys_addr_t kfence_pool;
unsigned long rts_field;
phys_addr_t start, end;
u64 i;
/* We don't support slb for radix */
- mmu_slb_size = 0;
+ slb_set_size(0);
+
+ kfence_pool = alloc_kfence_pool();
/*
* Create the linear mapping
@@ -353,22 +443,18 @@ static void __init radix_init_pgtable(void)
}
WARN_ON(create_physical_mapping(start, end,
- radix_mem_block_size,
- -1, PAGE_KERNEL));
+ -1, PAGE_KERNEL, ~0UL));
}
- /* Find out how many PID bits are supported */
+ map_kfence_pool(kfence_pool);
+
if (!cpu_has_feature(CPU_FTR_HVMODE) &&
cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
/*
- * Older versions of KVM on these machines perfer if the
+ * Older versions of KVM on these machines prefer if the
* guest only uses the low 19 PID bits.
*/
- if (!mmu_pid_bits)
- mmu_pid_bits = 19;
- } else {
- if (!mmu_pid_bits)
- mmu_pid_bits = 20;
+ mmu_pid_bits = 19;
}
mmu_base_pid = 1;
@@ -449,11 +535,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
if (type == NULL || strcmp(type, "cpu") != 0)
return 0;
- /* Find MMU PID size */
- prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
- if (prop && size == 4)
- mmu_pid_bits = be32_to_cpup(prop);
-
/* Grab page size encodings */
prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
if (!prop)
@@ -484,58 +565,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
return 1;
}
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int __init probe_memory_block_size(unsigned long node, const char *uname, int
- depth, void *data)
-{
- unsigned long *mem_block_size = (unsigned long *)data;
- const __be32 *prop;
- int len;
-
- if (depth != 1)
- return 0;
-
- if (strcmp(uname, "ibm,dynamic-reconfiguration-memory"))
- return 0;
-
- prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
-
- if (!prop || len < dt_root_size_cells * sizeof(__be32))
- /*
- * Nothing in the device tree
- */
- *mem_block_size = MIN_MEMORY_BLOCK_SIZE;
- else
- *mem_block_size = of_read_number(prop, dt_root_size_cells);
- return 1;
-}
-
-static unsigned long radix_memory_block_size(void)
-{
- unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE;
-
- /*
- * OPAL firmware feature is set by now. Hence we are ok
- * to test OPAL feature.
- */
- if (firmware_has_feature(FW_FEATURE_OPAL))
- mem_block_size = 1UL * 1024 * 1024 * 1024;
- else
- of_scan_flat_dt(probe_memory_block_size, &mem_block_size);
-
- return mem_block_size;
-}
-
-#else /* CONFIG_MEMORY_HOTPLUG */
-
-static unsigned long radix_memory_block_size(void)
-{
- return 1UL * 1024 * 1024 * 1024;
-}
-
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-
void __init radix__early_init_devtree(void)
{
int rc;
@@ -559,51 +588,20 @@ void __init radix__early_init_devtree(void)
mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
psize_to_rpti_pgsize(MMU_PAGE_64K);
}
-
- /*
- * Max mapping size used when mapping pages. We don't use
- * ppc_md.memory_block_size() here because this get called
- * early and we don't have machine probe called yet. Also
- * the pseries implementation only check for ibm,lmb-size.
- * All hypervisor supporting radix do expose that device
- * tree node.
- */
- radix_mem_block_size = radix_memory_block_size();
return;
}
-static void radix_init_amor(void)
-{
- /*
- * In HV mode, we init AMOR (Authority Mask Override Register) so that
- * the hypervisor and guest can setup IAMR (Instruction Authority Mask
- * Register), enable key 0 and set it to 1.
- *
- * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
- */
- mtspr(SPRN_AMOR, (3ul << 62));
-}
-
void __init radix__early_init_mmu(void)
{
unsigned long lpcr;
+#ifdef CONFIG_PPC_64S_HASH_MMU
#ifdef CONFIG_PPC_64K_PAGES
/* PAGE_SIZE mappings */
mmu_virtual_psize = MMU_PAGE_64K;
#else
mmu_virtual_psize = MMU_PAGE_4K;
#endif
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- /* vmemmap mapping */
- if (mmu_psize_defs[MMU_PAGE_2M].shift) {
- /*
- * map vmemmap using 2M if available
- */
- mmu_vmemmap_psize = MMU_PAGE_2M;
- } else
- mmu_vmemmap_psize = mmu_virtual_psize;
#endif
/*
* initialize page table size
@@ -644,7 +642,6 @@ void __init radix__early_init_mmu(void)
lpcr = mfspr(SPRN_LPCR);
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
radix_init_partition_table();
- radix_init_amor();
} else {
radix_init_pseries();
}
@@ -668,8 +665,6 @@ void radix__early_init_mmu_secondary(void)
set_ptcr_when_no_uv(__pa(partition_tb) |
(PATB_SIZE_SHIFT - 12));
-
- radix_init_amor();
}
radix__switch_mmu_context(NULL, &init_mm);
@@ -679,7 +674,8 @@ void radix__early_init_mmu_secondary(void)
mtspr(SPRN_UAMOR, 0);
}
-void radix__mmu_cleanup_all(void)
+/* Called during kexec sequence with MMU off */
+notrace void radix__mmu_cleanup_all(void)
{
unsigned long lpcr;
@@ -738,10 +734,60 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
p4d_clear(p4d);
}
-static void remove_pte_table(pte_t *pte_start, unsigned long addr,
- unsigned long end)
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
{
- unsigned long next;
+ unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
+
+ return !vmemmap_populated(start, PMD_SIZE);
+}
+
+static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
+{
+ unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
+
+ return !vmemmap_populated(start, PAGE_SIZE);
+
+}
+#endif
+
+static void __meminit free_vmemmap_pages(struct page *page,
+ struct vmem_altmap *altmap,
+ int order)
+{
+ unsigned int nr_pages = 1 << order;
+
+ if (altmap) {
+ unsigned long alt_start, alt_end;
+ unsigned long base_pfn = page_to_pfn(page);
+
+ /*
+ * with 2M vmemmap mmaping we can have things setup
+ * such that even though atlmap is specified we never
+ * used altmap.
+ */
+ alt_start = altmap->base_pfn;
+ alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
+
+ if (base_pfn >= alt_start && base_pfn < alt_end) {
+ vmem_altmap_free(altmap, nr_pages);
+ return;
+ }
+ }
+
+ if (PageReserved(page)) {
+ /* allocated from memblock */
+ while (nr_pages--)
+ free_reserved_page(page++);
+ } else
+ __free_pages(page, order);
+}
+
+static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
+{
+ unsigned long next, pages = 0;
pte_t *pte;
pte = pte_start + pte_index(addr);
@@ -753,23 +799,28 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr,
if (!pte_present(*pte))
continue;
- if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
- /*
- * The vmemmap_free() and remove_section_mapping()
- * codepaths call us with aligned addresses.
- */
- WARN_ONCE(1, "%s: unaligned range\n", __func__);
- continue;
+ if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
+ if (!direct)
+ free_vmemmap_pages(pte_page(*pte), altmap, 0);
+ pte_clear(&init_mm, addr, pte);
+ pages++;
}
-
- pte_clear(&init_mm, addr, pte);
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (!direct && vmemmap_page_is_unused(addr, next)) {
+ free_vmemmap_pages(pte_page(*pte), altmap, 0);
+ pte_clear(&init_mm, addr, pte);
+ }
+#endif
}
+ if (direct)
+ update_page_count(mmu_virtual_psize, -pages);
}
static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
- unsigned long end)
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
- unsigned long next;
+ unsigned long next, pages = 0;
pte_t *pte_base;
pmd_t *pmd;
@@ -780,26 +831,36 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
if (!pmd_present(*pmd))
continue;
- if (pmd_is_leaf(*pmd)) {
- if (!IS_ALIGNED(addr, PMD_SIZE) ||
- !IS_ALIGNED(next, PMD_SIZE)) {
- WARN_ONCE(1, "%s: unaligned range\n", __func__);
- continue;
+ if (pmd_leaf(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
+ pte_clear(&init_mm, addr, (pte_t *)pmd);
+ pages++;
}
- pte_clear(&init_mm, addr, (pte_t *)pmd);
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
+ free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
+ pte_clear(&init_mm, addr, (pte_t *)pmd);
+ }
+#endif
continue;
}
pte_base = (pte_t *)pmd_page_vaddr(*pmd);
- remove_pte_table(pte_base, addr, next);
+ remove_pte_table(pte_base, addr, next, direct, altmap);
free_pte_table(pte_base, pmd);
}
+ if (direct)
+ update_page_count(MMU_PAGE_2M, -pages);
}
static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
- unsigned long end)
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
- unsigned long next;
+ unsigned long next, pages = 0;
pmd_t *pmd_base;
pud_t *pud;
@@ -810,23 +871,28 @@ static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
if (!pud_present(*pud))
continue;
- if (pud_is_leaf(*pud)) {
+ if (pud_leaf(*pud)) {
if (!IS_ALIGNED(addr, PUD_SIZE) ||
!IS_ALIGNED(next, PUD_SIZE)) {
WARN_ONCE(1, "%s: unaligned range\n", __func__);
continue;
}
pte_clear(&init_mm, addr, (pte_t *)pud);
+ pages++;
continue;
}
pmd_base = pud_pgtable(*pud);
- remove_pmd_table(pmd_base, addr, next);
+ remove_pmd_table(pmd_base, addr, next, direct, altmap);
free_pmd_table(pmd_base, pud);
}
+ if (direct)
+ update_page_count(MMU_PAGE_1G, -pages);
}
-static void __meminit remove_pagetable(unsigned long start, unsigned long end)
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long addr, next;
pud_t *pud_base;
@@ -843,7 +909,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
if (!p4d_present(*p4d))
continue;
- if (p4d_is_leaf(*p4d)) {
+ if (p4d_leaf(*p4d)) {
if (!IS_ALIGNED(addr, P4D_SIZE) ||
!IS_ALIGNED(next, P4D_SIZE)) {
WARN_ONCE(1, "%s: unaligned range\n", __func__);
@@ -855,7 +921,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
}
pud_base = p4d_pgtable(*p4d);
- remove_pud_table(pud_base, addr, next);
+ remove_pud_table(pud_base, addr, next, direct, altmap);
free_pud_table(pud_base, p4d);
}
@@ -873,12 +939,12 @@ int __meminit radix__create_section_mapping(unsigned long start,
}
return create_physical_mapping(__pa(start), __pa(end),
- radix_mem_block_size, nid, prot);
+ nid, prot, ~0UL);
}
int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
{
- remove_pagetable(start, end);
+ remove_pagetable(start, end, true, NULL);
return 0;
}
#endif /* CONFIG_MEMORY_HOTPLUG */
@@ -896,7 +962,6 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
unsigned long phys)
{
/* Create a PTE encoding */
- unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
int ret;
@@ -905,16 +970,456 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
return -1;
}
- ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
+ ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);
BUG_ON(ret);
return 0;
}
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+{
+ if (radix_enabled())
+ return __vmemmap_can_optimize(altmap, pgmap);
+
+ return false;
+}
+#endif
+
+int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
+ unsigned long addr, unsigned long next)
+{
+ int large = pmd_leaf(*pmdp);
+
+ if (large)
+ vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
+
+ return large;
+}
+
+void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
+ unsigned long addr, unsigned long next)
+{
+ pte_t entry;
+ pte_t *ptep = pmdp_ptep(pmdp);
+
+ VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, ptep, entry);
+ asm volatile("ptesync": : :"memory");
+
+ vmemmap_verify(ptep, node, addr, next);
+}
+
+static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
+ int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
+{
+ pte_t *pte = pte_offset_kernel(pmdp, addr);
+
+ if (pte_none(*pte)) {
+ pte_t entry;
+ void *p;
+
+ if (!reuse) {
+ /*
+ * make sure we don't create altmap mappings
+ * covering things outside the device.
+ */
+ if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
+ altmap = NULL;
+
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+ if (!p && altmap)
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
+ if (!p)
+ return NULL;
+ pr_debug("PAGE_SIZE vmemmap mapping\n");
+ } else {
+ /*
+ * When a PTE/PMD entry is freed from the init_mm
+ * there's a free_pages() call to this page allocated
+ * above. Thus this get_page() is paired with the
+ * put_page_testzero() on the freeing path.
+ * This can only called by certain ZONE_DEVICE path,
+ * and through vmemmap_populate_compound_pages() when
+ * slab is available.
+ */
+ get_page(reuse);
+ p = page_to_virt(reuse);
+ pr_debug("Tail page reuse vmemmap mapping\n");
+ }
+
+ VM_BUG_ON(!PAGE_ALIGNED(addr));
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, pte, entry);
+ asm volatile("ptesync": : :"memory");
+ }
+ return pte;
+}
+
+static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
+ unsigned long address)
+{
+ pud_t *pud;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(p4d_none(*p4dp))) {
+ if (unlikely(!slab_is_available())) {
+ pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ p4d_populate(&init_mm, p4dp, pud);
+ /* go to the pud_offset */
+ } else
+ return pud_alloc(&init_mm, p4dp, address);
+ }
+ return pud_offset(p4dp, address);
+}
+
+static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
+ unsigned long address)
+{
+ pmd_t *pmd;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(pud_none(*pudp))) {
+ if (unlikely(!slab_is_available())) {
+ pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ pud_populate(&init_mm, pudp, pmd);
+ } else
+ return pmd_alloc(&init_mm, pudp, address);
+ }
+ return pmd_offset(pudp, address);
+}
+
+static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
+ unsigned long address)
+{
+ pte_t *pte;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(pmd_none(*pmdp))) {
+ if (unlikely(!slab_is_available())) {
+ pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ pmd_populate(&init_mm, pmdp, pte);
+ } else
+ return pte_alloc_kernel(pmdp, address);
+ }
+ return pte_offset_kernel(pmdp, address);
+}
+
+
+
+int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
+{
+ unsigned long addr;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ /*
+ * If altmap is present, Make sure we align the start vmemmap addr
+ * to PAGE_SIZE so that we calculate the correct start_pfn in
+ * altmap boundary check to decide whether we should use altmap or
+ * RAM based backing memory allocation. Also the address need to be
+ * aligned for set_pte operation. If the start addr is already
+ * PMD_SIZE aligned and with in the altmap boundary then we will
+ * try to use a pmd size altmap mapping else we go for page size
+ * mapping.
+ *
+ * If altmap is not present, align the vmemmap addr to PMD_SIZE and
+ * always allocate a PMD size page for vmemmap backing.
+ *
+ */
+
+ if (altmap)
+ start = ALIGN_DOWN(start, PAGE_SIZE);
+ else
+ start = ALIGN_DOWN(start, PMD_SIZE);
+
+ for (addr = start; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return -ENOMEM;
+
+ if (pmd_none(READ_ONCE(*pmd))) {
+ void *p;
+
+ /*
+ * keep it simple by checking addr PMD_SIZE alignment
+ * and verifying the device boundary condition.
+ * For us to use a pmd mapping, both addr and pfn should
+ * be aligned. We skip if addr is not aligned and for
+ * pfn we hope we have extra area in the altmap that
+ * can help to find an aligned block. This can result
+ * in altmap block allocation failures, in which case
+ * we fallback to RAM for vmemmap allocation.
+ */
+ if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
+ altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
+ /*
+ * make sure we don't create altmap mappings
+ * covering things outside the device.
+ */
+ goto base_mapping;
+ }
+
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
+ if (p) {
+ vmemmap_set_pmd(pmd, p, node, addr, next);
+ pr_debug("PMD_SIZE vmemmap mapping\n");
+ continue;
+ } else {
+ /*
+ * A vmemmap block allocation can fail due to
+ * alignment requirements and we trying to align
+ * things aggressively there by running out of
+ * space. Try base mapping on failure.
+ */
+ goto base_mapping;
+ }
+ } else if (vmemmap_check_pmd(pmd, node, addr, next)) {
+ /*
+ * If a huge mapping exist due to early call to
+ * vmemmap_populate, let's try to use that.
+ */
+ continue;
+ }
+base_mapping:
+ /*
+ * Not able allocate higher order memory to back memmap
+ * or we found a pointer to pte page. Allocate base page
+ * size vmemmap
+ */
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return -ENOMEM;
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
+ if (!pte)
+ return -ENOMEM;
+
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+ next = addr + PAGE_SIZE;
+ }
+ return 0;
+}
+
+static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return NULL;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return NULL;
+ if (pmd_leaf(*pmd))
+ /*
+ * The second page is mapped as a hugepage due to a nearby request.
+ * Force our mapping to page size without deduplication
+ */
+ return NULL;
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return NULL;
+ radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ return pte;
+}
+
+static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
+ unsigned long pfn_offset, int node)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long map_addr;
+
+ /* the second vmemmap page which we use for duplication */
+ map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
+ pgd = pgd_offset_k(map_addr);
+ p4d = p4d_offset(pgd, map_addr);
+ pud = vmemmap_pud_alloc(p4d, node, map_addr);
+ if (!pud)
+ return NULL;
+ pmd = vmemmap_pmd_alloc(pud, node, map_addr);
+ if (!pmd)
+ return NULL;
+ if (pmd_leaf(*pmd))
+ /*
+ * The second page is mapped as a hugepage due to a nearby request.
+ * Force our mapping to page size without deduplication
+ */
+ return NULL;
+ pte = vmemmap_pte_alloc(pmd, node, map_addr);
+ if (!pte)
+ return NULL;
+ /*
+ * Check if there exist a mapping to the left
+ */
+ if (pte_none(*pte)) {
+ /*
+ * Populate the head page vmemmap page.
+ * It can fall in different pmd, hence
+ * vmemmap_populate_address()
+ */
+ pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
+ if (!pte)
+ return NULL;
+ /*
+ * Populate the tail pages vmemmap page
+ */
+ pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
+ if (!pte)
+ return NULL;
+ vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
+ return pte;
+ }
+ return pte;
+}
+
+int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
+ unsigned long start,
+ unsigned long end, int node,
+ struct dev_pagemap *pgmap)
+{
+ /*
+ * we want to map things as base page size mapping so that
+ * we can save space in vmemmap. We could have huge mapping
+ * covering out both edges.
+ */
+ unsigned long addr;
+ unsigned long addr_pfn = start_pfn;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ for (addr = start; addr < end; addr = next) {
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return -ENOMEM;
+
+ if (pmd_leaf(READ_ONCE(*pmd))) {
+ /* existing huge mapping. Skip the range */
+ addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
+ next = pmd_addr_end(addr, end);
+ continue;
+ }
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return -ENOMEM;
+ if (!pte_none(*pte)) {
+ /*
+ * This could be because we already have a compound
+ * page whose VMEMMAP_RESERVE_NR pages were mapped and
+ * this request fall in those pages.
+ */
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ } else {
+ unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
+ unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
+ pte_t *tail_page_pte;
+
+ /*
+ * if the address is aligned to huge page size it is the
+ * head mapping.
+ */
+ if (pfn_offset == 0) {
+ /* Populate the head page vmemmap page */
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ /*
+ * Populate the tail pages vmemmap page
+ * It can fall in different pmd, hence
+ * vmemmap_populate_address()
+ */
+ pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+
+ addr_pfn += 2;
+ next = addr + 2 * PAGE_SIZE;
+ continue;
+ }
+ /*
+ * get the 2nd mapping details
+ * Also create it if that doesn't exist
+ */
+ tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
+ if (!tail_page_pte) {
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ }
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ }
+ }
+ return 0;
+}
+
+
#ifdef CONFIG_MEMORY_HOTPLUG
void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
{
- remove_pagetable(start, start + page_size);
+ remove_pagetable(start, start + page_size, true, NULL);
+}
+
+void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+ remove_pagetable(start, end, false, altmap);
}
#endif
#endif
@@ -928,12 +1433,29 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add
unsigned long old;
#ifdef CONFIG_DEBUG_VM
- WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+ WARN_ON(!radix__pmd_trans_huge(*pmdp));
assert_spin_locked(pmd_lockptr(mm, pmdp));
#endif
- old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
- trace_hugepage_update(addr, old, clr, set);
+ old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
+ trace_hugepage_update_pmd(addr, old, clr, set);
+
+ return old;
+}
+
+unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, unsigned long clr,
+ unsigned long set)
+{
+ unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pud_trans_huge(*pudp));
+ assert_spin_locked(pud_lockptr(mm, pudp));
+#endif
+
+ old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
+ trace_hugepage_update_pud(addr, old, clr, set);
return old;
}
@@ -946,22 +1468,12 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
- VM_BUG_ON(pmd_devmap(*pmdp));
/*
* khugepaged calls this for normal pmd
*/
pmd = *pmdp;
pmd_clear(pmdp);
- /*
- * pmdp collapse_flush need to ensure that there are no parallel gup
- * walk after this call. This is needed so that we can have stable
- * page ref count when collapsing a page. We don't allow a collapse page
- * if we have gup taken on the page. We can ensure that by sending IPI
- * because gup walk happens with IRQ disabled.
- */
- serialize_against_pte_lookup(vma->vm_mm);
-
radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
return pmd;
@@ -1023,27 +1535,43 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
return old_pmd;
}
+pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ pud_t old_pud;
+ unsigned long old;
+
+ old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
+ old_pud = __pud(old);
+ return old_pud;
+}
+
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
pte_t entry, unsigned long address, int psize)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
- _PAGE_RW | _PAGE_EXEC);
+ unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY |
+ _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
unsigned long change = pte_val(entry) ^ pte_val(*ptep);
/*
- * To avoid NMMU hang while relaxing access, we need mark
- * the pte invalid in between.
+ * On POWER9, the NMMU is not able to relax PTE access permissions
+ * for a translation with a TLB. The PTE must be invalidated, TLB
+ * flushed before the new PTE is installed.
+ *
+ * This only needs to be done for radix, because hash translation does
+ * flush when updating the linux pte (and we don't support NMMU
+ * accelerators on HPT on POWER9 anyway XXX: do we?).
+ *
+ * POWER10 (and P9P) NMMU does behave as per ISA.
*/
- if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
+ if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
+ atomic_read(&mm->context.copros) > 0) {
unsigned long old_pte, new_pte;
old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
- /*
- * new value of pte
- */
new_pte = old_pte | set;
radix__flush_tlb_page_psize(mm, address, psize);
__radix_pte_update(ptep, _PAGE_INVALID, new_pte);
@@ -1051,9 +1579,12 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
__radix_pte_update(ptep, 0, set);
/*
* Book3S does not require a TLB flush when relaxing access
- * restrictions when the address space is not attached to a
- * NMMU, because the core MMU will reload the pte after taking
- * an access fault, which is defined by the architecture.
+ * restrictions when the address space (modulo the POWER9 nest
+ * MMU issue above) because the MMU will reload the PTE after
+ * taking an access fault, as defined by the architecture. See
+ * "Setting a Reference or Change Bit or Upgrading Access
+ * Authority (PTE Subject to Atomic Hardware Updates)" in
+ * Power ISA Version 3.1B.
*/
}
/* See ptesync comment in radix__set_pte_at */
@@ -1066,11 +1597,12 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
/*
- * To avoid NMMU hang while relaxing access we need to flush the tlb before
- * we set the new value. We need to do this only for radix, because hash
- * translation does flush when updating the linux pte.
+ * POWER9 NMMU must flush the TLB after clearing the PTE before
+ * installing a PTE with more relaxed access permissions, see
+ * radix__ptep_set_access_flags.
*/
- if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
+ is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
(atomic_read(&mm->context.copros) > 0))
radix__flush_tlb_page(vma, addr);
@@ -1092,7 +1624,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
int pud_clear_huge(pud_t *pud)
{
- if (pud_huge(*pud)) {
+ if (pud_leaf(*pud)) {
pud_clear(pud);
return 1;
}
@@ -1139,7 +1671,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
int pmd_clear_huge(pmd_t *pmd)
{
- if (pmd_huge(*pmd)) {
+ if (pmd_leaf(*pmd)) {
pmd_clear(pmd);
return 1;
}
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index aefc100d79a7..9e1f6558d026 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -10,6 +10,7 @@
#include <linux/memblock.h>
#include <linux/mmu_context.h>
#include <linux/sched/mm.h>
+#include <linux/debugfs.h>
#include <asm/ppc-opcode.h>
#include <asm/tlb.h>
@@ -126,21 +127,6 @@ static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric)
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
-static __always_inline void __tlbie_pid_lpid(unsigned long pid,
- unsigned long lpid,
- unsigned long ric)
-{
- unsigned long rb, rs, prs, r;
-
- rb = PPC_BIT(53); /* IS = 1 */
- rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
- prs = 1; /* process scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(0, 0, rb, rs, ric, prs, r);
-}
static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -201,23 +187,6 @@ static __always_inline void __tlbie_va(unsigned long va, unsigned long pid,
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
-static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid,
- unsigned long lpid,
- unsigned long ap, unsigned long ric)
-{
- unsigned long rb, rs, prs, r;
-
- rb = va & ~(PPC_BITMASK(52, 63));
- rb |= ap << PPC_BITLSHIFT(58);
- rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
- prs = 1; /* process scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(0, 0, rb, rs, ric, prs, r);
-}
-
static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
unsigned long ap, unsigned long ric)
{
@@ -263,22 +232,6 @@ static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid,
}
}
-static inline void fixup_tlbie_va_range_lpid(unsigned long va,
- unsigned long pid,
- unsigned long lpid,
- unsigned long ap)
-{
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
- asm volatile("ptesync" : : : "memory");
- __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
- }
-
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
- asm volatile("ptesync" : : : "memory");
- __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB);
- }
-}
-
static inline void fixup_tlbie_pid(unsigned long pid)
{
/*
@@ -298,26 +251,6 @@ static inline void fixup_tlbie_pid(unsigned long pid)
}
}
-static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid)
-{
- /*
- * We can use any address for the invalidation, pick one which is
- * probably unused as an optimisation.
- */
- unsigned long va = ((1UL << 52) - 1);
-
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
- asm volatile("ptesync" : : : "memory");
- __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
- }
-
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
- asm volatile("ptesync" : : : "memory");
- __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
- RIC_FLUSH_TLB);
- }
-}
-
static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
unsigned long ap)
{
@@ -396,7 +329,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
/*
* Workaround the fact that the "ric" argument to __tlbie_pid
- * must be a compile-time contraint to match the "i" constraint
+ * must be a compile-time constraint to match the "i" constraint
* in the asm statement.
*/
switch (ric) {
@@ -415,31 +348,6 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
-static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
- unsigned long ric)
-{
- asm volatile("ptesync" : : : "memory");
-
- /*
- * Workaround the fact that the "ric" argument to __tlbie_pid
- * must be a compile-time contraint to match the "i" constraint
- * in the asm statement.
- */
- switch (ric) {
- case RIC_FLUSH_TLB:
- __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
- fixup_tlbie_pid_lpid(pid, lpid);
- break;
- case RIC_FLUSH_PWC:
- __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
- break;
- case RIC_FLUSH_ALL:
- default:
- __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
- fixup_tlbie_pid_lpid(pid, lpid);
- }
- asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-}
struct tlbiel_pid {
unsigned long pid;
unsigned long ric;
@@ -565,20 +473,6 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end,
fixup_tlbie_va_range(addr - page_size, pid, ap);
}
-static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end,
- unsigned long pid, unsigned long lpid,
- unsigned long page_size,
- unsigned long psize)
-{
- unsigned long addr;
- unsigned long ap = mmu_get_ap(psize);
-
- for (addr = start; addr < end; addr += page_size)
- __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
-
- fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
-}
-
static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
unsigned long psize, unsigned long ric)
{
@@ -659,18 +553,6 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end,
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
-static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end,
- unsigned long pid, unsigned long lpid,
- unsigned long page_size,
- unsigned long psize, bool also_pwc)
-{
- asm volatile("ptesync" : : : "memory");
- if (also_pwc)
- __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
- __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
- asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-}
-
static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
@@ -699,12 +581,13 @@ static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
*/
void radix__local_flush_tlb_mm(struct mm_struct *mm)
{
- unsigned long pid;
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_pid(pid, RIC_FLUSH_TLB);
+ _tlbiel_pid(pid, RIC_FLUSH_TLB);
preempt_enable();
}
EXPORT_SYMBOL(radix__local_flush_tlb_mm);
@@ -712,12 +595,13 @@ EXPORT_SYMBOL(radix__local_flush_tlb_mm);
#ifndef CONFIG_SMP
void radix__local_flush_all_mm(struct mm_struct *mm)
{
- unsigned long pid;
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
preempt_enable();
}
EXPORT_SYMBOL(radix__local_flush_all_mm);
@@ -731,12 +615,13 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
int psize)
{
- unsigned long pid;
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
preempt_enable();
}
@@ -754,10 +639,18 @@ EXPORT_SYMBOL(radix__local_flush_tlb_page);
static bool mm_needs_flush_escalation(struct mm_struct *mm)
{
/*
- * P9 nest MMU has issues with the page walk cache
- * caching PTEs and not flushing them properly when
- * RIC = 0 for a PID/LPID invalidate
+ * The P9 nest MMU has issues with the page walk cache caching PTEs
+ * and not flushing them when RIC = 0 for a PID/LPID invalidate.
+ *
+ * This may have been fixed in shipping firmware (by disabling PWC
+ * or preventing it from caching PTEs), but until that is confirmed,
+ * this workaround is required - escalate all RIC=0 IS=1/2/3 flushes
+ * to RIC=2.
+ *
+ * POWER10 (and P9P) does not have this problem.
*/
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ return false;
if (atomic_read(&mm->context.copros) > 0)
return true;
return false;
@@ -783,12 +676,20 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush)
goto out;
if (current->active_mm == mm) {
+ unsigned long flags;
+
WARN_ON_ONCE(current->mm != NULL);
- /* Is a kernel thread and is using mm as the lazy tlb */
- mmgrab(&init_mm);
+ /*
+ * It is a kernel thread and is using mm as the lazy tlb, so
+ * switch it to init_mm. This is not always called from IPI
+ * (e.g., flush_type_needed), so must disable irqs.
+ */
+ local_irq_save(flags);
+ mmgrab_lazy_tlb(&init_mm);
current->active_mm = &init_mm;
switch_mm_irqs_off(mm, &init_mm, current);
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
+ local_irq_restore(flags);
}
/*
@@ -800,7 +701,7 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush)
* that's what the caller expects.
*/
if (cpumask_test_cpu(cpu, mm_cpumask(mm))) {
- atomic_dec(&mm->context.active_cpus);
+ dec_mm_active_cpus(mm);
cpumask_clear_cpu(cpu, mm_cpumask(mm));
always_flush = true;
}
@@ -936,7 +837,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
@@ -967,6 +868,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
}
}
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
EXPORT_SYMBOL(radix__flush_tlb_mm);
@@ -976,7 +878,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
@@ -1000,6 +902,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
}
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
void radix__flush_all_mm(struct mm_struct *mm)
@@ -1015,7 +918,7 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
@@ -1095,6 +998,9 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
}
EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
+/*
+ * Doesn't appear to be used anywhere. Remove.
+ */
#define TLB_FLUSH_ALL -1UL
/*
@@ -1106,8 +1012,8 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
* invalidating a full PID, so it has a far lower threshold to change from
* individual page flushes to full-pid flushes.
*/
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
-static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
+static u32 tlb_single_page_flush_ceiling __read_mostly = 33;
+static u32 tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
static inline void __radix__flush_tlb_range(struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -1116,23 +1022,22 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
unsigned long page_size = 1UL << page_shift;
unsigned long nr_pages = (end - start) >> page_shift;
- bool fullmm = (end == TLB_FLUSH_ALL);
bool flush_pid, flush_pwc = false;
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
+ WARN_ON_ONCE(end == TLB_FLUSH_ALL);
+
preempt_disable();
smp_mb(); /* see radix__flush_tlb_mm */
- type = flush_type_needed(mm, fullmm);
+ type = flush_type_needed(mm, false);
if (type == FLUSH_TYPE_NONE)
goto out;
- if (fullmm)
- flush_pid = true;
- else if (type == FLUSH_TYPE_GLOBAL)
+ if (type == FLUSH_TYPE_GLOBAL)
flush_pid = nr_pages > tlb_single_page_flush_ceiling;
else
flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
@@ -1170,15 +1075,12 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
}
}
} else {
- bool hflush = false;
+ bool hflush;
unsigned long hstart, hend;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- hstart = (start + PMD_SIZE - 1) & PMD_MASK;
- hend = end & PMD_MASK;
- if (hstart < hend)
- hflush = true;
- }
+ hstart = (start + PMD_SIZE - 1) & PMD_MASK;
+ hend = end & PMD_MASK;
+ hflush = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hstart < hend;
if (type == FLUSH_TYPE_LOCAL) {
asm volatile("ptesync": : :"memory");
@@ -1209,6 +1111,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
}
out:
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
@@ -1293,8 +1196,29 @@ void radix__tlb_flush(struct mmu_gather *tlb)
* that flushes the process table entry cache upon process teardown.
* See the comment for radix in arch_exit_mmap().
*/
- if (tlb->fullmm || tlb->need_flush_all) {
- __flush_all_mm(mm, true);
+ if (tlb->fullmm) {
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * Shootdown based lazy tlb mm refcounting means we
+ * have to IPI everyone in the mm_cpumask anyway soon
+ * when the mm goes away, so might as well do it as
+ * part of the final flush now.
+ *
+ * If lazy shootdown was improved to reduce IPIs (e.g.,
+ * by batching), then it may end up being better to use
+ * tlbies here instead.
+ */
+ preempt_disable();
+
+ smp_mb(); /* see radix__flush_tlb_mm */
+ exit_flush_lazy_tlbs(mm);
+ __flush_all_mm(mm, true);
+
+ preempt_enable();
+ } else {
+ __flush_all_mm(mm, true);
+ }
+
} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
if (!tlb->freed_tables)
radix__flush_tlb_mm(mm);
@@ -1316,25 +1240,22 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
unsigned int page_shift = mmu_psize_defs[psize].shift;
unsigned long page_size = 1UL << page_shift;
unsigned long nr_pages = (end - start) >> page_shift;
- bool fullmm = (end == TLB_FLUSH_ALL);
bool flush_pid;
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
- fullmm = (end == TLB_FLUSH_ALL);
+ WARN_ON_ONCE(end == TLB_FLUSH_ALL);
preempt_disable();
smp_mb(); /* see radix__flush_tlb_mm */
- type = flush_type_needed(mm, fullmm);
+ type = flush_type_needed(mm, false);
if (type == FLUSH_TYPE_NONE)
goto out;
- if (fullmm)
- flush_pid = true;
- else if (type == FLUSH_TYPE_GLOBAL)
+ if (type == FLUSH_TYPE_GLOBAL)
flush_pid = nr_pages > tlb_single_page_flush_ceiling;
else
flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
@@ -1376,6 +1297,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
}
out:
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
@@ -1397,7 +1319,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
/* 4k page size, just blow the world */
@@ -1445,6 +1367,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
}
EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
+void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G);
+}
+EXPORT_SYMBOL(radix__flush_pud_tlb_range);
+
void radix__flush_tlb_all(void)
{
unsigned long rb,prs,r,rs;
@@ -1470,6 +1399,127 @@ void radix__flush_tlb_all(void)
}
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+static __always_inline void __tlbie_pid_lpid(unsigned long pid,
+ unsigned long lpid,
+ unsigned long ric)
+{
+ unsigned long rb, rs, prs, r;
+
+ rb = PPC_BIT(53); /* IS = 1 */
+ rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid,
+ unsigned long lpid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb, rs, prs, r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid)
+{
+ /*
+ * We can use any address for the invalidation, pick one which is
+ * probably unused as an optimisation.
+ */
+ unsigned long va = ((1UL << 52) - 1);
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
+ RIC_FLUSH_TLB);
+ }
+}
+
+static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
+ unsigned long ric)
+{
+ asm volatile("ptesync" : : : "memory");
+
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
+ fixup_tlbie_pid_lpid(pid, lpid);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
+ fixup_tlbie_pid_lpid(pid, lpid);
+ }
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+
+static inline void fixup_tlbie_va_range_lpid(unsigned long va,
+ unsigned long pid,
+ unsigned long lpid,
+ unsigned long ap)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB);
+ }
+}
+
+static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long lpid,
+ unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ for (addr = start; addr < end; addr += page_size)
+ __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
+
+ fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
+}
+
+static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long lpid,
+ unsigned long page_size,
+ unsigned long psize, bool also_pwc)
+{
+ asm volatile("ptesync" : : : "memory");
+ if (also_pwc)
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+ __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+
/*
* Performs process-scoped invalidations for a given LPID
* as part of H_RPT_INVALIDATE hcall.
@@ -1524,3 +1574,14 @@ void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid,
EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+
+static int __init create_tlb_single_page_flush_ceiling(void)
+{
+ debugfs_create_u32("tlb_single_page_flush_ceiling", 0600,
+ arch_debugfs_dir, &tlb_single_page_flush_ceiling);
+ debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600,
+ arch_debugfs_dir, &tlb_local_single_page_flush_ceiling);
+ return 0;
+}
+late_initcall(create_tlb_single_page_flush_ceiling);
+
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index c91bd85eb90e..15f73abd1506 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -9,11 +9,11 @@
* Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
*/
-#include <asm/asm-prototypes.h>
#include <asm/interrupt.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/paca.h>
+#include <asm/lppaca.h>
#include <asm/ppc-opcode.h>
#include <asm/cputable.h>
#include <asm/cacheflush.h>
@@ -24,7 +24,7 @@
#include <linux/pgtable.h>
#include <asm/udbg.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include "internal.h"
@@ -42,6 +42,15 @@ early_param("stress_slb", parse_stress_slb);
__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_slb_key);
+bool no_slb_preload __initdata;
+static int __init parse_no_slb_preload(char *p)
+{
+ no_slb_preload = true;
+ return 0;
+}
+early_param("no_slb_preload", parse_no_slb_preload);
+__ro_after_init DEFINE_STATIC_KEY_FALSE(no_slb_preload_key);
+
static void assert_slb_presence(bool present, unsigned long ea)
{
#ifdef CONFIG_DEBUG_VM
@@ -294,11 +303,14 @@ static bool preload_hit(struct thread_info *ti, unsigned long esid)
return false;
}
-static bool preload_add(struct thread_info *ti, unsigned long ea)
+static void preload_add(struct thread_info *ti, unsigned long ea)
{
unsigned char idx;
unsigned long esid;
+ if (slb_preload_disabled())
+ return;
+
if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
/* EAs are stored >> 28 so 256MB segments don't need clearing */
if (ea & ESID_MASK_1T)
@@ -308,7 +320,7 @@ static bool preload_add(struct thread_info *ti, unsigned long ea)
esid = ea >> SID_SHIFT;
if (preload_hit(ti, esid))
- return false;
+ return;
idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR;
ti->slb_preload_esid[idx] = esid;
@@ -316,8 +328,6 @@ static bool preload_add(struct thread_info *ti, unsigned long ea)
ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
else
ti->slb_preload_nr++;
-
- return true;
}
static void preload_age(struct thread_info *ti)
@@ -328,94 +338,6 @@ static void preload_age(struct thread_info *ti)
ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
}
-void slb_setup_new_exec(void)
-{
- struct thread_info *ti = current_thread_info();
- struct mm_struct *mm = current->mm;
- unsigned long exec = 0x10000000;
-
- WARN_ON(irqs_disabled());
-
- /*
- * preload cache can only be used to determine whether a SLB
- * entry exists if it does not start to overflow.
- */
- if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR)
- return;
-
- hard_irq_disable();
-
- /*
- * We have no good place to clear the slb preload cache on exec,
- * flush_thread is about the earliest arch hook but that happens
- * after we switch to the mm and have aleady preloaded the SLBEs.
- *
- * For the most part that's probably okay to use entries from the
- * previous exec, they will age out if unused. It may turn out to
- * be an advantage to clear the cache before switching to it,
- * however.
- */
-
- /*
- * preload some userspace segments into the SLB.
- * Almost all 32 and 64bit PowerPC executables are linked at
- * 0x10000000 so it makes sense to preload this segment.
- */
- if (!is_kernel_addr(exec)) {
- if (preload_add(ti, exec))
- slb_allocate_user(mm, exec);
- }
-
- /* Libraries and mmaps. */
- if (!is_kernel_addr(mm->mmap_base)) {
- if (preload_add(ti, mm->mmap_base))
- slb_allocate_user(mm, mm->mmap_base);
- }
-
- /* see switch_slb */
- asm volatile("isync" : : : "memory");
-
- local_irq_enable();
-}
-
-void preload_new_slb_context(unsigned long start, unsigned long sp)
-{
- struct thread_info *ti = current_thread_info();
- struct mm_struct *mm = current->mm;
- unsigned long heap = mm->start_brk;
-
- WARN_ON(irqs_disabled());
-
- /* see above */
- if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR)
- return;
-
- hard_irq_disable();
-
- /* Userspace entry address. */
- if (!is_kernel_addr(start)) {
- if (preload_add(ti, start))
- slb_allocate_user(mm, start);
- }
-
- /* Top of stack, grows down. */
- if (!is_kernel_addr(sp)) {
- if (preload_add(ti, sp))
- slb_allocate_user(mm, sp);
- }
-
- /* Bottom of heap, grows up. */
- if (heap && !is_kernel_addr(heap)) {
- if (preload_add(ti, heap))
- slb_allocate_user(mm, heap);
- }
-
- /* see switch_slb */
- asm volatile("isync" : : : "memory");
-
- local_irq_enable();
-}
-
static void slb_cache_slbie_kernel(unsigned int index)
{
unsigned long slbie_data = get_paca()->slb_cache[index];
@@ -502,6 +424,9 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
copy_mm_to_paca(mm);
+ if (slb_preload_disabled())
+ return;
+
/*
* We gradually age out SLBs after a number of context switches to
* reduce reload overhead of unused entries (like we do with FP/VEC
@@ -616,7 +541,7 @@ static void slb_cache_update(unsigned long esid_data)
} else {
/*
* Our cache is full and the current cache content strictly
- * doesn't indicate the active SLB conents. Bump the ptr
+ * doesn't indicate the active SLB contents. Bump the ptr
* so that switch_slb() will ignore the cache.
*/
local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1;
@@ -822,7 +747,7 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault)
/* IRQs are not reconciled here, so can't check irqs_disabled */
VM_WARN_ON(mfmsr() & MSR_EE);
- if (unlikely(!(regs->msr & MSR_RI)))
+ if (regs_is_unrecoverable(regs))
return -EINVAL;
/*
@@ -868,19 +793,3 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault)
return err;
}
}
-
-DEFINE_INTERRUPT_HANDLER(do_bad_slb_fault)
-{
- int err = regs->result;
-
- if (err == -EFAULT) {
- if (user_mode(regs))
- _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar);
- else
- bad_page_fault(regs, SIGSEGV);
- } else if (err == -EINVAL) {
- unrecoverable_exception(regs);
- } else {
- BUG();
- }
-}
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/book3s64/slice.c
index 82b45b1cb973..28bec5bc7879 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/book3s64/slice.c
@@ -22,7 +22,7 @@
#include <linux/security.h>
#include <asm/mman.h>
#include <asm/mmu.h>
-#include <asm/copro.h>
+#include <asm/spu.h>
#include <asm/hugetlb.h>
#include <asm/mmu_context.h>
@@ -248,7 +248,9 @@ static void slice_convert(struct mm_struct *mm,
spin_unlock_irqrestore(&slice_convert_lock, flags);
- copro_flush_all_slbs(mm);
+#ifdef CONFIG_SPU_BASE
+ spu_flush_all_slbs(mm);
+#endif
}
/*
@@ -276,20 +278,16 @@ static bool slice_scan_available(unsigned long addr,
}
static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
- unsigned long len,
+ unsigned long addr, unsigned long len,
const struct slice_mask *available,
int psize, unsigned long high_limit)
{
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
- unsigned long addr, found, next_end;
- struct vm_unmapped_area_info info;
-
- info.flags = 0;
- info.length = len;
- info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
- info.align_offset = 0;
-
- addr = TASK_UNMAPPED_BASE;
+ unsigned long found, next_end;
+ struct vm_unmapped_area_info info = {
+ .length = len,
+ .align_mask = PAGE_MASK & ((1ul << pshift) - 1),
+ };
/*
* Check till the allow max value for this mmap request
*/
@@ -322,21 +320,19 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
}
static unsigned long slice_find_area_topdown(struct mm_struct *mm,
- unsigned long len,
+ unsigned long addr, unsigned long len,
const struct slice_mask *available,
int psize, unsigned long high_limit)
{
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
- unsigned long addr, found, prev;
- struct vm_unmapped_area_info info;
+ unsigned long found, prev;
+ struct vm_unmapped_area_info info = {
+ .flags = VM_UNMAPPED_AREA_TOPDOWN,
+ .length = len,
+ .align_mask = PAGE_MASK & ((1ul << pshift) - 1),
+ };
unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr);
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
- info.align_offset = 0;
-
- addr = mm->mmap_base;
/*
* If we are trying to allocate above DEFAULT_MAP_WINDOW
* Add the different to the mmap_base.
@@ -377,7 +373,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
* can happen with large stack limits and large mmap()
* allocations.
*/
- return slice_find_area_bottomup(mm, len, available, psize, high_limit);
+ return slice_find_area_bottomup(mm, TASK_UNMAPPED_BASE, len, available, psize, high_limit);
}
@@ -386,9 +382,9 @@ static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
int topdown, unsigned long high_limit)
{
if (topdown)
- return slice_find_area_topdown(mm, len, mask, psize, high_limit);
+ return slice_find_area_topdown(mm, mm->mmap_base, len, mask, psize, high_limit);
else
- return slice_find_area_bottomup(mm, len, mask, psize, high_limit);
+ return slice_find_area_bottomup(mm, mm->mmap_base, len, mask, psize, high_limit);
}
static inline void slice_copy_mask(struct slice_mask *dst,
@@ -639,24 +635,58 @@ return_addr:
}
EXPORT_SYMBOL_GPL(slice_get_unmapped_area);
+#ifdef CONFIG_HUGETLB_PAGE
+static int file_to_psize(struct file *file)
+{
+ struct hstate *hstate = hstate_file(file);
+
+ return shift_to_mmu_psize(huge_page_shift(hstate));
+}
+#else
+static int file_to_psize(struct file *file)
+{
+ return 0;
+}
+#endif
+
unsigned long arch_get_unmapped_area(struct file *filp,
unsigned long addr,
unsigned long len,
unsigned long pgoff,
- unsigned long flags)
+ unsigned long flags,
+ vm_flags_t vm_flags)
{
- return slice_get_unmapped_area(addr, len, flags,
- mm_ctx_user_psize(&current->mm->context), 0);
+ unsigned int psize;
+
+ if (radix_enabled())
+ return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
+
+ if (filp && is_file_hugepages(filp))
+ psize = file_to_psize(filp);
+ else
+ psize = mm_ctx_user_psize(&current->mm->context);
+
+ return slice_get_unmapped_area(addr, len, flags, psize, 0);
}
unsigned long arch_get_unmapped_area_topdown(struct file *filp,
const unsigned long addr0,
const unsigned long len,
const unsigned long pgoff,
- const unsigned long flags)
+ const unsigned long flags,
+ vm_flags_t vm_flags)
{
- return slice_get_unmapped_area(addr0, len, flags,
- mm_ctx_user_psize(&current->mm->context), 1);
+ unsigned int psize;
+
+ if (radix_enabled())
+ return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags, vm_flags);
+
+ if (filp && is_file_hugepages(filp))
+ psize = file_to_psize(filp);
+ else
+ psize = mm_ctx_user_psize(&current->mm->context);
+
+ return slice_get_unmapped_area(addr0, len, flags, psize, 1);
}
unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr)
@@ -712,7 +742,6 @@ void slice_init_new_context_exec(struct mm_struct *mm)
bitmap_fill(mask->high_slices, SLICE_NUM_HIGH);
}
-#ifdef CONFIG_PPC_BOOK3S_64
void slice_setup_new_exec(void)
{
struct mm_struct *mm = current->mm;
@@ -724,7 +753,6 @@ void slice_setup_new_exec(void)
mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW);
}
-#endif
void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long len, unsigned int psize)
@@ -779,4 +807,13 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
return !slice_check_range_fits(mm, maskp, addr, len);
}
+
+unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
+{
+ /* With radix we don't use slice, so derive it from vma*/
+ if (radix_enabled())
+ return vma_kernel_pagesize(vma);
+
+ return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start));
+}
#endif
diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
index 60c6ea16a972..ec98e526167e 100644
--- a/arch/powerpc/mm/book3s64/subpage_prot.c
+++ b/arch/powerpc/mm/book3s64/subpage_prot.c
@@ -71,6 +71,8 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
if (pmd_none(*pmd))
return;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return;
arch_enter_lazy_mmu_mode();
for (; npages > 0; --npages) {
pte_update(mm, addr, pte, 0, 0, 0);
@@ -143,30 +145,22 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
static const struct mm_walk_ops subpage_walk_ops = {
.pmd_entry = subpage_walk_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK_VERIFY,
};
static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
unsigned long len)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, addr);
/*
* We don't try too hard, we just mark all the vma in that range
* VM_NOHUGEPAGE and split them.
*/
- vma = find_vma(mm, addr);
- /*
- * If the range is in unmapped range, just return
- */
- if (vma && ((addr + len) <= vma->vm_start))
- return;
-
- while (vma) {
- if (vma->vm_start >= (addr + len))
- break;
- vma->vm_flags |= VM_NOHUGEPAGE;
+ for_each_vma_range(vmi, vma, addr + len) {
+ vm_flags_set(vma, VM_NOHUGEPAGE);
walk_page_vma(vma, &subpage_walk_ops, NULL);
- vma = vma->vm_next;
}
}
#else
diff --git a/arch/powerpc/mm/book3s64/trace.c b/arch/powerpc/mm/book3s64/trace.c
new file mode 100644
index 000000000000..ccd64b5e6cac
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/trace.c
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file is for defining trace points and trace related helpers.
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#include <trace/events/thp.h>
+#endif
diff --git a/arch/powerpc/mm/cacheflush.c b/arch/powerpc/mm/cacheflush.c
index 63363787e000..7186516eca52 100644
--- a/arch/powerpc/mm/cacheflush.c
+++ b/arch/powerpc/mm/cacheflush.c
@@ -12,7 +12,7 @@ static inline bool flush_coherent_icache(void)
/*
* For a snooping icache, we still need a dummy icbi to purge all the
* prefetched instructions from the ifetch buffers. We also need a sync
- * before the icbi to order the the actual stores to memory that might
+ * before the icbi to order the actual stores to memory that might
* have modified instructions with the icbi.
*/
if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(flush_icache_range);
#ifdef CONFIG_HIGHMEM
/**
- * flush_dcache_icache_phys() - Flush a page by it's physical address
+ * flush_dcache_icache_phys() - Flush a page by its physical address
* @physaddr: the physical address of the page
*/
static void flush_dcache_icache_phys(unsigned long physaddr)
@@ -148,44 +148,31 @@ static void __flush_dcache_icache(void *p)
invalidate_icache_range(addr, addr + PAGE_SIZE);
}
-static void flush_dcache_icache_hugepage(struct page *page)
+void flush_dcache_icache_folio(struct folio *folio)
{
- int i;
- int nr = compound_nr(page);
+ unsigned int i, nr = folio_nr_pages(folio);
- if (!PageHighMem(page)) {
+ if (flush_coherent_icache())
+ return;
+
+ if (!folio_test_highmem(folio)) {
+ void *addr = folio_address(folio);
for (i = 0; i < nr; i++)
- __flush_dcache_icache(lowmem_page_address(page + i));
- } else {
+ __flush_dcache_icache(addr + i * PAGE_SIZE);
+ } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) {
for (i = 0; i < nr; i++) {
- void *start = kmap_local_page(page + i);
+ void *start = kmap_local_folio(folio, i * PAGE_SIZE);
__flush_dcache_icache(start);
kunmap_local(start);
}
- }
-}
-
-void flush_dcache_icache_page(struct page *page)
-{
- if (flush_coherent_icache())
- return;
-
- if (PageCompound(page))
- return flush_dcache_icache_hugepage(page);
-
- if (!PageHighMem(page)) {
- __flush_dcache_icache(lowmem_page_address(page));
- } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) {
- void *start = kmap_local_page(page);
-
- __flush_dcache_icache(start);
- kunmap_local(start);
} else {
- flush_dcache_icache_phys(page_to_phys(page));
+ unsigned long pfn = folio_pfn(folio);
+ for (i = 0; i < nr; i++)
+ flush_dcache_icache_phys((pfn + i) * PAGE_SIZE);
}
}
-EXPORT_SYMBOL(flush_dcache_icache_page);
+EXPORT_SYMBOL(flush_dcache_icache_folio);
void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
{
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index 8acd00178956..f5f8692e2c69 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -12,8 +12,6 @@
#include <linux/export.h>
#include <asm/reg.h>
#include <asm/copro.h>
-#include <asm/spu.h>
-#include <misc/cxl-base.h>
/*
* This ought to be kept in sync with the powerpc specific do_page_fault
@@ -33,19 +31,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
if (mm->pgd == NULL)
return -EFAULT;
- mmap_read_lock(mm);
- ret = -EFAULT;
- vma = find_vma(mm, ea);
+ vma = lock_mm_and_find_vma(mm, ea, NULL);
if (!vma)
- goto out_unlock;
-
- if (ea < vma->vm_start) {
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto out_unlock;
- if (expand_stack(vma, ea))
- goto out_unlock;
- }
+ return -EFAULT;
+ ret = -EFAULT;
is_write = dsisr & DSISR_ISSTORE;
if (is_write) {
if (!(vma->vm_flags & VM_WRITE))
@@ -65,6 +55,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
ret = 0;
*flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL);
+
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (*flt & VM_FAULT_COMPLETED)
+ return 0;
+
if (unlikely(*flt & VM_FAULT_ERROR)) {
if (*flt & VM_FAULT_OOM) {
ret = -ENOMEM;
@@ -82,6 +77,7 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(copro_handle_mm_fault);
+#ifdef CONFIG_PPC_64S_HASH_MMU
int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
{
u64 vsid, vsidkey;
@@ -137,12 +133,4 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
return 0;
}
EXPORT_SYMBOL_GPL(copro_calculate_slb);
-
-void copro_flush_all_slbs(struct mm_struct *mm)
-{
-#ifdef CONFIG_SPU_BASE
- spu_flush_all_slbs(mm);
#endif
- cxl_slbia(mm);
-}
-EXPORT_SYMBOL_GPL(copro_flush_all_slbs);
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 9af3832c9d8d..8dd7b340d51f 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -11,13 +11,14 @@
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/memblock.h>
-#include <asm/prom.h>
+#include <linux/slab.h>
#include <asm/drmem.h>
static int n_root_addr_cells, n_root_size_cells;
static struct drmem_lmb_info __drmem_info;
struct drmem_lmb_info *drmem_info = &__drmem_info;
+static bool in_drmem_update;
u64 drmem_lmb_memory_max(void)
{
@@ -66,7 +67,7 @@ static int drmem_update_dt_v1(struct device_node *memory,
struct property *new_prop;
struct of_drconf_cell_v1 *dr_cell;
struct drmem_lmb *lmb;
- u32 *p;
+ __be32 *p;
new_prop = clone_property(prop, prop->length);
if (!new_prop)
@@ -178,6 +179,11 @@ int drmem_update_dt(void)
if (!memory)
return -1;
+ /*
+ * Set in_drmem_update to prevent the notifier callback to process the
+ * DT property back since the change is coming from the LMB tree.
+ */
+ in_drmem_update = true;
prop = of_find_property(memory, "ibm,dynamic-memory", NULL);
if (prop) {
rc = drmem_update_dt_v1(memory, prop);
@@ -186,6 +192,7 @@ int drmem_update_dt(void)
if (prop)
rc = drmem_update_dt_v2(memory, prop);
}
+ in_drmem_update = false;
of_node_put(memory);
return rc;
@@ -307,6 +314,45 @@ int __init walk_drmem_lmbs_early(unsigned long node, void *data,
return ret;
}
+/*
+ * Update the LMB associativity index.
+ */
+static int update_lmb(struct drmem_lmb *updated_lmb,
+ __maybe_unused const __be32 **usm,
+ __maybe_unused void *data)
+{
+ struct drmem_lmb *lmb;
+
+ for_each_drmem_lmb(lmb) {
+ if (lmb->drc_index != updated_lmb->drc_index)
+ continue;
+
+ lmb->aa_index = updated_lmb->aa_index;
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Update the LMB associativity index.
+ *
+ * This needs to be called when the hypervisor is updating the
+ * dynamic-reconfiguration-memory node property.
+ */
+void drmem_update_lmbs(struct property *prop)
+{
+ /*
+ * Don't update the LMBs if triggered by the update done in
+ * drmem_update_dt(), the LMB values have been used to the update the DT
+ * property in that case.
+ */
+ if (in_drmem_update)
+ return;
+ if (!strcmp(prop->name, "ibm,dynamic-memory"))
+ __walk_drmem_v1_lmbs(prop->value, NULL, NULL, update_lmb);
+ else if (!strcmp(prop->name, "ibm,dynamic-memory-v2"))
+ __walk_drmem_v2_lmbs(prop->value, NULL, NULL, update_lmb);
+}
#endif
static int init_drmem_lmb_size(struct device_node *dn)
@@ -347,17 +393,17 @@ static const __be32 *of_get_usable_memory(struct device_node *dn)
int walk_drmem_lmbs(struct device_node *dn, void *data,
int (*func)(struct drmem_lmb *, const __be32 **, void *))
{
+ struct device_node *root = of_find_node_by_path("/");
const __be32 *prop, *usm;
int ret = -ENODEV;
- if (!of_root)
+ if (!root)
return ret;
/* Get the address & size cells */
- of_node_get(of_root);
- n_root_addr_cells = of_n_addr_cells(of_root);
- n_root_size_cells = of_n_size_cells(of_root);
- of_node_put(of_root);
+ n_root_addr_cells = of_n_addr_cells(root);
+ n_root_size_cells = of_n_size_cells(root);
+ of_node_put(root);
if (init_drmem_lmb_size(dn))
return ret;
@@ -445,10 +491,8 @@ static int __init drmem_init(void)
const __be32 *prop;
dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
- if (!dn) {
- pr_info("No dynamic reconfiguration memory found\n");
+ if (!dn)
return 0;
- }
if (init_drmem_lmb_size(dn)) {
of_node_put(dn);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index a8d0ce85d39a..806c74e0d5ab 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -17,6 +17,7 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/types.h>
#include <linux/pagemap.h>
#include <linux/ptrace.h>
@@ -71,28 +72,26 @@ static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long add
return __bad_area_nosemaphore(regs, address, SEGV_MAPERR);
}
-static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
+static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code,
+ struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct mm_struct *mm = current->mm;
/*
* Something tried to access memory that isn't in our memory map..
* Fix it, but check if it's kernel or user first..
*/
- mmap_read_unlock(mm);
+ if (mm)
+ mmap_read_unlock(mm);
+ else
+ vma_end_read(vma);
return __bad_area_nosemaphore(regs, address, si_code);
}
-static noinline int bad_area(struct pt_regs *regs, unsigned long address)
-{
- return __bad_area(regs, address, SEGV_MAPERR);
-}
-
static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
+ struct mm_struct *mm,
struct vm_area_struct *vma)
{
- struct mm_struct *mm = current->mm;
int pkey;
/*
@@ -114,7 +113,10 @@ static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
*/
pkey = vma_pkey(vma);
- mmap_read_unlock(mm);
+ if (mm)
+ mmap_read_unlock(mm);
+ else
+ vma_end_read(vma);
/*
* If we are in kernel mode, bail out with a SEGV, this will
@@ -129,9 +131,10 @@ static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
return 0;
}
-static noinline int bad_access(struct pt_regs *regs, unsigned long address)
+static noinline int bad_access(struct pt_regs *regs, unsigned long address,
+ struct mm_struct *mm, struct vm_area_struct *vma)
{
- return __bad_area(regs, address, SEGV_ACCERR);
+ return __bad_area(regs, address, SEGV_ACCERR, mm, vma);
}
static int do_sigbus(struct pt_regs *regs, unsigned long address,
@@ -216,7 +219,7 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
// Read/write fault blocked by KUAP is bad, it can never succeed.
if (bad_kuap_fault(regs, address, is_write)) {
pr_crit_ratelimited("Kernel attempted to %s user page (%lx) - exploit attempt? (uid: %d)\n",
- is_write ? "write" : "read", address,
+ str_write_read(is_write), address,
from_kuid(&init_user_ns, current_uid()));
// Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad
@@ -270,8 +273,18 @@ static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma
return false;
}
+ /*
+ * VM_READ, VM_WRITE and VM_EXEC may imply read permissions, as
+ * defined in protection_map[]. In that case Read faults can only be
+ * caused by a PROT_NONE mapping. However a non exec access on a
+ * VM_EXEC only mapping is invalid anyway, so report it as such.
+ */
if (unlikely(!vma_is_accessible(vma)))
return true;
+
+ if ((vma->vm_flags & VM_ACCESS_FLAGS) == VM_EXEC)
+ return true;
+
/*
* We should ideally do the vma pkey access check here. But in the
* fault path, handle_mm_fault() also does the same check. To avoid
@@ -356,18 +369,33 @@ static void sanity_check_fault(bool is_write, bool is_user,
* Define the correct "is_write" bit in error_code based
* on the processor family
*/
-#if (defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+#ifdef CONFIG_BOOKE
#define page_fault_is_write(__err) ((__err) & ESR_DST)
#else
#define page_fault_is_write(__err) ((__err) & DSISR_ISSTORE)
#endif
-#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+#ifdef CONFIG_BOOKE
#define page_fault_is_bad(__err) (0)
#elif defined(CONFIG_PPC_8xx)
#define page_fault_is_bad(__err) ((__err) & DSISR_NOEXEC_OR_G)
#elif defined(CONFIG_PPC64)
-#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_64S)
+static int page_fault_is_bad(unsigned long err)
+{
+ unsigned long flag = DSISR_BAD_FAULT_64S;
+
+ /*
+ * PAPR+ v2.11 § 14.15.3.4.1 (unreleased)
+ * If byte 0, bit 3 of pi-attribute-specifier-type in
+ * ibm,pi-features property is defined, ignore the DSI error
+ * which is caused by the paste instruction on the
+ * suspended NX window.
+ */
+ if (mmu_has_feature(MMU_FTR_NX_DSI))
+ flag &= ~DSISR_BAD_COPYPASTE;
+
+ return err & flag;
+}
#else
#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S)
#endif
@@ -412,10 +440,16 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
/*
* The kernel should never take an execute fault nor should it
* take a page fault to a kernel address or a page fault to a user
- * address outside of dedicated places
+ * address outside of dedicated places.
+ *
+ * Rather than kfence directly reporting false negatives, search whether
+ * the NIP belongs to the fixup table for cases where fault could come
+ * from functions like copy_from_kernel_nofault().
*/
if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write))) {
- if (kfence_handle_page_fault(address, is_write, regs))
+ if (is_kfence_address((void *)address) &&
+ !search_exception_tables(instruction_pointer(regs)) &&
+ kfence_handle_page_fault(address, is_write, regs))
return 0;
return SIGSEGV;
@@ -450,6 +484,41 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
if (is_exec)
flags |= FAULT_FLAG_INSTRUCTION;
+ if (!(flags & FAULT_FLAG_USER))
+ goto lock_mmap;
+
+ vma = lock_vma_under_rcu(mm, address);
+ if (!vma)
+ goto lock_mmap;
+
+ if (unlikely(access_pkey_error(is_write, is_exec,
+ (error_code & DSISR_KEYFAULT), vma))) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ return bad_access_pkey(regs, address, NULL, vma);
+ }
+
+ if (unlikely(access_error(is_write, is_exec, vma))) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ return bad_access(regs, address, NULL, vma);
+ }
+
+ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+ if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+ vma_end_read(vma);
+
+ if (!(fault & VM_FAULT_RETRY)) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ goto done;
+ }
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
+ if (fault & VM_FAULT_MAJOR)
+ flags |= FAULT_FLAG_TRIED;
+
+ if (fault_signal_pending(fault, regs))
+ return user_mode(regs) ? 0 : SIGBUS;
+
+lock_mmap:
+
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
@@ -457,47 +526,19 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
* we will deadlock attempting to validate the fault against the
* address space. Luckily the kernel only validly references user
* space from well defined areas of code, which are listed in the
- * exceptions table.
- *
- * As the vast majority of faults will be valid we will only perform
- * the source reference check when there is a possibility of a deadlock.
- * Attempt to lock the address space, if we cannot we then validate the
- * source. If this is invalid we can skip the address space check,
- * thus avoiding the deadlock.
+ * exceptions table. lock_mm_and_find_vma() handles that logic.
*/
- if (unlikely(!mmap_read_trylock(mm))) {
- if (!is_user && !search_exception_tables(regs->nip))
- return bad_area_nosemaphore(regs, address);
-
retry:
- mmap_read_lock(mm);
- } else {
- /*
- * The above down_read_trylock() might have succeeded in
- * which case we'll have missed the might_sleep() from
- * down_read():
- */
- might_sleep();
- }
-
- vma = find_vma(mm, address);
+ vma = lock_mm_and_find_vma(mm, address, regs);
if (unlikely(!vma))
- return bad_area(regs, address);
-
- if (unlikely(vma->vm_start > address)) {
- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
- return bad_area(regs, address);
-
- if (unlikely(expand_stack(vma, address)))
- return bad_area(regs, address);
- }
+ return bad_area_nosemaphore(regs, address);
if (unlikely(access_pkey_error(is_write, is_exec,
(error_code & DSISR_KEYFAULT), vma)))
- return bad_access_pkey(regs, address, vma);
+ return bad_access_pkey(regs, address, mm, vma);
if (unlikely(access_error(is_write, is_exec, vma)))
- return bad_access(regs, address);
+ return bad_access(regs, address, mm, vma);
/*
* If for any reason at all we couldn't handle the fault,
@@ -511,22 +552,26 @@ retry:
if (fault_signal_pending(fault, regs))
return user_mode(regs) ? 0 : SIGBUS;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ goto out;
+
/*
* Handle the retry right now, the mmap_lock has been released in that
* case.
*/
if (unlikely(fault & VM_FAULT_RETRY)) {
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- flags |= FAULT_FLAG_TRIED;
- goto retry;
- }
+ flags |= FAULT_FLAG_TRIED;
+ goto retry;
}
mmap_read_unlock(current->mm);
+done:
if (unlikely(fault & VM_FAULT_ERROR))
return mm_fault_error(regs, address, fault);
+out:
/*
* Major/minor page fault accounting.
*/
@@ -568,17 +613,23 @@ NOKPROBE_SYMBOL(hash__do_page_fault);
static void __bad_page_fault(struct pt_regs *regs, int sig)
{
int is_write = page_fault_is_write(regs->dsisr);
+ const char *msg;
/* kernel has accessed a bad area */
+ if (regs->dar < PAGE_SIZE)
+ msg = "Kernel NULL pointer dereference";
+ else
+ msg = "Unable to handle kernel data access";
+
switch (TRAP(regs)) {
case INTERRUPT_DATA_STORAGE:
- case INTERRUPT_DATA_SEGMENT:
case INTERRUPT_H_DATA_STORAGE:
- pr_alert("BUG: %s on %s at 0x%08lx\n",
- regs->dar < PAGE_SIZE ? "Kernel NULL pointer dereference" :
- "Unable to handle kernel data access",
- is_write ? "write" : "read", regs->dar);
+ pr_alert("BUG: %s on %s at 0x%08lx\n", msg,
+ str_write_read(is_write), regs->dar);
+ break;
+ case INTERRUPT_DATA_SEGMENT:
+ pr_alert("BUG: %s at 0x%08lx\n", msg, regs->dar);
break;
case INTERRUPT_INST_STORAGE:
case INTERRUPT_INST_SEGMENT:
@@ -620,4 +671,27 @@ DEFINE_INTERRUPT_HANDLER(do_bad_page_fault_segv)
{
bad_page_fault(regs, SIGSEGV);
}
+
+/*
+ * In radix, segment interrupts indicate the EA is not addressable by the
+ * page table geometry, so they are always sent here.
+ *
+ * In hash, this is called if do_slb_fault returns error. Typically it is
+ * because the EA was outside the region allowed by software.
+ */
+DEFINE_INTERRUPT_HANDLER(do_bad_segment_interrupt)
+{
+ int err = regs->result;
+
+ if (err == -EFAULT) {
+ if (user_mode(regs))
+ _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar);
+ else
+ bad_page_fault(regs, SIGSEGV);
+ } else if (err == -EINVAL) {
+ unrecoverable_exception(regs);
+ } else {
+ BUG();
+ }
+}
#endif
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 9a75ba078e1b..d3c1b749dcfc 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -24,11 +24,10 @@
#include <asm/setup.h>
#include <asm/hugetlb.h>
#include <asm/pte-walk.h>
+#include <asm/firmware.h>
bool hugetlb_disabled = false;
-#define hugepd_none(hpd) (hpd_val(hpd) == 0)
-
#define PTE_T_ORDER (__builtin_ffs(sizeof(pte_basic_t)) - \
__builtin_ffs(sizeof(void *)))
@@ -41,156 +40,43 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long s
return __find_linux_pte(mm->pgd, addr, NULL, NULL);
}
-static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
- unsigned long address, unsigned int pdshift,
- unsigned int pshift, spinlock_t *ptl)
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long sz)
{
- struct kmem_cache *cachep;
- pte_t *new;
- int i;
- int num_hugepd;
-
- if (pshift >= pdshift) {
- cachep = PGT_CACHE(PTE_T_ORDER);
- num_hugepd = 1 << (pshift - pdshift);
- } else {
- cachep = PGT_CACHE(pdshift - pshift);
- num_hugepd = 1;
- }
-
- if (!cachep) {
- WARN_ONCE(1, "No page table cache created for hugetlb tables");
- return -ENOMEM;
- }
-
- new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
- BUG_ON(pshift > HUGEPD_SHIFT_MASK);
- BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
+ addr &= ~(sz - 1);
- if (!new)
- return -ENOMEM;
+ p4d = p4d_offset(pgd_offset(mm, addr), addr);
+ if (!mm_pud_folded(mm) && sz >= P4D_SIZE)
+ return (pte_t *)p4d;
- /*
- * Make sure other cpus find the hugepd set only after a
- * properly initialized page table is visible to them.
- * For more details look for comment in __pte_alloc().
- */
- smp_wmb();
+ pud = pud_alloc(mm, p4d, addr);
+ if (!pud)
+ return NULL;
+ if (!mm_pmd_folded(mm) && sz >= PUD_SIZE)
+ return (pte_t *)pud;
- spin_lock(ptl);
- /*
- * We have multiple higher-level entries that point to the same
- * actual pte location. Fill in each as we go and backtrack on error.
- * We need all of these so the DTLB pgtable walk code can find the
- * right higher-level entry without knowing if it's a hugepage or not.
- */
- for (i = 0; i < num_hugepd; i++, hpdp++) {
- if (unlikely(!hugepd_none(*hpdp)))
- break;
- hugepd_populate(hpdp, new, pshift);
- }
- /* If we bailed from the for loop early, an error occurred, clean up */
- if (i < num_hugepd) {
- for (i = i - 1 ; i >= 0; i--, hpdp--)
- *hpdp = __hugepd(0);
- kmem_cache_free(cachep, new);
- } else {
- kmemleak_ignore(new);
- }
- spin_unlock(ptl);
- return 0;
-}
+ pmd = pmd_alloc(mm, pud, addr);
+ if (!pmd)
+ return NULL;
-/*
- * At this point we do the placement change only for BOOK3S 64. This would
- * possibly work on other subarchs.
- */
-pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, unsigned long sz)
-{
- pgd_t *pg;
- p4d_t *p4;
- pud_t *pu;
- pmd_t *pm;
- hugepd_t *hpdp = NULL;
- unsigned pshift = __ffs(sz);
- unsigned pdshift = PGDIR_SHIFT;
- spinlock_t *ptl;
-
- addr &= ~(sz-1);
- pg = pgd_offset(mm, addr);
- p4 = p4d_offset(pg, addr);
+ if (sz >= PMD_SIZE) {
+ /* On 8xx, all hugepages are handled as contiguous PTEs */
+ if (IS_ENABLED(CONFIG_PPC_8xx)) {
+ int i;
-#ifdef CONFIG_PPC_BOOK3S_64
- if (pshift == PGDIR_SHIFT)
- /* 16GB huge page */
- return (pte_t *) p4;
- else if (pshift > PUD_SHIFT) {
- /*
- * We need to use hugepd table
- */
- ptl = &mm->page_table_lock;
- hpdp = (hugepd_t *)p4;
- } else {
- pdshift = PUD_SHIFT;
- pu = pud_alloc(mm, p4, addr);
- if (!pu)
- return NULL;
- if (pshift == PUD_SHIFT)
- return (pte_t *)pu;
- else if (pshift > PMD_SHIFT) {
- ptl = pud_lockptr(mm, pu);
- hpdp = (hugepd_t *)pu;
- } else {
- pdshift = PMD_SHIFT;
- pm = pmd_alloc(mm, pu, addr);
- if (!pm)
- return NULL;
- if (pshift == PMD_SHIFT)
- /* 16MB hugepage */
- return (pte_t *)pm;
- else {
- ptl = pmd_lockptr(mm, pm);
- hpdp = (hugepd_t *)pm;
+ for (i = 0; i < sz / PMD_SIZE; i++) {
+ if (!pte_alloc_huge(mm, pmd + i, addr))
+ return NULL;
}
}
+ return (pte_t *)pmd;
}
-#else
- if (pshift >= PGDIR_SHIFT) {
- ptl = &mm->page_table_lock;
- hpdp = (hugepd_t *)p4;
- } else {
- pdshift = PUD_SHIFT;
- pu = pud_alloc(mm, p4, addr);
- if (!pu)
- return NULL;
- if (pshift >= PUD_SHIFT) {
- ptl = pud_lockptr(mm, pu);
- hpdp = (hugepd_t *)pu;
- } else {
- pdshift = PMD_SHIFT;
- pm = pmd_alloc(mm, pu, addr);
- if (!pm)
- return NULL;
- ptl = pmd_lockptr(mm, pm);
- hpdp = (hugepd_t *)pm;
- }
- }
-#endif
- if (!hpdp)
- return NULL;
-
- if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT)
- return pte_alloc_map(mm, (pmd_t *)hpdp, addr);
-
- BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
-
- if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
- pdshift, pshift, ptl))
- return NULL;
- return hugepte_offset(*hpdp, addr, pdshift);
+ return pte_alloc_huge(mm, pmd, addr);
}
#ifdef CONFIG_PPC_BOOK3S_64
@@ -225,344 +111,27 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
return 0;
m = phys_to_virt(gpage_freearray[--nr_gpages]);
gpage_freearray[nr_gpages] = 0;
- list_add(&m->list, &huge_boot_pages);
+ list_add(&m->list, &huge_boot_pages[0]);
m->hstate = hstate;
+ m->flags = 0;
return 1;
}
-#endif
-
-
-int __init alloc_bootmem_huge_page(struct hstate *h)
-{
-
-#ifdef CONFIG_PPC_BOOK3S_64
- if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
- return pseries_alloc_bootmem_huge_page(h);
-#endif
- return __alloc_bootmem_huge_page(h);
-}
-
-#ifndef CONFIG_PPC_BOOK3S_64
-#define HUGEPD_FREELIST_SIZE \
- ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
-
-struct hugepd_freelist {
- struct rcu_head rcu;
- unsigned int index;
- void *ptes[];
-};
-static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
-
-static void hugepd_free_rcu_callback(struct rcu_head *head)
+bool __init hugetlb_node_alloc_supported(void)
{
- struct hugepd_freelist *batch =
- container_of(head, struct hugepd_freelist, rcu);
- unsigned int i;
-
- for (i = 0; i < batch->index; i++)
- kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]);
-
- free_page((unsigned long)batch);
+ return false;
}
-
-static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
-{
- struct hugepd_freelist **batchp;
-
- batchp = &get_cpu_var(hugepd_freelist_cur);
-
- if (atomic_read(&tlb->mm->mm_users) < 2 ||
- mm_is_thread_local(tlb->mm)) {
- kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte);
- put_cpu_var(hugepd_freelist_cur);
- return;
- }
-
- if (*batchp == NULL) {
- *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
- (*batchp)->index = 0;
- }
-
- (*batchp)->ptes[(*batchp)->index++] = hugepte;
- if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
- call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback);
- *batchp = NULL;
- }
- put_cpu_var(hugepd_freelist_cur);
-}
-#else
-static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
#endif
-/* Return true when the entry to be freed maps more than the area being freed */
-static bool range_is_outside_limits(unsigned long start, unsigned long end,
- unsigned long floor, unsigned long ceiling,
- unsigned long mask)
-{
- if ((start & mask) < floor)
- return true;
- if (ceiling) {
- ceiling &= mask;
- if (!ceiling)
- return true;
- }
- return end - 1 > ceiling - 1;
-}
-static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
- unsigned long start, unsigned long end,
- unsigned long floor, unsigned long ceiling)
+int __init alloc_bootmem_huge_page(struct hstate *h, int nid)
{
- pte_t *hugepte = hugepd_page(*hpdp);
- int i;
-
- unsigned long pdmask = ~((1UL << pdshift) - 1);
- unsigned int num_hugepd = 1;
- unsigned int shift = hugepd_shift(*hpdp);
-
- /* Note: On fsl the hpdp may be the first of several */
- if (shift > pdshift)
- num_hugepd = 1 << (shift - pdshift);
- if (range_is_outside_limits(start, end, floor, ceiling, pdmask))
- return;
-
- for (i = 0; i < num_hugepd; i++, hpdp++)
- *hpdp = __hugepd(0);
-
- if (shift >= pdshift)
- hugepd_free(tlb, hugepte);
- else
- pgtable_free_tlb(tlb, hugepte,
- get_hugepd_cache_index(pdshift - shift));
-}
-
-static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
- pgtable_t token = pmd_pgtable(*pmd);
-
- if (range_is_outside_limits(addr, end, floor, ceiling, PMD_MASK))
- return;
-
- pmd_clear(pmd);
- pte_free_tlb(tlb, token, addr);
- mm_dec_nr_ptes(tlb->mm);
-}
-
-static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
- pmd_t *pmd;
- unsigned long next;
- unsigned long start;
-
- start = addr;
- do {
- unsigned long more;
-
- pmd = pmd_offset(pud, addr);
- next = pmd_addr_end(addr, end);
- if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
- if (pmd_none_or_clear_bad(pmd))
- continue;
-
- /*
- * if it is not hugepd pointer, we should already find
- * it cleared.
- */
- WARN_ON(!IS_ENABLED(CONFIG_PPC_8xx));
-
- hugetlb_free_pte_range(tlb, pmd, addr, end, floor, ceiling);
-
- continue;
- }
- /*
- * Increment next by the size of the huge mapping since
- * there may be more than one entry at this level for a
- * single hugepage, but all of them point to
- * the same kmem cache that holds the hugepte.
- */
- more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
- if (more > next)
- next = more;
-
- free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
- addr, next, floor, ceiling);
- } while (addr = next, addr != end);
-
- if (range_is_outside_limits(start, end, floor, ceiling, PUD_MASK))
- return;
-
- pmd = pmd_offset(pud, start & PUD_MASK);
- pud_clear(pud);
- pmd_free_tlb(tlb, pmd, start & PUD_MASK);
- mm_dec_nr_pmds(tlb->mm);
-}
-
-static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
- pud_t *pud;
- unsigned long next;
- unsigned long start;
-
- start = addr;
- do {
- pud = pud_offset(p4d, addr);
- next = pud_addr_end(addr, end);
- if (!is_hugepd(__hugepd(pud_val(*pud)))) {
- if (pud_none_or_clear_bad(pud))
- continue;
- hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
- ceiling);
- } else {
- unsigned long more;
- /*
- * Increment next by the size of the huge mapping since
- * there may be more than one entry at this level for a
- * single hugepage, but all of them point to
- * the same kmem cache that holds the hugepte.
- */
- more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
- if (more > next)
- next = more;
-
- free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
- addr, next, floor, ceiling);
- }
- } while (addr = next, addr != end);
-
- if (range_is_outside_limits(start, end, floor, ceiling, PGDIR_MASK))
- return;
-
- pud = pud_offset(p4d, start & PGDIR_MASK);
- p4d_clear(p4d);
- pud_free_tlb(tlb, pud, start & PGDIR_MASK);
- mm_dec_nr_puds(tlb->mm);
-}
-
-/*
- * This function frees user-level page tables of a process.
- */
-void hugetlb_free_pgd_range(struct mmu_gather *tlb,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
- pgd_t *pgd;
- p4d_t *p4d;
- unsigned long next;
-
- /*
- * Because there are a number of different possible pagetable
- * layouts for hugepage ranges, we limit knowledge of how
- * things should be laid out to the allocation path
- * (huge_pte_alloc(), above). Everything else works out the
- * structure as it goes from information in the hugepd
- * pointers. That means that we can't here use the
- * optimization used in the normal page free_pgd_range(), of
- * checking whether we're actually covering a large enough
- * range to have to do anything at the top level of the walk
- * instead of at the bottom.
- *
- * To make sense of this, you should probably go read the big
- * block comment at the top of the normal free_pgd_range(),
- * too.
- */
-
- do {
- next = pgd_addr_end(addr, end);
- pgd = pgd_offset(tlb->mm, addr);
- p4d = p4d_offset(pgd, addr);
- if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
- if (p4d_none_or_clear_bad(p4d))
- continue;
- hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling);
- } else {
- unsigned long more;
- /*
- * Increment next by the size of the huge mapping since
- * there may be more than one entry at the pgd level
- * for a single hugepage, but all of them point to the
- * same kmem cache that holds the hugepte.
- */
- more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
- if (more > next)
- next = more;
-
- free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT,
- addr, next, floor, ceiling);
- }
- } while (addr = next, addr != end);
-}
-
-struct page *follow_huge_pd(struct vm_area_struct *vma,
- unsigned long address, hugepd_t hpd,
- int flags, int pdshift)
-{
- pte_t *ptep;
- spinlock_t *ptl;
- struct page *page = NULL;
- unsigned long mask;
- int shift = hugepd_shift(hpd);
- struct mm_struct *mm = vma->vm_mm;
-
-retry:
- /*
- * hugepage directory entries are protected by mm->page_table_lock
- * Use this instead of huge_pte_lockptr
- */
- ptl = &mm->page_table_lock;
- spin_lock(ptl);
-
- ptep = hugepte_offset(hpd, address, pdshift);
- if (pte_present(*ptep)) {
- mask = (1UL << shift) - 1;
- page = pte_page(*ptep);
- page += ((address & mask) >> PAGE_SHIFT);
- if (flags & FOLL_GET)
- get_page(page);
- } else {
- if (is_hugetlb_entry_migration(*ptep)) {
- spin_unlock(ptl);
- __migration_entry_wait(mm, ptep, ptl);
- goto retry;
- }
- }
- spin_unlock(ptl);
- return page;
-}
-
-#ifdef CONFIG_PPC_MM_SLICES
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags)
-{
- struct hstate *hstate = hstate_file(file);
- int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
-
-#ifdef CONFIG_PPC_RADIX_MMU
- if (radix_enabled())
- return radix__hugetlb_get_unmapped_area(file, addr, len,
- pgoff, flags);
-#endif
- return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
-}
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
+ return pseries_alloc_bootmem_huge_page(h);
#endif
-
-unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
-{
- /* With radix we don't use slice, so derive it from vma*/
- if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) {
- unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
-
- return 1UL << mmu_psize_to_shift(psize);
- }
- return vma_kernel_pagesize(vma);
+ return __alloc_bootmem_huge_page(h, nid);
}
bool __init arch_hugetlb_valid_size(unsigned long size)
@@ -611,52 +180,19 @@ static int __init hugetlbpage_init(void)
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
unsigned shift;
- unsigned pdshift;
if (!mmu_psize_defs[psize].shift)
continue;
shift = mmu_psize_to_shift(psize);
-#ifdef CONFIG_PPC_BOOK3S_64
- if (shift > PGDIR_SHIFT)
- continue;
- else if (shift > PUD_SHIFT)
- pdshift = PGDIR_SHIFT;
- else if (shift > PMD_SHIFT)
- pdshift = PUD_SHIFT;
- else
- pdshift = PMD_SHIFT;
-#else
- if (shift < PUD_SHIFT)
- pdshift = PMD_SHIFT;
- else if (shift < PGDIR_SHIFT)
- pdshift = PUD_SHIFT;
- else
- pdshift = PGDIR_SHIFT;
-#endif
-
if (add_huge_page_size(1ULL << shift) < 0)
continue;
- /*
- * if we have pdshift and shift value same, we don't
- * use pgt cache for hugepd.
- */
- if (pdshift > shift) {
- if (!IS_ENABLED(CONFIG_PPC_8xx))
- pgtable_cache_add(pdshift - shift);
- } else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) ||
- IS_ENABLED(CONFIG_PPC_8xx)) {
- pgtable_cache_add(PTE_T_ORDER);
- }
configured = true;
}
- if (configured) {
- if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE))
- hugetlbpage_init_default();
- } else
+ if (!configured)
pr_info("Failed to initialize. Disabling HugeTLB");
return 0;
@@ -676,8 +212,6 @@ void __init gigantic_hugetlb_cma_reserve(void)
*/
order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
- if (order) {
- VM_WARN_ON(order < MAX_ORDER);
+ if (order)
hugetlb_cma_reserve(order);
- }
}
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index 3a82f89827a5..745097554bea 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -20,6 +20,7 @@
#include <linux/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/kup.h>
+#include <asm/smp.h>
phys_addr_t memstart_addr __ro_after_init = (phys_addr_t)~0ull;
EXPORT_SYMBOL_GPL(memstart_addr);
@@ -30,9 +31,16 @@ EXPORT_SYMBOL_GPL(kernstart_virt_addr);
bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP);
bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP);
+#ifdef CONFIG_KFENCE
+bool __ro_after_init kfence_disabled;
+bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;
+#endif
static int __init parse_nosmep(char *p)
{
+ if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64))
+ return 0;
+
disable_kuep = true;
pr_warn("Disabling Kernel Userspace Execution Prevention\n");
return 0;
@@ -47,9 +55,26 @@ static int __init parse_nosmap(char *p)
}
early_param("nosmap", parse_nosmap);
+void __weak setup_kuep(bool disabled)
+{
+ if (!IS_ENABLED(CONFIG_PPC_KUEP) || disabled)
+ return;
+
+ if (smp_processor_id() != boot_cpuid)
+ return;
+
+ pr_info("Activating Kernel Userspace Execution Prevention\n");
+}
+
+void setup_kup(void)
+{
+ setup_kuap(disable_kuap);
+ setup_kuep(disable_kuep);
+}
+
#define CTOR(shift) static void ctor_##shift(void *addr) \
{ \
- memset(addr, 0, sizeof(void *) << (shift)); \
+ memset(addr, 0, sizeof(pgd_t) << (shift)); \
}
CTOR(0); CTOR(1); CTOR(2); CTOR(3); CTOR(4); CTOR(5); CTOR(6); CTOR(7);
@@ -93,19 +118,15 @@ EXPORT_SYMBOL_GPL(pgtable_cache); /* used by kvm_hv module */
void pgtable_cache_add(unsigned int shift)
{
char *name;
- unsigned long table_size = sizeof(void *) << shift;
+ unsigned long table_size = sizeof(pgd_t) << shift;
unsigned long align = table_size;
/* When batching pgtable pointers for RCU freeing, we store
* the index size in the low bits. Table alignment must be
* big enough to fit it.
- *
- * Likewise, hugeapge pagetable pointers contain a (different)
- * shift value in the low bits. All tables must be aligned so
- * as to leave enough 0 bits in the address to contain it. */
- unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
- HUGEPD_SHIFT_MASK + 1);
- struct kmem_cache *new;
+ */
+ unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1;
+ struct kmem_cache *new = NULL;
/* It would be nice if this was a BUILD_BUG_ON(), but at the
* moment, gcc doesn't seem to recognize is_power_of_2 as a
@@ -118,7 +139,8 @@ void pgtable_cache_add(unsigned int shift)
align = max_t(unsigned long, align, minalign);
name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
- new = kmem_cache_create(name, table_size, align, 0, ctor(shift));
+ if (name)
+ new = kmem_cache_create(name, table_size, align, 0, ctor(shift));
if (!new)
panic("Could not allocate pgtable cache for order %d", shift);
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 3d690be48e84..4e71dfe7d026 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -29,7 +29,6 @@
#include <linux/slab.h>
#include <linux/hugetlb.h>
-#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu.h>
#include <asm/smp.h>
@@ -40,6 +39,7 @@
#include <asm/hugetlb.h>
#include <asm/kup.h>
#include <asm/kasan.h>
+#include <asm/fixmap.h>
#include <mm/mmu_decl.h>
@@ -70,44 +70,10 @@ EXPORT_SYMBOL(agp_special_page);
void MMU_init(void);
-/*
- * this tells the system to map all of ram with the segregs
- * (i.e. page tables) instead of the bats.
- * -- Cort
- */
-int __map_without_bats;
-int __map_without_ltlbs;
-
/* max amount of low RAM to map in */
unsigned long __max_low_memory = MAX_LOW_MEM;
/*
- * Check for command-line options that affect what MMU_init will do.
- */
-static void __init MMU_setup(void)
-{
- /* Check for nobats option (used in mapin_ram). */
- if (strstr(boot_command_line, "nobats")) {
- __map_without_bats = 1;
- }
-
- if (strstr(boot_command_line, "noltlbs")) {
- __map_without_ltlbs = 1;
- }
- if (IS_ENABLED(CONFIG_PPC_8xx))
- return;
-
- if (IS_ENABLED(CONFIG_KFENCE))
- __map_without_ltlbs = 1;
-
- if (debug_pagealloc_enabled())
- __map_without_ltlbs = 1;
-
- if (strict_kernel_rwx_enabled())
- __map_without_ltlbs = 1;
-}
-
-/*
* MMU_init sets up the basic memory mappings for the kernel,
* including both RAM and possibly some I/O regions,
* and sets up the page tables and the MMU hardware ready to go.
@@ -117,31 +83,15 @@ void __init MMU_init(void)
if (ppc_md.progress)
ppc_md.progress("MMU:enter", 0x111);
- /* parse args from command line */
- MMU_setup();
-
- /*
- * Reserve gigantic pages for hugetlb. This MUST occur before
- * lowmem_end_addr is initialized below.
- */
- if (memblock.memory.cnt > 1) {
-#ifndef CONFIG_WII
- memblock_enforce_memory_limit(memblock.memory.regions[0].size);
- pr_warn("Only using first contiguous memory region\n");
-#else
- wii_memory_fixups();
-#endif
- }
-
total_lowmem = total_memory = memblock_end_of_DRAM() - memstart_addr;
lowmem_end_addr = memstart_addr + total_lowmem;
-#ifdef CONFIG_FSL_BOOKE
+#ifdef CONFIG_PPC_85xx
/* Freescale Book-E parts expect lowmem to be mapped by fixed TLB
* entries, so we need to adjust lowmem to match the amount we can map
* in the fixed entries */
adjust_total_lowmem();
-#endif /* CONFIG_FSL_BOOKE */
+#endif /* CONFIG_PPC_85xx */
if (total_lowmem > __max_low_memory) {
total_lowmem = __max_low_memory;
@@ -177,6 +127,8 @@ void __init MMU_init(void)
setup_kup();
+ update_mmu_feature_fixups(MMU_FTR_KUAP);
+
/* Shortly after that, the entire linear mapping will be available */
memblock_set_current_limit(lowmem_end_addr);
}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 386be136026e..b6f3ae03ca9e 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -40,6 +40,8 @@
#include <linux/of_fdt.h>
#include <linux/libfdt.h>
#include <linux/memremap.h>
+#include <linux/memory.h>
+#include <linux/bootmem_info.h>
#include <asm/pgalloc.h>
#include <asm/page.h>
@@ -59,6 +61,7 @@
#include <asm/sections.h>
#include <asm/iommu.h>
#include <asm/vdso.h>
+#include <asm/hugetlb.h>
#include <mm/mmu_decl.h>
@@ -91,7 +94,7 @@ static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_ad
* a page table lookup here because with the hash translation we don't keep
* vmemmap details in linux page table.
*/
-static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size)
+int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size)
{
struct page *start;
unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size;
@@ -110,7 +113,7 @@ static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_m
}
/*
- * vmemmap virtual address space management does not have a traditonal page
+ * vmemmap virtual address space management does not have a traditional page
* table to track which virtual struct pages are backed by physical mapping.
* The virtual to physical mappings are tracked in a simple linked list
* format. 'vmemmap_list' maintains the entire vmemmap physical mapping at
@@ -127,7 +130,7 @@ static struct vmemmap_backing *next;
/*
* The same pointer 'next' tracks individual chunks inside the allocated
- * full page during the boot time and again tracks the freeed nodes during
+ * full page during the boot time and again tracks the freed nodes during
* runtime. It is racy but it does not happen as they are separated by the
* boot process. Will create problem if some how we have memory hotplug
* operation during boot !!
@@ -182,13 +185,13 @@ static __meminit int vmemmap_list_populate(unsigned long phys,
return 0;
}
-static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
- unsigned long page_size)
+bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
+ unsigned long page_size)
{
unsigned long nr_pfn = page_size / sizeof(struct page);
unsigned long start_pfn = page_to_pfn((struct page *)start);
- if ((start_pfn + nr_pfn) > altmap->end_pfn)
+ if ((start_pfn + nr_pfn - 1) > altmap->end_pfn)
return true;
if (start_pfn < altmap->base_pfn)
@@ -197,8 +200,8 @@ static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long star
return false;
}
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
- struct vmem_altmap *altmap)
+static int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
{
bool altmap_alloc;
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
@@ -271,6 +274,18 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
return 0;
}
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
+{
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (radix_enabled())
+ return radix__vmemmap_populate(start, end, node, altmap);
+#endif
+
+ return __vmemmap_populate(start, end, node, altmap);
+}
+
#ifdef CONFIG_MEMORY_HOTPLUG
static unsigned long vmemmap_list_free(unsigned long start)
{
@@ -302,8 +317,8 @@ static unsigned long vmemmap_list_free(unsigned long start)
return vmem_back->phys;
}
-void __ref vmemmap_free(unsigned long start, unsigned long end,
- struct vmem_altmap *altmap)
+static void __ref __vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
{
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
unsigned long page_order = get_order(page_size);
@@ -313,8 +328,7 @@ void __ref vmemmap_free(unsigned long start, unsigned long end,
start = ALIGN_DOWN(start, page_size);
if (altmap) {
alt_start = altmap->base_pfn;
- alt_end = altmap->base_pfn + altmap->reserve +
- altmap->free + altmap->alloc + altmap->align;
+ alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
}
pr_debug("vmemmap_free %lx...%lx\n", start, end);
@@ -361,15 +375,35 @@ void __ref vmemmap_free(unsigned long start, unsigned long end,
vmemmap_remove_mapping(start, page_size);
}
}
+
+void __ref vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (radix_enabled())
+ return radix__vmemmap_free(start, end, altmap);
+#endif
+ return __vmemmap_free(start, end, altmap);
+}
+
#endif
+
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
void register_page_bootmem_memmap(unsigned long section_nr,
struct page *start_page, unsigned long size)
{
}
+#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
#ifdef CONFIG_PPC_BOOK3S_64
+unsigned int mmu_lpid_bits;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+EXPORT_SYMBOL_GPL(mmu_lpid_bits);
+#endif
+unsigned int mmu_pid_bits;
+
static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);
static int __init parse_disable_radix(char *p)
@@ -437,11 +471,180 @@ static void __init early_check_vec5(void)
}
}
+static int __init dt_scan_mmu_pid_width(unsigned long node,
+ const char *uname, int depth,
+ void *data)
+{
+ int size = 0;
+ const __be32 *prop;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+
+ /* We are scanning "cpu" nodes only */
+ if (type == NULL || strcmp(type, "cpu") != 0)
+ return 0;
+
+ /* Find MMU LPID, PID register size */
+ prop = of_get_flat_dt_prop(node, "ibm,mmu-lpid-bits", &size);
+ if (prop && size == 4)
+ mmu_lpid_bits = be32_to_cpup(prop);
+
+ prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
+ if (prop && size == 4)
+ mmu_pid_bits = be32_to_cpup(prop);
+
+ if (!mmu_pid_bits && !mmu_lpid_bits)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Outside hotplug the kernel uses this value to map the kernel direct map
+ * with radix. To be compatible with older kernels, let's keep this value
+ * as 16M which is also SECTION_SIZE with SPARSEMEM. We can ideally map
+ * things with 1GB size in the case where we don't support hotplug.
+ */
+#ifndef CONFIG_MEMORY_HOTPLUG
+#define DEFAULT_MEMORY_BLOCK_SIZE SZ_16M
+#else
+#define DEFAULT_MEMORY_BLOCK_SIZE MIN_MEMORY_BLOCK_SIZE
+#endif
+
+static void update_memory_block_size(unsigned long *block_size, unsigned long mem_size)
+{
+ unsigned long min_memory_block_size = DEFAULT_MEMORY_BLOCK_SIZE;
+
+ for (; *block_size > min_memory_block_size; *block_size >>= 2) {
+ if ((mem_size & *block_size) == 0)
+ break;
+ }
+}
+
+static int __init probe_memory_block_size(unsigned long node, const char *uname, int
+ depth, void *data)
+{
+ const char *type;
+ unsigned long *block_size = (unsigned long *)data;
+ const __be32 *reg, *endp;
+ int l;
+
+ if (depth != 1)
+ return 0;
+ /*
+ * If we have dynamic-reconfiguration-memory node, use the
+ * lmb value.
+ */
+ if (strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) {
+
+ const __be32 *prop;
+
+ prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &l);
+
+ if (!prop || l < dt_root_size_cells * sizeof(__be32))
+ /*
+ * Nothing in the device tree
+ */
+ *block_size = DEFAULT_MEMORY_BLOCK_SIZE;
+ else
+ *block_size = of_read_number(prop, dt_root_size_cells);
+ /*
+ * We have found the final value. Don't probe further.
+ */
+ return 1;
+ }
+ /*
+ * Find all the device tree nodes of memory type and make sure
+ * the area can be mapped using the memory block size value
+ * we end up using. We start with 1G value and keep reducing
+ * it such that we can map the entire area using memory_block_size.
+ * This will be used on powernv and older pseries that don't
+ * have ibm,lmb-size node.
+ * For ex: with P5 we can end up with
+ * memory@0 -> 128MB
+ * memory@128M -> 64M
+ * This will end up using 64MB memory block size value.
+ */
+ type = of_get_flat_dt_prop(node, "device_type", NULL);
+ if (type == NULL || strcmp(type, "memory") != 0)
+ return 0;
+
+ reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l);
+ if (!reg)
+ reg = of_get_flat_dt_prop(node, "reg", &l);
+ if (!reg)
+ return 0;
+
+ endp = reg + (l / sizeof(__be32));
+ while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
+ const char *compatible;
+ u64 size;
+
+ dt_mem_next_cell(dt_root_addr_cells, &reg);
+ size = dt_mem_next_cell(dt_root_size_cells, &reg);
+
+ if (size) {
+ update_memory_block_size(block_size, size);
+ continue;
+ }
+ /*
+ * ibm,coherent-device-memory with linux,usable-memory = 0
+ * Force 256MiB block size. Work around for GPUs on P9 PowerNV
+ * linux,usable-memory == 0 implies driver managed memory and
+ * we can't use large memory block size due to hotplug/unplug
+ * limitations.
+ */
+ compatible = of_get_flat_dt_prop(node, "compatible", NULL);
+ if (compatible && !strcmp(compatible, "ibm,coherent-device-memory")) {
+ if (*block_size > SZ_256M)
+ *block_size = SZ_256M;
+ /*
+ * We keep 256M as the upper limit with GPU present.
+ */
+ return 0;
+ }
+ }
+ /* continue looking for other memory device types */
+ return 0;
+}
+
+/*
+ * start with 1G memory block size. Early init will
+ * fix this with correct value.
+ */
+unsigned long memory_block_size __ro_after_init = 1UL << 30;
+static void __init early_init_memory_block_size(void)
+{
+ /*
+ * We need to do memory_block_size probe early so that
+ * radix__early_init_mmu() can use this as limit for
+ * mapping page size.
+ */
+ of_scan_flat_dt(probe_memory_block_size, &memory_block_size);
+}
+
void __init mmu_early_init_devtree(void)
{
+ bool hvmode = !!(mfmsr() & MSR_HV);
+
/* Disable radix mode based on kernel command line. */
- if (disable_radix)
- cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+ if (disable_radix) {
+ if (IS_ENABLED(CONFIG_PPC_64S_HASH_MMU))
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+ else
+ pr_warn("WARNING: Ignoring cmdline option disable_radix\n");
+ }
+
+ of_scan_flat_dt(dt_scan_mmu_pid_width, NULL);
+ if (hvmode && !mmu_lpid_bits) {
+ if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
+ mmu_lpid_bits = 12; /* POWER8-10 */
+ else
+ mmu_lpid_bits = 10; /* POWER7 */
+ }
+ if (!mmu_pid_bits) {
+ if (early_cpu_has_feature(CPU_FTR_ARCH_300))
+ mmu_pid_bits = 20; /* POWER9-10 */
+ }
/*
* Check /chosen/ibm,architecture-vec-5 if running as a guest.
@@ -449,11 +652,14 @@ void __init mmu_early_init_devtree(void)
* even though the ibm,architecture-vec-5 property created by
* skiboot doesn't have the necessary bits set.
*/
- if (!(mfmsr() & MSR_HV))
+ if (!hvmode)
early_check_vec5();
+ early_init_memory_block_size();
+
if (early_radix_enabled()) {
radix__early_init_devtree();
+
/*
* We have finalized the translation we are going to use by now.
* Radix mode is not limited by RMA / VRMA addressing.
@@ -463,5 +669,12 @@ void __init mmu_early_init_devtree(void)
memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
} else
hash__early_init_devtree();
+
+ if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE))
+ hugetlbpage_init_defaultsize();
+
+ if (!(cur_cpu_spec->mmu_features & MMU_FTR_HPTE_TABLE) &&
+ !(cur_cpu_spec->mmu_features & MMU_FTR_TYPE_RADIX))
+ panic("kernel does not support any MMU type offered by platform");
}
#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
index 57342154d2b0..4b4feba9873b 100644
--- a/arch/powerpc/mm/ioremap.c
+++ b/arch/powerpc/mm/ioremap.c
@@ -4,7 +4,6 @@
#include <linux/slab.h>
#include <linux/mmzone.h>
#include <linux/vmalloc.h>
-#include <asm/io-workarounds.h>
unsigned long ioremap_bot;
EXPORT_SYMBOL(ioremap_bot);
@@ -14,8 +13,6 @@ void __iomem *ioremap(phys_addr_t addr, unsigned long size)
pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
void *caller = __builtin_return_address(0);
- if (iowa_is_active())
- return iowa_ioremap(addr, size, prot, caller);
return __ioremap_caller(addr, size, prot, caller);
}
EXPORT_SYMBOL(ioremap);
@@ -25,8 +22,6 @@ void __iomem *ioremap_wc(phys_addr_t addr, unsigned long size)
pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL);
void *caller = __builtin_return_address(0);
- if (iowa_is_active())
- return iowa_ioremap(addr, size, prot, caller);
return __ioremap_caller(addr, size, prot, caller);
}
EXPORT_SYMBOL(ioremap_wc);
@@ -36,26 +31,18 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size)
pgprot_t prot = pgprot_cached(PAGE_KERNEL);
void *caller = __builtin_return_address(0);
- if (iowa_is_active())
- return iowa_ioremap(addr, size, prot, caller);
return __ioremap_caller(addr, size, prot, caller);
}
-void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
+void __iomem *ioremap_prot(phys_addr_t addr, size_t size, pgprot_t prot)
{
- pte_t pte = __pte(flags);
+ pte_t pte = __pte(pgprot_val(prot));
void *caller = __builtin_return_address(0);
/* writeable implies dirty for kernel addresses */
if (pte_write(pte))
pte = pte_mkdirty(pte);
- /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
- pte = pte_exprotect(pte);
- pte = pte_mkprivileged(pte);
-
- if (iowa_is_active())
- return iowa_ioremap(addr, size, pte_pgprot(pte), caller);
return __ioremap_caller(addr, size, pte_pgprot(pte), caller);
}
EXPORT_SYMBOL(ioremap_prot);
@@ -66,7 +53,7 @@ int early_ioremap_range(unsigned long ea, phys_addr_t pa,
unsigned long i;
for (i = 0; i < size; i += PAGE_SIZE) {
- int err = map_kernel_page(ea + i, pa + i, prot);
+ int err = map_kernel_page(ea + i, pa + i, pgprot_nx(prot));
if (WARN_ON_ONCE(err)) /* Should clean up */
return err;
@@ -74,47 +61,3 @@ int early_ioremap_range(unsigned long ea, phys_addr_t pa,
return 0;
}
-
-void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
- pgprot_t prot, void *caller)
-{
- struct vm_struct *area;
- int ret;
- unsigned long va;
-
- area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller);
- if (area == NULL)
- return NULL;
-
- area->phys_addr = pa;
- va = (unsigned long)area->addr;
-
- ret = ioremap_page_range(va, va + size, pa, prot);
- if (!ret)
- return (void __iomem *)area->addr + offset;
-
- vunmap_range(va, va + size);
- free_vm_area(area);
-
- return NULL;
-}
-
-#ifdef CONFIG_ZONE_DEVICE
-/*
- * Override the generic version in mm/memremap.c.
- *
- * With hash translation, the direct-map range is mapped with just one
- * page size selected by htab_init_page_sizes(). Consult
- * mmu_psize_defs[] to determine the minimum page size alignment.
-*/
-unsigned long memremap_compat_align(void)
-{
- unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift;
-
- if (radix_enabled())
- return SUBSECTION_SIZE;
- return max(SUBSECTION_SIZE, 1UL << shift);
-
-}
-EXPORT_SYMBOL_GPL(memremap_compat_align);
-#endif
diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c
index 9d13143b8be4..ca5bc6be3e6f 100644
--- a/arch/powerpc/mm/ioremap_32.c
+++ b/arch/powerpc/mm/ioremap_32.c
@@ -22,6 +22,13 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
int err;
/*
+ * If the address lies within the first 16 MB, assume it's in ISA
+ * memory space
+ */
+ if (addr < SZ_16M)
+ addr += _ISA_MEM_BASE;
+
+ /*
* Choose an address to map it to.
* Once the vmalloc system is running, we use it.
* Before then, we use space going down from IOREMAP_TOP
@@ -31,13 +38,6 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
offset = addr & ~PAGE_MASK;
size = PAGE_ALIGN(addr + size) - p;
- /*
- * If the address lies within the first 16 MB, assume it's in ISA
- * memory space
- */
- if (p < 16 * 1024 * 1024)
- p += _ISA_MEM_BASE;
-
#ifndef CONFIG_CRASH_DUMP
/*
* Don't allow anybody to remap normal RAM that we're using.
@@ -63,7 +63,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
return (void __iomem *)v + offset;
if (slab_is_available())
- return do_ioremap(p, offset, size, prot, caller);
+ return generic_ioremap_prot(addr, size, prot);
/*
* Should check if it is a candidate for a BAT mapping
@@ -87,7 +87,6 @@ void iounmap(volatile void __iomem *addr)
if (v_block_mapped((unsigned long)addr))
return;
- if (addr > high_memory && (unsigned long)addr < ioremap_bot)
- vunmap((void *)(PAGE_MASK & (unsigned long)addr));
+ generic_iounmap(addr);
}
EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c
index 3acece00b33e..fb8b55bd2cd5 100644
--- a/arch/powerpc/mm/ioremap_64.c
+++ b/arch/powerpc/mm/ioremap_64.c
@@ -29,7 +29,7 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
return NULL;
if (slab_is_available())
- return do_ioremap(paligned, offset, size, prot, caller);
+ return generic_ioremap_prot(addr, size, prot);
pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller);
@@ -49,17 +49,9 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
*/
void iounmap(volatile void __iomem *token)
{
- void *addr;
-
if (!slab_is_available())
return;
- addr = (void *)((unsigned long __force)PCI_FIX_ADDR(token) & PAGE_MASK);
-
- if ((unsigned long)addr < ioremap_bot) {
- pr_warn("Attempt to iounmap early bolted mapping at 0x%p\n", addr);
- return;
- }
- vunmap(addr);
+ generic_iounmap(token);
}
EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/kasan/8xx.c b/arch/powerpc/mm/kasan/8xx.c
index 2784224054f8..989d6cdf4141 100644
--- a/arch/powerpc/mm/kasan/8xx.c
+++ b/arch/powerpc/mm/kasan/8xx.c
@@ -6,28 +6,33 @@
#include <linux/memblock.h>
#include <linux/hugetlb.h>
+#include <asm/pgalloc.h>
+
static int __init
kasan_init_shadow_8M(unsigned long k_start, unsigned long k_end, void *block)
{
pmd_t *pmd = pmd_off_k(k_start);
unsigned long k_cur, k_next;
- for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd += 2, block += SZ_8M) {
- pte_basic_t *new;
+ for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++, block += SZ_4M) {
+ pte_t *ptep;
+ int i;
k_next = pgd_addr_end(k_cur, k_end);
- k_next = pgd_addr_end(k_next, k_end);
if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte)
continue;
- new = memblock_alloc(sizeof(pte_basic_t), SZ_4K);
- if (!new)
+ ptep = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE);
+ if (!ptep)
return -ENOMEM;
- *new = pte_val(pte_mkhuge(pfn_pte(PHYS_PFN(__pa(block)), PAGE_KERNEL)));
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte_t pte = pte_mkhuge(pfn_pte(PHYS_PFN(__pa(block + i * PAGE_SIZE)), PAGE_KERNEL));
- hugepd_populate_kernel((hugepd_t *)pmd, (pte_t *)new, PAGE_SHIFT_8M);
- hugepd_populate_kernel((hugepd_t *)pmd + 1, (pte_t *)new, PAGE_SHIFT_8M);
+ __set_pte_at(&init_mm, k_cur, ptep + i, pte, 1);
+ }
+ pmd_populate_kernel(&init_mm, pmd, ptep);
+ *pmd = __pmd(pmd_val(*pmd) | _PMD_PAGE_8M);
}
return 0;
}
diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile
index bb1a5408b86b..f9522fd70b2f 100644
--- a/arch/powerpc/mm/kasan/Makefile
+++ b/arch/powerpc/mm/kasan/Makefile
@@ -1,7 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
KASAN_SANITIZE := n
+KCOV_INSTRUMENT := n
-obj-$(CONFIG_PPC32) += kasan_init_32.o
+obj-$(CONFIG_PPC32) += init_32.o
obj-$(CONFIG_PPC_8xx) += 8xx.o
obj-$(CONFIG_PPC_BOOK3S_32) += book3s_32.o
+obj-$(CONFIG_PPC_BOOK3S_64) += init_book3s_64.o
+obj-$(CONFIG_PPC_BOOK3E_64) += init_book3e_64.o
diff --git a/arch/powerpc/mm/kasan/book3s_32.c b/arch/powerpc/mm/kasan/book3s_32.c
index 202bd260a009..450a67ef0bbe 100644
--- a/arch/powerpc/mm/kasan/book3s_32.c
+++ b/arch/powerpc/mm/kasan/book3s_32.c
@@ -10,47 +10,51 @@ int __init kasan_init_region(void *start, size_t size)
{
unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start);
unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size);
- unsigned long k_cur = k_start;
- int k_size = k_end - k_start;
- int k_size_base = 1 << (ffs(k_size) - 1);
+ unsigned long k_nobat = k_start;
+ unsigned long k_cur;
+ phys_addr_t phys;
int ret;
- void *block;
- block = memblock_alloc(k_size, k_size_base);
-
- if (block && k_size_base >= SZ_128K && k_start == ALIGN(k_start, k_size_base)) {
- int k_size_more = 1 << (ffs(k_size - k_size_base) - 1);
-
- setbat(-1, k_start, __pa(block), k_size_base, PAGE_KERNEL);
- if (k_size_more >= SZ_128K)
- setbat(-1, k_start + k_size_base, __pa(block) + k_size_base,
- k_size_more, PAGE_KERNEL);
- if (v_block_mapped(k_start))
- k_cur = k_start + k_size_base;
- if (v_block_mapped(k_start + k_size_base))
- k_cur = k_start + k_size_base + k_size_more;
-
- update_bats();
+ while (k_nobat < k_end) {
+ unsigned int k_size = bat_block_size(k_nobat, k_end);
+ int idx = find_free_bat();
+
+ if (idx == -1)
+ break;
+ if (k_size < SZ_128K)
+ break;
+ phys = memblock_phys_alloc_range(k_size, k_size, 0,
+ MEMBLOCK_ALLOC_ANYWHERE);
+ if (!phys)
+ break;
+
+ setbat(idx, k_nobat, phys, k_size, PAGE_KERNEL);
+ k_nobat += k_size;
}
+ if (k_nobat != k_start)
+ update_bats();
- if (!block)
- block = memblock_alloc(k_size, PAGE_SIZE);
- if (!block)
- return -ENOMEM;
+ if (k_nobat < k_end) {
+ phys = memblock_phys_alloc_range(k_end - k_nobat, PAGE_SIZE, 0,
+ MEMBLOCK_ALLOC_ANYWHERE);
+ if (!phys)
+ return -ENOMEM;
+ }
ret = kasan_init_shadow_page_tables(k_start, k_end);
if (ret)
return ret;
- kasan_update_early_region(k_start, k_cur, __pte(0));
+ kasan_update_early_region(k_start, k_nobat, __pte(0));
- for (; k_cur < k_end; k_cur += PAGE_SIZE) {
+ for (k_cur = k_nobat; k_cur < k_end; k_cur += PAGE_SIZE) {
pmd_t *pmd = pmd_off_k(k_cur);
- void *va = block + k_cur - k_start;
- pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);
+ pte_t pte = pfn_pte(PHYS_PFN(phys + k_cur - k_nobat), PAGE_KERNEL);
__set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0);
}
flush_tlb_kernel_range(k_start, k_end);
+ memset(kasan_mem_to_shadow(start), 0, k_end - k_start);
+
return 0;
}
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/init_32.c
index cf8770b1a692..1d083597464f 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/init_32.c
@@ -7,7 +7,7 @@
#include <linux/memblock.h>
#include <linux/sched/task.h>
#include <asm/pgalloc.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <mm/mmu_decl.h>
static pgprot_t __init kasan_prot_ro(void)
@@ -25,7 +25,7 @@ static void __init kasan_populate_pte(pte_t *ptep, pgprot_t prot)
int i;
for (i = 0; i < PTRS_PER_PTE; i++, ptep++)
- __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0);
+ __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 1);
}
int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end)
@@ -64,6 +64,7 @@ int __init __weak kasan_init_region(void *start, size_t size)
if (ret)
return ret;
+ k_start = k_start & PAGE_MASK;
block = memblock_alloc(k_end - k_start, PAGE_SIZE);
if (!block)
return -ENOMEM;
@@ -83,13 +84,12 @@ void __init
kasan_update_early_region(unsigned long k_start, unsigned long k_end, pte_t pte)
{
unsigned long k_cur;
- phys_addr_t pa = __pa(kasan_early_shadow_page);
for (k_cur = k_start; k_cur != k_end; k_cur += PAGE_SIZE) {
pmd_t *pmd = pmd_off_k(k_cur);
pte_t *ptep = pte_offset_kernel(pmd, k_cur);
- if ((pte_val(*ptep) & PTE_RPN_MASK) != pa)
+ if (pte_page(*ptep) != virt_to_page(lm_alias(kasan_early_shadow_page)))
continue;
__set_pte_at(&init_mm, k_cur, ptep, pte, 0);
@@ -165,7 +165,7 @@ void __init kasan_init(void)
/* At this point kasan is fully initialized. Enable error messages */
init_task.kasan_depth = 0;
- pr_info("KASAN init done\n");
+ kasan_init_generic();
}
void __init kasan_late_init(void)
diff --git a/arch/powerpc/mm/kasan/init_book3e_64.c b/arch/powerpc/mm/kasan/init_book3e_64.c
new file mode 100644
index 000000000000..0d3a73d6d4b0
--- /dev/null
+++ b/arch/powerpc/mm/kasan/init_book3e_64.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KASAN for 64-bit Book3e powerpc
+ *
+ * Copyright 2022, Christophe Leroy, CS GROUP France
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/kasan.h>
+#include <linux/printk.h>
+#include <linux/memblock.h>
+#include <linux/set_memory.h>
+
+#include <asm/pgalloc.h>
+
+static inline bool kasan_pud_table(p4d_t p4d)
+{
+ return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud));
+}
+
+static inline bool kasan_pmd_table(pud_t pud)
+{
+ return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd));
+}
+
+static inline bool kasan_pte_table(pmd_t pmd)
+{
+ return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte));
+}
+
+static int __init kasan_map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
+{
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ pgdp = pgd_offset_k(ea);
+ p4dp = p4d_offset(pgdp, ea);
+ if (kasan_pud_table(*p4dp)) {
+ pudp = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
+ memcpy(pudp, kasan_early_shadow_pud, PUD_TABLE_SIZE);
+ p4d_populate(&init_mm, p4dp, pudp);
+ }
+ pudp = pud_offset(p4dp, ea);
+ if (kasan_pmd_table(*pudp)) {
+ pmdp = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
+ memcpy(pmdp, kasan_early_shadow_pmd, PMD_TABLE_SIZE);
+ pud_populate(&init_mm, pudp, pmdp);
+ }
+ pmdp = pmd_offset(pudp, ea);
+ if (kasan_pte_table(*pmdp)) {
+ ptep = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
+ memcpy(ptep, kasan_early_shadow_pte, PTE_TABLE_SIZE);
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+ }
+ ptep = pte_offset_kernel(pmdp, ea);
+
+ __set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot), 0);
+
+ return 0;
+}
+
+static void __init kasan_init_phys_region(void *start, void *end)
+{
+ unsigned long k_start, k_end, k_cur;
+ void *va;
+
+ if (start >= end)
+ return;
+
+ k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE);
+ k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE);
+
+ va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE);
+ for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE)
+ kasan_map_kernel_page(k_cur, __pa(va), PAGE_KERNEL);
+}
+
+void __init kasan_early_init(void)
+{
+ int i;
+ unsigned long addr;
+ pgd_t *pgd = pgd_offset_k(KASAN_SHADOW_START);
+ pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL);
+
+ BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE));
+ BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE));
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page,
+ &kasan_early_shadow_pte[i], zero_pte, 0);
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ pmd_populate_kernel(&init_mm, &kasan_early_shadow_pmd[i],
+ kasan_early_shadow_pte);
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ pud_populate(&init_mm, &kasan_early_shadow_pud[i],
+ kasan_early_shadow_pmd);
+
+ for (addr = KASAN_SHADOW_START; addr != KASAN_SHADOW_END; addr += PGDIR_SIZE)
+ p4d_populate(&init_mm, p4d_offset(pgd++, addr), kasan_early_shadow_pud);
+}
+
+void __init kasan_init(void)
+{
+ phys_addr_t start, end;
+ u64 i;
+ pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO);
+
+ for_each_mem_range(i, &start, &end)
+ kasan_init_phys_region(phys_to_virt(start), phys_to_virt(end));
+
+ if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
+ kasan_remove_zero_shadow((void *)VMALLOC_START, VMALLOC_SIZE);
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page,
+ &kasan_early_shadow_pte[i], zero_pte, 0);
+
+ flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+ memset(kasan_early_shadow_page, 0, PAGE_SIZE);
+
+ /* Enable error messages */
+ init_task.kasan_depth = 0;
+ kasan_init_generic();
+}
+
+void __init kasan_late_init(void) { }
diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c
new file mode 100644
index 000000000000..dcafa641804c
--- /dev/null
+++ b/arch/powerpc/mm/kasan/init_book3s_64.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KASAN for 64-bit Book3S powerpc
+ *
+ * Copyright 2019-2022, Daniel Axtens, IBM Corporation.
+ */
+
+/*
+ * ppc64 turns on virtual memory late in boot, after calling into generic code
+ * like the device-tree parser, so it uses this in conjunction with a hook in
+ * outline mode to avoid invalid access early in boot.
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/kasan.h>
+#include <linux/printk.h>
+#include <linux/sched/task.h>
+#include <linux/memblock.h>
+#include <asm/pgalloc.h>
+
+static void __init kasan_init_phys_region(void *start, void *end)
+{
+ unsigned long k_start, k_end, k_cur;
+ void *va;
+
+ if (start >= end)
+ return;
+
+ k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE);
+ k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE);
+
+ va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE);
+ for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE)
+ map_kernel_page(k_cur, __pa(va), PAGE_KERNEL);
+}
+
+void __init kasan_init(void)
+{
+ /*
+ * We want to do the following things:
+ * 1) Map real memory into the shadow for all physical memblocks
+ * This takes us from c000... to c008...
+ * 2) Leave a hole over the shadow of vmalloc space. KASAN_VMALLOC
+ * will manage this for us.
+ * This takes us from c008... to c00a...
+ * 3) Map the 'early shadow'/zero page over iomap and vmemmap space.
+ * This takes us up to where we start at c00e...
+ */
+
+ void *k_start = kasan_mem_to_shadow((void *)RADIX_VMALLOC_END);
+ void *k_end = kasan_mem_to_shadow((void *)RADIX_VMEMMAP_END);
+ phys_addr_t start, end;
+ u64 i;
+ pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL);
+
+ if (!early_radix_enabled()) {
+ pr_warn("KASAN not enabled as it requires radix!");
+ return;
+ }
+
+ for_each_mem_range(i, &start, &end)
+ kasan_init_phys_region(phys_to_virt(start), phys_to_virt(end));
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page,
+ &kasan_early_shadow_pte[i], zero_pte, 0);
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ pmd_populate_kernel(&init_mm, &kasan_early_shadow_pmd[i],
+ kasan_early_shadow_pte);
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ pud_populate(&init_mm, &kasan_early_shadow_pud[i],
+ kasan_early_shadow_pmd);
+
+ /* map the early shadow over the iomap and vmemmap space */
+ kasan_populate_early_shadow(k_start, k_end);
+
+ /* mark early shadow region as RO and wipe it */
+ zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO);
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page,
+ &kasan_early_shadow_pte[i], zero_pte, 0);
+
+ /*
+ * clear_page relies on some cache info that hasn't been set up yet.
+ * It ends up looping ~forever and blows up other data.
+ * Use memset instead.
+ */
+ memset(kasan_early_shadow_page, 0, PAGE_SIZE);
+
+ /* Enable error messages */
+ init_task.kasan_depth = 0;
+ kasan_init_generic();
+}
+
+void __init kasan_early_init(void) { }
+
+void __init kasan_late_init(void) { }
diff --git a/arch/powerpc/mm/maccess.c b/arch/powerpc/mm/maccess.c
index aad7c47e0030..ea821d0ffe16 100644
--- a/arch/powerpc/mm/maccess.c
+++ b/arch/powerpc/mm/maccess.c
@@ -11,20 +11,3 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
{
return is_kernel_addr((unsigned long)unsafe_src);
}
-
-int copy_inst_from_kernel_nofault(struct ppc_inst *inst, u32 *src)
-{
- unsigned int val, suffix;
- int err;
-
- err = copy_from_kernel_nofault(&val, src, sizeof(val));
- if (err)
- return err;
- if (IS_ENABLED(CONFIG_PPC64) && get_op(val) == OP_PREFIX) {
- err = copy_from_kernel_nofault(&suffix, src + 1, sizeof(suffix));
- *inst = ppc_inst_prefix(val, suffix);
- } else {
- *inst = ppc_inst(val);
- }
- return err;
-}
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index ad198b439222..3ddbfdbfa941 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -16,33 +16,39 @@
#include <linux/highmem.h>
#include <linux/suspend.h>
#include <linux/dma-direct.h>
+#include <linux/execmem.h>
+#include <linux/vmalloc.h>
+#include <asm/swiotlb.h>
#include <asm/machdep.h>
#include <asm/rtas.h>
#include <asm/kasan.h>
-#include <asm/sparsemem.h>
#include <asm/svm.h>
+#include <asm/mmzone.h>
+#include <asm/ftrace.h>
+#include <asm/text-patching.h>
+#include <asm/setup.h>
+#include <asm/fixmap.h>
#include <mm/mmu_decl.h>
-unsigned long long memory_limit;
-bool init_mem_is_free;
+unsigned long long memory_limit __initdata;
unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
EXPORT_SYMBOL(empty_zero_page);
-pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
- unsigned long size, pgprot_t vma_prot)
+pgprot_t __phys_mem_access_prot(unsigned long pfn, unsigned long size,
+ pgprot_t vma_prot)
{
if (ppc_md.phys_mem_access_prot)
- return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot);
+ return ppc_md.phys_mem_access_prot(pfn, size, vma_prot);
if (!page_is_ram(pfn))
vma_prot = pgprot_noncached(vma_prot);
return vma_prot;
}
-EXPORT_SYMBOL(phys_mem_access_prot);
+EXPORT_SYMBOL(__phys_mem_access_prot);
#ifdef CONFIG_MEMORY_HOTPLUG
static DEFINE_MUTEX(linear_mapping_mutex);
@@ -52,6 +58,7 @@ int memory_add_physaddr_to_nid(u64 start)
{
return hot_add_scn_to_nid(start);
}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
int __weak create_section_mapping(unsigned long start, unsigned long end,
@@ -103,6 +110,37 @@ void __ref arch_remove_linear_mapping(u64 start, u64 size)
vm_unmap_aliases();
}
+/*
+ * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
+ * updating.
+ */
+static void update_end_of_memory_vars(u64 start, u64 size)
+{
+ unsigned long end_pfn = PFN_UP(start + size);
+
+ if (end_pfn > max_pfn) {
+ max_pfn = end_pfn;
+ max_low_pfn = end_pfn;
+ high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+ }
+}
+
+int __ref add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
+ struct mhp_params *params)
+{
+ int ret;
+
+ ret = __add_pages(nid, start_pfn, nr_pages, params);
+ if (ret)
+ return ret;
+
+ /* update max_pfn, max_low_pfn and high_memory */
+ update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
+ nr_pages << PAGE_SHIFT);
+
+ return ret;
+}
+
int __ref arch_add_memory(int nid, u64 start, u64 size,
struct mhp_params *params)
{
@@ -113,14 +151,13 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
rc = arch_create_linear_mapping(nid, start, size, params);
if (rc)
return rc;
- rc = __add_pages(nid, start_pfn, nr_pages, params);
+ rc = add_pages(nid, start_pfn, nr_pages, params);
if (rc)
arch_remove_linear_mapping(start, size);
return rc;
}
-void __ref arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -179,7 +216,7 @@ static int __init mark_nonram_nosave(void)
* everything else. GFP_DMA32 page allocations automatically fall back to
* ZONE_DMA.
*
- * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the
+ * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the
* generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU
* anyway) will take a first dip into ZONE_NORMAL and get otherwise served by
* ZONE_DMA.
@@ -193,6 +230,7 @@ void __init paging_init(void)
{
unsigned long long total_ram = memblock_phys_mem_size();
phys_addr_t top_of_ram = memblock_end_of_DRAM();
+ int zone_dma_bits;
#ifdef CONFIG_HIGHMEM
unsigned long v = __fix_to_virt(FIX_KMAP_END);
@@ -219,6 +257,8 @@ void __init paging_init(void)
else
zone_dma_bits = 31;
+ zone_dma_limit = DMA_BIT_MASK(zone_dma_bits);
+
#ifdef CONFIG_ZONE_DMA
max_zone_pfns[ZONE_DMA] = min(max_low_pfn,
1UL << (zone_dma_bits - PAGE_SHIFT));
@@ -233,7 +273,7 @@ void __init paging_init(void)
mark_nonram_nosave();
}
-void __init mem_init(void)
+void __init arch_mm_preinit(void)
{
/*
* book3s is limited to 16 page sizes due to encoding this in
@@ -250,34 +290,12 @@ void __init mem_init(void)
* back to to-down.
*/
memblock_set_bottom_up(true);
- if (is_secure_guest())
- svm_swiotlb_init();
- else
- swiotlb_init(0);
+ swiotlb_init(ppc_swiotlb_enable, ppc_swiotlb_flags);
#endif
- high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
- set_max_mapnr(max_pfn);
-
kasan_late_init();
- memblock_free_all();
-
-#ifdef CONFIG_HIGHMEM
- {
- unsigned long pfn, highmem_mapnr;
-
- highmem_mapnr = lowmem_end_addr >> PAGE_SHIFT;
- for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) {
- phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT;
- struct page *page = pfn_to_page(pfn);
- if (!memblock_is_reserved(paddr))
- free_highmem_page(page);
- }
- }
-#endif /* CONFIG_HIGHMEM */
-
-#if defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_SMP)
+#if defined(CONFIG_PPC_E500) && !defined(CONFIG_SMP)
/*
* If smp is enabled, next_tlbcam_idx is initialized in the cpu up
* functions.... do it here for the non-smp case.
@@ -285,36 +303,14 @@ void __init mem_init(void)
per_cpu(next_tlbcam_idx, smp_processor_id()) =
(mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1;
#endif
-
-#ifdef CONFIG_PPC32
- pr_info("Kernel virtual memory layout:\n");
-#ifdef CONFIG_KASAN
- pr_info(" * 0x%08lx..0x%08lx : kasan shadow mem\n",
- KASAN_SHADOW_START, KASAN_SHADOW_END);
-#endif
- pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP);
-#ifdef CONFIG_HIGHMEM
- pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n",
- PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP));
-#endif /* CONFIG_HIGHMEM */
- if (ioremap_bot != IOREMAP_TOP)
- pr_info(" * 0x%08lx..0x%08lx : early ioremap\n",
- ioremap_bot, IOREMAP_TOP);
- pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n",
- VMALLOC_START, VMALLOC_END);
-#ifdef MODULES_VADDR
- pr_info(" * 0x%08lx..0x%08lx : modules\n",
- MODULES_VADDR, MODULES_END);
-#endif
-#endif /* CONFIG_PPC32 */
}
void free_initmem(void)
{
ppc_md.progress = ppc_printk_progress;
mark_initmem_nx();
- init_mem_is_free = true;
free_initmem_default(POISON_FREE_INITMEM);
+ ftrace_free_init_tramp();
}
/*
@@ -342,7 +338,7 @@ static int __init add_system_ram_resources(void)
*/
res->end = end - 1;
res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- WARN_ON(request_resource(&iomem_resource, res) < 0);
+ WARN_ON(insert_resource(&iomem_resource, res) < 0);
}
}
@@ -375,3 +371,80 @@ int devmem_is_allowed(unsigned long pfn)
* the EHEA driver. Drop this when drivers/net/ethernet/ibm/ehea is removed.
*/
EXPORT_SYMBOL_GPL(walk_system_ram_range);
+
+#ifdef CONFIG_EXECMEM
+static struct execmem_info execmem_info __ro_after_init;
+
+#if defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_BOOK3S_603)
+static void prealloc_execmem_pgtable(void)
+{
+ unsigned long va;
+
+ for (va = ALIGN_DOWN(MODULES_VADDR, PGDIR_SIZE); va < MODULES_END; va += PGDIR_SIZE)
+ pte_alloc_kernel(pmd_off_k(va), va);
+}
+#else
+static void prealloc_execmem_pgtable(void) { }
+#endif
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+ pgprot_t kprobes_prot = strict_module_rwx_enabled() ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
+ pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC;
+ unsigned long fallback_start = 0, fallback_end = 0;
+ unsigned long start, end;
+
+ /*
+ * BOOK3S_32 and 8xx define MODULES_VADDR for text allocations and
+ * allow allocating data in the entire vmalloc space
+ */
+#ifdef MODULES_VADDR
+ unsigned long limit = (unsigned long)_etext - SZ_32M;
+
+ BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
+
+ /* First try within 32M limit from _etext to avoid branch trampolines */
+ if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) {
+ start = limit;
+ fallback_start = MODULES_VADDR;
+ fallback_end = MODULES_END;
+ } else {
+ start = MODULES_VADDR;
+ }
+
+ end = MODULES_END;
+#else
+ start = VMALLOC_START;
+ end = VMALLOC_END;
+#endif
+
+ prealloc_execmem_pgtable();
+
+ execmem_info = (struct execmem_info){
+ .ranges = {
+ [EXECMEM_DEFAULT] = {
+ .start = start,
+ .end = end,
+ .pgprot = prot,
+ .alignment = 1,
+ .fallback_start = fallback_start,
+ .fallback_end = fallback_end,
+ },
+ [EXECMEM_KPROBES] = {
+ .start = VMALLOC_START,
+ .end = VMALLOC_END,
+ .pgprot = kprobes_prot,
+ .alignment = 1,
+ },
+ [EXECMEM_MODULE_DATA] = {
+ .start = VMALLOC_START,
+ .end = VMALLOC_END,
+ .pgprot = PAGE_KERNEL,
+ .alignment = 1,
+ },
+ },
+ };
+
+ return &execmem_info;
+}
+#endif /* CONFIG_EXECMEM */
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
deleted file mode 100644
index ae683fdc716c..000000000000
--- a/arch/powerpc/mm/mmap.c
+++ /dev/null
@@ -1,228 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * flexible mmap layout support
- *
- * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
- * All Rights Reserved.
- *
- * Started by Ingo Molnar <mingo@elte.hu>
- */
-
-#include <linux/personality.h>
-#include <linux/mm.h>
-#include <linux/random.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/mm.h>
-#include <linux/elf-randomize.h>
-#include <linux/security.h>
-#include <linux/mman.h>
-
-/*
- * Top of mmap area (just below the process stack).
- *
- * Leave at least a ~128 MB hole.
- */
-#define MIN_GAP (128*1024*1024)
-#define MAX_GAP (TASK_SIZE/6*5)
-
-static inline int mmap_is_legacy(struct rlimit *rlim_stack)
-{
- if (current->personality & ADDR_COMPAT_LAYOUT)
- return 1;
-
- if (rlim_stack->rlim_cur == RLIM_INFINITY)
- return 1;
-
- return sysctl_legacy_va_layout;
-}
-
-unsigned long arch_mmap_rnd(void)
-{
- unsigned long shift, rnd;
-
- shift = mmap_rnd_bits;
-#ifdef CONFIG_COMPAT
- if (is_32bit_task())
- shift = mmap_rnd_compat_bits;
-#endif
- rnd = get_random_long() % (1ul << shift);
-
- return rnd << PAGE_SHIFT;
-}
-
-static inline unsigned long stack_maxrandom_size(void)
-{
- if (!(current->flags & PF_RANDOMIZE))
- return 0;
-
- /* 8MB for 32bit, 1GB for 64bit */
- if (is_32bit_task())
- return (1<<23);
- else
- return (1<<30);
-}
-
-static inline unsigned long mmap_base(unsigned long rnd,
- struct rlimit *rlim_stack)
-{
- unsigned long gap = rlim_stack->rlim_cur;
- unsigned long pad = stack_maxrandom_size() + stack_guard_gap;
-
- /* Values close to RLIM_INFINITY can overflow. */
- if (gap + pad > gap)
- gap += pad;
-
- if (gap < MIN_GAP)
- gap = MIN_GAP;
- else if (gap > MAX_GAP)
- gap = MAX_GAP;
-
- return PAGE_ALIGN(DEFAULT_MAP_WINDOW - gap - rnd);
-}
-
-#ifdef CONFIG_PPC_RADIX_MMU
-/*
- * Same function as generic code used only for radix, because we don't need to overload
- * the generic one. But we will have to duplicate, because hash select
- * HAVE_ARCH_UNMAPPED_AREA
- */
-static unsigned long
-radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- int fixed = (flags & MAP_FIXED);
- unsigned long high_limit;
- struct vm_unmapped_area_info info;
-
- high_limit = DEFAULT_MAP_WINDOW;
- if (addr >= high_limit || (fixed && (addr + len > high_limit)))
- high_limit = TASK_SIZE;
-
- if (len > high_limit)
- return -ENOMEM;
-
- if (fixed) {
- if (addr > high_limit - len)
- return -ENOMEM;
- return addr;
- }
-
- if (addr) {
- addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
- if (high_limit - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vm_start_gap(vma)))
- return addr;
- }
-
- info.flags = 0;
- info.length = len;
- info.low_limit = mm->mmap_base;
- info.high_limit = high_limit;
- info.align_mask = 0;
-
- return vm_unmapped_area(&info);
-}
-
-static unsigned long
-radix__arch_get_unmapped_area_topdown(struct file *filp,
- const unsigned long addr0,
- const unsigned long len,
- const unsigned long pgoff,
- const unsigned long flags)
-{
- struct vm_area_struct *vma;
- struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
- int fixed = (flags & MAP_FIXED);
- unsigned long high_limit;
- struct vm_unmapped_area_info info;
-
- high_limit = DEFAULT_MAP_WINDOW;
- if (addr >= high_limit || (fixed && (addr + len > high_limit)))
- high_limit = TASK_SIZE;
-
- if (len > high_limit)
- return -ENOMEM;
-
- if (fixed) {
- if (addr > high_limit - len)
- return -ENOMEM;
- return addr;
- }
-
- if (addr) {
- addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
- if (high_limit - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vm_start_gap(vma)))
- return addr;
- }
-
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
- info.align_mask = 0;
-
- addr = vm_unmapped_area(&info);
- if (!(addr & ~PAGE_MASK))
- return addr;
- VM_BUG_ON(addr != -ENOMEM);
-
- /*
- * A failed mmap() very likely causes application failure,
- * so fall back to the bottom-up function here. This scenario
- * can happen with large stack limits and large mmap()
- * allocations.
- */
- return radix__arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
-}
-
-static void radix__arch_pick_mmap_layout(struct mm_struct *mm,
- unsigned long random_factor,
- struct rlimit *rlim_stack)
-{
- if (mmap_is_legacy(rlim_stack)) {
- mm->mmap_base = TASK_UNMAPPED_BASE;
- mm->get_unmapped_area = radix__arch_get_unmapped_area;
- } else {
- mm->mmap_base = mmap_base(random_factor, rlim_stack);
- mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown;
- }
-}
-#else
-/* dummy */
-extern void radix__arch_pick_mmap_layout(struct mm_struct *mm,
- unsigned long random_factor,
- struct rlimit *rlim_stack);
-#endif
-/*
- * This function, called very early during the creation of a new
- * process VM image, sets up which VM layout function to use:
- */
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
-{
- unsigned long random_factor = 0UL;
-
- if (current->flags & PF_RANDOMIZE)
- random_factor = arch_mmap_rnd();
-
- if (radix_enabled())
- return radix__arch_pick_mmap_layout(mm, random_factor,
- rlim_stack);
- /*
- * Fall back to the standard layout if the personality
- * bit is set, or if the expected stack growth is unlimited:
- */
- if (mmap_is_legacy(rlim_stack)) {
- mm->mmap_base = TASK_UNMAPPED_BASE;
- mm->get_unmapped_area = arch_get_unmapped_area;
- } else {
- mm->mmap_base = mmap_base(random_factor, rlim_stack);
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- }
-}
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index 74246536b832..3e3af29b4523 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -18,6 +18,12 @@ static inline void switch_mm_pgdir(struct task_struct *tsk,
{
/* 32-bit keeps track of the current PGDIR in the thread struct */
tsk->thread.pgdir = mm->pgd;
+#ifdef CONFIG_PPC_BOOK3S_32
+ tsk->thread.sr0 = mm->context.sr0;
+#endif
+#if defined(CONFIG_BOOKE) && defined(CONFIG_PPC_KUAP)
+ tsk->thread.pid = mm->context.id;
+#endif
}
#elif defined(CONFIG_PPC_BOOK3E_64)
static inline void switch_mm_pgdir(struct task_struct *tsk,
@@ -25,6 +31,9 @@ static inline void switch_mm_pgdir(struct task_struct *tsk,
{
/* 64-bit Book3E keeps track of current PGD in the PACA */
get_paca()->pgd = mm->pgd;
+#ifdef CONFIG_PPC_KUAP
+ tsk->thread.pid = mm->context.id;
+#endif
}
#else
static inline void switch_mm_pgdir(struct task_struct *tsk,
@@ -34,11 +43,13 @@ static inline void switch_mm_pgdir(struct task_struct *tsk,
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
+ int cpu = smp_processor_id();
bool new_on_cpu = false;
/* Mark this context has been used on the new CPU */
- if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) {
- cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
+ if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
+ VM_WARN_ON_ONCE(next == &init_mm);
+ cpumask_set_cpu(cpu, mm_cpumask(next));
inc_mm_active_cpus(next);
/*
@@ -81,7 +92,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* context
*/
if (cpu_has_feature(CPU_FTR_ALTIVEC))
- asm volatile ("dssall");
+ asm volatile (PPC_DSSALL);
if (!new_on_cpu)
membarrier_arch_switch_mm(prev, next, tsk);
@@ -91,6 +102,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* sub architectures. Out of line for now
*/
switch_mmu_context(prev, next, tsk);
+
+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(prev)));
}
#ifndef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 7dac910c0b21..b2d1eea09761 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -20,9 +20,9 @@
#include <asm/trace.h>
/*
- * On 40x and 8xx, we directly inline tlbia and tlbivax
+ * On 8xx, we directly inline tlbia
*/
-#if defined(CONFIG_40x) || defined(CONFIG_PPC_8xx)
+#ifdef CONFIG_PPC_8xx
static inline void _tlbil_all(void)
{
asm volatile ("sync; tlbia; isync" : : : "memory");
@@ -35,15 +35,15 @@ static inline void _tlbil_pid(unsigned int pid)
}
#define _tlbil_pid_noind(pid) _tlbil_pid(pid)
-#else /* CONFIG_40x || CONFIG_PPC_8xx */
+#else /* CONFIG_PPC_8xx */
extern void _tlbil_all(void);
extern void _tlbil_pid(unsigned int pid);
-#ifdef CONFIG_PPC_BOOK3E
+#ifdef CONFIG_PPC_BOOK3E_64
extern void _tlbil_pid_noind(unsigned int pid);
#else
#define _tlbil_pid_noind(pid) _tlbil_pid(pid)
#endif
-#endif /* !(CONFIG_40x || CONFIG_PPC_8xx) */
+#endif /* !CONFIG_PPC_8xx */
/*
* On 8xx, we directly inline tlbie, on others, it's extern
@@ -55,7 +55,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid,
asm volatile ("tlbie %0; sync" : : "r" (address) : "memory");
trace_tlbie(0, 0, address, pid, 0, 0, 0);
}
-#elif defined(CONFIG_PPC_BOOK3E)
+#elif defined(CONFIG_PPC_BOOK3E_64)
extern void _tlbil_va(unsigned long address, unsigned int pid,
unsigned int tsize, unsigned int ind);
#else
@@ -67,7 +67,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid,
}
#endif /* CONFIG_PPC_8xx */
-#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x)
+#if defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_47x)
extern void _tlbivax_bcast(unsigned long address, unsigned int pid,
unsigned int tsize, unsigned int ind);
#else
@@ -92,29 +92,16 @@ extern void mapin_ram(void);
extern void setbat(int index, unsigned long virt, phys_addr_t phys,
unsigned int size, pgprot_t prot);
-extern int __map_without_bats;
-extern unsigned int rtas_data, rtas_size;
-
-struct hash_pte;
extern u8 early_hash[];
#endif /* CONFIG_PPC32 */
extern unsigned long __max_low_memory;
-extern phys_addr_t __initial_memory_limit_addr;
extern phys_addr_t total_memory;
extern phys_addr_t total_lowmem;
extern phys_addr_t memstart_addr;
extern phys_addr_t lowmem_end_addr;
-#ifdef CONFIG_WII
-extern unsigned long wii_hole_start;
-extern unsigned long wii_hole_size;
-
-extern unsigned long wii_mmu_mapin_mem2(unsigned long top);
-extern void wii_memory_fixups(void);
-#endif
-
/* ...and now those things that may be slightly different between processor
* architectures. -- Dan
*/
@@ -123,18 +110,18 @@ extern void MMU_init_hw(void);
void MMU_init_hw_patch(void);
unsigned long mmu_mapin_ram(unsigned long base, unsigned long top);
#endif
+void mmu_init_secondary(int cpu);
-#ifdef CONFIG_PPC_FSL_BOOK3E
+#ifdef CONFIG_PPC_E500
extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx,
- bool dryrun);
-extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
- phys_addr_t phys);
+ bool dryrun, bool init);
#ifdef CONFIG_PPC32
extern void adjust_total_lowmem(void);
extern int switch_to_as1(void);
extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu);
void create_kaslr_tlb_entry(int entry, unsigned long virt, phys_addr_t phys);
void reloc_kernel_entry(void *fdt, int addr);
+void relocate_init(u64 dt_ptr, phys_addr_t start);
extern int is_second_reloc;
#endif
extern void loadcam_entry(unsigned int index);
@@ -155,11 +142,15 @@ struct tlbcam {
u32 MAS3;
u32 MAS7;
};
+
+#define NUM_TLBCAMS 64
+
+extern struct tlbcam TLBCAM[NUM_TLBCAMS];
#endif
-#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_FSL_BOOKE) || defined(CONFIG_PPC_8xx)
+#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_85xx) || defined(CONFIG_PPC_8xx)
/* 6xx have BATS */
-/* FSL_BOOKE have TLBCAM */
+/* PPC_85xx have TLBCAM */
/* 8xx have LTLB */
phys_addr_t v_block_mapped(unsigned long va);
unsigned long p_block_mapped(phys_addr_t pa);
@@ -168,25 +159,26 @@ static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; }
static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; }
#endif
-#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx)
-void mmu_mark_initmem_nx(void);
-void mmu_mark_rodata_ro(void);
+#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_E500)
+int mmu_mark_initmem_nx(void);
+int mmu_mark_rodata_ro(void);
#else
-static inline void mmu_mark_initmem_nx(void) { }
-static inline void mmu_mark_rodata_ro(void) { }
+static inline int mmu_mark_initmem_nx(void) { return 0; }
+static inline int mmu_mark_rodata_ro(void) { return 0; }
#endif
#ifdef CONFIG_PPC_8xx
void __init mmu_mapin_immr(void);
#endif
-#ifdef CONFIG_PPC_DEBUG_WX
-void ptdump_check_wx(void);
-#else
-static inline void ptdump_check_wx(void) { }
-#endif
-
static inline bool debug_pagealloc_enabled_or_kfence(void)
{
return IS_ENABLED(CONFIG_KFENCE) || debug_pagealloc_enabled();
}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot);
+#endif
+
+int hash__kernel_map_pages(struct page *page, int numpages, int enable);
diff --git a/arch/powerpc/mm/nohash/40x.c b/arch/powerpc/mm/nohash/40x.c
deleted file mode 100644
index 95751c322f6c..000000000000
--- a/arch/powerpc/mm/nohash/40x.c
+++ /dev/null
@@ -1,152 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * This file contains the routines for initializing the MMU
- * on the 4xx series of chips.
- * -- paulus
- *
- * Derived from arch/ppc/mm/init.c:
- * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- * and Cort Dougan (PReP) (cort@cs.nmt.edu)
- * Copyright (C) 1996 Paul Mackerras
- *
- * Derived from "arch/i386/mm/init.c"
- * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/stddef.h>
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/highmem.h>
-#include <linux/memblock.h>
-
-#include <asm/prom.h>
-#include <asm/io.h>
-#include <asm/mmu_context.h>
-#include <asm/mmu.h>
-#include <linux/uaccess.h>
-#include <asm/smp.h>
-#include <asm/bootx.h>
-#include <asm/machdep.h>
-#include <asm/setup.h>
-
-#include <mm/mmu_decl.h>
-
-extern int __map_without_ltlbs;
-/*
- * MMU_init_hw does the chip-specific initialization of the MMU hardware.
- */
-void __init MMU_init_hw(void)
-{
- /*
- * The Zone Protection Register (ZPR) defines how protection will
- * be applied to every page which is a member of a given zone. At
- * present, we utilize only two of the 4xx's zones.
- * The zone index bits (of ZSEL) in the PTE are used for software
- * indicators, except the LSB. For user access, zone 1 is used,
- * for kernel access, zone 0 is used. We set all but zone 1
- * to zero, allowing only kernel access as indicated in the PTE.
- * For zone 1, we set a 01 binary (a value of 10 will not work)
- * to allow user access as indicated in the PTE. This also allows
- * kernel access as indicated in the PTE.
- */
-
- mtspr(SPRN_ZPR, 0x10000000);
-
- flush_instruction_cache();
-
- /*
- * Set up the real-mode cache parameters for the exception vector
- * handlers (which are run in real-mode).
- */
-
- mtspr(SPRN_DCWR, 0x00000000); /* All caching is write-back */
-
- /*
- * Cache instruction and data space where the exception
- * vectors and the kernel live in real-mode.
- */
-
- mtspr(SPRN_DCCR, 0xFFFF0000); /* 2GByte of data space at 0x0. */
- mtspr(SPRN_ICCR, 0xFFFF0000); /* 2GByte of instr. space at 0x0. */
-}
-
-#define LARGE_PAGE_SIZE_16M (1<<24)
-#define LARGE_PAGE_SIZE_4M (1<<22)
-
-unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
-{
- unsigned long v, s, mapped;
- phys_addr_t p;
-
- v = KERNELBASE;
- p = 0;
- s = total_lowmem;
-
- if (__map_without_ltlbs)
- return 0;
-
- while (s >= LARGE_PAGE_SIZE_16M) {
- pmd_t *pmdp;
- unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_RW;
-
- pmdp = pmd_off_k(v);
- *pmdp++ = __pmd(val);
- *pmdp++ = __pmd(val);
- *pmdp++ = __pmd(val);
- *pmdp++ = __pmd(val);
-
- v += LARGE_PAGE_SIZE_16M;
- p += LARGE_PAGE_SIZE_16M;
- s -= LARGE_PAGE_SIZE_16M;
- }
-
- while (s >= LARGE_PAGE_SIZE_4M) {
- pmd_t *pmdp;
- unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_RW;
-
- pmdp = pmd_off_k(v);
- *pmdp = __pmd(val);
-
- v += LARGE_PAGE_SIZE_4M;
- p += LARGE_PAGE_SIZE_4M;
- s -= LARGE_PAGE_SIZE_4M;
- }
-
- mapped = total_lowmem - s;
-
- /* If the size of RAM is not an exact power of two, we may not
- * have covered RAM in its entirety with 16 and 4 MiB
- * pages. Consequently, restrict the top end of RAM currently
- * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail"
- * coverage with normal-sized pages (or other reasons) do not
- * attempt to allocate outside the allowed range.
- */
- memblock_set_current_limit(mapped);
-
- return mapped;
-}
-
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size)
-{
- /* We don't currently support the first MEMBLOCK not mapping 0
- * physical on those processors
- */
- BUG_ON(first_memblock_base != 0);
-
- /* 40x can only access 16MB at the moment (see head_40x.S) */
- memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
-}
diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c
index e079f26b267e..6d10c6d8be71 100644
--- a/arch/powerpc/mm/nohash/44x.c
+++ b/arch/powerpc/mm/nohash/44x.c
@@ -24,7 +24,7 @@
#include <asm/mmu.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/smp.h>
#include <mm/mmu_decl.h>
@@ -38,7 +38,7 @@ int icache_44x_need_flush;
unsigned long tlb_47x_boltmap[1024/8];
-static void ppc44x_update_tlb_hwater(void)
+static void __init ppc44x_update_tlb_hwater(void)
{
/* The TLB miss handlers hard codes the watermark in a cmpli
* instruction to improve performances rather than loading it
@@ -122,7 +122,7 @@ static void __init ppc47x_update_boltmap(void)
/*
* "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
*/
-static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
+static void __init ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
{
unsigned int rA;
int bolted;
@@ -240,19 +240,3 @@ void __init mmu_init_secondary(int cpu)
}
}
#endif /* CONFIG_SMP */
-
-#ifdef CONFIG_PPC_KUEP
-void setup_kuep(bool disabled)
-{
- if (smp_processor_id() != boot_cpuid)
- return;
-
- if (disabled)
- patch_instruction_site(&patch__tlb_44x_kuep, ppc_inst(PPC_RAW_NOP()));
- else
- pr_info("Activating Kernel Userspace Execution Prevention\n");
-
- if (IS_ENABLED(CONFIG_PPC_47x) && disabled)
- patch_instruction_site(&patch__tlb_47x_kuep, ppc_inst(PPC_RAW_NOP()));
-}
-#endif
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 60780e089118..ab1505cf42bf 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -8,18 +8,15 @@
*/
#include <linux/memblock.h>
-#include <linux/mmu_context.h>
#include <linux/hugetlb.h>
+
#include <asm/fixmap.h>
-#include <asm/code-patching.h>
-#include <asm/inst.h>
+#include <asm/pgalloc.h>
#include <mm/mmu_decl.h>
#define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT)
-extern int __map_without_ltlbs;
-
static unsigned long block_mapped_ram;
/*
@@ -32,8 +29,6 @@ phys_addr_t v_block_mapped(unsigned long va)
if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE)
return p + va - VIRT_IMMR_BASE;
- if (__map_without_ltlbs)
- return 0;
if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram)
return __pa(va);
return 0;
@@ -49,59 +44,52 @@ unsigned long p_block_mapped(phys_addr_t pa)
if (pa >= p && pa < p + IMMR_SIZE)
return VIRT_IMMR_BASE + pa - p;
- if (__map_without_ltlbs)
- return 0;
if (pa < block_mapped_ram)
return (unsigned long)__va(pa);
return 0;
}
-static pte_t __init *early_hugepd_alloc_kernel(hugepd_t *pmdp, unsigned long va)
-{
- if (hpd_val(*pmdp) == 0) {
- pte_t *ptep = memblock_alloc(sizeof(pte_basic_t), SZ_4K);
-
- if (!ptep)
- return NULL;
-
- hugepd_populate_kernel((hugepd_t *)pmdp, ptep, PAGE_SHIFT_8M);
- hugepd_populate_kernel((hugepd_t *)pmdp + 1, ptep, PAGE_SHIFT_8M);
- }
- return hugepte_offset(*(hugepd_t *)pmdp, va, PGDIR_SHIFT);
-}
-
static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa,
pgprot_t prot, int psize, bool new)
{
pmd_t *pmdp = pmd_off_k(va);
pte_t *ptep;
-
- if (WARN_ON(psize != MMU_PAGE_512K && psize != MMU_PAGE_8M))
- return -EINVAL;
+ unsigned int shift = mmu_psize_to_shift(psize);
if (new) {
if (WARN_ON(slab_is_available()))
return -EINVAL;
- if (psize == MMU_PAGE_512K)
+ if (psize == MMU_PAGE_8M) {
+ if (WARN_ON(!pmd_none(*pmdp) || !pmd_none(*(pmdp + 1))))
+ return -EINVAL;
+
+ ptep = early_alloc_pgtable(PTE_FRAG_SIZE);
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+
+ ptep = early_alloc_pgtable(PTE_FRAG_SIZE);
+ pmd_populate_kernel(&init_mm, pmdp + 1, ptep);
+
+ ptep = (pte_t *)pmdp;
+ } else {
ptep = early_pte_alloc_kernel(pmdp, va);
- else
- ptep = early_hugepd_alloc_kernel((hugepd_t *)pmdp, va);
+ /* The PTE should never be already present */
+ if (WARN_ON(pte_present(*ptep) && pgprot_val(prot)))
+ return -EINVAL;
+ }
} else {
- if (psize == MMU_PAGE_512K)
- ptep = pte_offset_kernel(pmdp, va);
+ if (psize == MMU_PAGE_8M)
+ ptep = (pte_t *)pmdp;
else
- ptep = hugepte_offset(*(hugepd_t *)pmdp, va, PGDIR_SHIFT);
+ ptep = pte_offset_kernel(pmdp, va);
}
if (WARN_ON(!ptep))
return -ENOMEM;
- /* The PTE should never be already present */
- if (new && WARN_ON(pte_present(*ptep) && pgprot_val(prot)))
- return -EINVAL;
-
- set_huge_pte_at(&init_mm, va, ptep, pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)));
+ set_huge_pte_at(&init_mm, va, ptep,
+ arch_make_huge_pte(pfn_pte(pa >> PAGE_SHIFT, prot), shift, 0),
+ 1UL << shift);
return 0;
}
@@ -126,23 +114,30 @@ void __init mmu_mapin_immr(void)
PAGE_KERNEL_NCG, MMU_PAGE_512K, true);
}
-static void mmu_mapin_ram_chunk(unsigned long offset, unsigned long top,
- pgprot_t prot, bool new)
+static int mmu_mapin_ram_chunk(unsigned long offset, unsigned long top,
+ pgprot_t prot, bool new)
{
unsigned long v = PAGE_OFFSET + offset;
unsigned long p = offset;
+ int err = 0;
- WARN_ON(!IS_ALIGNED(offset, SZ_512K) || !IS_ALIGNED(top, SZ_512K));
+ WARN_ON(!IS_ALIGNED(offset, SZ_16K) || !IS_ALIGNED(top, SZ_16K));
- for (; p < ALIGN(p, SZ_8M) && p < top; p += SZ_512K, v += SZ_512K)
- __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
- for (; p < ALIGN_DOWN(top, SZ_8M) && p < top; p += SZ_8M, v += SZ_8M)
- __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_8M, new);
- for (; p < ALIGN_DOWN(top, SZ_512K) && p < top; p += SZ_512K, v += SZ_512K)
- __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
+ for (; p < ALIGN(p, SZ_512K) && p < top && !err; p += SZ_16K, v += SZ_16K)
+ err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_16K, new);
+ for (; p < ALIGN(p, SZ_8M) && p < top && !err; p += SZ_512K, v += SZ_512K)
+ err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
+ for (; p < ALIGN_DOWN(top, SZ_8M) && p < top && !err; p += SZ_8M, v += SZ_8M)
+ err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_8M, new);
+ for (; p < ALIGN_DOWN(top, SZ_512K) && p < top && !err; p += SZ_512K, v += SZ_512K)
+ err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
+ for (; p < ALIGN_DOWN(top, SZ_16K) && p < top && !err; p += SZ_16K, v += SZ_16K)
+ err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_16K, new);
if (!new)
flush_tlb_kernel_range(PAGE_OFFSET + v, PAGE_OFFSET + top);
+
+ return err;
}
unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
@@ -157,14 +152,11 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
mmu_mapin_immr();
- if (__map_without_ltlbs)
- return 0;
-
- mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_TEXT, true);
+ mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_X, true);
if (debug_pagealloc_enabled_or_kfence()) {
top = boundary;
} else {
- mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL_TEXT, true);
+ mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL_X, true);
mmu_mapin_ram_chunk(einittext8, top, PAGE_KERNEL, true);
}
@@ -176,27 +168,34 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
return top;
}
-void mmu_mark_initmem_nx(void)
+int mmu_mark_initmem_nx(void)
{
unsigned long etext8 = ALIGN(__pa(_etext), SZ_8M);
unsigned long sinittext = __pa(_sinittext);
unsigned long boundary = strict_kernel_rwx_enabled() ? sinittext : etext8;
unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M);
+ int err = 0;
- mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_TEXT, false);
- mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL, false);
+ if (!debug_pagealloc_enabled_or_kfence())
+ err = mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL, false);
- mmu_pin_tlb(block_mapped_ram, false);
+ if (IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+ mmu_pin_tlb(block_mapped_ram, false);
+
+ return err;
}
#ifdef CONFIG_STRICT_KERNEL_RWX
-void mmu_mark_rodata_ro(void)
+int mmu_mark_rodata_ro(void)
{
unsigned long sinittext = __pa(_sinittext);
+ int err;
- mmu_mapin_ram_chunk(0, sinittext, PAGE_KERNEL_ROX, false);
+ err = mmu_mapin_ram_chunk(0, sinittext, PAGE_KERNEL_ROX, false);
if (IS_ENABLED(CONFIG_PIN_TLB_DATA))
mmu_pin_tlb(block_mapped_ram, true);
+
+ return err;
}
#endif
@@ -210,33 +209,16 @@ void __init setup_initial_memory_limit(phys_addr_t first_memblock_base,
/* 8xx can only access 32MB at the moment */
memblock_set_current_limit(min_t(u64, first_memblock_size, SZ_32M));
+
+ BUILD_BUG_ON(ALIGN_DOWN(MODULES_VADDR, PGDIR_SIZE) < TASK_SIZE);
}
-#ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
+int pud_clear_huge(pud_t *pud)
{
- if (disabled)
- return;
-
- pr_info("Activating Kernel Userspace Execution Prevention\n");
-
- mtspr(SPRN_MI_AP, MI_APG_KUEP);
+ return 0;
}
-#endif
-
-#ifdef CONFIG_PPC_KUAP
-struct static_key_false disable_kuap_key;
-EXPORT_SYMBOL(disable_kuap_key);
-void __init setup_kuap(bool disabled)
+int pmd_clear_huge(pmd_t *pmd)
{
- if (disabled) {
- static_branch_enable(&disable_kuap_key);
- return;
- }
-
- pr_info("Activating Kernel Userspace Access Protection\n");
-
- mtspr(SPRN_MD_AP, MD_APG_KUAP);
+ return 0;
}
-#endif
diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile
index 0424f6ce5bd8..cf60c776c883 100644
--- a/arch/powerpc/mm/nohash/Makefile
+++ b/arch/powerpc/mm/nohash/Makefile
@@ -1,19 +1,16 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
-
-obj-y += mmu_context.o tlb.o tlb_low.o
-obj-$(CONFIG_PPC_BOOK3E_64) += tlb_low_64e.o book3e_pgtable.o
-obj-$(CONFIG_40x) += 40x.o
+obj-y += mmu_context.o tlb.o tlb_low.o kup.o
+obj-$(CONFIG_PPC_BOOK3E_64) += tlb_64e.o tlb_low_64e.o book3e_pgtable.o
obj-$(CONFIG_44x) += 44x.o
obj-$(CONFIG_PPC_8xx) += 8xx.o
-obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke.o
+obj-$(CONFIG_PPC_E500) += e500.o
obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_booke.o
ifdef CONFIG_HUGETLB_PAGE
-obj-$(CONFIG_PPC_FSL_BOOK3E) += book3e_hugetlbpage.o
+obj-$(CONFIG_PPC_E500) += e500_hugetlbpage.o
endif
# Disable kcov instrumentation on sensitive code
# This is necessary for booting with kcov enabled on book3e machines
KCOV_INSTRUMENT_tlb.o := n
-KCOV_INSTRUMENT_fsl_booke.o := n
+KCOV_INSTRUMENT_e500.o := n
diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c
index 77884e24281d..062e8785c1bb 100644
--- a/arch/powerpc/mm/nohash/book3e_pgtable.c
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -10,6 +10,7 @@
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/dma.h>
+#include <asm/text-patching.h>
#include <mm/mmu_decl.h>
@@ -28,10 +29,10 @@ int __meminit vmemmap_create_mapping(unsigned long start,
_PAGE_KERNEL_RW;
/* PTEs only contain page size encodings up to 32M */
- BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
+ BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].shift - 10 > 0xf);
/* Encode the size in the PTE */
- flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
+ flags |= (mmu_psize_defs[mmu_vmemmap_psize].shift - 10) << 8;
/* For each PTE for that area, map things. Note that we don't
* increment phys because all PTEs are of the large size and
@@ -70,7 +71,7 @@ static void __init *early_alloc_pgtable(unsigned long size)
* map_kernel_page adds an entry to the ioremap page table
* and adds an entry to the HPT, possibly bolting it
*/
-int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
+int __ref map_kernel_page(unsigned long ea, phys_addr_t pa, pgprot_t prot)
{
pgd_t *pgdp;
p4d_t *p4dp;
@@ -95,8 +96,8 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
pgdp = pgd_offset_k(ea);
p4dp = p4d_offset(pgdp, ea);
if (p4d_none(*p4dp)) {
- pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
- p4d_populate(&init_mm, p4dp, pmdp);
+ pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
+ p4d_populate(&init_mm, p4dp, pudp);
}
pudp = pud_offset(p4dp, ea);
if (pud_none(*pudp)) {
@@ -105,7 +106,7 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
}
pmdp = pmd_offset(pudp, ea);
if (!pmd_present(*pmdp)) {
- ptep = early_alloc_pgtable(PAGE_SIZE);
+ ptep = early_alloc_pgtable(PTE_TABLE_SIZE);
pmd_populate_kernel(&init_mm, pmdp, ptep);
}
ptep = pte_offset_kernel(pmdp, ea);
@@ -115,3 +116,17 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
smp_wmb();
return 0;
}
+
+void __patch_exception(int exc, unsigned long addr)
+{
+ unsigned int *ibase = &interrupt_base_book3e;
+
+ /*
+ * Our exceptions vectors start with a NOP and -then- a branch
+ * to deal with single stepping from userspace which stops on
+ * the second instruction. Thus we need to patch the second
+ * instruction of the exception, not the first one.
+ */
+
+ patch_branch(ibase + (exc / 4) + 1, addr, 0);
+}
diff --git a/arch/powerpc/mm/nohash/fsl_booke.c b/arch/powerpc/mm/nohash/e500.c
index 03dacbe940e5..266fb22131fc 100644
--- a/arch/powerpc/mm/nohash/fsl_booke.c
+++ b/arch/powerpc/mm/nohash/e500.c
@@ -36,8 +36,8 @@
#include <linux/delay.h>
#include <linux/highmem.h>
#include <linux/memblock.h>
+#include <linux/of_fdt.h>
-#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/mmu.h>
@@ -51,21 +51,15 @@
unsigned int tlbcam_index;
-#define NUM_TLBCAMS (64)
struct tlbcam TLBCAM[NUM_TLBCAMS];
-struct tlbcamrange {
+static struct {
unsigned long start;
unsigned long limit;
phys_addr_t phys;
} tlbcam_addrs[NUM_TLBCAMS];
-unsigned long tlbcam_sz(int idx)
-{
- return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1;
-}
-
-#ifdef CONFIG_FSL_BOOKE
+#ifdef CONFIG_PPC_85xx
/*
* Return PA for this VA if it is mapped by a CAM, or 0
*/
@@ -122,15 +116,18 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0;
TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0;
- TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR;
- TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0);
+ TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SR;
+ TLBCAM[index].MAS3 |= (flags & _PAGE_WRITE) ? MAS3_SW : 0;
if (mmu_has_feature(MMU_FTR_BIG_PHYS))
TLBCAM[index].MAS7 = (u64)phys >> 32;
/* Below is unlikely -- only for large user pages or similar */
- if (pte_user(__pte(flags))) {
- TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
- TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
+ if (!is_kernel_addr(virt)) {
+ TLBCAM[index].MAS3 |= MAS3_UR;
+ TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_UX : 0;
+ TLBCAM[index].MAS3 |= (flags & _PAGE_WRITE) ? MAS3_UW : 0;
+ } else {
+ TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_SX : 0;
}
tlbcam_addrs[index].start = virt;
@@ -138,8 +135,8 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
tlbcam_addrs[index].phys = phys;
}
-unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
- phys_addr_t phys)
+static unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
+ phys_addr_t phys)
{
unsigned int camsize = __ilog2(ram);
unsigned int align = __ffs(virt | phys);
@@ -165,19 +162,38 @@ unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
unsigned long ram, int max_cam_idx,
- bool dryrun)
+ bool dryrun, bool init)
{
int i;
unsigned long amount_mapped = 0;
+ unsigned long boundary;
+
+ if (strict_kernel_rwx_enabled())
+ boundary = (unsigned long)(_sinittext - _stext);
+ else
+ boundary = ram;
/* Calculate CAM values */
- for (i = 0; ram && i < max_cam_idx; i++) {
+ for (i = 0; boundary && i < max_cam_idx; i++) {
+ unsigned long cam_sz;
+ pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL_ROX;
+
+ cam_sz = calc_cam_sz(boundary, virt, phys);
+ if (!dryrun)
+ settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0);
+
+ boundary -= cam_sz;
+ amount_mapped += cam_sz;
+ virt += cam_sz;
+ phys += cam_sz;
+ }
+ for (ram -= amount_mapped; ram && i < max_cam_idx; i++) {
unsigned long cam_sz;
+ pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL;
cam_sz = calc_cam_sz(ram, virt, phys);
if (!dryrun)
- settlbcam(i, virt, phys, cam_sz,
- pgprot_val(PAGE_KERNEL_X), 0);
+ settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0);
ram -= cam_sz;
amount_mapped += cam_sz;
@@ -188,8 +204,13 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
if (dryrun)
return amount_mapped;
- loadcam_multi(0, i, max_cam_idx);
- tlbcam_index = i;
+ if (init) {
+ loadcam_multi(0, i, max_cam_idx);
+ tlbcam_index = i;
+ } else {
+ loadcam_multi(0, i, 0);
+ WARN_ON(i > tlbcam_index);
+ }
#ifdef CONFIG_PPC64
get_paca()->tcd.esel_next = i;
@@ -200,12 +221,12 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
return amount_mapped;
}
-unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun)
+unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun, bool init)
{
unsigned long virt = PAGE_OFFSET;
phys_addr_t phys = memstart_addr;
- return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun);
+ return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun, init);
}
#ifdef CONFIG_PPC32
@@ -237,6 +258,11 @@ void __init MMU_init_hw(void)
flush_instruction_cache();
}
+static unsigned long __init tlbcam_sz(int idx)
+{
+ return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1;
+}
+
void __init adjust_total_lowmem(void)
{
unsigned long ram;
@@ -246,8 +272,8 @@ void __init adjust_total_lowmem(void)
ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
i = switch_to_as1();
- __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false);
- restore_to_as0(i, 0, 0, 1);
+ __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false, true);
+ restore_to_as0(i, 0, NULL, 1);
pr_info("Memory CAM mapping: ");
for (i = 0; i < tlbcam_index - 1; i++)
@@ -258,6 +284,26 @@ void __init adjust_total_lowmem(void)
memblock_set_current_limit(memstart_addr + __max_low_memory);
}
+#ifdef CONFIG_STRICT_KERNEL_RWX
+int mmu_mark_rodata_ro(void)
+{
+ unsigned long remapped;
+
+ remapped = map_mem_in_cams(__max_low_memory, CONFIG_LOWMEM_CAM_NUM, false, false);
+
+ if (WARN_ON(__max_low_memory != remapped))
+ return -EINVAL;
+
+ return 0;
+}
+#endif
+
+int mmu_mark_initmem_nx(void)
+{
+ /* Everything is done in mmu_mark_rodata_ro() */
+ return 0;
+}
+
void setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
{
@@ -317,11 +363,11 @@ notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start)
/* map a 64M area for the second relocation */
if (memstart_addr > start)
map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM,
- false);
+ false, true);
else
map_mem_in_cams_addr(start, PAGE_OFFSET + offset,
0x4000000, CONFIG_LOWMEM_CAM_NUM,
- false);
+ false, true);
restore_to_as0(n, offset, __va(dt_ptr), 1);
/* We should never reach here */
panic("Relocation error");
diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c
index 8b88be91b622..a134d28a0e4d 100644
--- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
+++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c
@@ -45,7 +45,9 @@ static inline void book3e_tlb_lock(void)
if (!cpu_has_feature(CPU_FTR_SMT))
return;
- asm volatile("1: lbarx %0, 0, %1;"
+ asm volatile(".machine push;"
+ ".machine e6500;"
+ "1: lbarx %0, 0, %1;"
"cmpwi %0, 0;"
"bne 2f;"
"stbcx. %2, 0, %1;"
@@ -56,6 +58,7 @@ static inline void book3e_tlb_lock(void)
"bne 2b;"
"b 1b;"
"3:"
+ ".machine pop;"
: "=&r" (tmp)
: "r" (&paca->tcd_ptr->lock), "r" (token)
: "memory");
@@ -103,21 +106,11 @@ static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid)
int found = 0;
mtspr(SPRN_MAS6, pid << 16);
- if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) {
- asm volatile(
- "li %0,0\n"
- "tlbsx. 0,%1\n"
- "bne 1f\n"
- "li %0,1\n"
- "1:\n"
- : "=&r"(found) : "r"(ea));
- } else {
- asm volatile(
- "tlbsx 0,%1\n"
- "mfspr %0,0x271\n"
- "srwi %0,%0,31\n"
- : "=&r"(found) : "r"(ea));
- }
+ asm volatile(
+ "tlbsx 0,%1\n"
+ "mfspr %0,0x271\n"
+ "srwi %0,%0,31\n"
+ : "=&r"(found) : "r"(ea));
return found;
}
@@ -142,7 +135,7 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte)
tsize = shift - 10;
/*
* We can't be interrupted while we're setting up the MAS
- * regusters or after we've confirmed that no tlb exists.
+ * registers or after we've confirmed that no tlb exists.
*/
local_irq_save(flags);
@@ -169,13 +162,9 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte)
mtspr(SPRN_MAS1, mas1);
mtspr(SPRN_MAS2, mas2);
- if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) {
- mtspr(SPRN_MAS7_MAS3, mas7_3);
- } else {
- if (mmu_has_feature(MMU_FTR_BIG_PHYS))
- mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
- mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
- }
+ if (mmu_has_feature(MMU_FTR_BIG_PHYS))
+ mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
+ mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
asm volatile ("tlbwe");
@@ -189,7 +178,7 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte)
*
* This must always be called with the pte lock held.
*/
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
+void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
{
if (is_vm_hugetlb_page(vma))
book3e_hugetlb_preload(vma, address, *ptep);
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c
index 4c74e8a5482b..5e4897daaaea 100644
--- a/arch/powerpc/mm/nohash/kaslr_booke.c
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -13,13 +13,12 @@
#include <linux/delay.h>
#include <linux/memblock.h>
#include <linux/libfdt.h>
-#include <linux/crash_core.h>
+#include <linux/crash_reserve.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
#include <asm/cacheflush.h>
-#include <asm/prom.h>
#include <asm/kdump.h>
#include <mm/mmu_decl.h>
-#include <generated/compile.h>
-#include <generated/utsrelease.h>
struct regions {
unsigned long pa_start;
@@ -36,17 +35,11 @@ struct regions {
int reserved_mem_size_cells;
};
-/* Simplified build-specific string for starting entropy. */
-static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
- LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
-
struct regions __initdata regions;
static __init void kaslr_get_cmdline(void *fdt)
{
- int node = fdt_path_offset(fdt, "/chosen");
-
- early_init_dt_scan_chosen(node, "chosen", 1, boot_command_line);
+ early_init_dt_scan_chosen(boot_command_line);
}
static unsigned long __init rotate_xor(unsigned long hash, const void *area,
@@ -72,7 +65,8 @@ static unsigned long __init get_boot_seed(void *fdt)
{
unsigned long hash = 0;
- hash = rotate_xor(hash, build_str, sizeof(build_str));
+ /* build-specific string for starting entropy. */
+ hash = rotate_xor(hash, linux_banner, strlen(linux_banner));
hash = rotate_xor(hash, fdt, fdt_totalsize(fdt));
return hash;
@@ -179,12 +173,12 @@ static __init bool overlaps_region(const void *fdt, u32 start,
static void __init get_crash_kernel(void *fdt, unsigned long size)
{
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_CRASH_RESERVE
unsigned long long crash_size, crash_base;
int ret;
ret = parse_crashkernel(boot_command_line, size, &crash_size,
- &crash_base);
+ &crash_base, NULL, NULL, NULL);
if (ret != 0 || crash_size == 0)
return;
if (crash_base == 0)
@@ -314,10 +308,10 @@ static unsigned long __init kaslr_choose_location(void *dt_ptr, phys_addr_t size
pr_warn("KASLR: No safe seed for randomizing the kernel base.\n");
ram = min_t(phys_addr_t, __max_low_memory, size);
- ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true);
+ ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true, true);
linear_sz = min_t(unsigned long, ram, SZ_512M);
- /* If the linear size is smaller than 64M, do not randmize */
+ /* If the linear size is smaller than 64M, do not randomize */
if (linear_sz < SZ_64M)
return 0;
@@ -382,7 +376,7 @@ notrace void __init kaslr_early_init(void *dt_ptr, phys_addr_t size)
create_kaslr_tlb_entry(1, tlb_virt, tlb_phys);
}
- /* Copy the kernel to it's new location and run */
+ /* Copy the kernel to its new location and run */
memcpy((void *)kernstart_virt_addr, (void *)_stext, kernel_sz);
flush_icache_range(kernstart_virt_addr, kernstart_virt_addr + kernel_sz);
diff --git a/arch/powerpc/mm/nohash/kup.c b/arch/powerpc/mm/nohash/kup.c
new file mode 100644
index 000000000000..c20c4f357fbf
--- /dev/null
+++ b/arch/powerpc/mm/nohash/kup.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file contains the routines for initializing kernel userspace protection
+ */
+
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/printk.h>
+#include <linux/smp.h>
+
+#include <asm/kup.h>
+#include <asm/smp.h>
+
+#ifdef CONFIG_PPC_KUAP
+void setup_kuap(bool disabled)
+{
+ if (disabled) {
+ if (smp_processor_id() == boot_cpuid)
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_KUAP;
+ return;
+ }
+
+ pr_info("Activating Kernel Userspace Access Protection\n");
+
+ prevent_user_access(KUAP_READ_WRITE);
+}
+#endif
diff --git a/arch/powerpc/mm/nohash/mmu_context.c b/arch/powerpc/mm/nohash/mmu_context.c
index 44b2b5e7cabe..28a96a10c907 100644
--- a/arch/powerpc/mm/nohash/mmu_context.c
+++ b/arch/powerpc/mm/nohash/mmu_context.c
@@ -33,6 +33,7 @@
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
#include <asm/smp.h>
+#include <asm/kup.h>
#include <mm/mmu_decl.h>
@@ -202,25 +203,14 @@ static unsigned int steal_context_up(unsigned int id)
static void set_context(unsigned long id, pgd_t *pgd)
{
if (IS_ENABLED(CONFIG_PPC_8xx)) {
- s16 offset = (s16)(__pa(swapper_pg_dir));
-
- /*
- * Register M_TWB will contain base address of level 1 table minus the
- * lower part of the kernel PGDIR base address, so that all accesses to
- * level 1 table are done relative to lower part of kernel PGDIR base
- * address.
- */
- mtspr(SPRN_M_TWB, __pa(pgd) - offset);
+ mtspr(SPRN_M_TWB, __pa(pgd));
/* Update context */
mtspr(SPRN_M_CASID, id - 1);
/* sync */
mb();
- } else {
- if (IS_ENABLED(CONFIG_40x))
- mb(); /* sync */
-
+ } else if (kuap_is_disabled()) {
mtspr(SPRN_PID, id);
isync();
}
@@ -305,6 +295,9 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
if (IS_ENABLED(CONFIG_BDI_SWITCH))
abatron_pteptrs[1] = next->pgd;
set_context(id, next->pgd);
+#if defined(CONFIG_BOOKE) && defined(CONFIG_PPC_KUAP)
+ tsk->thread.pid = id;
+#endif
raw_spin_unlock(&context_lock);
}
@@ -313,15 +306,6 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
*/
int init_new_context(struct task_struct *t, struct mm_struct *mm)
{
- /*
- * We have MMU_NO_CONTEXT set to be ~0. Hence check
- * explicitly against context.id == 0. This ensures that we properly
- * initialize context slice details for newly allocated mm's (which will
- * have id == 0) and don't alter context slice inherited via fork (which
- * will have id != 0).
- */
- if (mm->context.id == 0)
- slice_init_new_context_exec(mm);
mm->context.id = MMU_NO_CONTEXT;
mm->context.active = 0;
pte_frag_set(&mm->context, NULL);
@@ -393,21 +377,11 @@ void __init mmu_context_init(void)
/*
* Allocate the maps used by context management
*/
- context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
- if (!context_map)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- CTX_MAP_SIZE);
- context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1),
+ context_map = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES);
+ context_mm = memblock_alloc_or_panic(sizeof(void *) * (LAST_CONTEXT + 1),
SMP_CACHE_BYTES);
- if (!context_mm)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- sizeof(void *) * (LAST_CONTEXT + 1));
if (IS_ENABLED(CONFIG_SMP)) {
- stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
- if (!stale_map[boot_cpuid])
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- CTX_MAP_SIZE);
-
+ stale_map[boot_cpuid] = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES);
cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE,
"powerpc/mmu/ctx:prepare",
mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead);
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index 5872f69141d5..0a650742f3a0 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -37,7 +37,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/cputhreads.h>
#include <asm/hugetlb.h>
#include <asm/paca.h>
@@ -49,39 +49,44 @@
* other sizes not listed here. The .ind field is only used on MMUs that have
* indirect page table entries.
*/
-#if defined(CONFIG_PPC_BOOK3E_MMU) || defined(CONFIG_PPC_8xx)
-#ifdef CONFIG_PPC_FSL_BOOK3E
+#ifdef CONFIG_PPC_E500
struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
[MMU_PAGE_4K] = {
.shift = 12,
- .enc = BOOK3E_PAGESZ_4K,
},
[MMU_PAGE_2M] = {
.shift = 21,
- .enc = BOOK3E_PAGESZ_2M,
},
[MMU_PAGE_4M] = {
.shift = 22,
- .enc = BOOK3E_PAGESZ_4M,
},
[MMU_PAGE_16M] = {
.shift = 24,
- .enc = BOOK3E_PAGESZ_16M,
},
[MMU_PAGE_64M] = {
.shift = 26,
- .enc = BOOK3E_PAGESZ_64M,
},
[MMU_PAGE_256M] = {
.shift = 28,
- .enc = BOOK3E_PAGESZ_256M,
},
[MMU_PAGE_1G] = {
.shift = 30,
- .enc = BOOK3E_PAGESZ_1GB,
},
};
-#elif defined(CONFIG_PPC_8xx)
+
+static inline int mmu_get_tsize(int psize)
+{
+ return mmu_psize_defs[psize].shift - 10;
+}
+#else
+static inline int mmu_get_tsize(int psize)
+{
+ /* This isn't used on !Book3E for now */
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_PPC_8xx
struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
[MMU_PAGE_4K] = {
.shift = 12,
@@ -96,78 +101,9 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
.shift = 23,
},
};
-#else
-struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
- [MMU_PAGE_4K] = {
- .shift = 12,
- .ind = 20,
- .enc = BOOK3E_PAGESZ_4K,
- },
- [MMU_PAGE_16K] = {
- .shift = 14,
- .enc = BOOK3E_PAGESZ_16K,
- },
- [MMU_PAGE_64K] = {
- .shift = 16,
- .ind = 28,
- .enc = BOOK3E_PAGESZ_64K,
- },
- [MMU_PAGE_1M] = {
- .shift = 20,
- .enc = BOOK3E_PAGESZ_1M,
- },
- [MMU_PAGE_16M] = {
- .shift = 24,
- .ind = 36,
- .enc = BOOK3E_PAGESZ_16M,
- },
- [MMU_PAGE_256M] = {
- .shift = 28,
- .enc = BOOK3E_PAGESZ_256M,
- },
- [MMU_PAGE_1G] = {
- .shift = 30,
- .enc = BOOK3E_PAGESZ_1GB,
- },
-};
-#endif /* CONFIG_FSL_BOOKE */
-
-static inline int mmu_get_tsize(int psize)
-{
- return mmu_psize_defs[psize].enc;
-}
-#else
-static inline int mmu_get_tsize(int psize)
-{
- /* This isn't used on !Book3E for now */
- return 0;
-}
-#endif /* CONFIG_PPC_BOOK3E_MMU */
-
-/* The variables below are currently only used on 64-bit Book3E
- * though this will probably be made common with other nohash
- * implementations at some point
- */
-#ifdef CONFIG_PPC64
-
-int mmu_linear_psize; /* Page size used for the linear mapping */
-int mmu_pte_psize; /* Page size used for PTE pages */
-int mmu_vmemmap_psize; /* Page size used for the virtual mem map */
-int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */
-unsigned long linear_map_top; /* Top of linear mapping */
-
-
-/*
- * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug
- * exceptions. This is used for bolted and e6500 TLB miss handlers which
- * do not modify this SPRG in the TLB miss code; for other TLB miss handlers,
- * this is set to zero.
- */
-int extlb_level_exc;
-
-#endif /* CONFIG_PPC64 */
+#endif
-#ifdef CONFIG_PPC_FSL_BOOK3E
+#ifdef CONFIG_PPC_E500
/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */
DEFINE_PER_CPU(int, next_tlbcam_idx);
EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx);
@@ -185,6 +121,7 @@ EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx);
* processor
*/
+#ifndef CONFIG_PPC_8xx
/*
* These are the base non-SMP variants of page and mm flushing
*/
@@ -219,6 +156,15 @@ void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
}
EXPORT_SYMBOL(local_flush_tlb_page);
+void local_flush_tlb_page_psize(struct mm_struct *mm,
+ unsigned long vmaddr, int psize)
+{
+ __local_flush_tlb_page(mm, vmaddr, mmu_get_tsize(psize), 0);
+}
+EXPORT_SYMBOL(local_flush_tlb_page_psize);
+
+#endif
+
/*
* And here are the SMP non-local implementations
*/
@@ -343,20 +289,10 @@ EXPORT_SYMBOL(flush_tlb_page);
#endif /* CONFIG_SMP */
-#ifdef CONFIG_PPC_47x
-void __init early_init_mmu_47x(void)
-{
-#ifdef CONFIG_SMP
- unsigned long root = of_get_flat_dt_root();
- if (of_get_flat_dt_prop(root, "cooperative-partition", NULL))
- mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST);
-#endif /* CONFIG_SMP */
-}
-#endif /* CONFIG_PPC_47x */
-
/*
* Flush kernel TLB entries in the given range
*/
+#ifndef CONFIG_PPC_8xx
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
#ifdef CONFIG_SMP
@@ -369,6 +305,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
#endif
}
EXPORT_SYMBOL(flush_tlb_kernel_range);
+#endif
/*
* Currently, for range flushing, we just do a full mm flush. This should
@@ -392,397 +329,13 @@ void tlb_flush(struct mmu_gather *tlb)
flush_tlb_mm(tlb->mm);
}
-/*
- * Below are functions specific to the 64-bit variant of Book3E though that
- * may change in the future
- */
-
-#ifdef CONFIG_PPC64
-
-/*
- * Handling of virtual linear page tables or indirect TLB entries
- * flushing when PTE pages are freed
- */
-void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
-{
- int tsize = mmu_psize_defs[mmu_pte_psize].enc;
-
- if (book3e_htw_mode != PPC_HTW_NONE) {
- unsigned long start = address & PMD_MASK;
- unsigned long end = address + PMD_SIZE;
- unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
-
- /* This isn't the most optimal, ideally we would factor out the
- * while preempt & CPU mask mucking around, or even the IPI but
- * it will do for now
- */
- while (start < end) {
- __flush_tlb_page(tlb->mm, start, tsize, 1);
- start += size;
- }
- } else {
- unsigned long rmask = 0xf000000000000000ul;
- unsigned long rid = (address & rmask) | 0x1000000000000000ul;
- unsigned long vpte = address & ~rmask;
-
- vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
- vpte |= rid;
- __flush_tlb_page(tlb->mm, vpte, tsize, 0);
- }
-}
-
-static void setup_page_sizes(void)
-{
- unsigned int tlb0cfg;
- unsigned int tlb0ps;
- unsigned int eptcfg;
- int i, psize;
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
- unsigned int mmucfg = mfspr(SPRN_MMUCFG);
- int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E);
-
- if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
- unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG);
- unsigned int min_pg, max_pg;
-
- min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT;
- max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT;
-
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
- struct mmu_psize_def *def;
- unsigned int shift;
-
- def = &mmu_psize_defs[psize];
- shift = def->shift;
-
- if (shift == 0 || shift & 1)
- continue;
-
- /* adjust to be in terms of 4^shift Kb */
- shift = (shift - 10) >> 1;
-
- if ((shift >= min_pg) && (shift <= max_pg))
- def->flags |= MMU_PAGE_SIZE_DIRECT;
- }
-
- goto out;
- }
-
- if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
- u32 tlb1cfg, tlb1ps;
-
- tlb0cfg = mfspr(SPRN_TLB0CFG);
- tlb1cfg = mfspr(SPRN_TLB1CFG);
- tlb1ps = mfspr(SPRN_TLB1PS);
- eptcfg = mfspr(SPRN_EPTCFG);
-
- if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT))
- book3e_htw_mode = PPC_HTW_E6500;
-
- /*
- * We expect 4K subpage size and unrestricted indirect size.
- * The lack of a restriction on indirect size is a Freescale
- * extension, indicated by PSn = 0 but SPSn != 0.
- */
- if (eptcfg != 2)
- book3e_htw_mode = PPC_HTW_NONE;
-
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
- struct mmu_psize_def *def = &mmu_psize_defs[psize];
-
- if (!def->shift)
- continue;
-
- if (tlb1ps & (1U << (def->shift - 10))) {
- def->flags |= MMU_PAGE_SIZE_DIRECT;
-
- if (book3e_htw_mode && psize == MMU_PAGE_2M)
- def->flags |= MMU_PAGE_SIZE_INDIRECT;
- }
- }
-
- goto out;
- }
-#endif
-
- tlb0cfg = mfspr(SPRN_TLB0CFG);
- tlb0ps = mfspr(SPRN_TLB0PS);
- eptcfg = mfspr(SPRN_EPTCFG);
-
- /* Look for supported direct sizes */
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
- struct mmu_psize_def *def = &mmu_psize_defs[psize];
-
- if (tlb0ps & (1U << (def->shift - 10)))
- def->flags |= MMU_PAGE_SIZE_DIRECT;
- }
-
- /* Indirect page sizes supported ? */
- if ((tlb0cfg & TLBnCFG_IND) == 0 ||
- (tlb0cfg & TLBnCFG_PT) == 0)
- goto out;
-
- book3e_htw_mode = PPC_HTW_IBM;
-
- /* Now, we only deal with one IND page size for each
- * direct size. Hopefully all implementations today are
- * unambiguous, but we might want to be careful in the
- * future.
- */
- for (i = 0; i < 3; i++) {
- unsigned int ps, sps;
-
- sps = eptcfg & 0x1f;
- eptcfg >>= 5;
- ps = eptcfg & 0x1f;
- eptcfg >>= 5;
- if (!ps || !sps)
- continue;
- for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
- struct mmu_psize_def *def = &mmu_psize_defs[psize];
-
- if (ps == (def->shift - 10))
- def->flags |= MMU_PAGE_SIZE_INDIRECT;
- if (sps == (def->shift - 10))
- def->ind = ps + 10;
- }
- }
-
-out:
- /* Cleanup array and print summary */
- pr_info("MMU: Supported page sizes\n");
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
- struct mmu_psize_def *def = &mmu_psize_defs[psize];
- const char *__page_type_names[] = {
- "unsupported",
- "direct",
- "indirect",
- "direct & indirect"
- };
- if (def->flags == 0) {
- def->shift = 0;
- continue;
- }
- pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10),
- __page_type_names[def->flags & 0x3]);
- }
-}
-
-static void setup_mmu_htw(void)
-{
- /*
- * If we want to use HW tablewalk, enable it by patching the TLB miss
- * handlers to branch to the one dedicated to it.
- */
-
- switch (book3e_htw_mode) {
- case PPC_HTW_IBM:
- patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e);
- patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e);
- break;
-#ifdef CONFIG_PPC_FSL_BOOK3E
- case PPC_HTW_E6500:
- extlb_level_exc = EX_TLB_SIZE;
- patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
- patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e);
- break;
-#endif
- }
- pr_info("MMU: Book3E HW tablewalk %s\n",
- book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported");
-}
-
-/*
- * Early initialization of the MMU TLB code
- */
-static void early_init_this_mmu(void)
-{
- unsigned int mas4;
-
- /* Set MAS4 based on page table setting */
-
- mas4 = 0x4 << MAS4_WIMGED_SHIFT;
- switch (book3e_htw_mode) {
- case PPC_HTW_E6500:
- mas4 |= MAS4_INDD;
- mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT;
- mas4 |= MAS4_TLBSELD(1);
- mmu_pte_psize = MMU_PAGE_2M;
- break;
-
- case PPC_HTW_IBM:
- mas4 |= MAS4_INDD;
- mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
- mmu_pte_psize = MMU_PAGE_1M;
- break;
-
- case PPC_HTW_NONE:
- mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
- mmu_pte_psize = mmu_virtual_psize;
- break;
- }
- mtspr(SPRN_MAS4, mas4);
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
- if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
- unsigned int num_cams;
- bool map = true;
-
- /* use a quarter of the TLBCAM for bolted linear map */
- num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
-
- /*
- * Only do the mapping once per core, or else the
- * transient mapping would cause problems.
- */
-#ifdef CONFIG_SMP
- if (hweight32(get_tensr()) > 1)
- map = false;
-#endif
-
- if (map)
- linear_map_top = map_mem_in_cams(linear_map_top,
- num_cams, false);
- }
-#endif
-
- /* A sync won't hurt us after mucking around with
- * the MMU configuration
- */
- mb();
-}
-
-static void __init early_init_mmu_global(void)
-{
- /* XXX This will have to be decided at runtime, but right
- * now our boot and TLB miss code hard wires it. Ideally
- * we should find out a suitable page size and patch the
- * TLB miss code (either that or use the PACA to store
- * the value we want)
- */
- mmu_linear_psize = MMU_PAGE_1G;
-
- /* XXX This should be decided at runtime based on supported
- * page sizes in the TLB, but for now let's assume 16M is
- * always there and a good fit (which it probably is)
- *
- * Freescale booke only supports 4K pages in TLB0, so use that.
- */
- if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
- mmu_vmemmap_psize = MMU_PAGE_4K;
- else
- mmu_vmemmap_psize = MMU_PAGE_16M;
-
- /* XXX This code only checks for TLB 0 capabilities and doesn't
- * check what page size combos are supported by the HW. It
- * also doesn't handle the case where a separate array holds
- * the IND entries from the array loaded by the PT.
- */
- /* Look for supported page sizes */
- setup_page_sizes();
-
- /* Look for HW tablewalk support */
- setup_mmu_htw();
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
- if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
- if (book3e_htw_mode == PPC_HTW_NONE) {
- extlb_level_exc = EX_TLB_SIZE;
- patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
- patch_exception(0x1e0,
- exc_instruction_tlb_miss_bolted_book3e);
- }
- }
-#endif
-
- /* Set the global containing the top of the linear mapping
- * for use by the TLB miss code
- */
- linear_map_top = memblock_end_of_DRAM();
-
- ioremap_bot = IOREMAP_BASE;
-}
-
-static void __init early_mmu_set_memory_limit(void)
-{
-#ifdef CONFIG_PPC_FSL_BOOK3E
- if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
- /*
- * Limit memory so we dont have linear faults.
- * Unlike memblock_set_current_limit, which limits
- * memory available during early boot, this permanently
- * reduces the memory available to Linux. We need to
- * do this because highmem is not supported on 64-bit.
- */
- memblock_enforce_memory_limit(linear_map_top);
- }
-#endif
-
- memblock_set_current_limit(linear_map_top);
-}
-
-/* boot cpu only */
-void __init early_init_mmu(void)
-{
- early_init_mmu_global();
- early_init_this_mmu();
- early_mmu_set_memory_limit();
-}
-
-void early_init_mmu_secondary(void)
-{
- early_init_this_mmu();
-}
-
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size)
-{
- /* On non-FSL Embedded 64-bit, we adjust the RMA size to match
- * the bolted TLB entry. We know for now that only 1G
- * entries are supported though that may eventually
- * change.
- *
- * on FSL Embedded 64-bit, usually all RAM is bolted, but with
- * unusual memory sizes it's possible for some RAM to not be mapped
- * (such RAM is not used at all by Linux, since we don't support
- * highmem on 64-bit). We limit ppc64_rma_size to what would be
- * mappable if this memblock is the only one. Additional memblocks
- * can only increase, not decrease, the amount that ends up getting
- * mapped. We still limit max to 1G even if we'll eventually map
- * more. This is due to what the early init code is set up to do.
- *
- * We crop it to the size of the first MEMBLOCK to
- * avoid going over total available memory just in case...
- */
-#ifdef CONFIG_PPC_FSL_BOOK3E
- if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
- unsigned long linear_sz;
- unsigned int num_cams;
-
- /* use a quarter of the TLBCAM for bolted linear map */
- num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
-
- linear_sz = map_mem_in_cams(first_memblock_size, num_cams,
- true);
-
- ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
- } else
-#endif
- ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
-
- /* Finally limit subsequent allocations */
- memblock_set_current_limit(first_memblock_base + ppc64_rma_size);
-}
-#else /* ! CONFIG_PPC64 */
+#ifndef CONFIG_PPC64
void __init early_init_mmu(void)
{
-#ifdef CONFIG_PPC_47x
- early_init_mmu_47x();
-#endif
+ unsigned long root = of_get_flat_dt_root();
-#ifdef CONFIG_PPC_MM_SLICES
- mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT);
-#endif
+ if (IS_ENABLED(CONFIG_PPC_47x) && IS_ENABLED(CONFIG_SMP) &&
+ of_get_flat_dt_prop(root, "cooperative-partition", NULL))
+ mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST);
}
#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c
new file mode 100644
index 000000000000..4f925adf2695
--- /dev/null
+++ b/arch/powerpc/mm/nohash/tlb_64e.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org>
+ * IBM Corp.
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/memblock.h>
+
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/text-patching.h>
+#include <asm/cputhreads.h>
+
+#include <mm/mmu_decl.h>
+
+/* The variables below are currently only used on 64-bit Book3E
+ * though this will probably be made common with other nohash
+ * implementations at some point
+ */
+static int mmu_pte_psize; /* Page size used for PTE pages */
+int mmu_vmemmap_psize; /* Page size used for the virtual mem map */
+int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */
+unsigned long linear_map_top; /* Top of linear mapping */
+
+
+/*
+ * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug
+ * exceptions. This is used for bolted and e6500 TLB miss handlers which
+ * do not modify this SPRG in the TLB miss code; for other TLB miss handlers,
+ * this is set to zero.
+ */
+int extlb_level_exc;
+
+/*
+ * Handling of virtual linear page tables or indirect TLB entries
+ * flushing when PTE pages are freed
+ */
+void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
+{
+ int tsize = mmu_psize_defs[mmu_pte_psize].shift - 10;
+
+ if (book3e_htw_mode != PPC_HTW_NONE) {
+ unsigned long start = address & PMD_MASK;
+ unsigned long end = address + PMD_SIZE;
+ unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
+
+ /* This isn't the most optimal, ideally we would factor out the
+ * while preempt & CPU mask mucking around, or even the IPI but
+ * it will do for now
+ */
+ while (start < end) {
+ __flush_tlb_page(tlb->mm, start, tsize, 1);
+ start += size;
+ }
+ } else {
+ unsigned long rmask = 0xf000000000000000ul;
+ unsigned long rid = (address & rmask) | 0x1000000000000000ul;
+ unsigned long vpte = address & ~rmask;
+
+ vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
+ vpte |= rid;
+ __flush_tlb_page(tlb->mm, vpte, tsize, 0);
+ }
+}
+
+static void __init setup_page_sizes(void)
+{
+ unsigned int tlb0cfg;
+ unsigned int eptcfg;
+ int psize;
+
+ unsigned int mmucfg = mfspr(SPRN_MMUCFG);
+
+ if ((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
+ unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG);
+ unsigned int min_pg, max_pg;
+
+ min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT;
+ max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT;
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def;
+ unsigned int shift;
+
+ def = &mmu_psize_defs[psize];
+ shift = def->shift;
+
+ if (shift == 0 || shift & 1)
+ continue;
+
+ /* adjust to be in terms of 4^shift Kb */
+ shift = (shift - 10) >> 1;
+
+ if ((shift >= min_pg) && (shift <= max_pg))
+ def->flags |= MMU_PAGE_SIZE_DIRECT;
+ }
+
+ goto out;
+ }
+
+ if ((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
+ u32 tlb1cfg, tlb1ps;
+
+ tlb0cfg = mfspr(SPRN_TLB0CFG);
+ tlb1cfg = mfspr(SPRN_TLB1CFG);
+ tlb1ps = mfspr(SPRN_TLB1PS);
+ eptcfg = mfspr(SPRN_EPTCFG);
+
+ if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT))
+ book3e_htw_mode = PPC_HTW_E6500;
+
+ /*
+ * We expect 4K subpage size and unrestricted indirect size.
+ * The lack of a restriction on indirect size is a Freescale
+ * extension, indicated by PSn = 0 but SPSn != 0.
+ */
+ if (eptcfg != 2)
+ book3e_htw_mode = PPC_HTW_NONE;
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+ if (!def->shift)
+ continue;
+
+ if (tlb1ps & (1U << (def->shift - 10))) {
+ def->flags |= MMU_PAGE_SIZE_DIRECT;
+
+ if (book3e_htw_mode && psize == MMU_PAGE_2M)
+ def->flags |= MMU_PAGE_SIZE_INDIRECT;
+ }
+ }
+
+ goto out;
+ }
+out:
+ /* Cleanup array and print summary */
+ pr_info("MMU: Supported page sizes\n");
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+ const char *__page_type_names[] = {
+ "unsupported",
+ "direct",
+ "indirect",
+ "direct & indirect"
+ };
+ if (def->flags == 0) {
+ def->shift = 0;
+ continue;
+ }
+ pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10),
+ __page_type_names[def->flags & 0x3]);
+ }
+}
+
+/*
+ * Early initialization of the MMU TLB code
+ */
+static void early_init_this_mmu(void)
+{
+ unsigned int mas4;
+
+ /* Set MAS4 based on page table setting */
+
+ mas4 = 0x4 << MAS4_WIMGED_SHIFT;
+ switch (book3e_htw_mode) {
+ case PPC_HTW_E6500:
+ mas4 |= MAS4_INDD;
+ mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT;
+ mas4 |= MAS4_TLBSELD(1);
+ mmu_pte_psize = MMU_PAGE_2M;
+ break;
+
+ case PPC_HTW_NONE:
+ mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
+ mmu_pte_psize = mmu_virtual_psize;
+ break;
+ }
+ mtspr(SPRN_MAS4, mas4);
+
+ unsigned int num_cams;
+ bool map = true;
+
+ /* use a quarter of the TLBCAM for bolted linear map */
+ num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
+
+ /*
+ * Only do the mapping once per core, or else the
+ * transient mapping would cause problems.
+ */
+#ifdef CONFIG_SMP
+ if (hweight32(get_tensr()) > 1)
+ map = false;
+#endif
+
+ if (map)
+ linear_map_top = map_mem_in_cams(linear_map_top,
+ num_cams, false, true);
+
+ /* A sync won't hurt us after mucking around with
+ * the MMU configuration
+ */
+ mb();
+}
+
+static void __init early_init_mmu_global(void)
+{
+ /*
+ * Freescale booke only supports 4K pages in TLB0, so use that.
+ */
+ mmu_vmemmap_psize = MMU_PAGE_4K;
+
+ /* XXX This code only checks for TLB 0 capabilities and doesn't
+ * check what page size combos are supported by the HW. It
+ * also doesn't handle the case where a separate array holds
+ * the IND entries from the array loaded by the PT.
+ */
+ /* Look for supported page sizes */
+ setup_page_sizes();
+
+ /*
+ * If we want to use HW tablewalk, enable it by patching the TLB miss
+ * handlers to branch to the one dedicated to it.
+ */
+ extlb_level_exc = EX_TLB_SIZE;
+ switch (book3e_htw_mode) {
+ case PPC_HTW_E6500:
+ patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
+ patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e);
+ break;
+ }
+
+ pr_info("MMU: Book3E HW tablewalk %s\n",
+ book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported");
+
+ /* Set the global containing the top of the linear mapping
+ * for use by the TLB miss code
+ */
+ linear_map_top = memblock_end_of_DRAM();
+
+ ioremap_bot = IOREMAP_BASE;
+}
+
+static void __init early_mmu_set_memory_limit(void)
+{
+ /*
+ * Limit memory so we dont have linear faults.
+ * Unlike memblock_set_current_limit, which limits
+ * memory available during early boot, this permanently
+ * reduces the memory available to Linux. We need to
+ * do this because highmem is not supported on 64-bit.
+ */
+ memblock_enforce_memory_limit(linear_map_top);
+
+ memblock_set_current_limit(linear_map_top);
+}
+
+/* boot cpu only */
+void __init early_init_mmu(void)
+{
+ early_init_mmu_global();
+ early_init_this_mmu();
+ early_mmu_set_memory_limit();
+}
+
+void early_init_mmu_secondary(void)
+{
+ early_init_this_mmu();
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /*
+ * On FSL Embedded 64-bit, usually all RAM is bolted, but with
+ * unusual memory sizes it's possible for some RAM to not be mapped
+ * (such RAM is not used at all by Linux, since we don't support
+ * highmem on 64-bit). We limit ppc64_rma_size to what would be
+ * mappable if this memblock is the only one. Additional memblocks
+ * can only increase, not decrease, the amount that ends up getting
+ * mapped. We still limit max to 1G even if we'll eventually map
+ * more. This is due to what the early init code is set up to do.
+ *
+ * We crop it to the size of the first MEMBLOCK to
+ * avoid going over total available memory just in case...
+ */
+ unsigned long linear_sz;
+ unsigned int num_cams;
+
+ /* use a quarter of the TLBCAM for bolted linear map */
+ num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
+
+ linear_sz = map_mem_in_cams(first_memblock_size, num_cams, true, true);
+ ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
+
+ /* Finally limit subsequent allocations */
+ memblock_set_current_limit(first_memblock_base + ppc64_rma_size);
+}
diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S
index 4613bf8e9aae..c4d296e73731 100644
--- a/arch/powerpc/mm/nohash/tlb_low.S
+++ b/arch/powerpc/mm/nohash/tlb_low.S
@@ -32,32 +32,7 @@
#include <asm/asm-compat.h>
#include <asm/feature-fixups.h>
-#if defined(CONFIG_40x)
-
-/*
- * 40x implementation needs only tlbil_va
- */
-_GLOBAL(__tlbil_va)
- /* We run the search with interrupts disabled because we have to change
- * the PID and I don't want to preempt when that happens.
- */
- mfmsr r5
- mfspr r6,SPRN_PID
- wrteei 0
- mtspr SPRN_PID,r4
- tlbsx. r3, 0, r3
- mtspr SPRN_PID,r6
- wrtee r5
- bne 1f
- sync
- /* There are only 64 TLB entries, so r3 < 64, which means bit 25 is
- * clear. Since 25 is the V bit in the TLB_TAG, loading this value
- * will invalidate the TLB entry. */
- tlbwe r3, r3, TLB_TAG
- isync
-1: blr
-
-#elif defined(CONFIG_PPC_8xx)
+#if defined(CONFIG_PPC_8xx)
/*
* Nothing to do for 8xx, everything is inline
@@ -186,7 +161,7 @@ _GLOBAL(_tlbivax_bcast)
isync
PPC_TLBIVAX(0, R3)
isync
- eieio
+ mbar
tlbsync
BEGIN_FTR_SECTION
b 1f
@@ -199,7 +174,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2)
* Touch enough instruction cache lines to ensure cache hits
*/
1: mflr r9
- bl 2f
+ bcl 20,31,$+4
2: mflr r6
li r7,32
PPC_ICBT(0,R6,R7) /* touch next cache line */
@@ -221,7 +196,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2)
blr
#endif /* CONFIG_PPC_47x */
-#elif defined(CONFIG_FSL_BOOKE)
+#elif defined(CONFIG_PPC_85xx)
/*
* FSL BookE implementations.
*
@@ -294,7 +269,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
isync
1: wrtee r10
blr
-#elif defined(CONFIG_PPC_BOOK3E)
+#elif defined(CONFIG_PPC_BOOK3E_64)
/*
* New Book3E (>= 2.06) implementation
*
@@ -355,7 +330,7 @@ _GLOBAL(_tlbivax_bcast)
rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */
PPC_TLBIVAX(0,R3)
- eieio
+ mbar
tlbsync
sync
wrtee r10
@@ -364,12 +339,12 @@ _GLOBAL(_tlbivax_bcast)
#error Unsupported processor type !
#endif
-#if defined(CONFIG_PPC_FSL_BOOK3E)
+#if defined(CONFIG_PPC_E500)
/*
* extern void loadcam_entry(unsigned int index)
*
* Load TLBCAM[index] entry in to the L2 CAM MMU
- * Must preserve r7, r8, r9, r10 and r11
+ * Must preserve r7, r8, r9, r10, r11, r12
*/
_GLOBAL(loadcam_entry)
mflr r5
@@ -401,7 +376,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
*
* r3 = first entry to write
* r4 = number of entries to write
- * r5 = temporary tlb entry
+ * r5 = temporary tlb entry (0 means no switch to AS1)
*/
_GLOBAL(loadcam_multi)
mflr r8
@@ -409,12 +384,14 @@ _GLOBAL(loadcam_multi)
mfmsr r11
andi. r11,r11,MSR_IS
bne 10f
+ mr. r12, r5
+ beq 10f
/*
* Set up temporary TLB entry that is the same as what we're
* running from, but in AS=1.
*/
- bl 1f
+ bcl 20,31,$+4
1: mflr r6
tlbsx 0,r8
mfspr r6,SPRN_MAS1
@@ -446,6 +423,8 @@ _GLOBAL(loadcam_multi)
/* Don't return to AS=0 if we were in AS=1 at function start */
andi. r11,r11,MSR_IS
bne 3f
+ cmpwi r12, 0
+ beq 3f
/* Return to AS=0 and clear the temporary entry */
mfmsr r6
diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S
index bf24451f3e71..de568297d5c5 100644
--- a/arch/powerpc/mm/nohash/tlb_low_64e.S
+++ b/arch/powerpc/mm/nohash/tlb_low_64e.S
@@ -61,7 +61,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
ld r14,PACAPGD(r13)
std r15,EX_TLB_R15(r12)
std r10,EX_TLB_CR(r12)
-#ifdef CONFIG_PPC_FSL_BOOK3E
START_BTB_FLUSH_SECTION
mfspr r11, SPRN_SRR1
andi. r10,r11,MSR_PR
@@ -70,14 +69,11 @@ START_BTB_FLUSH_SECTION
1:
END_BTB_FLUSH_SECTION
std r7,EX_TLB_R7(r12)
-#endif
.endm
.macro tlb_epilog_bolted
ld r14,EX_TLB_CR(r12)
-#ifdef CONFIG_PPC_FSL_BOOK3E
ld r7,EX_TLB_R7(r12)
-#endif
ld r10,EX_TLB_R10(r12)
ld r11,EX_TLB_R11(r12)
ld r13,EX_TLB_R13(r12)
@@ -128,6 +124,13 @@ END_BTB_FLUSH_SECTION
bne tlb_miss_kernel_bolted
+tlb_miss_user_bolted:
+#ifdef CONFIG_PPC_KUAP
+ mfspr r10,SPRN_MAS1
+ rlwinm. r10,r10,0,0x3fff0000
+ beq- tlb_miss_fault_bolted /* KUAP fault */
+#endif
+
tlb_miss_common_bolted:
/*
* This is the guts of the TLB miss handler for bolted-linear.
@@ -145,16 +148,7 @@ tlb_miss_common_bolted:
clrrdi r15,r15,3
beq tlb_miss_fault_bolted /* No PGDIR, bail */
-BEGIN_MMU_FTR_SECTION
- /* Set the TLB reservation and search for existing entry. Then load
- * the entry.
- */
- PPC_TLBSRX_DOT(0,R16)
- ldx r14,r14,r15 /* grab pgd entry */
- beq tlb_miss_done_bolted /* tlb exists already, bail */
-MMU_FTR_SECTION_ELSE
ldx r14,r14,r15 /* grab pgd entry */
-ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
clrrdi r15,r15,3
@@ -215,14 +209,15 @@ itlb_miss_kernel_bolted:
tlb_miss_kernel_bolted:
mfspr r10,SPRN_MAS1
ld r14,PACA_KERNELPGD(r13)
- cmpldi cr0,r15,8 /* Check for vmalloc region */
+ srdi r15,r16,44 /* get kernel region */
+ andi. r15,r15,1 /* Check for vmalloc region */
rlwinm r10,r10,0,16,1 /* Clear TID */
mtspr SPRN_MAS1,r10
- beq+ tlb_miss_common_bolted
+ bne+ tlb_miss_common_bolted
tlb_miss_fault_bolted:
/* We need to check if it was an instruction miss */
- andi. r10,r11,_PAGE_EXEC|_PAGE_BAP_SX
+ andi. r10,r11,_PAGE_BAP_UX|_PAGE_BAP_SX
bne itlb_miss_fault_bolted
dtlb_miss_fault_bolted:
tlb_epilog_bolted
@@ -239,17 +234,16 @@ itlb_miss_fault_bolted:
srdi r15,r16,60 /* get region */
bne- itlb_miss_fault_bolted
- li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */
+ li r11,_PAGE_PRESENT|_PAGE_BAP_UX /* Base perm */
/* We do the user/kernel test for the PID here along with the RW test
*/
cmpldi cr0,r15,0 /* Check for user region */
oris r11,r11,_PAGE_ACCESSED@h
- beq tlb_miss_common_bolted
+ beq tlb_miss_user_bolted
b itlb_miss_kernel_bolted
-#ifdef CONFIG_PPC_FSL_BOOK3E
/*
* TLB miss handling for e6500 and derivatives, using hardware tablewalk.
*
@@ -357,7 +351,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532)
mfspr r15,SPRN_MAS2
isync
- tlbilxva 0,r15
+ PPC_TLBILX_VA(0,R15)
isync
mtspr SPRN_MAS6,r10
@@ -456,11 +450,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SMT)
tlb_miss_huge_e6500:
beq tlb_miss_fault_e6500
- li r10,1
- andi. r15,r14,HUGEPD_SHIFT_MASK@l /* r15 = psize */
- rldimi r14,r10,63,0 /* Set PD_HUGE */
- xor r14,r14,r15 /* Clear size bits */
- ldx r14,0,r14
+ rlwinm r15,r14,32-_PAGE_PSIZE_SHIFT,0x1e
/*
* Now we build the MAS for a huge page.
@@ -471,7 +461,6 @@ tlb_miss_huge_e6500:
* MAS 2,3+7: Needs to be redone similar to non-tablewalk handler
*/
- subi r15,r15,10 /* Convert psize to tsize */
mfspr r10,SPRN_MAS1
rlwinm r10,r10,0,~MAS1_IND
rlwimi r10,r15,MAS1_TSIZE_SHIFT,MAS1_TSIZE_MASK
@@ -500,7 +489,9 @@ tlb_miss_huge_e6500:
tlb_miss_kernel_e6500:
ld r14,PACA_KERNELPGD(r13)
- cmpldi cr1,r15,8 /* Check for vmalloc region */
+ srdi r15,r16,44 /* get kernel region */
+ xoris r15,r15,0xc /* Check for vmalloc region */
+ cmplwi cr1,r15,1
beq+ cr1,tlb_miss_common_e6500
tlb_miss_fault_e6500:
@@ -514,237 +505,6 @@ dtlb_miss_fault_e6500:
itlb_miss_fault_e6500:
tlb_epilog_bolted
b exc_instruction_storage_book3e
-#endif /* CONFIG_PPC_FSL_BOOK3E */
-
-/**********************************************************************
- * *
- * TLB miss handling for Book3E with TLB reservation and HES support *
- * *
- **********************************************************************/
-
-
-/* Data TLB miss */
- START_EXCEPTION(data_tlb_miss)
- TLB_MISS_PROLOG
-
- /* Now we handle the fault proper. We only save DEAR in normal
- * fault case since that's the only interesting values here.
- * We could probably also optimize by not saving SRR0/1 in the
- * linear mapping case but I'll leave that for later
- */
- mfspr r14,SPRN_ESR
- mfspr r16,SPRN_DEAR /* get faulting address */
- srdi r15,r16,60 /* get region */
- cmpldi cr0,r15,0xc /* linear mapping ? */
- beq tlb_load_linear /* yes -> go to linear map load */
-
- /* The page tables are mapped virtually linear. At this point, though,
- * we don't know whether we are trying to fault in a first level
- * virtual address or a virtual page table address. We can get that
- * from bit 0x1 of the region ID which we have set for a page table
- */
- andi. r10,r15,0x1
- bne- virt_page_table_tlb_miss
-
- std r14,EX_TLB_ESR(r12); /* save ESR */
- std r16,EX_TLB_DEAR(r12); /* save DEAR */
-
- /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */
- li r11,_PAGE_PRESENT
- oris r11,r11,_PAGE_ACCESSED@h
-
- /* We do the user/kernel test for the PID here along with the RW test
- */
- cmpldi cr0,r15,0 /* Check for user region */
-
- /* We pre-test some combination of permissions to avoid double
- * faults:
- *
- * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
- * ESR_ST is 0x00800000
- * _PAGE_BAP_SW is 0x00000010
- * So the shift is >> 19. This tests for supervisor writeability.
- * If the page happens to be supervisor writeable and not user
- * writeable, we will take a new fault later, but that should be
- * a rare enough case.
- *
- * We also move ESR_ST in _PAGE_DIRTY position
- * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
- *
- * MAS1 is preset for all we need except for TID that needs to
- * be cleared for kernel translations
- */
- rlwimi r11,r14,32-19,27,27
- rlwimi r11,r14,32-16,19,19
- beq normal_tlb_miss
- /* XXX replace the RMW cycles with immediate loads + writes */
-1: mfspr r10,SPRN_MAS1
- cmpldi cr0,r15,8 /* Check for vmalloc region */
- rlwinm r10,r10,0,16,1 /* Clear TID */
- mtspr SPRN_MAS1,r10
- beq+ normal_tlb_miss
-
- /* We got a crappy address, just fault with whatever DEAR and ESR
- * are here
- */
- TLB_MISS_EPILOG_ERROR
- b exc_data_storage_book3e
-
-/* Instruction TLB miss */
- START_EXCEPTION(instruction_tlb_miss)
- TLB_MISS_PROLOG
-
- /* If we take a recursive fault, the second level handler may need
- * to know whether we are handling a data or instruction fault in
- * order to get to the right store fault handler. We provide that
- * info by writing a crazy value in ESR in our exception frame
- */
- li r14,-1 /* store to exception frame is done later */
-
- /* Now we handle the fault proper. We only save DEAR in the non
- * linear mapping case since we know the linear mapping case will
- * not re-enter. We could indeed optimize and also not save SRR0/1
- * in the linear mapping case but I'll leave that for later
- *
- * Faulting address is SRR0 which is already in r16
- */
- srdi r15,r16,60 /* get region */
- cmpldi cr0,r15,0xc /* linear mapping ? */
- beq tlb_load_linear /* yes -> go to linear map load */
-
- /* We do the user/kernel test for the PID here along with the RW test
- */
- li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */
- oris r11,r11,_PAGE_ACCESSED@h
-
- cmpldi cr0,r15,0 /* Check for user region */
- std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */
- beq normal_tlb_miss
-
- li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */
- oris r11,r11,_PAGE_ACCESSED@h
- /* XXX replace the RMW cycles with immediate loads + writes */
- mfspr r10,SPRN_MAS1
- cmpldi cr0,r15,8 /* Check for vmalloc region */
- rlwinm r10,r10,0,16,1 /* Clear TID */
- mtspr SPRN_MAS1,r10
- beq+ normal_tlb_miss
-
- /* We got a crappy address, just fault */
- TLB_MISS_EPILOG_ERROR
- b exc_instruction_storage_book3e
-
-/*
- * This is the guts of the first-level TLB miss handler for direct
- * misses. We are entered with:
- *
- * r16 = faulting address
- * r15 = region ID
- * r14 = crap (free to use)
- * r13 = PACA
- * r12 = TLB exception frame in PACA
- * r11 = PTE permission mask
- * r10 = crap (free to use)
- */
-normal_tlb_miss:
- /* So we first construct the page table address. We do that by
- * shifting the bottom of the address (not the region ID) by
- * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and
- * or'ing the fourth high bit.
- *
- * NOTE: For 64K pages, we do things slightly differently in
- * order to handle the weird page table format used by linux
- */
- ori r10,r15,0x1
- rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4
- sldi r15,r10,60
- clrrdi r14,r14,3
- or r10,r15,r14
-
-BEGIN_MMU_FTR_SECTION
- /* Set the TLB reservation and search for existing entry. Then load
- * the entry.
- */
- PPC_TLBSRX_DOT(0,R16)
- ld r14,0(r10)
- beq normal_tlb_miss_done
-MMU_FTR_SECTION_ELSE
- ld r14,0(r10)
-ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
-
-finish_normal_tlb_miss:
- /* Check if required permissions are met */
- andc. r15,r11,r14
- bne- normal_tlb_miss_access_fault
-
- /* Now we build the MAS:
- *
- * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
- * MAS 1 : Almost fully setup
- * - PID already updated by caller if necessary
- * - TSIZE need change if !base page size, not
- * yet implemented for now
- * MAS 2 : Defaults not useful, need to be redone
- * MAS 3+7 : Needs to be done
- *
- * TODO: mix up code below for better scheduling
- */
- clrrdi r11,r16,12 /* Clear low crap in EA */
- rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */
- mtspr SPRN_MAS2,r11
-
- /* Check page size, if not standard, update MAS1 */
- rldicl r11,r14,64-8,64-8
- cmpldi cr0,r11,BOOK3E_PAGESZ_4K
- beq- 1f
- mfspr r11,SPRN_MAS1
- rlwimi r11,r14,31,21,24
- rlwinm r11,r11,0,21,19
- mtspr SPRN_MAS1,r11
-1:
- /* Move RPN in position */
- rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
- clrldi r15,r11,12 /* Clear crap at the top */
- rlwimi r15,r14,32-8,22,25 /* Move in U bits */
- rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */
-
- /* Mask out SW and UW if !DIRTY (XXX optimize this !) */
- andi. r11,r14,_PAGE_DIRTY
- bne 1f
- li r11,MAS3_SW|MAS3_UW
- andc r15,r15,r11
-1:
-BEGIN_MMU_FTR_SECTION
- srdi r16,r15,32
- mtspr SPRN_MAS3,r15
- mtspr SPRN_MAS7,r16
-MMU_FTR_SECTION_ELSE
- mtspr SPRN_MAS7_MAS3,r15
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
-
- tlbwe
-
-normal_tlb_miss_done:
- /* We don't bother with restoring DEAR or ESR since we know we are
- * level 0 and just going back to userland. They are only needed
- * if you are going to take an access fault
- */
- TLB_MISS_EPILOG_SUCCESS
- rfi
-
-normal_tlb_miss_access_fault:
- /* We need to check if it was an instruction miss */
- andi. r10,r11,_PAGE_EXEC
- bne 1f
- ld r14,EX_TLB_DEAR(r12)
- ld r15,EX_TLB_ESR(r12)
- mtspr SPRN_DEAR,r14
- mtspr SPRN_ESR,r15
- TLB_MISS_EPILOG_ERROR
- b exc_data_storage_book3e
-1: TLB_MISS_EPILOG_ERROR
- b exc_instruction_storage_book3e
-
/*
* This is the guts of the second-level TLB miss handler for direct
@@ -772,6 +532,7 @@ normal_tlb_miss_access_fault:
*/
virt_page_table_tlb_miss:
/* Are we hitting a kernel page table ? */
+ srdi r15,r16,60
andi. r10,r15,0x8
/* The cool thing now is that r10 contains 0 for user and 8 for kernel,
@@ -786,19 +547,22 @@ virt_page_table_tlb_miss:
mfspr r10,SPRN_MAS1
rlwinm r10,r10,0,16,1 /* Clear TID */
mtspr SPRN_MAS1,r10
+#ifdef CONFIG_PPC_KUAP
+ b 2f
1:
-BEGIN_MMU_FTR_SECTION
- /* Search if we already have a TLB entry for that virtual address, and
- * if we do, bail out.
- */
- PPC_TLBSRX_DOT(0,R16)
- beq virt_page_table_tlb_miss_done
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
+ mfspr r10,SPRN_MAS1
+ rlwinm. r10,r10,0,0x3fff0000
+ beq- virt_page_table_tlb_miss_fault /* KUAP fault */
+2:
+#else
+1:
+#endif
/* Now, we need to walk the page tables. First check if we are in
* range.
*/
- rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4
+ rldicl r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4
+ cmpldi r10,0x80
bne- virt_page_table_tlb_miss_fault
/* Get the PGD pointer */
@@ -844,41 +608,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
clrldi r11,r15,4 /* remove region ID from RPN */
ori r10,r11,1 /* Or-in SR */
-BEGIN_MMU_FTR_SECTION
srdi r16,r10,32
mtspr SPRN_MAS3,r10
mtspr SPRN_MAS7,r16
-MMU_FTR_SECTION_ELSE
- mtspr SPRN_MAS7_MAS3,r10
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
tlbwe
-BEGIN_MMU_FTR_SECTION
-virt_page_table_tlb_miss_done:
-
- /* We have overridden MAS2:EPN but currently our primary TLB miss
- * handler will always restore it so that should not be an issue,
- * if we ever optimize the primary handler to not write MAS2 on
- * some cases, we'll have to restore MAS2:EPN here based on the
- * original fault's DEAR. If we do that we have to modify the
- * ITLB miss handler to also store SRR0 in the exception frame
- * as DEAR.
- *
- * However, one nasty thing we did is we cleared the reservation
- * (well, potentially we did). We do a trick here thus if we
- * are not a level 0 exception (we interrupted the TLB miss) we
- * offset the return address by -4 in order to replay the tlbsrx
- * instruction there
- */
- subf r10,r13,r12
- cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE
- bne- 1f
- ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
- addi r10,r11,-4
- std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
-1:
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
/* Return to caller, normal case */
TLB_MISS_EPILOG_SUCCESS
rfi
@@ -927,198 +662,6 @@ virt_page_table_tlb_miss_whacko_fault:
TLB_MISS_EPILOG_ERROR
b exc_data_storage_book3e
-
-/**************************************************************
- * *
- * TLB miss handling for Book3E with hw page table support *
- * *
- **************************************************************/
-
-
-/* Data TLB miss */
- START_EXCEPTION(data_tlb_miss_htw)
- TLB_MISS_PROLOG
-
- /* Now we handle the fault proper. We only save DEAR in normal
- * fault case since that's the only interesting values here.
- * We could probably also optimize by not saving SRR0/1 in the
- * linear mapping case but I'll leave that for later
- */
- mfspr r14,SPRN_ESR
- mfspr r16,SPRN_DEAR /* get faulting address */
- srdi r11,r16,60 /* get region */
- cmpldi cr0,r11,0xc /* linear mapping ? */
- beq tlb_load_linear /* yes -> go to linear map load */
-
- /* We do the user/kernel test for the PID here along with the RW test
- */
- cmpldi cr0,r11,0 /* Check for user region */
- ld r15,PACAPGD(r13) /* Load user pgdir */
- beq htw_tlb_miss
-
- /* XXX replace the RMW cycles with immediate loads + writes */
-1: mfspr r10,SPRN_MAS1
- cmpldi cr0,r11,8 /* Check for vmalloc region */
- rlwinm r10,r10,0,16,1 /* Clear TID */
- mtspr SPRN_MAS1,r10
- ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */
- beq+ htw_tlb_miss
-
- /* We got a crappy address, just fault with whatever DEAR and ESR
- * are here
- */
- TLB_MISS_EPILOG_ERROR
- b exc_data_storage_book3e
-
-/* Instruction TLB miss */
- START_EXCEPTION(instruction_tlb_miss_htw)
- TLB_MISS_PROLOG
-
- /* If we take a recursive fault, the second level handler may need
- * to know whether we are handling a data or instruction fault in
- * order to get to the right store fault handler. We provide that
- * info by keeping a crazy value for ESR in r14
- */
- li r14,-1 /* store to exception frame is done later */
-
- /* Now we handle the fault proper. We only save DEAR in the non
- * linear mapping case since we know the linear mapping case will
- * not re-enter. We could indeed optimize and also not save SRR0/1
- * in the linear mapping case but I'll leave that for later
- *
- * Faulting address is SRR0 which is already in r16
- */
- srdi r11,r16,60 /* get region */
- cmpldi cr0,r11,0xc /* linear mapping ? */
- beq tlb_load_linear /* yes -> go to linear map load */
-
- /* We do the user/kernel test for the PID here along with the RW test
- */
- cmpldi cr0,r11,0 /* Check for user region */
- ld r15,PACAPGD(r13) /* Load user pgdir */
- beq htw_tlb_miss
-
- /* XXX replace the RMW cycles with immediate loads + writes */
-1: mfspr r10,SPRN_MAS1
- cmpldi cr0,r11,8 /* Check for vmalloc region */
- rlwinm r10,r10,0,16,1 /* Clear TID */
- mtspr SPRN_MAS1,r10
- ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */
- beq+ htw_tlb_miss
-
- /* We got a crappy address, just fault */
- TLB_MISS_EPILOG_ERROR
- b exc_instruction_storage_book3e
-
-
-/*
- * This is the guts of the second-level TLB miss handler for direct
- * misses. We are entered with:
- *
- * r16 = virtual page table faulting address
- * r15 = PGD pointer
- * r14 = ESR
- * r13 = PACA
- * r12 = TLB exception frame in PACA
- * r11 = crap (free to use)
- * r10 = crap (free to use)
- *
- * It can be re-entered by the linear mapping miss handler. However, to
- * avoid too much complication, it will save/restore things for us
- */
-htw_tlb_miss:
- /* Search if we already have a TLB entry for that virtual address, and
- * if we do, bail out.
- *
- * MAS1:IND should be already set based on MAS4
- */
- PPC_TLBSRX_DOT(0,R16)
- beq htw_tlb_miss_done
-
- /* Now, we need to walk the page tables. First check if we are in
- * range.
- */
- rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
- bne- htw_tlb_miss_fault
-
- /* Get the PGD pointer */
- cmpldi cr0,r15,0
- beq- htw_tlb_miss_fault
-
- /* Get to PGD entry */
- rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3
- clrrdi r10,r11,3
- ldx r15,r10,r15
- cmpdi cr0,r15,0
- bge htw_tlb_miss_fault
-
- /* Get to PUD entry */
- rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
- clrrdi r10,r11,3
- ldx r15,r10,r15
- cmpdi cr0,r15,0
- bge htw_tlb_miss_fault
-
- /* Get to PMD entry */
- rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
- clrrdi r10,r11,3
- ldx r15,r10,r15
- cmpdi cr0,r15,0
- bge htw_tlb_miss_fault
-
- /* Ok, we're all right, we can now create an indirect entry for
- * a 1M or 256M page.
- *
- * The last trick is now that because we use "half" pages for
- * the HTW (1M IND is 2K and 256M IND is 32K) we need to account
- * for an added LSB bit to the RPN. For 64K pages, there is no
- * problem as we already use 32K arrays (half PTE pages), but for
- * 4K page we need to extract a bit from the virtual address and
- * insert it into the "PA52" bit of the RPN.
- */
- rlwimi r15,r16,32-9,20,20
- /* Now we build the MAS:
- *
- * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
- * MAS 1 : Almost fully setup
- * - PID already updated by caller if necessary
- * - TSIZE for now is base ind page size always
- * MAS 2 : Use defaults
- * MAS 3+7 : Needs to be done
- */
- ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
-
-BEGIN_MMU_FTR_SECTION
- srdi r16,r10,32
- mtspr SPRN_MAS3,r10
- mtspr SPRN_MAS7,r16
-MMU_FTR_SECTION_ELSE
- mtspr SPRN_MAS7_MAS3,r10
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
-
- tlbwe
-
-htw_tlb_miss_done:
- /* We don't bother with restoring DEAR or ESR since we know we are
- * level 0 and just going back to userland. They are only needed
- * if you are going to take an access fault
- */
- TLB_MISS_EPILOG_SUCCESS
- rfi
-
-htw_tlb_miss_fault:
- /* We need to check if it was an instruction miss. We know this
- * though because r14 would contain -1
- */
- cmpdi cr0,r14,-1
- beq 1f
- mtspr SPRN_DEAR,r16
- mtspr SPRN_ESR,r14
- TLB_MISS_EPILOG_ERROR
- b exc_data_storage_book3e
-1: TLB_MISS_EPILOG_ERROR
- b exc_instruction_storage_book3e
-
/*
* This is the guts of "any" level TLB miss handler for kernel linear
* mapping misses. We are entered with:
@@ -1149,8 +692,8 @@ tlb_load_linear:
* we only use 1G pages for now. That might have to be changed in a
* final implementation, especially when dealing with hypervisors
*/
- ld r11,PACATOC(r13)
- ld r11,linear_map_top@got(r11)
+ __LOAD_PACA_TOC(r11)
+ LOAD_REG_ADDR_ALTTOC(r11, r11, linear_map_top)
ld r10,0(r11)
tovirt(10,10)
cmpld cr0,r16,r10
@@ -1175,13 +718,9 @@ tlb_load_linear:
clrldi r10,r10,4 /* clear region bits */
ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX
-BEGIN_MMU_FTR_SECTION
srdi r16,r10,32
mtspr SPRN_MAS3,r10
mtspr SPRN_MAS7,r16
-MMU_FTR_SECTION_ELSE
- mtspr SPRN_MAS7_MAS3,r10
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
tlbwe
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index f2bf98bdcea2..603a0f652ba6 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -16,6 +16,7 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/of.h>
+#include <linux/of_address.h>
#include <linux/pfn.h>
#include <linux/cpuset.h>
#include <linux/node.h>
@@ -26,7 +27,6 @@
#include <linux/slab.h>
#include <asm/cputhreads.h>
#include <asm/sparsemem.h>
-#include <asm/prom.h>
#include <asm/smp.h>
#include <asm/topology.h>
#include <asm/firmware.h>
@@ -34,31 +34,35 @@
#include <asm/hvcall.h>
#include <asm/setup.h>
#include <asm/vdso.h>
+#include <asm/vphn.h>
#include <asm/drmem.h>
static int numa_enabled = 1;
static char *cmdline __initdata;
-static int numa_debug;
-#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
-
int numa_cpu_lookup_table[NR_CPUS];
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
-struct pglist_data *node_data[MAX_NUMNODES];
EXPORT_SYMBOL(numa_cpu_lookup_table);
EXPORT_SYMBOL(node_to_cpumask_map);
-EXPORT_SYMBOL(node_data);
-static int min_common_depth;
+static int primary_domain_index;
static int n_mem_addr_cells, n_mem_size_cells;
-static int form1_affinity;
+
+#define FORM0_AFFINITY 0
+#define FORM1_AFFINITY 1
+#define FORM2_AFFINITY 2
+static int affinity_form;
#define MAX_DISTANCE_REF_POINTS 4
static int distance_ref_points_depth;
static const __be32 *distance_ref_points;
static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
+static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = {
+ [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 }
+};
+static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE };
/*
* Allocate node_to_cpumask_map based on number of available nodes
@@ -79,7 +83,7 @@ static void __init setup_node_to_cpumask_map(void)
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
/* cpumask_of_node() will now work */
- dbg("Node to cpumask map for %u nodes\n", nr_node_ids);
+ pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
}
static int __init fake_numa_create_new_node(unsigned long end_pfn,
@@ -123,13 +127,13 @@ static int __init fake_numa_create_new_node(unsigned long end_pfn,
cmdline = p;
fake_nid++;
*nid = fake_nid;
- dbg("created new fake_node with id %d\n", fake_nid);
+ pr_debug("created new fake_node with id %d\n", fake_nid);
return 1;
}
return 0;
}
-static void reset_numa_cpu_lookup_table(void)
+static void __init reset_numa_cpu_lookup_table(void)
{
unsigned int cpu;
@@ -137,33 +141,79 @@ static void reset_numa_cpu_lookup_table(void)
numa_cpu_lookup_table[cpu] = -1;
}
-static void map_cpu_to_node(int cpu, int node)
+void map_cpu_to_node(int cpu, int node)
{
update_numa_cpu_lookup_table(cpu, node);
- dbg("adding cpu %d to node %d\n", cpu, node);
-
- if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
+ if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) {
+ pr_debug("adding cpu %d to node %d\n", cpu, node);
cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
+ }
}
#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
-static void unmap_cpu_from_node(unsigned long cpu)
+void unmap_cpu_from_node(unsigned long cpu)
{
int node = numa_cpu_lookup_table[cpu];
- dbg("removing cpu %lu from node %d\n", cpu, node);
-
if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
+ pr_debug("removing cpu %lu from node %d\n", cpu, node);
} else {
- printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
- cpu, node);
+ pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node);
}
}
#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
-int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+static int __associativity_to_nid(const __be32 *associativity,
+ int max_array_sz)
+{
+ int nid;
+ /*
+ * primary_domain_index is 1 based array index.
+ */
+ int index = primary_domain_index - 1;
+
+ if (!numa_enabled || index >= max_array_sz)
+ return NUMA_NO_NODE;
+
+ nid = of_read_number(&associativity[index], 1);
+
+ /* POWER4 LPAR uses 0xffff as invalid node */
+ if (nid == 0xffff || nid >= nr_node_ids)
+ nid = NUMA_NO_NODE;
+ return nid;
+}
+/*
+ * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
+ * info is found.
+ */
+static int associativity_to_nid(const __be32 *associativity)
+{
+ int array_sz = of_read_number(associativity, 1);
+
+ /* Skip the first element in the associativity array */
+ return __associativity_to_nid((associativity + 1), array_sz);
+}
+
+static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+{
+ int dist;
+ int node1, node2;
+
+ node1 = associativity_to_nid(cpu1_assoc);
+ node2 = associativity_to_nid(cpu2_assoc);
+
+ dist = numa_distance_table[node1][node2];
+ if (dist <= LOCAL_DISTANCE)
+ return 0;
+ else if (dist <= REMOTE_DISTANCE)
+ return 1;
+ else
+ return 2;
+}
+
+static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
{
int dist = 0;
@@ -179,6 +229,15 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
return dist;
}
+int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+{
+ /* We should not get called with FORM0 */
+ VM_WARN_ON(affinity_form == FORM0_AFFINITY);
+ if (affinity_form == FORM1_AFFINITY)
+ return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc);
+ return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc);
+}
+
/* must hold reference to node during call */
static const __be32 *of_get_associativity(struct device_node *dev)
{
@@ -190,7 +249,9 @@ int __node_distance(int a, int b)
int i;
int distance = LOCAL_DISTANCE;
- if (!form1_affinity)
+ if (affinity_form == FORM2_AFFINITY)
+ return numa_distance_table[a][b];
+ else if (affinity_form == FORM0_AFFINITY)
return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
for (i = 0; i < distance_ref_points_depth; i++) {
@@ -205,52 +266,6 @@ int __node_distance(int a, int b)
}
EXPORT_SYMBOL(__node_distance);
-static void initialize_distance_lookup_table(int nid,
- const __be32 *associativity)
-{
- int i;
-
- if (!form1_affinity)
- return;
-
- for (i = 0; i < distance_ref_points_depth; i++) {
- const __be32 *entry;
-
- entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
- distance_lookup_table[nid][i] = of_read_number(entry, 1);
- }
-}
-
-/*
- * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
- * info is found.
- */
-static int associativity_to_nid(const __be32 *associativity)
-{
- int nid = NUMA_NO_NODE;
-
- if (!numa_enabled)
- goto out;
-
- if (of_read_number(associativity, 1) >= min_common_depth)
- nid = of_read_number(&associativity[min_common_depth], 1);
-
- /* POWER4 LPAR uses 0xffff as invalid node */
- if (nid == 0xffff || nid >= nr_node_ids)
- nid = NUMA_NO_NODE;
-
- if (nid > 0 &&
- of_read_number(associativity, 1) >= distance_ref_points_depth) {
- /*
- * Skip the length field and send start of associativity array
- */
- initialize_distance_lookup_table(nid, associativity + 1);
- }
-
-out:
- return nid;
-}
-
/* Returns the nid associated with the given device tree node,
* or -1 if not found.
*/
@@ -284,10 +299,155 @@ int of_node_to_nid(struct device_node *device)
}
EXPORT_SYMBOL(of_node_to_nid);
-static int __init find_min_common_depth(void)
+static void __initialize_form1_numa_distance(const __be32 *associativity,
+ int max_array_sz)
+{
+ int i, nid;
+
+ if (affinity_form != FORM1_AFFINITY)
+ return;
+
+ nid = __associativity_to_nid(associativity, max_array_sz);
+ if (nid != NUMA_NO_NODE) {
+ for (i = 0; i < distance_ref_points_depth; i++) {
+ const __be32 *entry;
+ int index = be32_to_cpu(distance_ref_points[i]) - 1;
+
+ /*
+ * broken hierarchy, return with broken distance table
+ */
+ if (WARN(index >= max_array_sz, "Broken ibm,associativity property"))
+ return;
+
+ entry = &associativity[index];
+ distance_lookup_table[nid][i] = of_read_number(entry, 1);
+ }
+ }
+}
+
+static void initialize_form1_numa_distance(const __be32 *associativity)
+{
+ int array_sz;
+
+ array_sz = of_read_number(associativity, 1);
+ /* Skip the first element in the associativity array */
+ __initialize_form1_numa_distance(associativity + 1, array_sz);
+}
+
+/*
+ * Used to update distance information w.r.t newly added node.
+ */
+void update_numa_distance(struct device_node *node)
+{
+ int nid;
+
+ if (affinity_form == FORM0_AFFINITY)
+ return;
+ else if (affinity_form == FORM1_AFFINITY) {
+ const __be32 *associativity;
+
+ associativity = of_get_associativity(node);
+ if (!associativity)
+ return;
+
+ initialize_form1_numa_distance(associativity);
+ return;
+ }
+
+ /* FORM2 affinity */
+ nid = of_node_to_nid_single(node);
+ if (nid == NUMA_NO_NODE)
+ return;
+
+ /*
+ * With FORM2 we expect NUMA distance of all possible NUMA
+ * nodes to be provided during boot.
+ */
+ WARN(numa_distance_table[nid][nid] == -1,
+ "NUMA distance details for node %d not provided\n", nid);
+}
+EXPORT_SYMBOL_GPL(update_numa_distance);
+
+/*
+ * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN}
+ * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements}
+ */
+static void __init initialize_form2_numa_distance_lookup_table(void)
{
- int depth;
+ int i, j;
struct device_node *root;
+ const __u8 *form2_distances;
+ const __be32 *numa_lookup_index;
+ int form2_distances_length;
+ int max_numa_index, distance_index;
+
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ root = of_find_node_by_path("/ibm,opal");
+ else
+ root = of_find_node_by_path("/rtas");
+ if (!root)
+ root = of_find_node_by_path("/");
+
+ numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL);
+ max_numa_index = of_read_number(&numa_lookup_index[0], 1);
+
+ /* first element of the array is the size and is encode-int */
+ form2_distances = of_get_property(root, "ibm,numa-distance-table", NULL);
+ form2_distances_length = of_read_number((const __be32 *)&form2_distances[0], 1);
+ /* Skip the size which is encoded int */
+ form2_distances += sizeof(__be32);
+
+ pr_debug("form2_distances_len = %d, numa_dist_indexes_len = %d\n",
+ form2_distances_length, max_numa_index);
+
+ for (i = 0; i < max_numa_index; i++)
+ /* +1 skip the max_numa_index in the property */
+ numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1);
+
+
+ if (form2_distances_length != max_numa_index * max_numa_index) {
+ WARN(1, "Wrong NUMA distance information\n");
+ form2_distances = NULL; // don't use it
+ }
+ distance_index = 0;
+ for (i = 0; i < max_numa_index; i++) {
+ for (j = 0; j < max_numa_index; j++) {
+ int nodeA = numa_id_index_table[i];
+ int nodeB = numa_id_index_table[j];
+ int dist;
+
+ if (form2_distances)
+ dist = form2_distances[distance_index++];
+ else if (nodeA == nodeB)
+ dist = LOCAL_DISTANCE;
+ else
+ dist = REMOTE_DISTANCE;
+ numa_distance_table[nodeA][nodeB] = dist;
+ pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, dist);
+ }
+ }
+
+ of_node_put(root);
+}
+
+static int __init find_primary_domain_index(void)
+{
+ int index;
+ struct device_node *root;
+
+ /*
+ * Check for which form of affinity.
+ */
+ if (firmware_has_feature(FW_FEATURE_OPAL)) {
+ affinity_form = FORM1_AFFINITY;
+ } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
+ pr_debug("Using form 2 affinity\n");
+ affinity_form = FORM2_AFFINITY;
+ } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
+ pr_debug("Using form 1 affinity\n");
+ affinity_form = FORM1_AFFINITY;
+ } else
+ affinity_form = FORM0_AFFINITY;
if (firmware_has_feature(FW_FEATURE_OPAL))
root = of_find_node_by_path("/ibm,opal");
@@ -313,42 +473,37 @@ static int __init find_min_common_depth(void)
&distance_ref_points_depth);
if (!distance_ref_points) {
- dbg("NUMA: ibm,associativity-reference-points not found.\n");
+ pr_debug("ibm,associativity-reference-points not found.\n");
goto err;
}
distance_ref_points_depth /= sizeof(int);
-
- if (firmware_has_feature(FW_FEATURE_OPAL) ||
- firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
- dbg("Using form 1 affinity\n");
- form1_affinity = 1;
- }
-
- if (form1_affinity) {
- depth = of_read_number(distance_ref_points, 1);
- } else {
+ if (affinity_form == FORM0_AFFINITY) {
if (distance_ref_points_depth < 2) {
- printk(KERN_WARNING "NUMA: "
- "short ibm,associativity-reference-points\n");
+ pr_warn("short ibm,associativity-reference-points\n");
goto err;
}
- depth = of_read_number(&distance_ref_points[1], 1);
+ index = of_read_number(&distance_ref_points[1], 1);
+ } else {
+ /*
+ * Both FORM1 and FORM2 affinity find the primary domain details
+ * at the same offset.
+ */
+ index = of_read_number(distance_ref_points, 1);
}
-
/*
* Warn and cap if the hardware supports more than
* MAX_DISTANCE_REF_POINTS domains.
*/
if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
- printk(KERN_WARNING "NUMA: distance array capped at "
- "%d entries\n", MAX_DISTANCE_REF_POINTS);
+ pr_warn("distance array capped at %d entries\n",
+ MAX_DISTANCE_REF_POINTS);
distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
}
of_node_put(root);
- return depth;
+ return index;
err:
of_node_put(root);
@@ -426,6 +581,38 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
return 0;
}
+static int __init get_nid_and_numa_distance(struct drmem_lmb *lmb)
+{
+ struct assoc_arrays aa = { .arrays = NULL };
+ int default_nid = NUMA_NO_NODE;
+ int nid = default_nid;
+ int rc, index;
+
+ if ((primary_domain_index < 0) || !numa_enabled)
+ return default_nid;
+
+ rc = of_get_assoc_arrays(&aa);
+ if (rc)
+ return default_nid;
+
+ if (primary_domain_index <= aa.array_sz &&
+ !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
+ const __be32 *associativity;
+
+ index = lmb->aa_index * aa.array_sz;
+ associativity = &aa.arrays[index];
+ nid = __associativity_to_nid(associativity, aa.array_sz);
+ if (nid > 0 && affinity_form == FORM1_AFFINITY) {
+ /*
+ * lookup array associativity entries have
+ * no length of the array as the first element.
+ */
+ __initialize_form1_numa_distance(associativity, aa.array_sz);
+ }
+ }
+ return nid;
+}
+
/*
* This is like of_node_to_nid_single() for memory represented in the
* ibm,dynamic-reconfiguration-memory node.
@@ -437,35 +624,28 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
int nid = default_nid;
int rc, index;
- if ((min_common_depth < 0) || !numa_enabled)
+ if ((primary_domain_index < 0) || !numa_enabled)
return default_nid;
rc = of_get_assoc_arrays(&aa);
if (rc)
return default_nid;
- if (min_common_depth <= aa.array_sz &&
+ if (primary_domain_index <= aa.array_sz &&
!(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
- index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
- nid = of_read_number(&aa.arrays[index], 1);
-
- if (nid == 0xffff || nid >= nr_node_ids)
- nid = default_nid;
+ const __be32 *associativity;
- if (nid > 0) {
- index = lmb->aa_index * aa.array_sz;
- initialize_distance_lookup_table(nid,
- &aa.arrays[index]);
- }
+ index = lmb->aa_index * aa.array_sz;
+ associativity = &aa.arrays[index];
+ nid = __associativity_to_nid(associativity, aa.array_sz);
}
-
return nid;
}
#ifdef CONFIG_PPC_SPLPAR
-static int vphn_get_nid(long lcpu)
+
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
{
- __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
long rc, hwid;
/*
@@ -485,12 +665,30 @@ static int vphn_get_nid(long lcpu)
rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
if (rc == H_SUCCESS)
- return associativity_to_nid(associativity);
+ return 0;
}
+ return -1;
+}
+
+static int vphn_get_nid(long lcpu)
+{
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+
+
+ if (!__vphn_get_associativity(lcpu, associativity))
+ return associativity_to_nid(associativity);
+
return NUMA_NO_NODE;
+
}
#else
+
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
+{
+ return -1;
+}
+
static int vphn_get_nid(long unused)
{
return NUMA_NO_NODE;
@@ -598,9 +796,6 @@ static int ppc_numa_cpu_prepare(unsigned int cpu)
static int ppc_numa_cpu_dead(unsigned int cpu)
{
-#ifdef CONFIG_HOTPLUG_CPU
- unmap_cpu_from_node(cpu);
-#endif
return 0;
}
@@ -685,7 +880,7 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
size = read_n_cells(n_mem_size_cells, usm);
}
- nid = of_drconf_to_nid_single(lmb);
+ nid = get_nid_and_numa_distance(lmb);
fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
&nid);
node_set_online(nid);
@@ -699,27 +894,34 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
static int __init parse_numa_properties(void)
{
- struct device_node *memory;
+ struct device_node *memory, *pci;
int default_nid = 0;
unsigned long i;
+ const __be32 *associativity;
if (numa_enabled == 0) {
- printk(KERN_WARNING "NUMA disabled by user\n");
+ pr_warn("disabled by user\n");
return -1;
}
- min_common_depth = find_min_common_depth();
+ primary_domain_index = find_primary_domain_index();
- if (min_common_depth < 0) {
+ if (primary_domain_index < 0) {
/*
- * if we fail to parse min_common_depth from device tree
+ * if we fail to parse primary_domain_index from device tree
* mark the numa disabled, boot with numa disabled.
*/
numa_enabled = false;
- return min_common_depth;
+ return primary_domain_index;
}
- dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+ pr_debug("associativity depth for CPU/Memory: %d\n", primary_domain_index);
+
+ /*
+ * If it is FORM2 initialize the distance table here.
+ */
+ if (affinity_form == FORM2_AFFINITY)
+ initialize_form2_numa_distance_lookup_table();
/*
* Even though we connect cpus to numa domains later in SMP
@@ -727,22 +929,36 @@ static int __init parse_numa_properties(void)
* each node to be onlined must have NODE_DATA etc backing it.
*/
for_each_present_cpu(i) {
+ __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
struct device_node *cpu;
- int nid = vphn_get_nid(i);
+ int nid = NUMA_NO_NODE;
- /*
- * Don't fall back to default_nid yet -- we will plug
- * cpus into nodes once the memory scan has discovered
- * the topology.
- */
- if (nid == NUMA_NO_NODE) {
+ memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
+
+ if (__vphn_get_associativity(i, vphn_assoc) == 0) {
+ nid = associativity_to_nid(vphn_assoc);
+ initialize_form1_numa_distance(vphn_assoc);
+ } else {
+
+ /*
+ * Don't fall back to default_nid yet -- we will plug
+ * cpus into nodes once the memory scan has discovered
+ * the topology.
+ */
cpu = of_get_cpu_node(i, NULL);
BUG_ON(!cpu);
- nid = of_node_to_nid_single(cpu);
+
+ associativity = of_get_associativity(cpu);
+ if (associativity) {
+ nid = associativity_to_nid(associativity);
+ initialize_form1_numa_distance(associativity);
+ }
of_node_put(cpu);
}
- node_set_online(nid);
+ /* node_set_online() is an UB if 'nid' is negative */
+ if (likely(nid >= 0))
+ node_set_online(nid);
}
get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
@@ -774,8 +990,11 @@ new_range:
* have associativity properties. If none, then
* everything goes to default_nid.
*/
- nid = of_node_to_nid_single(memory);
- if (nid < 0)
+ associativity = of_get_associativity(memory);
+ if (associativity) {
+ nid = associativity_to_nid(associativity);
+ initialize_form1_numa_distance(associativity);
+ } else
nid = default_nid;
fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
@@ -789,6 +1008,18 @@ new_range:
goto new_range;
}
+ for_each_node_by_name(pci, "pci") {
+ int nid = NUMA_NO_NODE;
+
+ associativity = of_get_associativity(pci);
+ if (associativity) {
+ nid = associativity_to_nid(associativity);
+ initialize_form1_numa_distance(associativity);
+ }
+ if (likely(nid >= 0) && !node_online(nid))
+ node_set_online(nid);
+ }
+
/*
* Now do the same thing for each MEMBLOCK listed in the
* ibm,dynamic-memory property in the
@@ -811,10 +1042,8 @@ static void __init setup_nonnuma(void)
unsigned int nid = 0;
int i;
- printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
- top_of_ram, total_ram);
- printk(KERN_DEBUG "Memory hole size: %ldMB\n",
- (top_of_ram - total_ram) >> 20);
+ pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram);
+ pr_debug("Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20);
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
fake_numa_create_new_node(end_pfn, &nid);
@@ -864,27 +1093,9 @@ void __init dump_numa_cpu_topology(void)
static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
{
u64 spanned_pages = end_pfn - start_pfn;
- const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
- u64 nd_pa;
- void *nd;
- int tnid;
-
- nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
- if (!nd_pa)
- panic("Cannot allocate %zu bytes for node %d data\n",
- nd_size, nid);
-
- nd = __va(nd_pa);
-
- /* report and initialize */
- pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n",
- nd_pa, nd_pa + nd_size - 1);
- tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
- if (tnid != nid)
- pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
-
- node_data[nid] = nd;
- memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+
+ alloc_node_data(nid);
+
NODE_DATA(nid)->node_id = nid;
NODE_DATA(nid)->node_start_pfn = start_pfn;
NODE_DATA(nid)->node_spanned_pages = spanned_pages;
@@ -892,8 +1103,8 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
static void __init find_possible_nodes(void)
{
- struct device_node *rtas;
- const __be32 *domains;
+ struct device_node *rtas, *root;
+ const __be32 *domains = NULL;
int prop_length, max_nodes;
u32 i;
@@ -909,9 +1120,16 @@ static void __init find_possible_nodes(void)
* it doesn't exist, then fallback on ibm,max-associativity-domains.
* Current denotes what the platform can support compared to max
* which denotes what the Hypervisor can support.
+ *
+ * If the LPAR is migratable, new nodes might be activated after a LPM,
+ * so we should consider the max number in that case.
*/
- domains = of_get_property(rtas, "ibm,current-associativity-domains",
- &prop_length);
+ root = of_find_node_by_path("/");
+ if (!of_get_property(root, "ibm,migratable-partition", NULL))
+ domains = of_get_property(rtas,
+ "ibm,current-associativity-domains",
+ &prop_length);
+ of_node_put(root);
if (!domains) {
domains = of_get_property(rtas, "ibm,max-associativity-domains",
&prop_length);
@@ -919,14 +1137,16 @@ static void __init find_possible_nodes(void)
goto out;
}
- max_nodes = of_read_number(&domains[min_common_depth], 1);
+ max_nodes = of_read_number(&domains[primary_domain_index], 1);
+ pr_info("Partition configured for %d NUMA nodes.\n", max_nodes);
+
for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
prop_length /= sizeof(int);
- if (prop_length > min_common_depth + 2)
+ if (prop_length > primary_domain_index + 2)
coregroup_enabled = 1;
out:
@@ -937,6 +1157,9 @@ void __init mem_topology_setup(void)
{
int cpu;
+ max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
+ min_low_pfn = MEMORY_START >> PAGE_SHIFT;
+
/*
* Linux/mm assumes node 0 to be online at boot. However this is not
* true on PowerPC, where node 0 is similar to any other node, it
@@ -981,9 +1204,6 @@ void __init initmem_init(void)
{
int nid;
- max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
- max_pfn = max_low_pfn;
-
memblock_dump_all();
for_each_online_node(nid) {
@@ -1014,9 +1234,6 @@ static int __init early_numa(char *p)
if (strstr(p, "off"))
numa_enabled = 0;
- if (strstr(p, "debug"))
- numa_debug = 1;
-
p = strstr(p, "fake=");
if (p)
cmdline = p + strlen("fake=");
@@ -1068,23 +1285,15 @@ static int hot_add_node_scn_to_nid(unsigned long scn_addr)
int nid = NUMA_NO_NODE;
for_each_node_by_type(memory, "memory") {
- unsigned long start, size;
- int ranges;
- const __be32 *memcell_buf;
- unsigned int len;
-
- memcell_buf = of_get_property(memory, "reg", &len);
- if (!memcell_buf || len <= 0)
- continue;
+ int i = 0;
- /* ranges in cell */
- ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
+ while (1) {
+ struct resource res;
- while (ranges--) {
- start = read_n_cells(n_mem_addr_cells, &memcell_buf);
- size = read_n_cells(n_mem_size_cells, &memcell_buf);
+ if (of_address_to_resource(memory, i++, &res))
+ break;
- if ((scn_addr < start) || (scn_addr >= (start + size)))
+ if ((scn_addr < res.start) || (scn_addr > res.end))
continue;
nid = of_node_to_nid_single(memory);
@@ -1127,7 +1336,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
return nid;
}
-static u64 hot_add_drconf_memory_max(void)
+u64 hot_add_drconf_memory_max(void)
{
struct device_node *memory = NULL;
struct device_node *dn = NULL;
@@ -1179,7 +1388,7 @@ static long vphn_get_associativity(unsigned long cpu,
switch (rc) {
case H_SUCCESS:
- dbg("VPHN hcall succeeded. Reset polling...\n");
+ pr_debug("VPHN hcall succeeded. Reset polling...\n");
goto out;
case H_FUNCTION:
@@ -1202,43 +1411,26 @@ out:
return rc;
}
-int find_and_online_cpu_nid(int cpu)
+void find_and_update_cpu_nid(int cpu)
{
__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
int new_nid;
/* Use associativity from first thread for all siblings */
if (vphn_get_associativity(cpu, associativity))
- return cpu_to_node(cpu);
+ return;
+ /* Do not have previous associativity, so find it now. */
new_nid = associativity_to_nid(associativity);
- if (new_nid < 0 || !node_possible(new_nid))
- new_nid = first_online_node;
- if (NODE_DATA(new_nid) == NULL) {
-#ifdef CONFIG_MEMORY_HOTPLUG
- /*
- * Need to ensure that NODE_DATA is initialized for a node from
- * available memory (see memblock_alloc_try_nid). If unable to
- * init the node, then default to nearest node that has memory
- * installed. Skip onlining a node if the subsystems are not
- * yet initialized.
- */
- if (!topology_inited || try_online_node(new_nid))
- new_nid = first_online_node;
-#else
- /*
- * Default to using the nearest node that has memory installed.
- * Otherwise, it would be necessary to patch the kernel MM code
- * to deal with more memoryless-node error conditions.
- */
+ if (new_nid < 0 || !node_possible(new_nid))
new_nid = first_online_node;
-#endif
- }
+ else
+ // Associate node <-> cpu, so cpu_up() calls
+ // try_online_node() on the right node.
+ set_cpu_numa_node(cpu, new_nid);
- pr_debug("%s:%d cpu %d nid %d\n", __FUNCTION__, __LINE__,
- cpu, new_nid);
- return new_nid;
+ pr_debug("%s:%d cpu %d nid %d\n", __func__, __LINE__, cpu, new_nid);
}
int cpu_to_coregroup_id(int cpu)
@@ -1259,7 +1451,7 @@ int cpu_to_coregroup_id(int cpu)
goto out;
index = of_read_number(associativity, 1);
- if (index > min_common_depth + 1)
+ if (index > primary_domain_index + 1)
return of_read_number(&associativity[index - 1], 1);
out:
diff --git a/arch/powerpc/mm/pageattr.c b/arch/powerpc/mm/pageattr.c
index 0876216ceee6..ac22bf28086f 100644
--- a/arch/powerpc/mm/pageattr.c
+++ b/arch/powerpc/mm/pageattr.c
@@ -14,57 +14,60 @@
#include <asm/page.h>
#include <asm/pgtable.h>
+#include <mm/mmu_decl.h>
+
+static pte_basic_t pte_update_delta(pte_t *ptep, unsigned long addr,
+ unsigned long old, unsigned long new)
+{
+ return pte_update(&init_mm, addr, ptep, old & ~new, new & ~old, 0);
+}
/*
- * Updates the attributes of a page in three steps:
- *
- * 1. invalidate the page table entry
- * 2. flush the TLB
- * 3. install the new entry with the updated attributes
- *
- * Invalidating the pte means there are situations where this will not work
- * when in theory it should.
- * For example:
- * - removing write from page whilst it is being executed
- * - setting a page read-only whilst it is being read by another CPU
+ * Updates the attributes of a page atomically.
*
+ * This sequence is safe against concurrent updates, and also allows updating the
+ * attributes of a page currently being executed or accessed.
*/
static int change_page_attr(pte_t *ptep, unsigned long addr, void *data)
{
long action = (long)data;
- pte_t pte;
-
- spin_lock(&init_mm.page_table_lock);
-
- /* invalidate the PTE so it's safe to modify */
- pte = ptep_get_and_clear(&init_mm, addr, ptep);
- flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
- /* modify the PTE bits as desired, then apply */
+ addr &= PAGE_MASK;
+ /* modify the PTE bits as desired */
switch (action) {
case SET_MEMORY_RO:
- pte = pte_wrprotect(pte);
+ /* Don't clear DIRTY bit */
+ pte_update_delta(ptep, addr, _PAGE_KERNEL_RW & ~_PAGE_DIRTY, _PAGE_KERNEL_RO);
+ break;
+ case SET_MEMORY_ROX:
+ /* Don't clear DIRTY bit */
+ pte_update_delta(ptep, addr, _PAGE_KERNEL_RW & ~_PAGE_DIRTY, _PAGE_KERNEL_ROX);
break;
case SET_MEMORY_RW:
- pte = pte_mkwrite(pte_mkdirty(pte));
+ pte_update_delta(ptep, addr, _PAGE_KERNEL_RO, _PAGE_KERNEL_RW);
break;
case SET_MEMORY_NX:
- pte = pte_exprotect(pte);
+ pte_update_delta(ptep, addr, _PAGE_KERNEL_ROX, _PAGE_KERNEL_RO);
break;
case SET_MEMORY_X:
- pte = pte_mkexec(pte);
+ pte_update_delta(ptep, addr, _PAGE_KERNEL_RO, _PAGE_KERNEL_ROX);
+ break;
+ case SET_MEMORY_NP:
+ pte_update(&init_mm, addr, ptep, _PAGE_PRESENT, 0, 0);
+ break;
+ case SET_MEMORY_P:
+ pte_update(&init_mm, addr, ptep, 0, _PAGE_PRESENT, 0);
break;
default:
WARN_ON_ONCE(1);
break;
}
- set_pte_at(&init_mm, addr, ptep, pte);
-
/* See ptesync comment in radix__set_pte_at() */
if (radix_enabled())
asm volatile("ptesync": : :"memory");
- spin_unlock(&init_mm.page_table_lock);
+
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
return 0;
}
@@ -100,35 +103,25 @@ int change_memory_attr(unsigned long addr, int numpages, long action)
change_page_attr, (void *)action);
}
-/*
- * Set the attributes of a page:
- *
- * This function is used by PPC32 at the end of init to set final kernel memory
- * protection. It includes changing the maping of the page it is executing from
- * and data pages it is using.
- */
-static int set_page_attr(pte_t *ptep, unsigned long addr, void *data)
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
+#ifdef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
{
- pgprot_t prot = __pgprot((unsigned long)data);
-
- spin_lock(&init_mm.page_table_lock);
-
- set_pte_at(&init_mm, addr, ptep, pte_modify(*ptep, prot));
- flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
-
- spin_unlock(&init_mm.page_table_lock);
-
- return 0;
-}
+ int err;
+ unsigned long addr = (unsigned long)page_address(page);
-int set_memory_attr(unsigned long addr, int numpages, pgprot_t prot)
-{
- unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
- unsigned long sz = numpages * PAGE_SIZE;
+ if (PageHighMem(page))
+ return;
- if (numpages <= 0)
- return 0;
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled())
+ err = hash__kernel_map_pages(page, numpages, enable);
+ else if (enable)
+ err = set_memory_p(addr, numpages);
+ else
+ err = set_memory_np(addr, numpages);
- return apply_to_existing_page_range(&init_mm, start, sz, set_page_attr,
- (void *)pgprot_val(prot));
+ if (err)
+ panic("%s: changing memory protections failed\n", __func__);
}
+#endif
+#endif
diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
index 97ae4935da79..77e55eac16e4 100644
--- a/arch/powerpc/mm/pgtable-frag.c
+++ b/arch/powerpc/mm/pgtable-frag.c
@@ -18,15 +18,15 @@
void pte_frag_destroy(void *pte_frag)
{
int count;
- struct page *page;
+ struct ptdesc *ptdesc;
- page = virt_to_page(pte_frag);
+ ptdesc = virt_to_ptdesc(pte_frag);
/* drop all the pending references */
count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
- if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) {
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
+ pagetable_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
}
@@ -55,25 +55,23 @@ static pte_t *get_pte_from_cache(struct mm_struct *mm)
static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel)
{
void *ret = NULL;
- struct page *page;
-
- if (!kernel) {
- page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT);
- if (!page)
- return NULL;
- if (!pgtable_pte_page_ctor(page)) {
- __free_page(page);
- return NULL;
- }
- } else {
- page = alloc_page(PGALLOC_GFP);
- if (!page)
- return NULL;
+ struct ptdesc *ptdesc;
+ gfp_t gfp = PGALLOC_GFP;
+
+ if (!kernel)
+ gfp |= __GFP_ACCOUNT;
+
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
+ return NULL;
+ if (!pagetable_pte_ctor(mm, ptdesc)) {
+ pagetable_free(ptdesc);
+ return NULL;
}
- atomic_set(&page->pt_frag_refcount, 1);
+ atomic_set(&ptdesc->pt_frag_refcount, 1);
- ret = page_address(page);
+ ret = ptdesc_address(ptdesc);
/*
* if we support only one fragment just return the
* allocated page.
@@ -82,12 +80,12 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel)
return ret;
spin_lock(&mm->page_table_lock);
/*
- * If we find pgtable_page set, we return
- * the allocated page with single fragement
+ * If we find ptdesc_page set, we return
+ * the allocated page with single fragment
* count.
*/
if (likely(!pte_frag_get(&mm->context))) {
- atomic_set(&page->pt_frag_refcount, PTE_FRAG_NR);
+ atomic_set(&ptdesc->pt_frag_refcount, PTE_FRAG_NR);
pte_frag_set(&mm->context, ret + PTE_FRAG_SIZE);
}
spin_unlock(&mm->page_table_lock);
@@ -106,17 +104,38 @@ pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel)
return __alloc_for_ptecache(mm, kernel);
}
+static void pte_free_now(struct rcu_head *head)
+{
+ struct ptdesc *ptdesc;
+
+ ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+ pagetable_dtor(ptdesc);
+ pagetable_free(ptdesc);
+}
+
void pte_fragment_free(unsigned long *table, int kernel)
{
- struct page *page = virt_to_page(table);
+ struct ptdesc *ptdesc = virt_to_ptdesc(table);
- if (PageReserved(page))
- return free_reserved_page(page);
+ if (pagetable_is_reserved(ptdesc))
+ return free_reserved_ptdesc(ptdesc);
- BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
- if (atomic_dec_and_test(&page->pt_frag_refcount)) {
- if (!kernel)
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
+ if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
+ if (kernel || !folio_test_clear_active(ptdesc_folio(ptdesc)))
+ pte_free_now(&ptdesc->pt_rcu_head);
+ else
+ call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
}
}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+ struct folio *folio;
+
+ folio = virt_to_folio(pgtable);
+ folio_set_active(folio);
+ pte_fragment_free((unsigned long *)pgtable, 0);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index cd16b407f47e..56d7e8960e77 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -46,19 +46,19 @@ static inline int is_exec_fault(void)
* and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that
* on userspace PTEs
*/
-static inline int pte_looks_normal(pte_t pte)
+static inline int pte_looks_normal(pte_t pte, unsigned long addr)
{
if (pte_present(pte) && !pte_special(pte)) {
if (pte_ci(pte))
return 0;
- if (pte_user(pte))
+ if (!is_kernel_addr(addr))
return 1;
}
return 0;
}
-static struct page *maybe_pte_to_page(pte_t pte)
+static struct folio *maybe_pte_to_folio(pte_t pte)
{
unsigned long pfn = pte_pfn(pte);
struct page *page;
@@ -68,7 +68,7 @@ static struct page *maybe_pte_to_page(pte_t pte)
page = pfn_to_page(pfn);
if (PageReserved(page))
return NULL;
- return page;
+ return page_folio(page);
}
#ifdef CONFIG_PPC_BOOK3S
@@ -79,20 +79,17 @@ static struct page *maybe_pte_to_page(pte_t pte)
* support falls into the same category.
*/
-static pte_t set_pte_filter_hash(pte_t pte)
+static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr)
{
- if (radix_enabled())
- return pte;
-
pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
- if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
- cpu_has_feature(CPU_FTR_NOEXECUTE))) {
- struct page *pg = maybe_pte_to_page(pte);
- if (!pg)
+ if (pte_looks_normal(pte, addr) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
+ cpu_has_feature(CPU_FTR_NOEXECUTE))) {
+ struct folio *folio = maybe_pte_to_folio(pte);
+ if (!folio)
return pte;
- if (!test_bit(PG_dcache_clean, &pg->flags)) {
- flush_dcache_icache_page(pg);
- set_bit(PG_dcache_clean, &pg->flags);
+ if (!test_bit(PG_dcache_clean, &folio->flags.f)) {
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags.f);
}
}
return pte;
@@ -100,38 +97,43 @@ static pte_t set_pte_filter_hash(pte_t pte)
#else /* CONFIG_PPC_BOOK3S */
-static pte_t set_pte_filter_hash(pte_t pte) { return pte; }
+static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr) { return pte; }
#endif /* CONFIG_PPC_BOOK3S */
/* Embedded type MMU with HW exec support. This is a bit more complicated
* as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
* instead we "filter out" the exec permission for non clean pages.
+ *
+ * This is also called once for the folio. So only work with folio->flags here.
*/
-static inline pte_t set_pte_filter(pte_t pte)
+static inline pte_t set_pte_filter(pte_t pte, unsigned long addr)
{
- struct page *pg;
+ struct folio *folio;
+
+ if (radix_enabled())
+ return pte;
if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
- return set_pte_filter_hash(pte);
+ return set_pte_filter_hash(pte, addr);
/* No exec permission in the first place, move on */
- if (!pte_exec(pte) || !pte_looks_normal(pte))
+ if (!pte_exec(pte) || !pte_looks_normal(pte, addr))
return pte;
/* If you set _PAGE_EXEC on weird pages you're on your own */
- pg = maybe_pte_to_page(pte);
- if (unlikely(!pg))
+ folio = maybe_pte_to_folio(pte);
+ if (unlikely(!folio))
return pte;
/* If the page clean, we move on */
- if (test_bit(PG_dcache_clean, &pg->flags))
+ if (test_bit(PG_dcache_clean, &folio->flags.f))
return pte;
/* If it's an exec fault, we flush the cache and make it clean */
if (is_exec_fault()) {
- flush_dcache_icache_page(pg);
- set_bit(PG_dcache_clean, &pg->flags);
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags.f);
return pte;
}
@@ -142,7 +144,10 @@ static inline pte_t set_pte_filter(pte_t pte)
static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
int dirty)
{
- struct page *pg;
+ struct folio *folio;
+
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
+ return pte;
if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
return pte;
@@ -165,17 +170,17 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
#endif /* CONFIG_DEBUG_VM */
/* If you set _PAGE_EXEC on weird pages you're on your own */
- pg = maybe_pte_to_page(pte);
- if (unlikely(!pg))
+ folio = maybe_pte_to_folio(pte);
+ if (unlikely(!folio))
goto bail;
/* If the page is already clean, we move on */
- if (test_bit(PG_dcache_clean, &pg->flags))
+ if (test_bit(PG_dcache_clean, &folio->flags.f))
goto bail;
/* Clean the page and set PG_dcache_clean */
- flush_dcache_icache_page(pg);
- set_bit(PG_dcache_clean, &pg->flags);
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags.f);
bail:
return pte_mkexec(pte);
@@ -184,23 +189,48 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
/*
* set_pte stores a linux PTE into the linux page table.
*/
-void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
- pte_t pte)
+void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned int nr)
{
- /*
- * Make sure hardware valid bit is not set. We don't do
- * tlb flush for this update.
- */
- VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
/* Note: mm->context.id might not yet have been assigned as
* this context might not have been activated yet when this
- * is called.
+ * is called. Filter the pte value and use the filtered value
+ * to setup all the ptes in the range.
+ */
+ pte = set_pte_filter(pte, addr);
+
+ /*
+ * We don't need to call arch_enter/leave_lazy_mmu_mode()
+ * because we expect set_ptes to be only be used on not present
+ * and not hw_valid ptes. Hence there is no translation cache flush
+ * involved that need to be batched.
*/
- pte = set_pte_filter(pte);
+ for (;;) {
- /* Perform the setting of the PTE */
- __set_pte_at(mm, addr, ptep, pte, 0);
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
+
+ /* Perform the setting of the PTE */
+ __set_pte_at(mm, addr, ptep, pte, 0);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ pte = pte_next_pfn(pte);
+ }
+}
+
+void unmap_kernel_page(unsigned long va)
+{
+ pmd_t *pmdp = pmd_off_k(va);
+ pte_t *ptep = pte_offset_kernel(pmdp, va);
+
+ pte_clear(&init_mm, va, ptep);
+ flush_tlb_kernel_range(va, va + PAGE_SIZE);
}
/*
@@ -267,11 +297,15 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
}
#if defined(CONFIG_PPC_8xx)
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+
+#if defined(CONFIG_SPLIT_PTE_PTLOCKS) || defined(CONFIG_SPLIT_PMD_PTLOCKS)
+/* We need the same lock to protect the PMD table and the two PTE tables. */
+#error "8M hugetlb folios are incompatible with split page table locks"
+#endif
+
+static void __set_huge_pte_at(pmd_t *pmd, pte_t *ptep, pte_basic_t val)
{
- pmd_t *pmd = pmd_off(mm, addr);
- pte_basic_t val;
- pte_basic_t *entry = &ptep->pte;
+ pte_basic_t *entry = (pte_basic_t *)ptep;
int num, i;
/*
@@ -280,15 +314,60 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_
*/
VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
- pte = set_pte_filter(pte);
-
- val = pte_val(pte);
-
num = number_of_cells_per_pte(pmd, val, 1);
for (i = 0; i < num; i++, entry++, val += SZ_4K)
*entry = val;
}
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned long sz)
+{
+ pmd_t *pmdp = pmd_off(mm, addr);
+
+ pte = set_pte_filter(pte, addr);
+
+ if (sz == SZ_8M) { /* Flag both PMD entries as 8M and fill both page tables */
+ *pmdp = __pmd(pmd_val(*pmdp) | _PMD_PAGE_8M);
+ *(pmdp + 1) = __pmd(pmd_val(*(pmdp + 1)) | _PMD_PAGE_8M);
+
+ __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp, 0), pte_val(pte));
+ __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp + 1, 0), pte_val(pte) + SZ_4M);
+ } else {
+ __set_huge_pte_at(pmdp, ptep, pte_val(pte));
+ }
+}
+#else
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned long sz)
+{
+ unsigned long pdsize;
+ int i;
+
+ pte = set_pte_filter(pte, addr);
+
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
+
+ if (sz < PMD_SIZE)
+ pdsize = PAGE_SIZE;
+ else if (sz < PUD_SIZE)
+ pdsize = PMD_SIZE;
+ else if (sz < P4D_SIZE)
+ pdsize = PUD_SIZE;
+ else if (sz < PGDIR_SIZE)
+ pdsize = P4D_SIZE;
+ else
+ pdsize = PGDIR_SIZE;
+
+ for (i = 0; i < sz / pdsize; i++, ptep++, addr += pdsize) {
+ __set_pte_at(mm, addr, ptep, pte, 0);
+ pte = __pte(pte_val(pte) + ((unsigned long long)pdsize / PAGE_SIZE << PFN_PTE_SHIFT));
+ }
+}
#endif
#endif /* CONFIG_HUGETLB_PAGE */
@@ -299,6 +378,8 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
+ pte_t *pte;
+ spinlock_t *ptl;
if (mm == &init_mm)
return;
@@ -317,8 +398,10 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
*/
if (pmd_none(*pmd))
return;
- BUG_ON(!pmd_present(*pmd));
- assert_spin_locked(pte_lockptr(mm, pmd));
+ pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl);
+ BUG_ON(!pte);
+ assert_spin_locked(ptl);
+ pte_unmap(pte);
}
#endif /* CONFIG_DEBUG_VM */
@@ -332,14 +415,13 @@ unsigned long vmalloc_to_phys(void *va)
EXPORT_SYMBOL_GPL(vmalloc_to_phys);
/*
- * We have 4 cases for pgds and pmds:
+ * We have 3 cases for pgds and pmds:
* (1) invalid (all zeroes)
* (2) pointer to next table, as normal; bottom 6 bits == 0
* (3) leaf pte for huge page _PAGE_PTE set
- * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
*
* So long as we atomically load page table pointers we are safe against teardown,
- * we can follow the address down to the the page and take a ref on it.
+ * we can follow the address down to the page and take a ref on it.
* This function need to be called with interrupts disabled. We use this variant
* when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
*/
@@ -347,11 +429,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
bool *is_thp, unsigned *hpage_shift)
{
pgd_t *pgdp;
+#ifdef CONFIG_PPC64
p4d_t p4d, *p4dp;
pud_t pud, *pudp;
+#endif
pmd_t pmd, *pmdp;
pte_t *ret_pte;
- hugepd_t *hpdp = NULL;
unsigned pdshift;
if (hpage_shift)
@@ -366,8 +449,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
* page fault or a page unmap. The return pte_t * is still not
* stable. So should be checked there for above conditions.
* Top level is an exception because it is folded into p4d.
+ *
+ * On PPC32, P4D/PUD/PMD are folded into PGD so go straight to
+ * PMD level.
*/
pgdp = pgdir + pgd_index(ea);
+#ifdef CONFIG_PPC64
p4dp = p4d_offset(pgdp, ea);
p4d = READ_ONCE(*p4dp);
pdshift = P4D_SHIFT;
@@ -375,16 +462,11 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
if (p4d_none(p4d))
return NULL;
- if (p4d_is_leaf(p4d)) {
+ if (p4d_leaf(p4d)) {
ret_pte = (pte_t *)p4dp;
goto out;
}
- if (is_hugepd(__hugepd(p4d_val(p4d)))) {
- hpdp = (hugepd_t *)&p4d;
- goto out_huge;
- }
-
/*
* Even if we end up with an unmap, the pgtable will not
* be freed, because we do an rcu free and here we are
@@ -397,18 +479,16 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
if (pud_none(pud))
return NULL;
- if (pud_is_leaf(pud)) {
+ if (pud_leaf(pud)) {
ret_pte = (pte_t *)pudp;
goto out;
}
- if (is_hugepd(__hugepd(pud_val(pud)))) {
- hpdp = (hugepd_t *)&pud;
- goto out_huge;
- }
-
- pdshift = PMD_SHIFT;
pmdp = pmd_offset(&pud, ea);
+#else
+ pmdp = pmd_offset(pud_offset(p4d_offset(pgdp, ea), ea), ea);
+#endif
+ pdshift = PMD_SHIFT;
pmd = READ_ONCE(*pmdp);
/*
@@ -429,34 +509,47 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
return NULL;
#endif
- if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
+ if (pmd_trans_huge(pmd)) {
if (is_thp)
*is_thp = true;
ret_pte = (pte_t *)pmdp;
goto out;
}
- if (pmd_is_leaf(pmd)) {
+ if (pmd_leaf(pmd)) {
ret_pte = (pte_t *)pmdp;
goto out;
}
- if (is_hugepd(__hugepd(pmd_val(pmd)))) {
- hpdp = (hugepd_t *)&pmd;
- goto out_huge;
- }
-
return pte_offset_kernel(&pmd, ea);
-out_huge:
- if (!hpdp)
- return NULL;
-
- ret_pte = hugepte_offset(*hpdp, ea, pdshift);
- pdshift = hugepd_shift(*hpdp);
out:
if (hpage_shift)
*hpage_shift = pdshift;
return ret_pte;
}
EXPORT_SYMBOL_GPL(__find_linux_pte);
+
+/* Note due to the way vm flags are laid out, the bits are XWR */
+const pgprot_t protection_map[16] = {
+ [VM_NONE] = PAGE_NONE,
+ [VM_READ] = PAGE_READONLY,
+ [VM_WRITE] = PAGE_COPY,
+ [VM_WRITE | VM_READ] = PAGE_COPY,
+ [VM_EXEC] = PAGE_EXECONLY_X,
+ [VM_EXEC | VM_READ] = PAGE_READONLY_X,
+ [VM_EXEC | VM_WRITE] = PAGE_COPY_X,
+ [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_X,
+ [VM_SHARED] = PAGE_NONE,
+ [VM_SHARED | VM_READ] = PAGE_READONLY,
+ [VM_SHARED | VM_WRITE] = PAGE_SHARED,
+ [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED,
+ [VM_SHARED | VM_EXEC] = PAGE_EXECONLY_X,
+ [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_X,
+ [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_X,
+ [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_X
+};
+
+#ifndef CONFIG_PPC_BOOK3S_64
+DECLARE_VM_GET_PAGE_PROT
+#endif
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index dcf5ecca19d9..0c9ef705803e 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -33,8 +33,6 @@
#include <mm/mmu_decl.h>
-extern char etext[], _stext[], _sinittext[], _einittext[];
-
static u8 early_fixmap_pagetable[FIXMAP_PTE_SIZE] __page_aligned_data;
notrace void __init early_ioremap_init(void)
@@ -50,15 +48,10 @@ notrace void __init early_ioremap_init(void)
early_ioremap_setup();
}
-static void __init *early_alloc_pgtable(unsigned long size)
+void __init *early_alloc_pgtable(unsigned long size)
{
- void *ptr = memblock_alloc(size, size);
-
- if (!ptr)
- panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
- __func__, size, size);
+ return memblock_alloc_or_panic(size, size);
- return ptr;
}
pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va)
@@ -104,15 +97,14 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
{
unsigned long v, s;
phys_addr_t p;
- int ktext;
+ bool ktext;
s = offset;
v = PAGE_OFFSET + s;
p = memstart_addr + s;
for (; s < top; s += PAGE_SIZE) {
- ktext = ((char *)v >= _stext && (char *)v < etext) ||
- ((char *)v >= _sinittext && (char *)v < _einittext);
- map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL);
+ ktext = core_kernel_text(v);
+ map_kernel_page(v, p, ktext ? PAGE_KERNEL_X : PAGE_KERNEL);
v += PAGE_SIZE;
p += PAGE_SIZE;
}
@@ -133,57 +125,58 @@ void __init mapin_ram(void)
}
}
-void mark_initmem_nx(void)
+static int __mark_initmem_nx(void)
{
unsigned long numpages = PFN_UP((unsigned long)_einittext) -
PFN_DOWN((unsigned long)_sinittext);
+ int err;
- if (v_block_mapped((unsigned long)_sinittext))
- mmu_mark_initmem_nx();
- else
- set_memory_attr((unsigned long)_sinittext, numpages, PAGE_KERNEL);
+ err = mmu_mark_initmem_nx();
+
+ if (!v_block_mapped((unsigned long)_sinittext)) {
+ err = set_memory_nx((unsigned long)_sinittext, numpages);
+ if (err)
+ return err;
+ err = set_memory_rw((unsigned long)_sinittext, numpages);
+ }
+ return err;
+}
+
+void mark_initmem_nx(void)
+{
+ int err = __mark_initmem_nx();
+
+ if (err)
+ panic("%s() failed, err = %d\n", __func__, err);
}
#ifdef CONFIG_STRICT_KERNEL_RWX
-void mark_rodata_ro(void)
+static int __mark_rodata_ro(void)
{
unsigned long numpages;
- if (v_block_mapped((unsigned long)_stext + 1)) {
- mmu_mark_rodata_ro();
- ptdump_check_wx();
- return;
- }
+ if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX) && mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ pr_warn("This platform has HASH MMU, STRICT_MODULE_RWX won't work\n");
- numpages = PFN_UP((unsigned long)_etext) -
- PFN_DOWN((unsigned long)_stext);
+ if (v_block_mapped((unsigned long)_stext + 1))
+ return mmu_mark_rodata_ro();
- set_memory_attr((unsigned long)_stext, numpages, PAGE_KERNEL_ROX);
/*
- * mark .rodata as read only. Use __init_begin rather than __end_rodata
- * to cover NOTES and EXCEPTION_TABLE.
+ * mark text and rodata as read only. __end_rodata is set by
+ * powerpc's linker script and includes tables and data
+ * requiring relocation which are not put in RO_DATA.
*/
- numpages = PFN_UP((unsigned long)__init_begin) -
- PFN_DOWN((unsigned long)__start_rodata);
-
- set_memory_attr((unsigned long)__start_rodata, numpages, PAGE_KERNEL_RO);
+ numpages = PFN_UP((unsigned long)__end_rodata) -
+ PFN_DOWN((unsigned long)_stext);
- // mark_initmem_nx() should have already run by now
- ptdump_check_wx();
+ return set_memory_ro((unsigned long)_stext, numpages);
}
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-void __kernel_map_pages(struct page *page, int numpages, int enable)
+void mark_rodata_ro(void)
{
- unsigned long addr = (unsigned long)page_address(page);
-
- if (PageHighMem(page))
- return;
+ int err = __mark_rodata_ro();
- if (enable)
- set_memory_attr(addr, numpages, PAGE_KERNEL);
- else
- set_memory_attr(addr, numpages, __pgprot(0));
+ if (err)
+ panic("%s() failed, err = %d\n", __func__, err);
}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
+#endif
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 78c8cf01db5f..6621cfc3baf8 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -32,7 +32,6 @@
#include <linux/hugetlb.h>
#include <asm/page.h>
-#include <asm/prom.h>
#include <asm/mmu_context.h>
#include <asm/mmu.h>
#include <asm/smp.h>
@@ -101,8 +100,9 @@ EXPORT_SYMBOL(__pte_frag_size_shift);
/* 4 level page table */
struct page *p4d_page(p4d_t p4d)
{
- if (p4d_is_leaf(p4d)) {
- VM_WARN_ON(!p4d_huge(p4d));
+ if (p4d_leaf(p4d)) {
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ VM_WARN_ON(!p4d_leaf(p4d));
return pte_page(p4d_pte(p4d));
}
return virt_to_page(p4d_pgtable(p4d));
@@ -111,8 +111,9 @@ struct page *p4d_page(p4d_t p4d)
struct page *pud_page(pud_t pud)
{
- if (pud_is_leaf(pud)) {
- VM_WARN_ON(!pud_huge(pud));
+ if (pud_leaf(pud)) {
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ VM_WARN_ON(!pud_leaf(pud));
return pte_page(pud_pte(pud));
}
return virt_to_page(pud_pgtable(pud));
@@ -124,8 +125,14 @@ struct page *pud_page(pud_t pud)
*/
struct page *pmd_page(pmd_t pmd)
{
- if (pmd_is_leaf(pmd)) {
- VM_WARN_ON(!(pmd_large(pmd) || pmd_huge(pmd)));
+ if (pmd_leaf(pmd)) {
+ /*
+ * vmalloc_to_page may be called on any vmap address (not only
+ * vmalloc), and it uses pmd_page() etc., when huge vmap is
+ * enabled so these checks can't be used.
+ */
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ VM_WARN_ON(!pmd_leaf(pmd));
return pte_page(pmd_pte(pmd));
}
return virt_to_page(pmd_page_vaddr(pmd));
@@ -143,9 +150,6 @@ void mark_rodata_ro(void)
radix__mark_rodata_ro();
else
hash__mark_rodata_ro();
-
- // mark_initmem_nx() should have already run by now
- ptdump_check_wx();
}
void mark_initmem_nx(void)
diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c
index 86da2a669680..ff845f251724 100644
--- a/arch/powerpc/mm/ptdump/8xx.c
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -21,11 +21,6 @@ static const struct flag_info flag_array[] = {
.set = "huge",
.clear = " ",
}, {
- .mask = _PAGE_SH,
- .val = 0,
- .set = "user",
- .clear = " ",
- }, {
.mask = _PAGE_RO | _PAGE_NA,
.val = 0,
.set = "rw",
@@ -74,18 +69,25 @@ static const struct flag_info flag_array[] = {
}
};
-struct pgtable_level pg_level[5] = {
- {
- }, { /* pgd */
+struct ptdump_pg_level pg_level[5] = {
+ { /* pgd */
+ .name = "PGD",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* p4d */
+ .name = "P4D",
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
}, { /* pud */
+ .name = "PUD",
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
}, { /* pmd */
+ .name = "PMD",
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
}, { /* pte */
+ .name = "PTE",
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
},
diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile
index 712762be3cb1..0f7a050f327e 100644
--- a/arch/powerpc/mm/ptdump/Makefile
+++ b/arch/powerpc/mm/ptdump/Makefile
@@ -2,8 +2,13 @@
obj-y += ptdump.o
-obj-$(CONFIG_4xx) += shared.o
+obj-$(CONFIG_44x) += shared.o
obj-$(CONFIG_PPC_8xx) += 8xx.o
-obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o
-obj-$(CONFIG_PPC_BOOK3S_32) += shared.o bats.o segment_regs.o
-obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o hashpagetable.o
+obj-$(CONFIG_PPC_E500) += shared.o
+obj-$(CONFIG_PPC_BOOK3S_32) += shared.o
+obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o
+
+ifdef CONFIG_PTDUMP_DEBUGFS
+obj-$(CONFIG_PPC_BOOK3S_32) += bats.o segment_regs.o
+obj-$(CONFIG_PPC_64S_HASH_MMU) += hashpagetable.o
+endif
diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c
index c4c628b03cf8..820c119013e4 100644
--- a/arch/powerpc/mm/ptdump/bats.c
+++ b/arch/powerpc/mm/ptdump/bats.c
@@ -7,7 +7,7 @@
*/
#include <linux/pgtable.h>
-#include <asm/debugfs.h>
+#include <linux/debugfs.h>
#include <asm/cpu_has_feature.h>
#include "ptdump.h"
@@ -57,7 +57,7 @@ static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, bool
#define BAT_SHOW_603(_m, _n, _l, _u, _d) bat_show_603(_m, _n, mfspr(_l), mfspr(_u), _d)
-static int bats_show_603(struct seq_file *m, void *v)
+static int bats_show(struct seq_file *m, void *v)
{
seq_puts(m, "---[ Instruction Block Address Translation ]---\n");
@@ -88,22 +88,12 @@ static int bats_show_603(struct seq_file *m, void *v)
return 0;
}
-static int bats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, bats_show_603, NULL);
-}
-
-static const struct file_operations bats_fops = {
- .open = bats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(bats);
static int __init bats_init(void)
{
debugfs_create_file("block_address_translation", 0400,
- powerpc_debugfs_root, NULL, &bats_fops);
+ arch_debugfs_dir, NULL, &bats_fops);
return 0;
}
device_initcall(bats_init);
diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c
index 14f73868db66..e8a21c6dc32e 100644
--- a/arch/powerpc/mm/ptdump/book3s64.c
+++ b/arch/powerpc/mm/ptdump/book3s64.c
@@ -102,18 +102,25 @@ static const struct flag_info flag_array[] = {
}
};
-struct pgtable_level pg_level[5] = {
- {
- }, { /* pgd */
+struct ptdump_pg_level pg_level[5] = {
+ { /* pgd */
+ .name = "PGD",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* p4d */
+ .name = "P4D",
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
}, { /* pud */
+ .name = "PUD",
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
}, { /* pmd */
+ .name = "PMD",
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
}, { /* pte */
+ .name = "PTE",
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
},
diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c
index ad6df9a2e7c8..671d0dc00c6d 100644
--- a/arch/powerpc/mm/ptdump/hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -216,6 +216,8 @@ static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64
vpn = hpt_vpn(ea, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize);
want_v = hpte_encode_avpn(vpn, psize, ssize);
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ want_v = hpte_old_to_new_v(want_v);
/* to check in the secondary hash table, we invert the hash */
if (!primary)
@@ -229,6 +231,10 @@ static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64
/* HPTE matches */
*v = be64_to_cpu(hptep->v);
*r = be64_to_cpu(hptep->r);
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ *v = hpte_new_to_old_v(*v, *r);
+ *r = hpte_new_to_old_r(*r);
+ }
return 0;
}
++hpte_group;
@@ -238,7 +244,10 @@ static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64
static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r)
{
- struct hash_pte ptes[4];
+ struct {
+ unsigned long v;
+ unsigned long r;
+ } ptes[4];
unsigned long vsid, vpn, hash, hpte_group, want_v;
int i, j, ssize = mmu_kernel_ssize;
long lpar_rc = 0;
@@ -488,7 +497,7 @@ static void walk_vmemmap(struct pg_state *st)
* Traverse the vmemmaped memory and dump pages that are in the hash
* pagetable.
*/
- while (ptr->list) {
+ while (ptr) {
hpte_find(st, ptr->virt_addr, mmu_vmemmap_psize);
ptr = ptr->list;
}
@@ -526,17 +535,7 @@ static int ptdump_show(struct seq_file *m, void *v)
return 0;
}
-static int ptdump_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
- .open = ptdump_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ptdump);
static int ptdump_init(void)
{
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 5062c58b1e5b..0d499aebee72 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -16,10 +16,12 @@
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/highmem.h>
+#include <linux/ptdump.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <asm/fixmap.h>
#include <linux/const.h>
+#include <linux/kasan.h>
#include <asm/page.h>
#include <asm/hugetlb.h>
@@ -54,11 +56,12 @@
*
*/
struct pg_state {
+ struct ptdump_state ptdump;
struct seq_file *seq;
const struct addr_marker *marker;
unsigned long start_address;
unsigned long start_pa;
- unsigned int level;
+ int level;
u64 current_flags;
bool check_wx;
unsigned long wx_pages;
@@ -102,6 +105,11 @@ static struct addr_marker address_markers[] = {
{ -1, NULL },
};
+static struct ptdump_range ptdump_range[] __ro_after_init = {
+ {TASK_SIZE_MAX, ~0UL},
+ {0, 0}
+};
+
#define pt_dump_seq_printf(m, fmt, args...) \
({ \
if (m) \
@@ -116,7 +124,7 @@ static struct addr_marker address_markers[] = {
void pt_dump_size(struct seq_file *m, unsigned long size)
{
- static const char units[] = "KMGTPE";
+ static const char units[] = " KMGTPE";
const char *unit = units;
/* Work out what appropriate unit to use */
@@ -169,29 +177,30 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1);
pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa);
- pt_dump_size(st->seq, (addr - st->start_address) >> 10);
+ pt_dump_size(st->seq, addr - st->start_address);
+ pt_dump_seq_printf(st->seq, "%s ", pg_level[st->level].name);
}
static void note_prot_wx(struct pg_state *st, unsigned long addr)
{
pte_t pte = __pte(st->current_flags);
- if (!IS_ENABLED(CONFIG_PPC_DEBUG_WX) || !st->check_wx)
+ if (!st->check_wx)
return;
if (!pte_write(pte) || !pte_exec(pte))
return;
- WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n",
+ WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX),
+ "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n",
(void *)st->start_address, (void *)st->start_address);
st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
}
-static void note_page_update_state(struct pg_state *st, unsigned long addr,
- unsigned int level, u64 val, unsigned long page_size)
+static void note_page_update_state(struct pg_state *st, unsigned long addr, int level, u64 val)
{
- u64 flag = val & pg_level[level].mask;
+ u64 flag = level >= 0 ? val & pg_level[level].mask : 0;
u64 pa = val & PTE_RPN_MASK;
st->level = level;
@@ -205,15 +214,15 @@ static void note_page_update_state(struct pg_state *st, unsigned long addr,
}
}
-static void note_page(struct pg_state *st, unsigned long addr,
- unsigned int level, u64 val, unsigned long page_size)
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
{
- u64 flag = val & pg_level[level].mask;
+ u64 flag = level >= 0 ? val & pg_level[level].mask : 0;
+ struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
/* At first no level is set */
- if (!st->level) {
+ if (st->level == -1) {
pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
- note_page_update_state(st, addr, level, val, page_size);
+ note_page_update_state(st, addr, level, val);
/*
* Dump the section of virtual memory when:
* - the PTE flags from one entry to the next differs.
@@ -242,95 +251,7 @@ static void note_page(struct pg_state *st, unsigned long addr,
* Address indicates we have passed the end of the
* current section of virtual memory
*/
- note_page_update_state(st, addr, level, val, page_size);
- }
-}
-
-static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
-{
- pte_t *pte = pte_offset_kernel(pmd, 0);
- unsigned long addr;
- unsigned int i;
-
- for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
- addr = start + i * PAGE_SIZE;
- note_page(st, addr, 4, pte_val(*pte), PAGE_SIZE);
-
- }
-}
-
-static void walk_hugepd(struct pg_state *st, hugepd_t *phpd, unsigned long start,
- int pdshift, int level)
-{
-#ifdef CONFIG_ARCH_HAS_HUGEPD
- unsigned int i;
- int shift = hugepd_shift(*phpd);
- int ptrs_per_hpd = pdshift - shift > 0 ? 1 << (pdshift - shift) : 1;
-
- if (start & ((1 << shift) - 1))
- return;
-
- for (i = 0; i < ptrs_per_hpd; i++) {
- unsigned long addr = start + (i << shift);
- pte_t *pte = hugepte_offset(*phpd, addr, pdshift);
-
- note_page(st, addr, level + 1, pte_val(*pte), 1 << shift);
- }
-#endif
-}
-
-static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
-{
- pmd_t *pmd = pmd_offset(pud, 0);
- unsigned long addr;
- unsigned int i;
-
- for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
- addr = start + i * PMD_SIZE;
- if (!pmd_none(*pmd) && !pmd_is_leaf(*pmd))
- /* pmd exists */
- walk_pte(st, pmd, addr);
- else
- note_page(st, addr, 3, pmd_val(*pmd), PMD_SIZE);
- }
-}
-
-static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
-{
- pud_t *pud = pud_offset(p4d, 0);
- unsigned long addr;
- unsigned int i;
-
- for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
- addr = start + i * PUD_SIZE;
- if (!pud_none(*pud) && !pud_is_leaf(*pud))
- /* pud exists */
- walk_pmd(st, pud, addr);
- else
- note_page(st, addr, 2, pud_val(*pud), PUD_SIZE);
- }
-}
-
-static void walk_pagetables(struct pg_state *st)
-{
- unsigned int i;
- unsigned long addr = st->start_address & PGDIR_MASK;
- pgd_t *pgd = pgd_offset_k(addr);
-
- /*
- * Traverse the linux pagetable structure and dump pages that are in
- * the hash pagetable.
- */
- for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
- p4d_t *p4d = p4d_offset(pgd, 0);
-
- if (p4d_none(*p4d) || p4d_is_leaf(*p4d))
- note_page(st, addr, 1, p4d_val(*p4d), PGDIR_SIZE);
- else if (is_hugepd(__hugepd(p4d_val(*p4d))))
- walk_hugepd(st, (hugepd_t *)p4d, addr, PGDIR_SHIFT, 1);
- else
- /* p4d exists */
- walk_pud(st, p4d, addr);
+ note_page_update_state(st, addr, level, val);
}
}
@@ -371,11 +292,43 @@ static void populate_markers(void)
#endif
address_markers[i++].start_address = FIXADDR_START;
address_markers[i++].start_address = FIXADDR_TOP;
+#endif /* CONFIG_PPC64 */
#ifdef CONFIG_KASAN
address_markers[i++].start_address = KASAN_SHADOW_START;
address_markers[i++].start_address = KASAN_SHADOW_END;
#endif
-#endif /* CONFIG_PPC64 */
+}
+
+static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
+{
+ note_page(pt_st, addr, 4, pte_val(pte));
+}
+
+static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+ note_page(pt_st, addr, 3, pmd_val(pmd));
+}
+
+static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
+{
+ note_page(pt_st, addr, 2, pud_val(pud));
+}
+
+static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+ note_page(pt_st, addr, 1, p4d_val(p4d));
+}
+
+static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
+{
+ note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+static void note_page_flush(struct ptdump_state *pt_st)
+{
+ pte_t pte_zero = {0};
+
+ note_page(pt_st, 0, -1, pte_val(pte_zero));
}
static int ptdump_show(struct seq_file *m, void *v)
@@ -383,34 +336,26 @@ static int ptdump_show(struct seq_file *m, void *v)
struct pg_state st = {
.seq = m,
.marker = address_markers,
- .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE,
+ .level = -1,
+ .ptdump = {
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
+ .range = ptdump_range,
+ }
};
-#ifdef CONFIG_PPC64
- if (!radix_enabled())
- st.start_address = KERN_VIRT_START;
-#endif
-
/* Traverse kernel page tables */
- walk_pagetables(&st);
- note_page(&st, 0, 0, 0, 0);
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
return 0;
}
+DEFINE_SHOW_ATTRIBUTE(ptdump);
-static int ptdump_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
- .open = ptdump_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static void build_pgtable_complete_mask(void)
+static void __init build_pgtable_complete_mask(void)
{
unsigned int i, j;
@@ -420,37 +365,61 @@ static void build_pgtable_complete_mask(void)
pg_level[i].mask |= pg_level[i].flag[j].mask;
}
-#ifdef CONFIG_PPC_DEBUG_WX
-void ptdump_check_wx(void)
+bool ptdump_check_wx(void)
{
struct pg_state st = {
.seq = NULL,
- .marker = address_markers,
+ .marker = (struct addr_marker[]) {
+ { 0, NULL},
+ { -1, NULL},
+ },
+ .level = -1,
.check_wx = true,
- .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE,
+ .ptdump = {
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
+ .range = ptdump_range,
+ }
};
-#ifdef CONFIG_PPC64
- if (!radix_enabled())
- st.start_address = KERN_VIRT_START;
-#endif
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !mmu_has_feature(MMU_FTR_KERNEL_RO))
+ return true;
- walk_pagetables(&st);
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
- if (st.wx_pages)
+ if (st.wx_pages) {
pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n",
st.wx_pages);
- else
+
+ return false;
+ } else {
pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+
+ return true;
+ }
}
-#endif
-static int ptdump_init(void)
+static int __init ptdump_init(void)
{
+#ifdef CONFIG_PPC64
+ if (!radix_enabled())
+ ptdump_range[0].start = KERN_VIRT_START;
+ else
+ ptdump_range[0].start = PAGE_OFFSET;
+
+ ptdump_range[0].end = PAGE_OFFSET + (PGDIR_SIZE * PTRS_PER_PGD);
+#endif
+
populate_markers();
build_pgtable_complete_mask();
- debugfs_create_file("kernel_page_tables", 0400, NULL, NULL,
- &ptdump_fops);
+
+ if (IS_ENABLED(CONFIG_PTDUMP_DEBUGFS))
+ debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
+
return 0;
}
device_initcall(ptdump_init);
diff --git a/arch/powerpc/mm/ptdump/ptdump.h b/arch/powerpc/mm/ptdump/ptdump.h
index 154efae96ae0..12aa9eca8b0c 100644
--- a/arch/powerpc/mm/ptdump/ptdump.h
+++ b/arch/powerpc/mm/ptdump/ptdump.h
@@ -11,12 +11,13 @@ struct flag_info {
int shift;
};
-struct pgtable_level {
+struct ptdump_pg_level {
const struct flag_info *flag;
+ char name[4];
size_t num;
u64 mask;
};
-extern struct pgtable_level pg_level[5];
+extern struct ptdump_pg_level pg_level[5];
void pt_dump_size(struct seq_file *m, unsigned long delta);
diff --git a/arch/powerpc/mm/ptdump/segment_regs.c b/arch/powerpc/mm/ptdump/segment_regs.c
index 565048a0c9be..9df3af8d481f 100644
--- a/arch/powerpc/mm/ptdump/segment_regs.c
+++ b/arch/powerpc/mm/ptdump/segment_regs.c
@@ -6,7 +6,7 @@
* This dumps the content of Segment Registers
*/
-#include <asm/debugfs.h>
+#include <linux/debugfs.h>
static void seg_show(struct seq_file *m, int i)
{
@@ -41,21 +41,11 @@ static int sr_show(struct seq_file *m, void *v)
return 0;
}
-static int sr_open(struct inode *inode, struct file *file)
-{
- return single_open(file, sr_show, NULL);
-}
-
-static const struct file_operations sr_fops = {
- .open = sr_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(sr);
static int __init sr_init(void)
{
- debugfs_create_file("segment_registers", 0400, powerpc_debugfs_root,
+ debugfs_create_file("segment_registers", 0400, arch_debugfs_dir,
NULL, &sr_fops);
return 0;
}
diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c
index c005fe041c18..edc69da19b85 100644
--- a/arch/powerpc/mm/ptdump/shared.c
+++ b/arch/powerpc/mm/ptdump/shared.c
@@ -11,15 +11,15 @@
static const struct flag_info flag_array[] = {
{
- .mask = _PAGE_USER,
- .val = _PAGE_USER,
- .set = "user",
- .clear = " ",
+ .mask = _PAGE_READ,
+ .val = 0,
+ .set = " ",
+ .clear = "r",
}, {
- .mask = _PAGE_RW,
- .val = _PAGE_RW,
- .set = "rw",
- .clear = "r ",
+ .mask = _PAGE_WRITE,
+ .val = 0,
+ .set = " ",
+ .clear = "w",
}, {
.mask = _PAGE_EXEC,
.val = _PAGE_EXEC,
@@ -67,18 +67,25 @@ static const struct flag_info flag_array[] = {
}
};
-struct pgtable_level pg_level[5] = {
- {
- }, { /* pgd */
+struct ptdump_pg_level pg_level[5] = {
+ { /* pgd */
+ .name = "PGD",
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* p4d */
+ .name = "P4D",
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
}, { /* pud */
+ .name = "PUD",
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
}, { /* pmd */
+ .name = "PMD",
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
}, { /* pte */
+ .name = "PTE",
.flag = flag_array,
.num = ARRAY_SIZE(flag_array),
},