summaryrefslogtreecommitdiff
path: root/arch/powerpc/mm/book3s64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm/book3s64')
-rw-r--r--arch/powerpc/mm/book3s64/Makefile30
-rw-r--r--arch/powerpc/mm/book3s64/hash_4k.c7
-rw-r--r--arch/powerpc/mm/book3s64/hash_64k.c14
-rw-r--r--arch/powerpc/mm/book3s64/hash_hugepage.c2
-rw-r--r--arch/powerpc/mm/book3s64/hash_native.c165
-rw-r--r--arch/powerpc/mm/book3s64/hash_pgtable.c165
-rw-r--r--arch/powerpc/mm/book3s64/hash_tlb.c29
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c597
-rw-r--r--arch/powerpc/mm/book3s64/hugetlbpage.c (renamed from arch/powerpc/mm/book3s64/hash_hugetlbpage.c)17
-rw-r--r--arch/powerpc/mm/book3s64/internal.h31
-rw-r--r--arch/powerpc/mm/book3s64/iommu_api.c137
-rw-r--r--arch/powerpc/mm/book3s64/mmu_context.c70
-rw-r--r--arch/powerpc/mm/book3s64/pgtable.c256
-rw-r--r--arch/powerpc/mm/book3s64/pkeys.c445
-rw-r--r--arch/powerpc/mm/book3s64/radix_hugetlbpage.c75
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c1024
-rw-r--r--arch/powerpc/mm/book3s64/radix_tlb.c812
-rw-r--r--arch/powerpc/mm/book3s64/slb.c280
-rw-r--r--arch/powerpc/mm/book3s64/slice.c807
-rw-r--r--arch/powerpc/mm/book3s64/subpage_prot.c40
-rw-r--r--arch/powerpc/mm/book3s64/trace.c7
21 files changed, 3592 insertions, 1418 deletions
diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile
index fd393b8be14f..cad2abc1730f 100644
--- a/arch/powerpc/mm/book3s64/Makefile
+++ b/arch/powerpc/mm/book3s64/Makefile
@@ -2,22 +2,34 @@
ccflags-y := $(NO_MINIMAL_TOC)
+obj-y += mmu_context.o pgtable.o trace.o
+ifdef CONFIG_PPC_64S_HASH_MMU
CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
-
-obj-y += hash_pgtable.o hash_utils.o slb.o \
- mmu_context.o pgtable.o hash_tlb.o
-obj-$(CONFIG_PPC_NATIVE) += hash_native.o
-obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o
+obj-y += hash_pgtable.o hash_utils.o hash_tlb.o slb.o slice.o
+obj-$(CONFIG_PPC_HASH_MMU_NATIVE) += hash_native.o
obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o
obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o
-obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o
+obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o
+endif
+
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+
+obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o
ifdef CONFIG_HUGETLB_PAGE
obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o
endif
-obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o
-obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o
obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o
-obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o
+obj-$(CONFIG_PPC_PKEY) += pkeys.o
# Instrumenting the SLB fault path can lead to duplicate SLB entries
KCOV_INSTRUMENT_slb.o := n
+
+# Parts of these can run in real mode and therefore are
+# not safe with the current outline KASAN implementation
+KASAN_SANITIZE_mmu_context.o := n
+KASAN_SANITIZE_pgtable.o := n
+KASAN_SANITIZE_radix_pgtable.o := n
+KASAN_SANITIZE_radix_tlb.o := n
+KASAN_SANITIZE_slb.o := n
+KASAN_SANITIZE_pkeys.o := n
diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c
index 22e787123cdf..02acbfd05b46 100644
--- a/arch/powerpc/mm/book3s64/hash_4k.c
+++ b/arch/powerpc/mm/book3s64/hash_4k.c
@@ -16,6 +16,8 @@
#include <asm/machdep.h>
#include <asm/mmu.h>
+#include "internal.h"
+
int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
pte_t *ptep, unsigned long trap, unsigned long flags,
int ssize, int subpg_prot)
@@ -54,7 +56,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
* PP bits. _PAGE_USER is already PP bit 0x2, so we only
* need to add in 0x1 if it's a read-only user page
*/
- rflags = htab_convert_pte_flags(new_pte);
+ rflags = htab_convert_pte_flags(new_pte, flags);
rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
@@ -118,6 +120,9 @@ repeat:
}
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+ if (stress_hpt())
+ hpt_do_stress(ea, hpte_group);
}
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c
index 7084ce2951e6..954af420f358 100644
--- a/arch/powerpc/mm/book3s64/hash_64k.c
+++ b/arch/powerpc/mm/book3s64/hash_64k.c
@@ -16,6 +16,8 @@
#include <asm/machdep.h>
#include <asm/mmu.h>
+#include "internal.h"
+
/*
* Return true, if the entry has a slot value which
* the software considers as invalid.
@@ -72,7 +74,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
* Handle the subpage protection bits
*/
subpg_pte = new_pte & ~subpg_prot;
- rflags = htab_convert_pte_flags(subpg_pte);
+ rflags = htab_convert_pte_flags(subpg_pte, flags);
if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
@@ -216,6 +218,9 @@ repeat:
new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
new_pte |= H_PAGE_HASHPTE;
+ if (stress_hpt())
+ hpt_do_stress(ea, hpte_group);
+
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
@@ -260,7 +265,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
new_pte |= _PAGE_DIRTY;
} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
- rflags = htab_convert_pte_flags(new_pte);
+ rflags = htab_convert_pte_flags(new_pte, flags);
rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
@@ -327,7 +332,12 @@ repeat:
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+ if (stress_hpt())
+ hpt_do_stress(ea, hpte_group);
}
+
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
+
return 0;
}
diff --git a/arch/powerpc/mm/book3s64/hash_hugepage.c b/arch/powerpc/mm/book3s64/hash_hugepage.c
index 440823797de7..c0fabe6c5a12 100644
--- a/arch/powerpc/mm/book3s64/hash_hugepage.c
+++ b/arch/powerpc/mm/book3s64/hash_hugepage.c
@@ -57,7 +57,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)))
return 0;
- rflags = htab_convert_pte_flags(new_pmd);
+ rflags = htab_convert_pte_flags(new_pmd, flags);
#if 0
if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
index d2d8237ea9d5..430d1d935a7c 100644
--- a/arch/powerpc/mm/book3s64/hash_native.c
+++ b/arch/powerpc/mm/book3s64/hash_native.c
@@ -14,11 +14,11 @@
#include <linux/processor.h>
#include <linux/threads.h>
#include <linux/smp.h>
+#include <linux/pgtable.h>
#include <asm/machdep.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/trace.h>
#include <asm/tlb.h>
#include <asm/cputable.h>
@@ -43,102 +43,28 @@
static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
-static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
-{
- unsigned long rb;
-
- rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
-
- asm volatile("tlbiel %0" : : "r" (rb));
-}
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map hpte_lock_map =
+ STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map);
-/*
- * tlbiel instruction for hash, set invalidation
- * i.e., r=1 and is=01 or is=10 or is=11
- */
-static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
- unsigned int pid,
- unsigned int ric, unsigned int prs)
+static void acquire_hpte_lock(void)
{
- unsigned long rb;
- unsigned long rs;
- unsigned int r = 0; /* hash format */
-
- rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
- rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
-
- asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
- : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r)
- : "memory");
+ lock_map_acquire(&hpte_lock_map);
}
-
-static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
+static void release_hpte_lock(void)
{
- unsigned int set;
-
- asm volatile("ptesync": : :"memory");
-
- for (set = 0; set < num_sets; set++)
- tlbiel_hash_set_isa206(set, is);
-
- asm volatile("ptesync": : :"memory");
+ lock_map_release(&hpte_lock_map);
}
-
-static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
+#else
+static void acquire_hpte_lock(void)
{
- unsigned int set;
-
- asm volatile("ptesync": : :"memory");
-
- /*
- * Flush the first set of the TLB, and any caching of partition table
- * entries. Then flush the remaining sets of the TLB. Hash mode uses
- * partition scoped TLB translations.
- */
- tlbiel_hash_set_isa300(0, is, 0, 2, 0);
- for (set = 1; set < num_sets; set++)
- tlbiel_hash_set_isa300(set, is, 0, 0, 0);
-
- /*
- * Now invalidate the process table cache.
- *
- * From ISA v3.0B p. 1078:
- * The following forms are invalid.
- * * PRS=1, R=0, and RIC!=2 (The only process-scoped
- * HPT caching is of the Process Table.)
- */
- tlbiel_hash_set_isa300(0, is, 0, 2, 1);
-
- asm volatile("ptesync": : :"memory");
-
- asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory");
}
-void hash__tlbiel_all(unsigned int action)
+static void release_hpte_lock(void)
{
- unsigned int is;
-
- switch (action) {
- case TLB_INVAL_SCOPE_GLOBAL:
- is = 3;
- break;
- case TLB_INVAL_SCOPE_LPID:
- is = 2;
- break;
- default:
- BUG();
- }
-
- if (early_cpu_has_feature(CPU_FTR_ARCH_300))
- tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
- else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
- tlbiel_all_isa206(POWER8_TLB_SETS, is);
- else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
- tlbiel_all_isa206(POWER7_TLB_SETS, is);
- else
- WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
}
+#endif
static inline unsigned long ___tlbie(unsigned long vpn, int psize,
int apsize, int ssize)
@@ -260,7 +186,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
va |= ssize << 8;
sllp = get_sllp_encoding(apsize);
va |= sllp << 5;
- asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1)
+ asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 0), %1)
: : "r" (va), "i" (CPU_FTR_ARCH_206)
: "memory");
break;
@@ -279,7 +205,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
*/
va |= (vpn & 0xfe);
va |= 1; /* L */
- asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1)
+ asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 1), %1)
: : "r" (va), "i" (CPU_FTR_ARCH_206)
: "memory");
break;
@@ -303,7 +229,7 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize,
asm volatile("ptesync": : :"memory");
if (use_local) {
__tlbiel(vpn, psize, apsize, ssize);
- asm volatile("ptesync": : :"memory");
+ ppc_after_tlbiel_barrier();
} else {
__tlbie(vpn, psize, apsize, ssize);
fixup_tlbie_vpn(vpn, psize, apsize, ssize);
@@ -317,6 +243,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
{
unsigned long *word = (unsigned long *)&hptep->v;
+ acquire_hpte_lock();
while (1) {
if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
break;
@@ -331,6 +258,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep)
{
unsigned long *word = (unsigned long *)&hptep->v;
+ release_hpte_lock();
clear_bit_unlock(HPTE_LOCK_BIT, word);
}
@@ -340,8 +268,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
{
struct hash_pte *hptep = htab_address + hpte_group;
unsigned long hpte_v, hpte_r;
+ unsigned long flags;
int i;
+ local_irq_save(flags);
+
if (!(vflags & HPTE_V_BOLTED)) {
DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx,"
" rflags=%lx, vflags=%lx, psize=%d)\n",
@@ -360,8 +291,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
hptep++;
}
- if (i == HPTES_PER_GROUP)
+ if (i == HPTES_PER_GROUP) {
+ local_irq_restore(flags);
return -1;
+ }
hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
@@ -383,19 +316,24 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
* Now set the first dword including the valid bit
* NOTE: this also unlocks the hpte
*/
+ release_hpte_lock();
hptep->v = cpu_to_be64(hpte_v);
__asm__ __volatile__ ("ptesync" : : : "memory");
+ local_irq_restore(flags);
+
return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
}
static long native_hpte_remove(unsigned long hpte_group)
{
+ unsigned long hpte_v, flags;
struct hash_pte *hptep;
int i;
int slot_offset;
- unsigned long hpte_v;
+
+ local_irq_save(flags);
DBG_LOW(" remove(group=%lx)\n", hpte_group);
@@ -420,12 +358,16 @@ static long native_hpte_remove(unsigned long hpte_group)
slot_offset &= 0x7;
}
- if (i == HPTES_PER_GROUP)
- return -1;
+ if (i == HPTES_PER_GROUP) {
+ i = -1;
+ goto out;
+ }
/* Invalidate the hpte. NOTE: this also unlocks it */
+ release_hpte_lock();
hptep->v = 0;
-
+out:
+ local_irq_restore(flags);
return i;
}
@@ -436,6 +378,9 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
struct hash_pte *hptep = htab_address + slot;
unsigned long hpte_v, want_v;
int ret = 0, local = 0;
+ unsigned long irqflags;
+
+ local_irq_save(irqflags);
want_v = hpte_encode_avpn(vpn, bpsize, ssize);
@@ -479,6 +424,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
if (!(flags & HPTE_NOHPTE_UPDATE))
tlbie(vpn, bpsize, apsize, ssize, local);
+ local_irq_restore(irqflags);
+
return ret;
}
@@ -542,6 +489,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
unsigned long vsid;
long slot;
struct hash_pte *hptep;
+ unsigned long flags;
+
+ local_irq_save(flags);
vsid = get_kernel_vsid(ea, ssize);
vpn = hpt_vpn(ea, vsid, ssize);
@@ -560,6 +510,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
* actual page size will be same.
*/
tlbie(vpn, psize, psize, ssize, 0);
+
+ local_irq_restore(flags);
}
/*
@@ -573,6 +525,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
unsigned long vsid;
long slot;
struct hash_pte *hptep;
+ unsigned long flags;
+
+ local_irq_save(flags);
vsid = get_kernel_vsid(ea, ssize);
vpn = hpt_vpn(ea, vsid, ssize);
@@ -590,6 +545,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
/* Invalidate the TLB */
tlbie(vpn, psize, psize, ssize, 0);
+
+ local_irq_restore(flags);
+
return 0;
}
@@ -614,10 +572,11 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
/* recheck with locks held */
hpte_v = hpte_get_old_v(hptep);
- if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
/* Invalidate the hpte. NOTE: this also unlocks it */
+ release_hpte_lock();
hptep->v = 0;
- else
+ } else
native_unlock_hpte(hptep);
}
/*
@@ -677,10 +636,8 @@ static void native_hugepage_invalidate(unsigned long vsid,
hpte_v = hpte_get_old_v(hptep);
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
- /*
- * Invalidate the hpte. NOTE: this also unlocks it
- */
-
+ /* Invalidate the hpte. NOTE: this also unlocks it */
+ release_hpte_lock();
hptep->v = 0;
} else
native_unlock_hpte(hptep);
@@ -780,7 +737,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
* TODO: add batching support when enabled. remember, no dynamic memory here,
* although there is the control page available...
*/
-static void native_hpte_clear(void)
+static notrace void native_hpte_clear(void)
{
unsigned long vpn = 0;
unsigned long slot, slots;
@@ -862,8 +819,10 @@ static void native_flush_hash_range(unsigned long number, int local)
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
- else
+ else {
+ release_hpte_lock();
hptep->v = 0;
+ }
} pte_iterate_hashed_end();
}
@@ -879,7 +838,7 @@ static void native_flush_hash_range(unsigned long number, int local)
__tlbiel(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
- asm volatile("ptesync":::"memory");
+ ppc_after_tlbiel_barrier();
} else {
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
index 64733b9cb20a..988948d69bc1 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -8,16 +8,15 @@
#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/mm.h>
+#include <linux/stop_machine.h>
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
#include <asm/sections.h>
#include <asm/mmu.h>
#include <asm/tlb.h>
+#include <asm/firmware.h>
#include <mm/mmu_decl.h>
-#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>
#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
@@ -148,6 +147,7 @@ void hash__vmemmap_remove_mapping(unsigned long start,
int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
{
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
@@ -155,7 +155,8 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
if (slab_is_available()) {
pgdp = pgd_offset_k(ea);
- pudp = pud_alloc(&init_mm, pgdp, ea);
+ p4dp = p4d_offset(pgdp, ea);
+ pudp = pud_alloc(&init_mm, p4dp, ea);
if (!pudp)
return -ENOMEM;
pmdp = pmd_alloc(&init_mm, pudp, ea);
@@ -213,7 +214,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr
old = be64_to_cpu(old_be);
- trace_hugepage_update(addr, old, clr, set);
+ trace_hugepage_update_pmd(addr, old, clr, set);
if (old & H_PAGE_HASHPTE)
hpte_do_hugepage_flush(mm, addr, pmdp, old);
return old;
@@ -236,7 +237,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres
* to hugepage, we first clear the pmd, then invalidate all
* the PTE entries. The assumption here is that any low level
* page fault will see a none pmd and take the slow path that
- * will wait on mmap_sem. But we could very well be in a
+ * will wait on mmap_lock. But we could very well be in a
* hash_page with local ptep pointer value. Such a hash page
* can result in adding new HPTE entries for normal subpages.
* That means we could be modifying the page content as we
@@ -250,12 +251,12 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres
* Now invalidate the hpte entries in the range
* covered by pmd. This make sure we take a
* fault and will find the pmd as none, which will
- * result in a major fault which takes mmap_sem and
+ * result in a major fault which takes mmap_lock and
* hence wait for collapse to complete. Without this
* the __collapse_huge_page_copy can result in copying
* the old content.
*/
- flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+ flush_hash_table_pmd_range(vma->vm_mm, &pmd, address);
return pmd;
}
@@ -363,17 +364,6 @@ pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
* hash fault look at them.
*/
memset(pgtable, 0, PTE_FRAG_SIZE);
- /*
- * Serialize against find_current_mm_pte variants which does lock-less
- * lookup in page tables with local interrupts disabled. For huge pages
- * it casts pmd_t to pte_t. Since format of pte_t is different from
- * pmd_t we want to prevent transit from pmd pointing to page table
- * to pmd pointing to huge page (and back) while interrupts are disabled.
- * We clear pmd to possibly replace it with page table pointer in
- * different code paths. So make sure we wait for the parallel
- * find_curren_mm_pte to finish.
- */
- serialize_against_pte_lookup(mm);
return old_pmd;
}
@@ -388,7 +378,7 @@ int hash__has_transparent_hugepage(void)
if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
return 0;
/*
- * We need to make sure that we support 16MB hugepage in a segement
+ * We need to make sure that we support 16MB hugepage in a segment
* with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
* of 64K.
*/
@@ -411,10 +401,105 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_STRICT_KERNEL_RWX
+
+struct change_memory_parms {
+ unsigned long start, end, newpp;
+ unsigned int step, nr_cpus;
+ atomic_t master_cpu;
+ atomic_t cpu_counter;
+};
+
+// We'd rather this was on the stack but it has to be in the RMO
+static struct change_memory_parms chmem_parms;
+
+// And therefore we need a lock to protect it from concurrent use
+static DEFINE_MUTEX(chmem_lock);
+
+static void change_memory_range(unsigned long start, unsigned long end,
+ unsigned int step, unsigned long newpp)
+{
+ unsigned long idx;
+
+ pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
+ start, end, newpp, step);
+
+ for (idx = start; idx < end; idx += step)
+ /* Not sure if we can do much with the return value */
+ mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
+ mmu_kernel_ssize);
+}
+
+static int notrace chmem_secondary_loop(struct change_memory_parms *parms)
+{
+ unsigned long msr, tmp, flags;
+ int *p;
+
+ p = &parms->cpu_counter.counter;
+
+ local_irq_save(flags);
+ hard_irq_disable();
+
+ asm volatile (
+ // Switch to real mode and leave interrupts off
+ "mfmsr %[msr] ;"
+ "li %[tmp], %[MSR_IR_DR] ;"
+ "andc %[tmp], %[msr], %[tmp] ;"
+ "mtmsrd %[tmp] ;"
+
+ // Tell the master we are in real mode
+ "1: "
+ "lwarx %[tmp], 0, %[p] ;"
+ "addic %[tmp], %[tmp], -1 ;"
+ "stwcx. %[tmp], 0, %[p] ;"
+ "bne- 1b ;"
+
+ // Spin until the counter goes to zero
+ "2: ;"
+ "lwz %[tmp], 0(%[p]) ;"
+ "cmpwi %[tmp], 0 ;"
+ "bne- 2b ;"
+
+ // Switch back to virtual mode
+ "mtmsrd %[msr] ;"
+
+ : // outputs
+ [msr] "=&r" (msr), [tmp] "=&b" (tmp), "+m" (*p)
+ : // inputs
+ [p] "b" (p), [MSR_IR_DR] "i" (MSR_IR | MSR_DR)
+ : // clobbers
+ "cc", "xer"
+ );
+
+ local_irq_restore(flags);
+
+ return 0;
+}
+
+static int change_memory_range_fn(void *data)
+{
+ struct change_memory_parms *parms = data;
+
+ // First CPU goes through, all others wait.
+ if (atomic_xchg(&parms->master_cpu, 1) == 1)
+ return chmem_secondary_loop(parms);
+
+ // Wait for all but one CPU (this one) to call-in
+ while (atomic_read(&parms->cpu_counter) > 1)
+ barrier();
+
+ change_memory_range(parms->start, parms->end, parms->step, parms->newpp);
+
+ mb();
+
+ // Signal the other CPUs that we're done
+ atomic_dec(&parms->cpu_counter);
+
+ return 0;
+}
+
static bool hash__change_memory_range(unsigned long start, unsigned long end,
unsigned long newpp)
{
- unsigned long idx;
unsigned int step, shift;
shift = mmu_psize_defs[mmu_linear_psize].shift;
@@ -426,25 +511,43 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end,
if (start >= end)
return false;
- pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
- start, end, newpp, step);
+ if (firmware_has_feature(FW_FEATURE_LPAR)) {
+ mutex_lock(&chmem_lock);
- for (idx = start; idx < end; idx += step)
- /* Not sure if we can do much with the return value */
- mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
- mmu_kernel_ssize);
+ chmem_parms.start = start;
+ chmem_parms.end = end;
+ chmem_parms.step = step;
+ chmem_parms.newpp = newpp;
+ atomic_set(&chmem_parms.master_cpu, 0);
+
+ cpus_read_lock();
+
+ atomic_set(&chmem_parms.cpu_counter, num_online_cpus());
+
+ // Ensure state is consistent before we call the other CPUs
+ mb();
+
+ stop_machine_cpuslocked(change_memory_range_fn, &chmem_parms,
+ cpu_online_mask);
+
+ cpus_read_unlock();
+ mutex_unlock(&chmem_lock);
+ } else
+ change_memory_range(start, end, step, newpp);
return true;
}
void hash__mark_rodata_ro(void)
{
- unsigned long start, end;
+ unsigned long start, end, pp;
start = (unsigned long)_stext;
- end = (unsigned long)__init_begin;
+ end = (unsigned long)__end_rodata;
+
+ pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY);
- WARN_ON(!hash__change_memory_range(start, end, PP_RXXX));
+ WARN_ON(!hash__change_memory_range(start, end, pp));
}
void hash__mark_initmem_nx(void)
@@ -454,7 +557,7 @@ void hash__mark_initmem_nx(void)
start = (unsigned long)__init_begin;
end = (unsigned long)__init_end;
- pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
+ pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY);
WARN_ON(!hash__change_memory_range(start, end, pp));
}
diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c
index 4a70d8dd39cd..21fcad97ae80 100644
--- a/arch/powerpc/mm/book3s64/hash_tlb.c
+++ b/arch/powerpc/mm/book3s64/hash_tlb.c
@@ -21,7 +21,6 @@
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/bug.h>
@@ -176,7 +175,6 @@ void hash__tlb_flush(struct mmu_gather *tlb)
* from the hash table (and the TLB). But keeps
* the linux PTEs intact.
*
- * @mm : mm_struct of the target address space (generally init_mm)
* @start : starting address
* @end : ending address (not included in the flush)
*
@@ -189,17 +187,14 @@ void hash__tlb_flush(struct mmu_gather *tlb)
* Because of that usage pattern, it is implemented for small size rather
* than speed.
*/
-void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
- unsigned long end)
+void __flush_hash_table_range(unsigned long start, unsigned long end)
{
- bool is_thp;
int hugepage_shift;
unsigned long flags;
- start = _ALIGN_DOWN(start, PAGE_SIZE);
- end = _ALIGN_UP(end, PAGE_SIZE);
+ start = ALIGN_DOWN(start, PAGE_SIZE);
+ end = ALIGN(end, PAGE_SIZE);
- BUG_ON(!mm->pgd);
/*
* Note: Normally, we should only ever use a batch within a
@@ -212,33 +207,27 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
for (; start < end; start += PAGE_SIZE) {
- pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp,
- &hugepage_shift);
+ pte_t *ptep = find_init_mm_pte(start, &hugepage_shift);
unsigned long pte;
if (ptep == NULL)
continue;
pte = pte_val(*ptep);
- if (is_thp)
- trace_hugepage_invalidate(start, pte);
if (!(pte & H_PAGE_HASHPTE))
continue;
- if (unlikely(is_thp))
- hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
- else
- hpte_need_flush(mm, start, ptep, pte, hugepage_shift);
+ hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift);
}
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
}
-void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
{
pte_t *pte;
pte_t *start_pte;
unsigned long flags;
- addr = _ALIGN_DOWN(addr, PMD_SIZE);
+ addr = ALIGN_DOWN(addr, PMD_SIZE);
/*
* Note: Normally, we should only ever use a batch within a
* PTE locked section. This violates the rule, but will work
@@ -250,12 +239,16 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
start_pte = pte_offset_map(pmd, addr);
+ if (!start_pte)
+ goto out;
for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
unsigned long pteval = pte_val(*pte);
if (pteval & H_PAGE_HASHPTE)
hpte_need_flush(mm, addr, pte, pteval, 0);
addr += PAGE_SIZE;
}
+ pte_unmap(start_pte);
+out:
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
}
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index b30435c7d804..0626a25b0d72 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -7,7 +7,7 @@
*
* SMP scalability work:
* Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- *
+ *
* Module name: htab.c
*
* Description:
@@ -35,17 +35,20 @@
#include <linux/pkeys.h>
#include <linux/hugetlb.h>
#include <linux/cpu.h>
+#include <linux/pgtable.h>
+#include <linux/debugfs.h>
+#include <linux/random.h>
+#include <linux/elf-randomize.h>
+#include <linux/of_fdt.h>
-#include <asm/debugfs.h>
+#include <asm/interrupt.h>
#include <asm/processor.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/page.h>
#include <asm/types.h>
#include <linux/uaccess.h>
#include <asm/machdep.h>
-#include <asm/prom.h>
#include <asm/io.h>
#include <asm/eeh.h>
#include <asm/tlb.h>
@@ -66,6 +69,9 @@
#include <mm/mmu_decl.h>
+#include "internal.h"
+
+
#ifdef DEBUG
#define DBG(fmt...) udbg_printf(fmt)
#else
@@ -95,8 +101,6 @@
*/
static unsigned long _SDR1;
-struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
-EXPORT_SYMBOL_GPL(mmu_psize_defs);
u8 hpte_page_sizes[1 << LP_BITS];
EXPORT_SYMBOL_GPL(hpte_page_sizes);
@@ -109,9 +113,7 @@ int mmu_linear_psize = MMU_PAGE_4K;
EXPORT_SYMBOL_GPL(mmu_linear_psize);
int mmu_virtual_psize = MMU_PAGE_4K;
int mmu_vmalloc_psize = MMU_PAGE_4K;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-int mmu_vmemmap_psize = MMU_PAGE_4K;
-#endif
+EXPORT_SYMBOL_GPL(mmu_vmalloc_psize);
int mmu_io_psize = MMU_PAGE_4K;
int mmu_kernel_ssize = MMU_SEGSIZE_256M;
EXPORT_SYMBOL_GPL(mmu_kernel_ssize);
@@ -121,11 +123,8 @@ EXPORT_SYMBOL_GPL(mmu_slb_size);
#ifdef CONFIG_PPC_64K_PAGES
int mmu_ci_restrictions;
#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
static u8 *linear_map_hash_slots;
static unsigned long linear_map_hash_count;
-static DEFINE_SPINLOCK(linear_map_hash_lock);
-#endif /* CONFIG_DEBUG_PAGEALLOC */
struct mmu_hash_ops mmu_hash_ops;
EXPORT_SYMBOL(mmu_hash_ops);
@@ -170,6 +169,110 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
},
};
+static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
+{
+ unsigned long rb;
+
+ rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+
+ asm volatile("tlbiel %0" : : "r" (rb));
+}
+
+/*
+ * tlbiel instruction for hash, set invalidation
+ * i.e., r=1 and is=01 or is=10 or is=11
+ */
+static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
+ unsigned int pid,
+ unsigned int ric, unsigned int prs)
+{
+ unsigned long rb;
+ unsigned long rs;
+ unsigned int r = 0; /* hash format */
+
+ rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+ rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
+
+ asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
+ : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r)
+ : "memory");
+}
+
+
+static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
+{
+ unsigned int set;
+
+ asm volatile("ptesync": : :"memory");
+
+ for (set = 0; set < num_sets; set++)
+ tlbiel_hash_set_isa206(set, is);
+
+ ppc_after_tlbiel_barrier();
+}
+
+static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
+{
+ unsigned int set;
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the partition table cache if this is HV mode.
+ */
+ if (early_cpu_has_feature(CPU_FTR_HVMODE))
+ tlbiel_hash_set_isa300(0, is, 0, 2, 0);
+
+ /*
+ * Now invalidate the process table cache. UPRT=0 HPT modes (what
+ * current hardware implements) do not use the process table, but
+ * add the flushes anyway.
+ *
+ * From ISA v3.0B p. 1078:
+ * The following forms are invalid.
+ * * PRS=1, R=0, and RIC!=2 (The only process-scoped
+ * HPT caching is of the Process Table.)
+ */
+ tlbiel_hash_set_isa300(0, is, 0, 2, 1);
+
+ /*
+ * Then flush the sets of the TLB proper. Hash mode uses
+ * partition scoped TLB translations, which may be flushed
+ * in !HV mode.
+ */
+ for (set = 0; set < num_sets; set++)
+ tlbiel_hash_set_isa300(set, is, 0, 0, 0);
+
+ ppc_after_tlbiel_barrier();
+
+ asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+void hash__tlbiel_all(unsigned int action)
+{
+ unsigned int is;
+
+ switch (action) {
+ case TLB_INVAL_SCOPE_GLOBAL:
+ is = 3;
+ break;
+ case TLB_INVAL_SCOPE_LPID:
+ is = 2;
+ break;
+ default:
+ BUG();
+ }
+
+ if (early_cpu_has_feature(CPU_FTR_ARCH_300))
+ tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
+ else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
+ tlbiel_all_isa206(POWER8_TLB_SETS, is);
+ else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
+ tlbiel_all_isa206(POWER7_TLB_SETS, is);
+ else
+ WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
+}
+
/*
* 'R' and 'C' update notes:
* - Under pHyp or KVM, the updatepp path will not set C, thus it *will*
@@ -183,7 +286,7 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
* - We make sure R is always set and never lost
* - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping
*/
-unsigned long htab_convert_pte_flags(unsigned long pteflags)
+unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags)
{
unsigned long rflags = 0;
@@ -207,9 +310,16 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
else
rflags |= 0x3;
}
+ VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request");
} else {
if (pteflags & _PAGE_RWX)
rflags |= 0x2;
+ /*
+ * We should never hit this in normal fault handling because
+ * a permission check (check_pte_access()) will bubble this
+ * to higher level linux handler even for PAGE_NONE.
+ */
+ VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request");
if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
rflags |= 0x1;
}
@@ -237,7 +347,7 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
*/
rflags |= HPTE_R_M;
- rflags |= pte_to_hpte_pkey_bits(pteflags);
+ rflags |= pte_to_hpte_pkey_bits(pteflags, flags);
return rflags;
}
@@ -252,13 +362,17 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
shift = mmu_psize_defs[psize].shift;
step = 1 << shift;
- prot = htab_convert_pte_flags(prot);
+ prot = htab_convert_pte_flags(prot, HPTE_USE_KERNEL_KEY);
DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
vstart, vend, pstart, prot, psize, ssize);
- for (vaddr = vstart, paddr = pstart; vaddr < vend;
- vaddr += step, paddr += step) {
+ /* Carefully map only the possible range */
+ vaddr = ALIGN(vstart, step);
+ paddr = ALIGN(pstart, step);
+ vend = ALIGN_DOWN(vend, step);
+
+ for (; vaddr < vend; vaddr += step, paddr += step) {
unsigned long hash, hpteg;
unsigned long vsid = get_kernel_vsid(vaddr, ssize);
unsigned long vpn = hpt_vpn(vaddr, vsid, ssize);
@@ -298,7 +412,7 @@ repeat:
ssize);
if (ret == -1) {
/*
- * Try to to keep bolted entries in primary.
+ * Try to keep bolted entries in primary.
* Remove non bolted entries and try insert again
*/
ret = mmu_hash_ops.hpte_remove(hpteg);
@@ -317,11 +431,9 @@ repeat:
break;
cond_resched();
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (debug_pagealloc_enabled() &&
+ if (debug_pagealloc_enabled_or_kfence() &&
(paddr >> PAGE_SHIFT) < linear_map_hash_count)
linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
-#endif /* CONFIG_DEBUG_PAGEALLOC */
}
return ret < 0 ? ret : 0;
}
@@ -329,7 +441,7 @@ repeat:
int htab_remove_mapping(unsigned long vstart, unsigned long vend,
int psize, int ssize)
{
- unsigned long vaddr;
+ unsigned long vaddr, time_limit;
unsigned int step, shift;
int rc;
int ret = 0;
@@ -340,8 +452,21 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend,
if (!mmu_hash_ops.hpte_removebolted)
return -ENODEV;
- for (vaddr = vstart; vaddr < vend; vaddr += step) {
+ /* Unmap the full range specificied */
+ vaddr = ALIGN_DOWN(vstart, step);
+ time_limit = jiffies + HZ;
+
+ for (;vaddr < vend; vaddr += step) {
rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize);
+
+ /*
+ * For large number of mappings introduce a cond_resched()
+ * to prevent softlockup warnings.
+ */
+ if (time_after(jiffies, time_limit)) {
+ cond_resched();
+ time_limit = jiffies + HZ;
+ }
if (rc == -ENOENT) {
ret = -ENOENT;
continue;
@@ -353,7 +478,7 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend,
return ret;
}
-static bool disable_1tb_segments = false;
+static bool disable_1tb_segments __ro_after_init;
static int __init parse_disable_1tb_segments(char *p)
{
@@ -362,6 +487,40 @@ static int __init parse_disable_1tb_segments(char *p)
}
early_param("disable_1tb_segments", parse_disable_1tb_segments);
+bool stress_hpt_enabled __initdata;
+
+static int __init parse_stress_hpt(char *p)
+{
+ stress_hpt_enabled = true;
+ return 0;
+}
+early_param("stress_hpt", parse_stress_hpt);
+
+__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_hpt_key);
+
+/*
+ * per-CPU array allocated if we enable stress_hpt.
+ */
+#define STRESS_MAX_GROUPS 16
+struct stress_hpt_struct {
+ unsigned long last_group[STRESS_MAX_GROUPS];
+};
+
+static inline int stress_nr_groups(void)
+{
+ /*
+ * LPAR H_REMOVE flushes TLB, so need some number > 1 of entries
+ * to allow practical forward progress. Bare metal returns 1, which
+ * seems to help uncover more bugs.
+ */
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ return STRESS_MAX_GROUPS;
+ else
+ return 1;
+}
+
+static struct stress_hpt_struct *stress_hpt_struct;
+
static int __init htab_dt_scan_seg_sizes(unsigned long node,
const char *uname, int depth,
void *data)
@@ -541,7 +700,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
}
#endif /* CONFIG_HUGETLB_PAGE */
-static void mmu_psize_set_default_penc(void)
+static void __init mmu_psize_set_default_penc(void)
{
int bpsize, apsize;
for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
@@ -551,7 +710,7 @@ static void mmu_psize_set_default_penc(void)
#ifdef CONFIG_PPC_64K_PAGES
-static bool might_have_hea(void)
+static bool __init might_have_hea(void)
{
/*
* The HEA ethernet adapter requires awareness of the
@@ -593,7 +752,7 @@ static void __init htab_scan_page_sizes(void)
}
#ifdef CONFIG_HUGETLB_PAGE
- if (!hugetlb_disabled) {
+ if (!hugetlb_disabled && !early_radix_enabled() ) {
/* Reserve 16G huge page memory sections for huge pages */
of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
}
@@ -622,7 +781,7 @@ static void __init htab_scan_page_sizes(void)
* low-order N bits as the encoding for the 2^(12+N) byte page size
* (if it exists).
*/
-static void init_hpte_page_sizes(void)
+static void __init init_hpte_page_sizes(void)
{
long int ap, bp;
long int shift, penc;
@@ -652,14 +811,22 @@ static void init_hpte_page_sizes(void)
static void __init htab_init_page_sizes(void)
{
+ bool aligned = true;
init_hpte_page_sizes();
- if (!debug_pagealloc_enabled()) {
+ if (!debug_pagealloc_enabled_or_kfence()) {
/*
* Pick a size for the linear mapping. Currently, we only
* support 16M, 1M and 4K which is the default
*/
- if (mmu_psize_defs[MMU_PAGE_16M].shift)
+ if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) &&
+ (unsigned long)_stext % 0x1000000) {
+ if (mmu_psize_defs[MMU_PAGE_16M].shift)
+ pr_warn("Kernel not 16M aligned, disabling 16M linear map alignment\n");
+ aligned = false;
+ }
+
+ if (mmu_psize_defs[MMU_PAGE_16M].shift && aligned)
mmu_linear_psize = MMU_PAGE_16M;
else if (mmu_psize_defs[MMU_PAGE_1M].shift)
mmu_linear_psize = MMU_PAGE_1M;
@@ -776,7 +943,7 @@ static unsigned long __init htab_get_table_size(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int resize_hpt_for_hotplug(unsigned long new_mem_size)
+static int resize_hpt_for_hotplug(unsigned long new_mem_size)
{
unsigned target_hpt_shift;
@@ -800,7 +967,8 @@ int resize_hpt_for_hotplug(unsigned long new_mem_size)
return 0;
}
-int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
+int hash__create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot)
{
int rc;
@@ -809,8 +977,10 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, int nid
return -1;
}
+ resize_hpt_for_hotplug(memblock_phys_mem_size());
+
rc = htab_bolt_mapping(start, end, __pa(start),
- pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+ pgprot_val(prot), mmu_linear_psize,
mmu_kernel_ssize);
if (rc < 0) {
@@ -825,7 +995,10 @@ int hash__remove_section_mapping(unsigned long start, unsigned long end)
{
int rc = htab_remove_mapping(start, end, mmu_linear_psize,
mmu_kernel_ssize);
- WARN_ON(rc < 0);
+
+ if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
+ pr_warn("Hash collision while resizing HPT\n");
+
return rc;
}
#endif /* CONFIG_MEMORY_HOTPLUG */
@@ -844,13 +1017,30 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
pr_info("Partition table %p\n", partition_tb);
}
+void hpt_clear_stress(void);
+static struct timer_list stress_hpt_timer;
+static void stress_hpt_timer_fn(struct timer_list *timer)
+{
+ int next_cpu;
+
+ hpt_clear_stress();
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ tlbiel_all();
+
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+ stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
+ add_timer_on(&stress_hpt_timer, next_cpu);
+}
+
static void __init htab_initialize(void)
{
unsigned long table;
unsigned long pteg_count;
unsigned long prot;
- unsigned long base = 0, size = 0;
- struct memblock_region *reg;
+ phys_addr_t base = 0, size = 0, end;
+ u64 i;
DBG(" -> htab_initialize()\n");
@@ -860,10 +1050,28 @@ static void __init htab_initialize(void)
printk(KERN_INFO "Using 1TB segments\n");
}
+ if (stress_slb_enabled)
+ static_branch_enable(&stress_slb_key);
+
+ if (stress_hpt_enabled) {
+ unsigned long tmp;
+ static_branch_enable(&stress_hpt_key);
+ // Too early to use nr_cpu_ids, so use NR_CPUS
+ tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS,
+ __alignof__(struct stress_hpt_struct),
+ 0, MEMBLOCK_ALLOC_ANYWHERE);
+ memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS);
+ stress_hpt_struct = __va(tmp);
+
+ timer_setup(&stress_hpt_timer, stress_hpt_timer_fn, 0);
+ stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
+ add_timer(&stress_hpt_timer);
+ }
+
/*
* Calculate the required size of the htab. We want the number of
* PTEGs to equal one half the number of real pages.
- */
+ */
htab_size_bytes = htab_get_table_size();
pteg_count = htab_size_bytes >> 7;
@@ -873,7 +1081,7 @@ static void __init htab_initialize(void)
firmware_has_feature(FW_FEATURE_PS3_LV1)) {
/* Using a hypervisor which owns the htab */
htab_address = NULL;
- _SDR1 = 0;
+ _SDR1 = 0;
#ifdef CONFIG_FA_DUMP
/*
* If firmware assisted dump is active firmware preserves
@@ -926,8 +1134,7 @@ static void __init htab_initialize(void)
prot = pgprot_val(PAGE_KERNEL);
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (debug_pagealloc_enabled()) {
+ if (debug_pagealloc_enabled_or_kfence()) {
linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
linear_map_hash_slots = memblock_alloc_try_nid(
linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
@@ -936,12 +1143,11 @@ static void __init htab_initialize(void)
panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
__func__, linear_map_hash_count, &ppc64_rma_size);
}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
/* create bolted the linear mapping in the hash table */
- for_each_memblock(memory, reg) {
- base = (unsigned long)__va(reg->base);
- size = reg->size;
+ for_each_mem_range(i, &base, &end) {
+ size = end - base;
+ base = (unsigned long)__va(base);
DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
base, size, prot);
@@ -1052,7 +1258,7 @@ void __init hash__early_init_mmu(void)
ps3_early_mm_init();
else if (firmware_has_feature(FW_FEATURE_LPAR))
hpte_init_pseries();
- else if (IS_ENABLED(CONFIG_PPC_NATIVE))
+ else if (IS_ENABLED(CONFIG_PPC_HASH_MMU_NATIVE))
hpte_init_native();
if (!mmu_hash_ops.hpte_insert)
@@ -1095,6 +1301,11 @@ void hash__early_init_mmu_secondary(void)
if (cpu_has_feature(CPU_FTR_ARCH_206)
&& cpu_has_feature(CPU_FTR_HVMODE))
tlbiel_all();
+
+#ifdef CONFIG_PPC_MEM_KEYS
+ if (mmu_has_feature(MMU_FTR_PKEY))
+ mtspr(SPRN_UAMOR, default_uamor);
+#endif
}
#endif /* CONFIG_SMP */
@@ -1103,25 +1314,25 @@ void hash__early_init_mmu_secondary(void)
*/
unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
{
- struct page *page;
+ struct folio *folio;
if (!pfn_valid(pte_pfn(pte)))
return pp;
- page = pte_page(pte);
+ folio = page_folio(pte_page(pte));
/* page is dirty */
- if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
- if (trap == 0x400) {
- flush_dcache_icache_page(page);
- set_bit(PG_arch_1, &page->flags);
+ if (!test_bit(PG_dcache_clean, &folio->flags) &&
+ !folio_test_reserved(folio)) {
+ if (trap == INTERRUPT_INST_STORAGE) {
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags);
} else
pp |= HPTE_R_N;
}
return pp;
}
-#ifdef CONFIG_PPC_MM_SLICES
static unsigned int get_paca_psize(unsigned long addr)
{
unsigned char *psizes;
@@ -1138,12 +1349,6 @@ static unsigned int get_paca_psize(unsigned long addr)
return (psizes[index >> 1] >> (mask_index * 4)) & 0xF;
}
-#else
-unsigned int get_paca_psize(unsigned long addr)
-{
- return get_paca()->mm_ctx_user_psize;
-}
-#endif
/*
* Demote a segment to using 4k pages.
@@ -1200,7 +1405,7 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea)
spp >>= 30 - 2 * ((ea >> 12) & 0xf);
/*
- * 0 -> full premission
+ * 0 -> full permission
* 1 -> Read only
* 2 -> no access.
* We return the flag that need to be cleared.
@@ -1256,7 +1461,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
unsigned long flags)
{
bool is_thp;
- enum ctx_state prev_state = exception_enter();
pgd_t *pgdir;
unsigned long vsid;
pte_t *ptep;
@@ -1285,12 +1489,14 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
psize = mmu_vmalloc_psize;
ssize = mmu_kernel_ssize;
+ flags |= HPTE_USE_KERNEL_KEY;
break;
case IO_REGION_ID:
vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
psize = mmu_io_psize;
ssize = mmu_kernel_ssize;
+ flags |= HPTE_USE_KERNEL_KEY;
break;
default:
/*
@@ -1340,8 +1546,15 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
goto bail;
}
- /* Add _PAGE_PRESENT to the required access perm */
- access |= _PAGE_PRESENT;
+ /*
+ * Add _PAGE_PRESENT to the required access perm. If there are parallel
+ * updates to the pte that can possibly clear _PAGE_PTE, catch that too.
+ *
+ * We can safely use the return pte address in rest of the function
+ * because we do set H_PAGE_BUSY which prevents further updates to pte
+ * from generic code.
+ */
+ access |= _PAGE_PRESENT | _PAGE_PTE;
/*
* Pre-check access permissions (will be re-checked atomically
@@ -1449,7 +1662,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
DBG_LOW(" -> rc=%d\n", rc);
bail:
- exception_exit(prev_state);
return rc;
}
EXPORT_SYMBOL_GPL(hash_page_mm);
@@ -1471,16 +1683,26 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
}
EXPORT_SYMBOL_GPL(hash_page);
-int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr,
- unsigned long msr)
+DEFINE_INTERRUPT_HANDLER(do_hash_fault)
{
+ unsigned long ea = regs->dar;
+ unsigned long dsisr = regs->dsisr;
unsigned long access = _PAGE_PRESENT | _PAGE_READ;
unsigned long flags = 0;
- struct mm_struct *mm = current->mm;
- unsigned int region_id = get_region_id(ea);
+ struct mm_struct *mm;
+ unsigned int region_id;
+ long err;
+ if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT))) {
+ hash__do_page_fault(regs);
+ return;
+ }
+
+ region_id = get_region_id(ea);
if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
mm = &init_mm;
+ else
+ mm = current->mm;
if (dsisr & DSISR_NOHPTE)
flags |= HPTE_NOHPTE_UPDATE;
@@ -1496,16 +1718,30 @@ int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr,
* 2) user space access kernel space.
*/
access |= _PAGE_PRIVILEGED;
- if ((msr & MSR_PR) || (region_id == USER_REGION_ID))
+ if (user_mode(regs) || (region_id == USER_REGION_ID))
access &= ~_PAGE_PRIVILEGED;
- if (trap == 0x400)
+ if (TRAP(regs) == INTERRUPT_INST_STORAGE)
access |= _PAGE_EXEC;
- return hash_page_mm(mm, ea, access, trap, flags);
+ err = hash_page_mm(mm, ea, access, TRAP(regs), flags);
+ if (unlikely(err < 0)) {
+ // failed to insert a hash PTE due to an hypervisor error
+ if (user_mode(regs)) {
+ if (IS_ENABLED(CONFIG_PPC_SUBPAGE_PROT) && err == -2)
+ _exception(SIGSEGV, regs, SEGV_ACCERR, ea);
+ else
+ _exception(SIGBUS, regs, BUS_ADRERR, ea);
+ } else {
+ bad_page_fault(regs, SIGBUS);
+ }
+ err = 0;
+
+ } else if (err) {
+ hash__do_page_fault(regs);
+ }
}
-#ifdef CONFIG_PPC_MM_SLICES
static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
{
int psize = get_slice_psize(mm, ea);
@@ -1522,23 +1758,15 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
return true;
}
-#else
-static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
-{
- return true;
-}
-#endif
-static void hash_preload(struct mm_struct *mm, unsigned long ea,
+static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
bool is_exec, unsigned long trap)
{
- int hugepage_shift;
unsigned long vsid;
pgd_t *pgdir;
- pte_t *ptep;
- unsigned long flags;
int rc, ssize, update_flags = 0;
unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
+ unsigned long flags;
BUG_ON(get_region_id(ea) != USER_REGION_ID);
@@ -1558,32 +1786,34 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea,
vsid = get_user_vsid(&mm->context, ea, ssize);
if (!vsid)
return;
- /*
- * Hash doesn't like irqs. Walking linux page table with irq disabled
- * saves us from holding multiple locks.
- */
- local_irq_save(flags);
- /*
- * THP pages use update_mmu_cache_pmd. We don't do
- * hash preload there. Hence can ignore THP here
- */
- ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift);
- if (!ptep)
- goto out_exit;
-
- WARN_ON(hugepage_shift);
#ifdef CONFIG_PPC_64K_PAGES
/* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
* a 64K kernel), then we don't preload, hash_page() will take
* care of it once we actually try to access the page.
* That way we don't have to duplicate all of the logic for segment
* page size demotion here
+ * Called with PTL held, hence can be sure the value won't change in
+ * between.
*/
if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
- goto out_exit;
+ return;
#endif /* CONFIG_PPC_64K_PAGES */
+ /*
+ * __hash_page_* must run with interrupts off, including PMI interrupts
+ * off, as it sets the H_PAGE_BUSY bit.
+ *
+ * It's otherwise possible for perf interrupts to hit at any time and
+ * may take a hash fault reading the user stack, which could take a
+ * hash miss and deadlock on the same H_PAGE_BUSY bit.
+ *
+ * Interrupts must also be off for the duration of the
+ * mm_is_thread_local test and update, to prevent preempt running the
+ * mm on another CPU (XXX: this may be racy vs kthread_use_mm).
+ */
+ powerpc_local_irq_pmu_save(flags);
+
/* Is that local to this CPU ? */
if (mm_is_thread_local(mm))
update_flags |= HPTE_LOCAL_UPDATE;
@@ -1606,8 +1836,8 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea,
mm_ctx_user_psize(&mm->context),
mm_ctx_user_psize(&mm->context),
pte_val(*ptep));
-out_exit:
- local_irq_restore(flags);
+
+ powerpc_local_irq_pmu_restore(flags);
}
/*
@@ -1618,7 +1848,7 @@ out_exit:
*
* This must always be called with the pte lock held.
*/
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep)
{
/*
@@ -1628,11 +1858,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
unsigned long trap;
bool is_exec;
- if (radix_enabled()) {
- prefetch((void *)address);
- return;
- }
-
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
if (!pte_young(*ptep) || address >= TASK_SIZE)
return;
@@ -1658,32 +1883,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
return;
}
- hash_preload(vma->vm_mm, address, is_exec, trap);
-}
-
-#ifdef CONFIG_PPC_MEM_KEYS
-/*
- * Return the protection key associated with the given address and the
- * mm_struct.
- */
-u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
-{
- pte_t *ptep;
- u16 pkey = 0;
- unsigned long flags;
-
- if (!mm || !mm->pgd)
- return 0;
-
- local_irq_save(flags);
- ptep = find_linux_pte(mm->pgd, address, NULL, NULL);
- if (ptep)
- pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep)));
- local_irq_restore(flags);
-
- return pkey;
+ hash_preload(vma->vm_mm, ptep, address, is_exec, trap);
}
-#endif /* CONFIG_PPC_MEM_KEYS */
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
static inline void tm_flush_hash_page(int local)
@@ -1726,10 +1927,6 @@ unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift,
return gslot;
}
-/*
- * WARNING: This is called from hash_low_64.S, if you change this prototype,
- * do not forget to update the assembly call site !
- */
void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
unsigned long flags)
{
@@ -1824,27 +2021,6 @@ void flush_hash_range(unsigned long number, int local)
}
}
-/*
- * low_hash_fault is called when we the low level hash code failed
- * to instert a PTE due to an hypervisor error
- */
-void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
-{
- enum ctx_state prev_state = exception_enter();
-
- if (user_mode(regs)) {
-#ifdef CONFIG_PPC_SUBPAGE_PROT
- if (rc == -2)
- _exception(SIGSEGV, regs, SEGV_ACCERR, address);
- else
-#endif
- _exception(SIGBUS, regs, BUS_ADRERR, address);
- } else
- bad_page_fault(regs, address, SIGBUS);
-
- exception_exit(prev_state);
-}
-
long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
unsigned long pa, unsigned long rflags,
unsigned long vflags, int psize, int ssize)
@@ -1878,13 +2054,78 @@ repeat:
return slot;
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
+void hpt_clear_stress(void)
+{
+ int cpu = raw_smp_processor_id();
+ int g;
+
+ for (g = 0; g < stress_nr_groups(); g++) {
+ unsigned long last_group;
+ last_group = stress_hpt_struct[cpu].last_group[g];
+
+ if (last_group != -1UL) {
+ int i;
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ if (mmu_hash_ops.hpte_remove(last_group) == -1)
+ break;
+ }
+ stress_hpt_struct[cpu].last_group[g] = -1;
+ }
+ }
+}
+
+void hpt_do_stress(unsigned long ea, unsigned long hpte_group)
+{
+ unsigned long last_group;
+ int cpu = raw_smp_processor_id();
+
+ last_group = stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1];
+ if (hpte_group == last_group)
+ return;
+
+ if (last_group != -1UL) {
+ int i;
+ /*
+ * Concurrent CPUs might be inserting into this group, so
+ * give up after a number of iterations, to prevent a live
+ * lock.
+ */
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ if (mmu_hash_ops.hpte_remove(last_group) == -1)
+ break;
+ }
+ stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1] = -1;
+ }
+
+ if (ea >= PAGE_OFFSET) {
+ /*
+ * We would really like to prefetch to get the TLB loaded, then
+ * remove the PTE before returning from fault interrupt, to
+ * increase the hash fault rate.
+ *
+ * Unfortunately QEMU TCG does not model the TLB in a way that
+ * makes this possible, and systemsim (mambo) emulator does not
+ * bring in TLBs with prefetches (although loads/stores do
+ * work for non-CI PTEs).
+ *
+ * So remember this PTE and clear it on the next hash fault.
+ */
+ memmove(&stress_hpt_struct[cpu].last_group[1],
+ &stress_hpt_struct[cpu].last_group[0],
+ (stress_nr_groups() - 1) * sizeof(unsigned long));
+ stress_hpt_struct[cpu].last_group[0] = hpte_group;
+ }
+}
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
+static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
+
static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
{
unsigned long hash;
unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
- unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
+ unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY);
long ret;
hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
@@ -1893,15 +2134,18 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
if (!vsid)
return;
+ if (linear_map_hash_slots[lmi] & 0x80)
+ return;
+
ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
HPTE_V_BOLTED,
mmu_linear_psize, mmu_kernel_ssize);
BUG_ON (ret < 0);
- spin_lock(&linear_map_hash_lock);
+ raw_spin_lock(&linear_map_hash_lock);
BUG_ON(linear_map_hash_slots[lmi] & 0x80);
linear_map_hash_slots[lmi] = ret | 0x80;
- spin_unlock(&linear_map_hash_lock);
+ raw_spin_unlock(&linear_map_hash_lock);
}
static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
@@ -1911,11 +2155,14 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
- spin_lock(&linear_map_hash_lock);
- BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
+ raw_spin_lock(&linear_map_hash_lock);
+ if (!(linear_map_hash_slots[lmi] & 0x80)) {
+ raw_spin_unlock(&linear_map_hash_lock);
+ return;
+ }
hidx = linear_map_hash_slots[lmi] & 0x7f;
linear_map_hash_slots[lmi] = 0;
- spin_unlock(&linear_map_hash_lock);
+ raw_spin_unlock(&linear_map_hash_lock);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
@@ -1925,7 +2172,7 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
mmu_kernel_ssize, 0);
}
-void __kernel_map_pages(struct page *page, int numpages, int enable)
+void hash__kernel_map_pages(struct page *page, int numpages, int enable)
{
unsigned long flags, vaddr, lmi;
int i;
@@ -1943,7 +2190,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
}
local_irq_restore(flags);
}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
+#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_KFENCE */
void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
@@ -2009,11 +2256,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n")
static int __init hash64_debugfs(void)
{
- if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root,
- NULL, &fops_hpt_order)) {
- pr_err("lpar: unable to create hpt_order debugsfs file\n");
- }
-
+ debugfs_create_file("hpt_order", 0600, arch_debugfs_dir, NULL,
+ &fops_hpt_order);
return 0;
}
machine_device_initcall(pseries, hash64_debugfs);
@@ -2026,3 +2270,20 @@ void __init print_system_hash_info(void)
if (htab_hash_mask)
pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask);
}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ /*
+ * If we are using 1TB segments and we are allowed to randomise
+ * the heap, we can put it above 1TB so it is backed by a 1TB
+ * segment. Otherwise the heap will be in the bottom 1TB
+ * which always uses 256MB segments and this may result in a
+ * performance penalty.
+ */
+ if (is_32bit_task())
+ return randomize_page(mm->brk, SZ_32M);
+ else if (!radix_enabled() && mmu_highuser_ssize == MMU_SEGSIZE_1T)
+ return randomize_page(max_t(unsigned long, mm->brk, SZ_1T), SZ_1G);
+ else
+ return randomize_page(mm->brk, SZ_1G);
+}
diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c
index eefa89c6117b..5a2e512e96db 100644
--- a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/hugetlbpage.c
@@ -10,18 +10,13 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/machdep.h>
unsigned int hpage_shift;
EXPORT_SYMBOL(hpage_shift);
-extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
- unsigned long pa, unsigned long rlags,
- unsigned long vflags, int psize, int ssize);
-
+#ifdef CONFIG_PPC_64S_HASH_MMU
int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
pte_t *ptep, unsigned long trap, unsigned long flags,
int ssize, unsigned int shift, unsigned int mmu_psize)
@@ -72,7 +67,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))
return 0;
- rflags = htab_convert_pte_flags(new_pte);
+ rflags = htab_convert_pte_flags(new_pte, flags);
if (unlikely(mmu_psize == MMU_PAGE_16G))
offset = PTRS_PER_PUD;
else
@@ -128,6 +123,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
+#endif
pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
@@ -147,14 +143,17 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, pte_t old_pte, pte_t pte)
{
+ unsigned long psize;
if (radix_enabled())
return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
old_pte, pte);
- set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+
+ psize = huge_page_size(hstate_vma(vma));
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
}
-void hugetlbpage_init_default(void)
+void __init hugetlbpage_init_defaultsize(void)
{
/* Set default large page size. Currently, we pick 16M or 1M
* depending on what is available
diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h
new file mode 100644
index 000000000000..a57a25f06a21
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/internal.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H
+#define ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H
+
+#include <linux/jump_label.h>
+
+extern bool stress_slb_enabled;
+
+DECLARE_STATIC_KEY_FALSE(stress_slb_key);
+
+static inline bool stress_slb(void)
+{
+ return static_branch_unlikely(&stress_slb_key);
+}
+
+extern bool stress_hpt_enabled;
+
+DECLARE_STATIC_KEY_FALSE(stress_hpt_key);
+
+static inline bool stress_hpt(void)
+{
+ return static_branch_unlikely(&stress_hpt_key);
+}
+
+void hpt_do_stress(unsigned long ea, unsigned long hpte_group);
+
+void slb_setup_new_exec(void);
+
+void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush);
+
+#endif /* ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H */
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index 56cc84520577..c0e8d597e4cb 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -96,16 +96,16 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
goto unlock_exit;
}
- down_read(&mm->mmap_sem);
- chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) /
+ mmap_read_lock(mm);
+ chunk = (1UL << (PAGE_SHIFT + MAX_PAGE_ORDER)) /
sizeof(struct vm_area_struct *);
chunk = min(chunk, entries);
for (entry = 0; entry < entries; entry += chunk) {
unsigned long n = min(entries - entry, chunk);
- ret = get_user_pages(ua + (entry << PAGE_SHIFT), n,
+ ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n,
FOLL_WRITE | FOLL_LONGTERM,
- mem->hpages + entry, NULL);
+ mem->hpages + entry);
if (ret == n) {
pinned += n;
continue;
@@ -114,31 +114,13 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
pinned += ret;
break;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (pinned != entries) {
if (!ret)
ret = -EFAULT;
goto free_exit;
}
- pageshift = PAGE_SHIFT;
- for (i = 0; i < entries; ++i) {
- struct page *page = mem->hpages[i];
-
- /*
- * Allow to use larger than 64k IOMMU pages. Only do that
- * if we are backed by hugetlb.
- */
- if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page))
- pageshift = page_shift(compound_head(page));
- mem->pageshift = min(mem->pageshift, pageshift);
- /*
- * We don't need struct page reference any more, switch
- * to physical address.
- */
- mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
- }
-
good_exit:
atomic64_set(&mem->mapped, 1);
mem->used = 1;
@@ -147,7 +129,8 @@ good_exit:
mutex_lock(&mem_list_mutex);
- list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next) {
+ list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next,
+ lockdep_is_held(&mem_list_mutex)) {
/* Overlap? */
if ((mem2->ua < (ua + (entries << PAGE_SHIFT))) &&
(ua < (mem2->ua +
@@ -158,6 +141,27 @@ good_exit:
}
}
+ if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
+ /*
+ * Allow to use larger than 64k IOMMU pages. Only do that
+ * if we are backed by hugetlb. Skip device memory as it is not
+ * backed with page structs.
+ */
+ pageshift = PAGE_SHIFT;
+ for (i = 0; i < entries; ++i) {
+ struct page *page = mem->hpages[i];
+
+ if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page))
+ pageshift = page_shift(compound_head(page));
+ mem->pageshift = min(mem->pageshift, pageshift);
+ /*
+ * We don't need struct page reference any more, switch
+ * to physical address.
+ */
+ mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
+ }
+ }
+
list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list);
mutex_unlock(&mem_list_mutex);
@@ -167,9 +171,8 @@ good_exit:
return 0;
free_exit:
- /* free the reference taken */
- for (i = 0; i < pinned; i++)
- put_page(mem->hpages[i]);
+ /* free the references taken */
+ unpin_user_pages(mem->hpages, pinned);
vfree(mem->hpas);
kfree(mem);
@@ -215,7 +218,8 @@ static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
SetPageDirty(page);
- put_page(page);
+ unpin_user_page(page);
+
mem->hpas[i] = 0;
}
}
@@ -260,7 +264,7 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
goto unlock_exit;
/* Are there still mappings? */
- if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) {
+ if (atomic64_cmpxchg(&mem->mapped, 1, 0) != 1) {
++mem->used;
ret = -EBUSY;
goto unlock_exit;
@@ -286,6 +290,7 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
{
struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+ rcu_read_lock();
list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
if ((mem->ua <= ua) &&
(ua + size <= mem->ua +
@@ -294,29 +299,12 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
break;
}
}
+ rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(mm_iommu_lookup);
-struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
- unsigned long ua, unsigned long size)
-{
- struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
-
- list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
- next) {
- if ((mem->ua <= ua) &&
- (ua + size <= mem->ua +
- (mem->entries << PAGE_SHIFT))) {
- ret = mem;
- break;
- }
- }
-
- return ret;
-}
-
struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
unsigned long ua, unsigned long entries)
{
@@ -324,7 +312,8 @@ struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
mutex_lock(&mem_list_mutex);
- list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+ list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next,
+ lockdep_is_held(&mem_list_mutex)) {
if ((mem->ua == ua) && (mem->entries == entries)) {
ret = mem;
++mem->used;
@@ -362,62 +351,13 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
}
EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
-long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned int pageshift, unsigned long *hpa)
-{
- const long entry = (ua - mem->ua) >> PAGE_SHIFT;
- unsigned long *pa;
-
- if (entry >= mem->entries)
- return -EFAULT;
-
- if (pageshift > mem->pageshift)
- return -EFAULT;
-
- if (!mem->hpas) {
- *hpa = mem->dev_hpa + (ua - mem->ua);
- return 0;
- }
-
- pa = (void *) vmalloc_to_phys(&mem->hpas[entry]);
- if (!pa)
- return -EFAULT;
-
- *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
-
- return 0;
-}
-
-extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
-{
- struct mm_iommu_table_group_mem_t *mem;
- long entry;
- void *va;
- unsigned long *pa;
-
- mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE);
- if (!mem)
- return;
-
- if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA)
- return;
-
- entry = (ua - mem->ua) >> PAGE_SHIFT;
- va = &mem->hpas[entry];
-
- pa = (void *) vmalloc_to_phys(va);
- if (!pa)
- return;
-
- *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
-}
-
bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
unsigned int pageshift, unsigned long *size)
{
struct mm_iommu_table_group_mem_t *mem;
unsigned long end;
+ rcu_read_lock();
list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
continue;
@@ -434,6 +374,7 @@ bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
return true;
}
}
+ rcu_read_unlock();
return false;
}
diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
index 0ba30b8b935b..1715b07c630c 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -17,10 +17,13 @@
#include <linux/export.h>
#include <linux/gfp.h>
#include <linux/slab.h>
+#include <linux/cpu.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
+#include "internal.h"
+
static DEFINE_IDA(mmu_context_ida);
static int alloc_context_id(int min_id, int max_id)
@@ -28,7 +31,8 @@ static int alloc_context_id(int min_id, int max_id)
return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL);
}
-void hash__reserve_context_id(int id)
+#ifdef CONFIG_PPC_64S_HASH_MMU
+void __init hash__reserve_context_id(int id)
{
int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL);
@@ -47,9 +51,9 @@ int hash__alloc_context_id(void)
return alloc_context_id(MIN_USER_CONTEXT, max);
}
EXPORT_SYMBOL_GPL(hash__alloc_context_id);
+#endif
-void slb_setup_new_exec(void);
-
+#ifdef CONFIG_PPC_64S_HASH_MMU
static int realloc_context_ids(mm_context_t *ctx)
{
int i, id;
@@ -118,7 +122,7 @@ static int hash__init_new_context(struct mm_struct *mm)
/* This is fork. Copy hash_context details from current->mm */
memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context));
#ifdef CONFIG_PPC_SUBPAGE_PROT
- /* inherit subpage prot detalis if we have one. */
+ /* inherit subpage prot details if we have one. */
if (current->mm->context.hash_context->spt) {
mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table),
GFP_KERNEL);
@@ -149,6 +153,13 @@ void hash__setup_new_exec(void)
slb_setup_new_exec();
}
+#else
+static inline int hash__init_new_context(struct mm_struct *mm)
+{
+ BUILD_BUG();
+ return 0;
+}
+#endif
static int radix__init_new_context(struct mm_struct *mm)
{
@@ -174,7 +185,9 @@ static int radix__init_new_context(struct mm_struct *mm)
*/
asm volatile("ptesync;isync" : : : "memory");
+#ifdef CONFIG_PPC_64S_HASH_MMU
mm->context.hash_context = NULL;
+#endif
return index;
}
@@ -212,28 +225,36 @@ EXPORT_SYMBOL_GPL(__destroy_context);
static void destroy_contexts(mm_context_t *ctx)
{
- int index, context_id;
+ if (radix_enabled()) {
+ ida_free(&mmu_context_ida, ctx->id);
+ } else {
+#ifdef CONFIG_PPC_64S_HASH_MMU
+ int index, context_id;
- for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) {
- context_id = ctx->extended_id[index];
- if (context_id)
- ida_free(&mmu_context_ida, context_id);
+ for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) {
+ context_id = ctx->extended_id[index];
+ if (context_id)
+ ida_free(&mmu_context_ida, context_id);
+ }
+ kfree(ctx->hash_context);
+#else
+ BUILD_BUG(); // radix_enabled() should be constant true
+#endif
}
- kfree(ctx->hash_context);
}
static void pmd_frag_destroy(void *pmd_frag)
{
int count;
- struct page *page;
+ struct ptdesc *ptdesc;
- page = virt_to_page(pmd_frag);
+ ptdesc = virt_to_ptdesc(pmd_frag);
/* drop all the pending references */
count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
- if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) {
- pgtable_pmd_page_dtor(page);
- __free_page(page);
+ if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
+ pagetable_pmd_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
}
@@ -307,3 +328,22 @@ void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
isync();
}
#endif
+
+/**
+ * cleanup_cpu_mmu_context - Clean up MMU details for this CPU (newly offlined)
+ *
+ * This clears the CPU from mm_cpumask for all processes, and then flushes the
+ * local TLB to ensure TLB coherency in case the CPU is onlined again.
+ *
+ * KVM guest translations are not necessarily flushed here. If KVM started
+ * using mm_cpumask or the Linux APIs which do, this would have to be resolved.
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+void cleanup_cpu_mmu_context(void)
+{
+ int cpu = smp_processor_id();
+
+ clear_tasks_mm_cpumask(cpu);
+ tlbiel_all();
+}
+#endif
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 75483b40fcb1..3438ab72c346 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -6,19 +6,32 @@
#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/memblock.h>
+#include <linux/memremap.h>
+#include <linux/pkeys.h>
+#include <linux/debugfs.h>
+#include <linux/proc_fs.h>
#include <misc/cxl-base.h>
-#include <asm/debugfs.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/trace.h>
#include <asm/powernv.h>
#include <asm/firmware.h>
#include <asm/ultravisor.h>
+#include <asm/kexec.h>
#include <mm/mmu_decl.h>
#include <trace/events/thp.h>
+#include "internal.h"
+
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
+EXPORT_SYMBOL_GPL(mmu_psize_defs);
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+int mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif
+
unsigned long __pmd_frag_nr;
EXPORT_SYMBOL(__pmd_frag_nr);
unsigned long __pmd_frag_size_shift;
@@ -52,11 +65,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
return changed;
}
+int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp, pud_t entry, int dirty)
+{
+ int changed;
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pud_devmap(*pudp));
+ assert_spin_locked(pud_lockptr(vma->vm_mm, pudp));
+#endif
+ changed = !pud_same(*(pudp), entry);
+ if (changed) {
+ /*
+ * We can use MMU_PAGE_1G here, because only radix
+ * path look at the psize.
+ */
+ __ptep_set_access_flags(vma, pudp_ptep(pudp),
+ pud_pte(entry), address, MMU_PAGE_1G);
+ }
+ return changed;
+}
+
+
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
}
+
+int pudp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp)
+{
+ return __pudp_test_and_clear_young(vma->vm_mm, address, pudp);
+}
+
/*
* set a new huge pmd. We should not be called for updating
* an existing pmd entry. That should go via pmd_hugepage_update.
@@ -78,24 +119,46 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
}
-static void do_nothing(void *unused)
+void set_pud_at(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud)
{
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+ WARN_ON(pte_hw_valid(pud_pte(*pudp)));
+ assert_spin_locked(pud_lockptr(mm, pudp));
+ WARN_ON(!(pud_large(pud)));
+#endif
+ trace_hugepage_set_pud(addr, pud_val(pud));
+ return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud));
}
+
+static void do_serialize(void *arg)
+{
+ /* We've taken the IPI, so try to trim the mask while here */
+ if (radix_enabled()) {
+ struct mm_struct *mm = arg;
+ exit_lazy_flush_tlb(mm, false);
+ }
+}
+
/*
- * Serialize against find_current_mm_pte which does lock-less
+ * Serialize against __find_linux_pte() which does lock-less
* lookup in page tables with local interrupts disabled. For huge pages
* it casts pmd_t to pte_t. Since format of pte_t is different from
* pmd_t we want to prevent transit from pmd pointing to page table
* to pmd pointing to huge page (and back) while interrupts are disabled.
* We clear pmd to possibly replace it with page table pointer in
* different code paths. So make sure we wait for the parallel
- * find_current_mm_pte to finish.
+ * __find_linux_pte() to finish.
*/
void serialize_against_pte_lookup(struct mm_struct *mm)
{
smp_mb();
- smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
+ smp_call_function_many(mm_cpumask(mm), do_serialize, mm, 1);
}
/*
@@ -109,15 +172,44 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return __pmd(old_pmd);
+}
+
+pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp, int full)
+{
+ pmd_t pmd;
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+ !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
+ pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
/*
- * This ensures that generic code that rely on IRQ disabling
- * to prevent a parallel THP split work as expected.
- *
- * Marking the entry with _PAGE_INVALID && ~_PAGE_PRESENT requires
- * a special case check in pmd_access_permitted.
+ * if it not a fullmm flush, then we can possibly end up converting
+ * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
+ * Make sure we flush the tlb in this case.
*/
- serialize_against_pte_lookup(vma->vm_mm);
- return __pmd(old_pmd);
+ if (!full)
+ flush_pmd_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+ return pmd;
+}
+
+pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pudp, int full)
+{
+ pud_t pud;
+
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) ||
+ !pud_present(*pudp));
+ pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp);
+ /*
+ * if it not a fullmm flush, then we can possibly end up converting
+ * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
+ * Make sure we flush the tlb in this case.
+ */
+ if (!full)
+ flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE);
+ return pud;
}
static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
@@ -125,12 +217,32 @@ static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
}
+static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot)
+{
+ return __pud(pud_val(pud) | pgprot_val(pgprot));
+}
+
+/*
+ * At some point we should be able to get rid of
+ * pmd_mkhuge() and mk_huge_pmd() when we update all the
+ * other archs to mark the pmd huge in pfn_pmd()
+ */
pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
{
unsigned long pmdv;
pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
- return pmd_set_protbits(__pmd(pmdv), pgprot);
+
+ return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot));
+}
+
+pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot)
+{
+ unsigned long pudv;
+
+ pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
+
+ return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot));
}
pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
@@ -146,37 +258,27 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
pmdv &= _HPAGE_CHG_MASK;
return pmd_set_protbits(__pmd(pmdv), newprot);
}
-
-/*
- * This is called at the end of handling a user page fault, when the
- * fault has been handled by updating a HUGE PMD entry in the linux page tables.
- * We use it to preload an HPTE into the hash table corresponding to
- * the updated linux HUGE PMD entry.
- */
-void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd)
-{
- if (radix_enabled())
- prefetch((void *)addr);
-}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-/* For use by kexec */
-void mmu_cleanup_all(void)
+/* For use by kexec, called with MMU off */
+notrace void mmu_cleanup_all(void)
{
if (radix_enabled())
radix__mmu_cleanup_all();
else if (mmu_hash_ops.hpte_clear_all)
mmu_hash_ops.hpte_clear_all();
+
+ reset_sprs();
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid)
+int __meminit create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot)
{
if (radix_enabled())
- return radix__create_section_mapping(start, end, nid);
+ return radix__create_section_mapping(start, end, nid, prot);
- return hash__create_section_mapping(start, end, nid);
+ return hash__create_section_mapping(start, end, nid, prot);
}
int __meminit remove_section_mapping(unsigned long start, unsigned long end)
@@ -193,17 +295,12 @@ void __init mmu_partition_table_init(void)
unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
unsigned long ptcr;
- BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
/* Initialize the Partition Table with no entries */
partition_tb = memblock_alloc(patb_size, patb_size);
if (!partition_tb)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, patb_size, patb_size);
- /*
- * update partition table control register,
- * 64 K size.
- */
ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
set_ptcr_when_no_uv(ptcr);
powernv_set_nmmu_ptcr(ptcr);
@@ -288,22 +385,22 @@ static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
{
void *ret = NULL;
- struct page *page;
+ struct ptdesc *ptdesc;
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
- page = alloc_page(gfp);
- if (!page)
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pmd_page_ctor(page)) {
- __free_pages(page, 0);
+ if (!pagetable_pmd_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- atomic_set(&page->pt_frag_refcount, 1);
+ atomic_set(&ptdesc->pt_frag_refcount, 1);
- ret = page_address(page);
+ ret = ptdesc_address(ptdesc);
/*
* if we support only one fragment just return the
* allocated page.
@@ -313,12 +410,12 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
spin_lock(&mm->page_table_lock);
/*
- * If we find pgtable_page set, we return
- * the allocated page with single fragement
+ * If we find ptdesc_page set, we return
+ * the allocated page with single fragment
* count.
*/
if (likely(!mm->context.pmd_frag)) {
- atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR);
+ atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR);
mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
}
spin_unlock(&mm->page_table_lock);
@@ -339,12 +436,15 @@ pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
void pmd_fragment_free(unsigned long *pmd)
{
- struct page *page = virt_to_page(pmd);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
+
+ if (pagetable_is_reserved(ptdesc))
+ return free_reserved_ptdesc(ptdesc);
- BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
- if (atomic_dec_and_test(&page->pt_frag_refcount)) {
- pgtable_pmd_page_dtor(page);
- __free_page(page);
+ BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
+ if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
+ pagetable_pmd_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
}
@@ -358,7 +458,7 @@ static inline void pgtable_free(void *table, int index)
pmd_fragment_free(table);
break;
case PUD_INDEX:
- kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
+ __pud_free(table);
break;
#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
/* 16M hugepd directory at pud level */
@@ -378,7 +478,6 @@ static inline void pgtable_free(void *table, int index)
}
}
-#ifdef CONFIG_SMP
void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
{
unsigned long pgf = (unsigned long)table;
@@ -395,12 +494,6 @@ void __tlb_remove_table(void *_table)
return pgtable_free(table, index);
}
-#else
-void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
-{
- return pgtable_free(table, index);
-}
-#endif
#ifdef CONFIG_PROC_FS
atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
@@ -449,6 +542,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
set_pte_at(vma->vm_mm, addr, ptep, pte);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* For hash translation mode, we use the deposited table to store hash slot
* information and they are stored at PTRS_PER_PMD offset from related pmd
@@ -470,6 +564,7 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
return true;
}
+#endif
/*
* Does the CPU support tlbie?
@@ -510,9 +605,50 @@ static int __init pgtable_debugfs_setup(void)
* invalidated as expected.
*/
debugfs_create_bool("tlbie_enabled", 0600,
- powerpc_debugfs_root,
+ arch_debugfs_dir,
&tlbie_enabled);
return 0;
}
arch_initcall(pgtable_debugfs_setup);
+
+#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN)
+/*
+ * Override the generic version in mm/memremap.c.
+ *
+ * With hash translation, the direct-map range is mapped with just one
+ * page size selected by htab_init_page_sizes(). Consult
+ * mmu_psize_defs[] to determine the minimum page size alignment.
+*/
+unsigned long memremap_compat_align(void)
+{
+ if (!radix_enabled()) {
+ unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift;
+ return max(SUBSECTION_SIZE, 1UL << shift);
+ }
+
+ return SUBSECTION_SIZE;
+}
+EXPORT_SYMBOL_GPL(memremap_compat_align);
+#endif
+
+pgprot_t vm_get_page_prot(unsigned long vm_flags)
+{
+ unsigned long prot;
+
+ /* Radix supports execute-only, but protection_map maps X -> RX */
+ if (!radix_enabled() && ((vm_flags & VM_ACCESS_FLAGS) == VM_EXEC))
+ vm_flags |= VM_READ;
+
+ prot = pgprot_val(protection_map[vm_flags & (VM_ACCESS_FLAGS | VM_SHARED)]);
+
+ if (vm_flags & VM_SAO)
+ prot |= _PAGE_SAO;
+
+#ifdef CONFIG_PPC_MEM_KEYS
+ prot |= vmflag_to_pte_pkey_bits(vm_flags);
+#endif
+
+ return __pgprot(prot);
+}
+EXPORT_SYMBOL(vm_get_page_prot);
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index 59e0ebbd8036..a974baf8f327 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -9,60 +9,108 @@
#include <asm/mmu_context.h>
#include <asm/mmu.h>
#include <asm/setup.h>
+#include <asm/smp.h>
+#include <asm/firmware.h>
+
#include <linux/pkeys.h>
-#include <linux/of_device.h>
+#include <linux/of_fdt.h>
-DEFINE_STATIC_KEY_TRUE(pkey_disabled);
-int pkeys_total; /* Total pkeys as per device tree */
-u32 initial_allocation_mask; /* Bits set for the initially allocated keys */
-u32 reserved_allocation_mask; /* Bits set for reserved keys */
-static bool pkey_execute_disable_supported;
-static bool pkeys_devtree_defined; /* property exported by device tree */
-static u64 pkey_amr_mask; /* Bits in AMR not to be touched */
-static u64 pkey_iamr_mask; /* Bits in AMR not to be touched */
-static u64 pkey_uamor_mask; /* Bits in UMOR not to be touched */
+
+int num_pkey; /* Max number of pkeys supported */
+/*
+ * Keys marked in the reservation list cannot be allocated by userspace
+ */
+u32 reserved_allocation_mask __ro_after_init;
+
+/* Bits set for the initially allocated keys */
+static u32 initial_allocation_mask __ro_after_init;
+
+/*
+ * Even if we allocate keys with sys_pkey_alloc(), we need to make sure
+ * other thread still find the access denied using the same keys.
+ */
+u64 default_amr __ro_after_init = ~0x0UL;
+u64 default_iamr __ro_after_init = 0x5555555555555555UL;
+u64 default_uamor __ro_after_init;
+EXPORT_SYMBOL(default_amr);
+/*
+ * Key used to implement PROT_EXEC mmap. Denies READ/WRITE
+ * We pick key 2 because 0 is special key and 1 is reserved as per ISA.
+ */
static int execute_only_key = 2;
+static bool pkey_execute_disable_supported;
+
#define AMR_BITS_PER_PKEY 2
#define AMR_RD_BIT 0x1UL
#define AMR_WR_BIT 0x2UL
#define IAMR_EX_BIT 0x1UL
-#define PKEY_REG_BITS (sizeof(u64)*8)
+#define PKEY_REG_BITS (sizeof(u64) * 8)
#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
-static void scan_pkey_feature(void)
+static int __init dt_scan_storage_keys(unsigned long node,
+ const char *uname, int depth,
+ void *data)
{
- u32 vals[2];
- struct device_node *cpu;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+ int *pkeys_total = (int *) data;
- cpu = of_find_node_by_type(NULL, "cpu");
- if (!cpu)
- return;
+ /* We are scanning "cpu" nodes only */
+ if (type == NULL || strcmp(type, "cpu") != 0)
+ return 0;
- if (of_property_read_u32_array(cpu,
- "ibm,processor-storage-keys", vals, 2))
- return;
+ prop = of_get_flat_dt_prop(node, "ibm,processor-storage-keys", NULL);
+ if (!prop)
+ return 0;
+ *pkeys_total = be32_to_cpu(prop[0]);
+ return 1;
+}
+
+static int __init scan_pkey_feature(void)
+{
+ int ret;
+ int pkeys_total = 0;
/*
- * Since any pkey can be used for data or execute, we will just treat
- * all keys as equal and track them as one entity.
+ * Pkey is not supported with Radix translation.
*/
- pkeys_total = vals[0];
- pkeys_devtree_defined = true;
-}
+ if (early_radix_enabled())
+ return 0;
-static inline bool pkey_mmu_enabled(void)
-{
- if (firmware_has_feature(FW_FEATURE_LPAR))
- return pkeys_total;
- else
- return cpu_has_feature(CPU_FTR_PKEY);
+ ret = of_scan_flat_dt(dt_scan_storage_keys, &pkeys_total);
+ if (ret == 0) {
+ /*
+ * Let's assume 32 pkeys on P8/P9 bare metal, if its not defined by device
+ * tree. We make this exception since some version of skiboot forgot to
+ * expose this property on power8/9.
+ */
+ if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+ unsigned long pvr = mfspr(SPRN_PVR);
+
+ if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E ||
+ PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9 ||
+ PVR_VER(pvr) == PVR_HX_C2000)
+ pkeys_total = 32;
+ }
+ }
+
+#ifdef CONFIG_PPC_MEM_KEYS
+ /*
+ * Adjust the upper limit, based on the number of bits supported by
+ * arch-neutral code.
+ */
+ pkeys_total = min_t(int, pkeys_total,
+ ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + 1));
+#endif
+ return pkeys_total;
}
-static int pkey_initialize(void)
+void __init pkey_early_init_devtree(void)
{
- int os_reserved, i;
+ int pkeys_total, i;
+#ifdef CONFIG_PPC_MEM_KEYS
/*
* We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral
* generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE.
@@ -78,33 +126,22 @@ static int pkey_initialize(void)
BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) +
__builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)
!= (sizeof(u64) * BITS_PER_BYTE));
-
- /* scan the device tree for pkey feature */
- scan_pkey_feature();
-
+#endif
/*
- * Let's assume 32 pkeys on P8 bare metal, if its not defined by device
- * tree. We make this exception since skiboot forgot to expose this
- * property on power8.
+ * Only P7 and above supports SPRN_AMR update with MSR[PR] = 1
*/
- if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR) &&
- cpu_has_feature(CPU_FTRS_POWER8))
- pkeys_total = 32;
+ if (!early_cpu_has_feature(CPU_FTR_ARCH_206))
+ return;
- /*
- * Adjust the upper limit, based on the number of bits supported by
- * arch-neutral code.
- */
- pkeys_total = min_t(int, pkeys_total,
- ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1));
+ /* scan the device tree for pkey feature */
+ pkeys_total = scan_pkey_feature();
+ if (!pkeys_total)
+ goto out;
- if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total)
- static_branch_enable(&pkey_disabled);
- else
- static_branch_disable(&pkey_disabled);
+ /* Allow all keys to be modified by default */
+ default_uamor = ~0x0UL;
- if (static_branch_likely(&pkey_disabled))
- return 0;
+ cur_cpu_spec->mmu_features |= MMU_FTR_PKEY;
/*
* The device tree cannot be relied to indicate support for
@@ -118,122 +155,180 @@ static int pkey_initialize(void)
#ifdef CONFIG_PPC_4K_PAGES
/*
* The OS can manage only 8 pkeys due to its inability to represent them
- * in the Linux 4K PTE.
+ * in the Linux 4K PTE. Mark all other keys reserved.
*/
- os_reserved = pkeys_total - 8;
+ num_pkey = min(8, pkeys_total);
#else
- os_reserved = 0;
+ num_pkey = pkeys_total;
#endif
- /* Bits are in LE format. */
- reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key);
-
- /* register mask is in BE format */
- pkey_amr_mask = ~0x0ul;
- pkey_amr_mask &= ~(0x3ul << pkeyshift(0));
- pkey_iamr_mask = ~0x0ul;
- pkey_iamr_mask &= ~(0x3ul << pkeyshift(0));
- pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key));
+ if (unlikely(num_pkey <= execute_only_key) || !pkey_execute_disable_supported) {
+ /*
+ * Insufficient number of keys to support
+ * execute only key. Mark it unavailable.
+ */
+ execute_only_key = -1;
+ } else {
+ /*
+ * Mark the execute_only_pkey as not available for
+ * user allocation via pkey_alloc.
+ */
+ reserved_allocation_mask |= (0x1 << execute_only_key);
- pkey_uamor_mask = ~0x0ul;
- pkey_uamor_mask &= ~(0x3ul << pkeyshift(0));
- pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key));
+ /*
+ * Deny READ/WRITE for execute_only_key.
+ * Allow execute in IAMR.
+ */
+ default_amr |= (0x3ul << pkeyshift(execute_only_key));
+ default_iamr &= ~(0x1ul << pkeyshift(execute_only_key));
- /* mark the rest of the keys as reserved and hence unavailable */
- for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) {
- reserved_allocation_mask |= (0x1 << i);
- pkey_uamor_mask &= ~(0x3ul << pkeyshift(i));
+ /*
+ * Clear the uamor bits for this key.
+ */
+ default_uamor &= ~(0x3ul << pkeyshift(execute_only_key));
}
- initial_allocation_mask = reserved_allocation_mask | (0x1 << 0);
- if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) {
+ if (unlikely(num_pkey <= 3)) {
/*
* Insufficient number of keys to support
- * execute only key. Mark it unavailable.
- * Any AMR, UAMOR, IAMR bit set for
- * this key is irrelevant since this key
- * can never be allocated.
+ * KUAP/KUEP feature.
*/
- execute_only_key = -1;
+ disable_kuep = true;
+ disable_kuap = true;
+ WARN(1, "Disabling kernel user protection due to low (%d) max supported keys\n", num_pkey);
+ } else {
+ /* handle key which is used by kernel for KAUP */
+ reserved_allocation_mask |= (0x1 << 3);
+ /*
+ * Mark access for kup_key in default amr so that
+ * we continue to operate with that AMR in
+ * copy_to/from_user().
+ */
+ default_amr &= ~(0x3ul << pkeyshift(3));
+ default_iamr &= ~(0x1ul << pkeyshift(3));
+ default_uamor &= ~(0x3ul << pkeyshift(3));
}
- return 0;
-}
-
-arch_initcall(pkey_initialize);
-
-void pkey_mm_init(struct mm_struct *mm)
-{
- if (static_branch_likely(&pkey_disabled))
- return;
- mm_pkey_allocation_map(mm) = initial_allocation_mask;
- mm->context.execute_only_pkey = execute_only_key;
-}
+ /*
+ * Allow access for only key 0. And prevent any other modification.
+ */
+ default_amr &= ~(0x3ul << pkeyshift(0));
+ default_iamr &= ~(0x1ul << pkeyshift(0));
+ default_uamor &= ~(0x3ul << pkeyshift(0));
+ /*
+ * key 0 is special in that we want to consider it an allocated
+ * key which is preallocated. We don't allow changing AMR bits
+ * w.r.t key 0. But one can pkey_free(key0)
+ */
+ initial_allocation_mask |= (0x1 << 0);
-static inline u64 read_amr(void)
-{
- return mfspr(SPRN_AMR);
-}
+ /*
+ * key 1 is recommended not to be used. PowerISA(3.0) page 1015,
+ * programming note.
+ */
+ reserved_allocation_mask |= (0x1 << 1);
+ default_uamor &= ~(0x3ul << pkeyshift(1));
-static inline void write_amr(u64 value)
-{
- mtspr(SPRN_AMR, value);
-}
+ /*
+ * Prevent the usage of OS reserved keys. Update UAMOR
+ * for those keys. Also mark the rest of the bits in the
+ * 32 bit mask as reserved.
+ */
+ for (i = num_pkey; i < 32 ; i++) {
+ reserved_allocation_mask |= (0x1 << i);
+ default_uamor &= ~(0x3ul << pkeyshift(i));
+ }
+ /*
+ * Prevent the allocation of reserved keys too.
+ */
+ initial_allocation_mask |= reserved_allocation_mask;
-static inline u64 read_iamr(void)
-{
- if (!likely(pkey_execute_disable_supported))
- return 0x0UL;
+ pr_info("Enabling pkeys with max key count %d\n", num_pkey);
+out:
+ /*
+ * Setup uamor on boot cpu
+ */
+ mtspr(SPRN_UAMOR, default_uamor);
- return mfspr(SPRN_IAMR);
+ return;
}
-static inline void write_iamr(u64 value)
+#ifdef CONFIG_PPC_KUEP
+void setup_kuep(bool disabled)
{
- if (!likely(pkey_execute_disable_supported))
+ if (disabled)
+ return;
+ /*
+ * On hash if PKEY feature is not enabled, disable KUAP too.
+ */
+ if (!early_radix_enabled() && !early_mmu_has_feature(MMU_FTR_PKEY))
return;
- mtspr(SPRN_IAMR, value);
-}
+ if (smp_processor_id() == boot_cpuid) {
+ pr_info("Activating Kernel Userspace Execution Prevention\n");
+ cur_cpu_spec->mmu_features |= MMU_FTR_BOOK3S_KUEP;
+ }
-static inline u64 read_uamor(void)
-{
- return mfspr(SPRN_UAMOR);
+ /*
+ * Radix always uses key0 of the IAMR to determine if an access is
+ * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
+ * fetch.
+ */
+ mtspr(SPRN_IAMR, AMR_KUEP_BLOCKED);
+ isync();
}
+#endif
-static inline void write_uamor(u64 value)
+#ifdef CONFIG_PPC_KUAP
+void setup_kuap(bool disabled)
{
- mtspr(SPRN_UAMOR, value);
-}
+ if (disabled)
+ return;
+ /*
+ * On hash if PKEY feature is not enabled, disable KUAP too.
+ */
+ if (!early_radix_enabled() && !early_mmu_has_feature(MMU_FTR_PKEY))
+ return;
-static bool is_pkey_enabled(int pkey)
-{
- u64 uamor = read_uamor();
- u64 pkey_bits = 0x3ul << pkeyshift(pkey);
- u64 uamor_pkey_bits = (uamor & pkey_bits);
+ if (smp_processor_id() == boot_cpuid) {
+ pr_info("Activating Kernel Userspace Access Prevention\n");
+ cur_cpu_spec->mmu_features |= MMU_FTR_KUAP;
+ }
/*
- * Both the bits in UAMOR corresponding to the key should be set or
- * reset.
+ * Set the default kernel AMR values on all cpus.
*/
- WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits));
- return !!(uamor_pkey_bits);
+ mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
+ isync();
+}
+#endif
+
+#ifdef CONFIG_PPC_MEM_KEYS
+void pkey_mm_init(struct mm_struct *mm)
+{
+ if (!mmu_has_feature(MMU_FTR_PKEY))
+ return;
+ mm_pkey_allocation_map(mm) = initial_allocation_mask;
+ mm->context.execute_only_pkey = execute_only_key;
}
static inline void init_amr(int pkey, u8 init_bits)
{
u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
- u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey));
+ u64 old_amr = current_thread_amr() & ~((u64)(0x3ul) << pkeyshift(pkey));
- write_amr(old_amr | new_amr_bits);
+ current->thread.regs->amr = old_amr | new_amr_bits;
}
static inline void init_iamr(int pkey, u8 init_bits)
{
u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey));
- u64 old_iamr = read_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey));
+ u64 old_iamr = current_thread_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey));
- write_iamr(old_iamr | new_iamr_bits);
+ if (!likely(pkey_execute_disable_supported))
+ return;
+
+ current->thread.regs->iamr = old_iamr | new_iamr_bits;
}
/*
@@ -245,8 +340,18 @@ int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
{
u64 new_amr_bits = 0x0ul;
u64 new_iamr_bits = 0x0ul;
+ u64 pkey_bits, uamor_pkey_bits;
+
+ /*
+ * Check whether the key is disabled by UAMOR.
+ */
+ pkey_bits = 0x3ul << pkeyshift(pkey);
+ uamor_pkey_bits = (default_uamor & pkey_bits);
- if (!is_pkey_enabled(pkey))
+ /*
+ * Both the bits in UAMOR corresponding to the key should be set
+ */
+ if (uamor_pkey_bits != pkey_bits)
return -EINVAL;
if (init_val & PKEY_DISABLE_EXECUTE) {
@@ -266,48 +371,7 @@ int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
return 0;
}
-void thread_pkey_regs_save(struct thread_struct *thread)
-{
- if (static_branch_likely(&pkey_disabled))
- return;
-
- /*
- * TODO: Skip saving registers if @thread hasn't used any keys yet.
- */
- thread->amr = read_amr();
- thread->iamr = read_iamr();
- thread->uamor = read_uamor();
-}
-
-void thread_pkey_regs_restore(struct thread_struct *new_thread,
- struct thread_struct *old_thread)
-{
- if (static_branch_likely(&pkey_disabled))
- return;
-
- if (old_thread->amr != new_thread->amr)
- write_amr(new_thread->amr);
- if (old_thread->iamr != new_thread->iamr)
- write_iamr(new_thread->iamr);
- if (old_thread->uamor != new_thread->uamor)
- write_uamor(new_thread->uamor);
-}
-
-void thread_pkey_regs_init(struct thread_struct *thread)
-{
- if (static_branch_likely(&pkey_disabled))
- return;
-
- thread->amr = pkey_amr_mask;
- thread->iamr = pkey_iamr_mask;
- thread->uamor = pkey_uamor_mask;
-
- write_uamor(pkey_uamor_mask);
- write_amr(pkey_amr_mask);
- write_iamr(pkey_iamr_mask);
-}
-
-int __execute_only_pkey(struct mm_struct *mm)
+int execute_only_pkey(struct mm_struct *mm)
{
return mm->context.execute_only_pkey;
}
@@ -315,7 +379,7 @@ int __execute_only_pkey(struct mm_struct *mm)
static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
{
/* Do this check first since the vm_flags should be hot */
- if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
+ if ((vma->vm_flags & VM_ACCESS_FLAGS) != VM_EXEC)
return false;
return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey);
@@ -353,21 +417,20 @@ static bool pkey_access_permitted(int pkey, bool write, bool execute)
int pkey_shift;
u64 amr;
- if (!is_pkey_enabled(pkey))
- return true;
-
pkey_shift = pkeyshift(pkey);
- if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift)))
- return true;
+ if (execute)
+ return !(current_thread_iamr() & (IAMR_EX_BIT << pkey_shift));
- amr = read_amr(); /* Delay reading amr until absolutely needed */
- return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) ||
- (write && !(amr & (AMR_WR_BIT << pkey_shift))));
+ amr = current_thread_amr();
+ if (write)
+ return !(amr & (AMR_WR_BIT << pkey_shift));
+
+ return !(amr & (AMR_RD_BIT << pkey_shift));
}
bool arch_pte_access_permitted(u64 pte, bool write, bool execute)
{
- if (static_branch_likely(&pkey_disabled))
+ if (!mmu_has_feature(MMU_FTR_PKEY))
return true;
return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute);
@@ -381,22 +444,10 @@ bool arch_pte_access_permitted(u64 pte, bool write, bool execute)
* So do not enforce things if the VMA is not from the current mm, or if we are
* in a kernel thread.
*/
-static inline bool vma_is_foreign(struct vm_area_struct *vma)
-{
- if (!current->mm)
- return true;
-
- /* if it is not our ->mm, it has to be foreign */
- if (current->mm != vma->vm_mm)
- return true;
-
- return false;
-}
-
bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
bool execute, bool foreign)
{
- if (static_branch_likely(&pkey_disabled))
+ if (!mmu_has_feature(MMU_FTR_PKEY))
return true;
/*
* Do not enforce our key-permissions on a foreign vma.
@@ -409,10 +460,12 @@ bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
{
- if (static_branch_likely(&pkey_disabled))
+ if (!mmu_has_feature(MMU_FTR_PKEY))
return;
/* Duplicate the oldmm pkey state in mm: */
mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
}
+
+#endif /* CONFIG_PPC_MEM_KEYS */
diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
index cab06331c0c0..35fd2a95be24 100644
--- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
@@ -2,8 +2,6 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/security.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/machdep.h>
#include <asm/mman.h>
@@ -34,62 +32,14 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st
struct hstate *hstate = hstate_file(vma->vm_file);
psize = hstate_get_psize(hstate);
- radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
-}
-
-/*
- * A vairant of hugetlb_get_unmapped_area doing topdown search
- * FIXME!! should we do as x86 does or non hugetlb area does ?
- * ie, use topdown or not based on mmap_is_legacy check ?
- */
-unsigned long
-radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- struct hstate *h = hstate_file(file);
- int fixed = (flags & MAP_FIXED);
- unsigned long high_limit;
- struct vm_unmapped_area_info info;
-
- high_limit = DEFAULT_MAP_WINDOW;
- if (addr >= high_limit || (fixed && (addr + len > high_limit)))
- high_limit = TASK_SIZE;
-
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (len > high_limit)
- return -ENOMEM;
-
- if (fixed) {
- if (addr > high_limit - len)
- return -ENOMEM;
- if (prepare_hugepage_range(file, addr, len))
- return -EINVAL;
- return addr;
- }
-
- if (addr) {
- addr = ALIGN(addr, huge_page_size(h));
- vma = find_vma(mm, addr);
- if (high_limit - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vm_start_gap(vma)))
- return addr;
- }
/*
- * We are always doing an topdown search here. Slice code
- * does that too.
+ * Flush PWC even if we get PUD_SIZE hugetlb invalidate to keep this simpler.
*/
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
-
- return vm_unmapped_area(&info);
+ if (end - start >= PUD_SIZE)
+ radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize);
+ else
+ radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
+ mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
}
void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
@@ -97,14 +47,17 @@ void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
pte_t old_pte, pte_t pte)
{
struct mm_struct *mm = vma->vm_mm;
+ unsigned long psize = huge_page_size(hstate_vma(vma));
/*
- * To avoid NMMU hang while relaxing access we need to flush the tlb before
- * we set the new value.
+ * POWER9 NMMU must flush the TLB after clearing the PTE before
+ * installing a PTE with more relaxed access permissions, see
+ * radix__ptep_set_access_flags.
*/
- if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
- (atomic_read(&mm->context.copros) > 0))
+ if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
+ is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ atomic_read(&mm->context.copros) > 0)
radix__flush_hugetlb_page(vma, addr);
- set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
}
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 974109bb85db..c6a4ac766b2b 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -11,13 +11,13 @@
#include <linux/kernel.h>
#include <linux/sched/mm.h>
#include <linux/memblock.h>
+#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/string_helpers.h>
-#include <linux/stop_machine.h>
+#include <linux/memory.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/mmu_context.h>
#include <asm/dma.h>
@@ -26,13 +26,16 @@
#include <asm/firmware.h>
#include <asm/powernv.h>
#include <asm/sections.h>
+#include <asm/smp.h>
#include <asm/trace.h>
#include <asm/uaccess.h>
#include <asm/ultravisor.h>
+#include <asm/set_memory.h>
#include <trace/events/thp.h>
-unsigned int mmu_pid_bits;
+#include <mm/mmu_decl.h>
+
unsigned int mmu_base_pid;
static __ref void *early_alloc_pgtable(unsigned long size, int nid,
@@ -56,6 +59,13 @@ static __ref void *early_alloc_pgtable(unsigned long size, int nid,
return ptr;
}
+/*
+ * When allocating pud or pmd pointers, we allocate a complete page
+ * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
+ * is to ensure that the page obtained from the memblock allocator
+ * can be completely used as page table page and can be freed
+ * correctly when the page table entries are removed.
+ */
static int early_map_kernel_page(unsigned long ea, unsigned long pa,
pgprot_t flags,
unsigned int map_page_size,
@@ -64,24 +74,26 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
{
unsigned long pfn = pa >> PAGE_SHIFT;
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
pgdp = pgd_offset_k(ea);
- if (pgd_none(*pgdp)) {
- pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
- region_start, region_end);
- pgd_populate(&init_mm, pgdp, pudp);
+ p4dp = p4d_offset(pgdp, ea);
+ if (p4d_none(*p4dp)) {
+ pudp = early_alloc_pgtable(PAGE_SIZE, nid,
+ region_start, region_end);
+ p4d_populate(&init_mm, p4dp, pudp);
}
- pudp = pud_offset(pgdp, ea);
+ pudp = pud_offset(p4dp, ea);
if (map_page_size == PUD_SIZE) {
ptep = (pte_t *)pudp;
goto set_the_pte;
}
if (pud_none(*pudp)) {
- pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
- region_start, region_end);
+ pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
+ region_end);
pud_populate(&init_mm, pudp, pmdp);
}
pmdp = pmd_offset(pudp, ea);
@@ -98,7 +110,7 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
set_the_pte:
set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
- smp_wmb();
+ asm volatile("ptesync": : :"memory");
return 0;
}
@@ -114,6 +126,7 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa,
{
unsigned long pfn = pa >> PAGE_SHIFT;
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
@@ -136,7 +149,8 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa,
* boot.
*/
pgdp = pgd_offset_k(ea);
- pudp = pud_alloc(&init_mm, pgdp, ea);
+ p4dp = p4d_offset(pgdp, ea);
+ pudp = pud_alloc(&init_mm, p4dp, ea);
if (!pudp)
return -ENOMEM;
if (map_page_size == PUD_SIZE) {
@@ -156,7 +170,7 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa,
set_the_pte:
set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
- smp_wmb();
+ asm volatile("ptesync": : :"memory");
return 0;
}
@@ -168,11 +182,12 @@ int radix__map_kernel_page(unsigned long ea, unsigned long pa,
}
#ifdef CONFIG_STRICT_KERNEL_RWX
-void radix__change_memory_range(unsigned long start, unsigned long end,
- unsigned long clear)
+static void radix__change_memory_range(unsigned long start, unsigned long end,
+ unsigned long clear)
{
unsigned long idx;
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
@@ -185,7 +200,8 @@ void radix__change_memory_range(unsigned long start, unsigned long end,
for (idx = start; idx < end; idx += PAGE_SIZE) {
pgdp = pgd_offset_k(idx);
- pudp = pud_alloc(&init_mm, pgdp, idx);
+ p4dp = p4d_offset(pgdp, idx);
+ pudp = pud_alloc(&init_mm, p4dp, idx);
if (!pudp)
continue;
if (pud_is_leaf(*pudp)) {
@@ -214,9 +230,17 @@ void radix__mark_rodata_ro(void)
unsigned long start, end;
start = (unsigned long)_stext;
- end = (unsigned long)__init_begin;
+ end = (unsigned long)__end_rodata;
radix__change_memory_range(start, end, _PAGE_WRITE);
+
+ for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) {
+ end = start + PAGE_SIZE;
+ if (overlaps_interrupt_vector_text(start, end))
+ radix__change_memory_range(start, end, _PAGE_WRITE);
+ else
+ break;
+ }
}
void radix__mark_initmem_nx(void)
@@ -245,27 +269,50 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e
static unsigned long next_boundary(unsigned long addr, unsigned long end)
{
#ifdef CONFIG_STRICT_KERNEL_RWX
- if (addr < __pa_symbol(__init_begin))
- return __pa_symbol(__init_begin);
+ unsigned long stext_phys;
+
+ stext_phys = __pa_symbol(_stext);
+
+ // Relocatable kernel running at non-zero real address
+ if (stext_phys != 0) {
+ // The end of interrupts code at zero is a rodata boundary
+ unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys;
+ if (addr < end_intr)
+ return end_intr;
+
+ // Start of relocated kernel text is a rodata boundary
+ if (addr < stext_phys)
+ return stext_phys;
+ }
+
+ if (addr < __pa_symbol(__srwx_boundary))
+ return __pa_symbol(__srwx_boundary);
#endif
return end;
}
static int __meminit create_physical_mapping(unsigned long start,
unsigned long end,
- int nid)
+ int nid, pgprot_t _prot)
{
unsigned long vaddr, addr, mapping_size = 0;
bool prev_exec, exec = false;
pgprot_t prot;
int psize;
+ unsigned long max_mapping_size = memory_block_size;
+
+ if (debug_pagealloc_enabled_or_kfence())
+ max_mapping_size = PAGE_SIZE;
- start = _ALIGN_UP(start, PAGE_SIZE);
+ start = ALIGN(start, PAGE_SIZE);
+ end = ALIGN_DOWN(end, PAGE_SIZE);
for (addr = start; addr < end; addr += mapping_size) {
unsigned long gap, previous_size;
int rc;
gap = next_boundary(addr, end) - addr;
+ if (gap > max_mapping_size)
+ gap = max_mapping_size;
previous_size = mapping_size;
prev_exec = exec;
@@ -289,7 +336,7 @@ static int __meminit create_physical_mapping(unsigned long start,
prot = PAGE_KERNEL_X;
exec = true;
} else {
- prot = PAGE_KERNEL;
+ prot = _prot;
exec = false;
}
@@ -312,51 +359,40 @@ static int __meminit create_physical_mapping(unsigned long start,
static void __init radix_init_pgtable(void)
{
unsigned long rts_field;
- struct memblock_region *reg;
+ phys_addr_t start, end;
+ u64 i;
/* We don't support slb for radix */
- mmu_slb_size = 0;
+ slb_set_size(0);
+
/*
- * Create the linear mapping, using standard page size for now
+ * Create the linear mapping
*/
- for_each_memblock(memory, reg) {
+ for_each_mem_range(i, &start, &end) {
/*
* The memblock allocator is up at this point, so the
* page tables will be allocated within the range. No
* need or a node (which we don't have yet).
*/
- if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
+ if (end >= RADIX_VMALLOC_START) {
pr_warn("Outside the supported range\n");
continue;
}
- WARN_ON(create_physical_mapping(reg->base,
- reg->base + reg->size,
- -1));
+ WARN_ON(create_physical_mapping(start, end,
+ -1, PAGE_KERNEL));
}
- /* Find out how many PID bits are supported */
- if (cpu_has_feature(CPU_FTR_HVMODE)) {
- if (!mmu_pid_bits)
- mmu_pid_bits = 20;
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ if (!cpu_has_feature(CPU_FTR_HVMODE) &&
+ cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
/*
- * When KVM is possible, we only use the top half of the
- * PID space to avoid collisions between host and guest PIDs
- * which can cause problems due to prefetch when exiting the
- * guest with AIL=3
+ * Older versions of KVM on these machines prefer if the
+ * guest only uses the low 19 PID bits.
*/
- mmu_base_pid = 1 << (mmu_pid_bits - 1);
-#else
- mmu_base_pid = 1;
-#endif
- } else {
- /* The guest uses the bottom half of the PID space */
- if (!mmu_pid_bits)
- mmu_pid_bits = 19;
- mmu_base_pid = 1;
+ mmu_pid_bits = 19;
}
+ mmu_base_pid = 1;
/*
* Allocate Partition table and process table for the
@@ -435,11 +471,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
if (type == NULL || strcmp(type, "cpu") != 0)
return 0;
- /* Find MMU PID size */
- prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
- if (prop && size == 4)
- mmu_pid_bits = be32_to_cpup(prop);
-
/* Grab page size encodings */
prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
if (!prop)
@@ -462,6 +493,7 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
def = &mmu_psize_defs[idx];
def->shift = shift;
def->ap = ap;
+ def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
}
/* needed ? */
@@ -477,88 +509,35 @@ void __init radix__early_init_devtree(void)
* Try to find the available page sizes in the device-tree
*/
rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
- if (rc != 0) /* Found */
- goto found;
- /*
- * let's assume we have page 4k and 64k support
- */
- mmu_psize_defs[MMU_PAGE_4K].shift = 12;
- mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
-
- mmu_psize_defs[MMU_PAGE_64K].shift = 16;
- mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
-found:
- return;
-}
-
-static void radix_init_amor(void)
-{
- /*
- * In HV mode, we init AMOR (Authority Mask Override Register) so that
- * the hypervisor and guest can setup IAMR (Instruction Authority Mask
- * Register), enable key 0 and set it to 1.
- *
- * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
- */
- mtspr(SPRN_AMOR, (3ul << 62));
-}
-
-#ifdef CONFIG_PPC_KUEP
-void setup_kuep(bool disabled)
-{
- if (disabled || !early_radix_enabled())
- return;
-
- if (smp_processor_id() == boot_cpuid)
- pr_info("Activating Kernel Userspace Execution Prevention\n");
-
- /*
- * Radix always uses key0 of the IAMR to determine if an access is
- * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
- * fetch.
- */
- mtspr(SPRN_IAMR, (1ul << 62));
-}
-#endif
-
-#ifdef CONFIG_PPC_KUAP
-void setup_kuap(bool disabled)
-{
- if (disabled || !early_radix_enabled())
- return;
-
- if (smp_processor_id() == boot_cpuid) {
- pr_info("Activating Kernel Userspace Access Prevention\n");
- cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP;
+ if (!rc) {
+ /*
+ * No page size details found in device tree.
+ * Let's assume we have page 4k and 64k support
+ */
+ mmu_psize_defs[MMU_PAGE_4K].shift = 12;
+ mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
+ mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
+ psize_to_rpti_pgsize(MMU_PAGE_4K);
+
+ mmu_psize_defs[MMU_PAGE_64K].shift = 16;
+ mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
+ mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
+ psize_to_rpti_pgsize(MMU_PAGE_64K);
}
-
- /* Make sure userspace can't change the AMR */
- mtspr(SPRN_UAMOR, 0);
- mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
- isync();
+ return;
}
-#endif
void __init radix__early_init_mmu(void)
{
unsigned long lpcr;
+#ifdef CONFIG_PPC_64S_HASH_MMU
#ifdef CONFIG_PPC_64K_PAGES
/* PAGE_SIZE mappings */
mmu_virtual_psize = MMU_PAGE_64K;
#else
mmu_virtual_psize = MMU_PAGE_4K;
#endif
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- /* vmemmap mapping */
- if (mmu_psize_defs[MMU_PAGE_2M].shift) {
- /*
- * map vmemmap using 2M if available
- */
- mmu_vmemmap_psize = MMU_PAGE_2M;
- } else
- mmu_vmemmap_psize = mmu_virtual_psize;
#endif
/*
* initialize page table size
@@ -599,7 +578,6 @@ void __init radix__early_init_mmu(void)
lpcr = mfspr(SPRN_LPCR);
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
radix_init_partition_table();
- radix_init_amor();
} else {
radix_init_pseries();
}
@@ -623,15 +601,17 @@ void radix__early_init_mmu_secondary(void)
set_ptcr_when_no_uv(__pa(partition_tb) |
(PATB_SIZE_SHIFT - 12));
-
- radix_init_amor();
}
radix__switch_mmu_context(NULL, &init_mm);
tlbiel_all();
+
+ /* Make sure userspace can't change the AMR */
+ mtspr(SPRN_UAMOR, 0);
}
-void radix__mmu_cleanup_all(void)
+/* Called during kexec sequence with MMU off */
+notrace void radix__mmu_cleanup_all(void)
{
unsigned long lpcr;
@@ -644,21 +624,6 @@ void radix__mmu_cleanup_all(void)
}
}
-void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size)
-{
- /*
- * We don't currently support the first MEMBLOCK not mapping 0
- * physical on those processors
- */
- BUG_ON(first_memblock_base != 0);
-
- /*
- * Radix mode is not limited by RMA / VRMA addressing.
- */
- ppc64_rma_size = ULONG_MAX;
-}
-
#ifdef CONFIG_MEMORY_HOTPLUG
static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
{
@@ -690,108 +655,108 @@ static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
pud_clear(pud);
}
-struct change_mapping_params {
- pte_t *pte;
- unsigned long start;
- unsigned long end;
- unsigned long aligned_start;
- unsigned long aligned_end;
-};
-
-static int __meminit stop_machine_change_mapping(void *data)
+static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
{
- struct change_mapping_params *params =
- (struct change_mapping_params *)data;
+ pud_t *pud;
+ int i;
- if (!data)
- return -1;
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (!pud_none(*pud))
+ return;
+ }
- spin_unlock(&init_mm.page_table_lock);
- pte_clear(&init_mm, params->aligned_start, params->pte);
- create_physical_mapping(__pa(params->aligned_start), __pa(params->start), -1);
- create_physical_mapping(__pa(params->end), __pa(params->aligned_end), -1);
- spin_lock(&init_mm.page_table_lock);
- return 0;
+ pud_free(&init_mm, pud_start);
+ p4d_clear(p4d);
}
-static void remove_pte_table(pte_t *pte_start, unsigned long addr,
- unsigned long end)
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
{
- unsigned long next;
- pte_t *pte;
+ unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
- pte = pte_start + pte_index(addr);
- for (; addr < end; addr = next, pte++) {
- next = (addr + PAGE_SIZE) & PAGE_MASK;
- if (next > end)
- next = end;
+ return !vmemmap_populated(start, PMD_SIZE);
+}
- if (!pte_present(*pte))
- continue;
+static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
+{
+ unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
- if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
- /*
- * The vmemmap_free() and remove_section_mapping()
- * codepaths call us with aligned addresses.
- */
- WARN_ONCE(1, "%s: unaligned range\n", __func__);
- continue;
- }
+ return !vmemmap_populated(start, PAGE_SIZE);
- pte_clear(&init_mm, addr, pte);
- }
}
+#endif
-/*
- * clear the pte and potentially split the mapping helper
- */
-static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
- unsigned long size, pte_t *pte)
+static void __meminit free_vmemmap_pages(struct page *page,
+ struct vmem_altmap *altmap,
+ int order)
{
- unsigned long mask = ~(size - 1);
- unsigned long aligned_start = addr & mask;
- unsigned long aligned_end = addr + size;
- struct change_mapping_params params;
- bool split_region = false;
+ unsigned int nr_pages = 1 << order;
+
+ if (altmap) {
+ unsigned long alt_start, alt_end;
+ unsigned long base_pfn = page_to_pfn(page);
- if ((end - addr) < size) {
/*
- * We're going to clear the PTE, but not flushed
- * the mapping, time to remap and flush. The
- * effects if visible outside the processor or
- * if we are running in code close to the
- * mapping we cleared, we are in trouble.
+ * with 2M vmemmap mmaping we can have things setup
+ * such that even though atlmap is specified we never
+ * used altmap.
*/
- if (overlaps_kernel_text(aligned_start, addr) ||
- overlaps_kernel_text(end, aligned_end)) {
- /*
- * Hack, just return, don't pte_clear
- */
- WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
- "text, not splitting\n", addr, end);
+ alt_start = altmap->base_pfn;
+ alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
+
+ if (base_pfn >= alt_start && base_pfn < alt_end) {
+ vmem_altmap_free(altmap, nr_pages);
return;
}
- split_region = true;
}
- if (split_region) {
- params.pte = pte;
- params.start = addr;
- params.end = end;
- params.aligned_start = addr & ~(size - 1);
- params.aligned_end = min_t(unsigned long, aligned_end,
- (unsigned long)__va(memblock_end_of_DRAM()));
- stop_machine(stop_machine_change_mapping, &params, NULL);
- return;
- }
+ if (PageReserved(page)) {
+ /* allocated from memblock */
+ while (nr_pages--)
+ free_reserved_page(page++);
+ } else
+ free_pages((unsigned long)page_address(page), order);
+}
+
+static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte;
+
+ pte = pte_start + pte_index(addr);
+ for (; addr < end; addr = next, pte++) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ if (!pte_present(*pte))
+ continue;
- pte_clear(&init_mm, addr, pte);
+ if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
+ if (!direct)
+ free_vmemmap_pages(pte_page(*pte), altmap, 0);
+ pte_clear(&init_mm, addr, pte);
+ pages++;
+ }
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (!direct && vmemmap_page_is_unused(addr, next)) {
+ free_vmemmap_pages(pte_page(*pte), altmap, 0);
+ pte_clear(&init_mm, addr, pte);
+ }
+#endif
+ }
+ if (direct)
+ update_page_count(mmu_virtual_psize, -pages);
}
-static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
- unsigned long end)
+static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
- unsigned long next;
+ unsigned long next, pages = 0;
pte_t *pte_base;
pmd_t *pmd;
@@ -803,20 +768,35 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
continue;
if (pmd_is_leaf(*pmd)) {
- split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
+ pte_clear(&init_mm, addr, (pte_t *)pmd);
+ pages++;
+ }
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
+ free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
+ pte_clear(&init_mm, addr, (pte_t *)pmd);
+ }
+#endif
continue;
}
pte_base = (pte_t *)pmd_page_vaddr(*pmd);
- remove_pte_table(pte_base, addr, next);
+ remove_pte_table(pte_base, addr, next, direct, altmap);
free_pte_table(pte_base, pmd);
}
+ if (direct)
+ update_page_count(MMU_PAGE_2M, -pages);
}
-static void remove_pud_table(pud_t *pud_start, unsigned long addr,
- unsigned long end)
+static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
- unsigned long next;
+ unsigned long next, pages = 0;
pmd_t *pmd_base;
pud_t *pud;
@@ -828,21 +808,32 @@ static void remove_pud_table(pud_t *pud_start, unsigned long addr,
continue;
if (pud_is_leaf(*pud)) {
- split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
+ if (!IS_ALIGNED(addr, PUD_SIZE) ||
+ !IS_ALIGNED(next, PUD_SIZE)) {
+ WARN_ONCE(1, "%s: unaligned range\n", __func__);
+ continue;
+ }
+ pte_clear(&init_mm, addr, (pte_t *)pud);
+ pages++;
continue;
}
- pmd_base = (pmd_t *)pud_page_vaddr(*pud);
- remove_pmd_table(pmd_base, addr, next);
+ pmd_base = pud_pgtable(*pud);
+ remove_pmd_table(pmd_base, addr, next, direct, altmap);
free_pmd_table(pmd_base, pud);
}
+ if (direct)
+ update_page_count(MMU_PAGE_1G, -pages);
}
-static void __meminit remove_pagetable(unsigned long start, unsigned long end)
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long addr, next;
pud_t *pud_base;
pgd_t *pgd;
+ p4d_t *p4d;
spin_lock(&init_mm.page_table_lock);
@@ -850,35 +841,46 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
next = pgd_addr_end(addr, end);
pgd = pgd_offset_k(addr);
- if (!pgd_present(*pgd))
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
continue;
- if (pgd_is_leaf(*pgd)) {
- split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
+ if (p4d_is_leaf(*p4d)) {
+ if (!IS_ALIGNED(addr, P4D_SIZE) ||
+ !IS_ALIGNED(next, P4D_SIZE)) {
+ WARN_ONCE(1, "%s: unaligned range\n", __func__);
+ continue;
+ }
+
+ pte_clear(&init_mm, addr, (pte_t *)pgd);
continue;
}
- pud_base = (pud_t *)pgd_page_vaddr(*pgd);
- remove_pud_table(pud_base, addr, next);
+ pud_base = p4d_pgtable(*p4d);
+ remove_pud_table(pud_base, addr, next, direct, altmap);
+ free_pud_table(pud_base, p4d);
}
spin_unlock(&init_mm.page_table_lock);
radix__flush_tlb_kernel_range(start, end);
}
-int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
+int __meminit radix__create_section_mapping(unsigned long start,
+ unsigned long end, int nid,
+ pgprot_t prot)
{
if (end >= RADIX_VMALLOC_START) {
pr_warn("Outside the supported range\n");
return -1;
}
- return create_physical_mapping(__pa(start), __pa(end), nid);
+ return create_physical_mapping(__pa(start), __pa(end),
+ nid, prot);
}
int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
{
- remove_pagetable(start, end);
+ remove_pagetable(start, end, true, NULL);
return 0;
}
#endif /* CONFIG_MEMORY_HOTPLUG */
@@ -896,7 +898,6 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
unsigned long phys)
{
/* Create a PTE encoding */
- unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
int ret;
@@ -905,20 +906,453 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
return -1;
}
- ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
+ ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);
BUG_ON(ret);
return 0;
}
+
+bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+{
+ if (radix_enabled())
+ return __vmemmap_can_optimize(altmap, pgmap);
+
+ return false;
+}
+
+int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
+ unsigned long addr, unsigned long next)
+{
+ int large = pmd_large(*pmdp);
+
+ if (large)
+ vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
+
+ return large;
+}
+
+void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
+ unsigned long addr, unsigned long next)
+{
+ pte_t entry;
+ pte_t *ptep = pmdp_ptep(pmdp);
+
+ VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, ptep, entry);
+ asm volatile("ptesync": : :"memory");
+
+ vmemmap_verify(ptep, node, addr, next);
+}
+
+static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
+ int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
+{
+ pte_t *pte = pte_offset_kernel(pmdp, addr);
+
+ if (pte_none(*pte)) {
+ pte_t entry;
+ void *p;
+
+ if (!reuse) {
+ /*
+ * make sure we don't create altmap mappings
+ * covering things outside the device.
+ */
+ if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
+ altmap = NULL;
+
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+ if (!p && altmap)
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
+ if (!p)
+ return NULL;
+ pr_debug("PAGE_SIZE vmemmap mapping\n");
+ } else {
+ /*
+ * When a PTE/PMD entry is freed from the init_mm
+ * there's a free_pages() call to this page allocated
+ * above. Thus this get_page() is paired with the
+ * put_page_testzero() on the freeing path.
+ * This can only called by certain ZONE_DEVICE path,
+ * and through vmemmap_populate_compound_pages() when
+ * slab is available.
+ */
+ get_page(reuse);
+ p = page_to_virt(reuse);
+ pr_debug("Tail page reuse vmemmap mapping\n");
+ }
+
+ VM_BUG_ON(!PAGE_ALIGNED(addr));
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, pte, entry);
+ asm volatile("ptesync": : :"memory");
+ }
+ return pte;
+}
+
+static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
+ unsigned long address)
+{
+ pud_t *pud;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(p4d_none(*p4dp))) {
+ if (unlikely(!slab_is_available())) {
+ pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ p4d_populate(&init_mm, p4dp, pud);
+ /* go to the pud_offset */
+ } else
+ return pud_alloc(&init_mm, p4dp, address);
+ }
+ return pud_offset(p4dp, address);
+}
+
+static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
+ unsigned long address)
+{
+ pmd_t *pmd;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(pud_none(*pudp))) {
+ if (unlikely(!slab_is_available())) {
+ pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ pud_populate(&init_mm, pudp, pmd);
+ } else
+ return pmd_alloc(&init_mm, pudp, address);
+ }
+ return pmd_offset(pudp, address);
+}
+
+static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
+ unsigned long address)
+{
+ pte_t *pte;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(pmd_none(*pmdp))) {
+ if (unlikely(!slab_is_available())) {
+ pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ pmd_populate(&init_mm, pmdp, pte);
+ } else
+ return pte_alloc_kernel(pmdp, address);
+ }
+ return pte_offset_kernel(pmdp, address);
+}
+
+
+
+int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
+{
+ unsigned long addr;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ for (addr = start; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return -ENOMEM;
+
+ if (pmd_none(READ_ONCE(*pmd))) {
+ void *p;
+
+ /*
+ * keep it simple by checking addr PMD_SIZE alignment
+ * and verifying the device boundary condition.
+ * For us to use a pmd mapping, both addr and pfn should
+ * be aligned. We skip if addr is not aligned and for
+ * pfn we hope we have extra area in the altmap that
+ * can help to find an aligned block. This can result
+ * in altmap block allocation failures, in which case
+ * we fallback to RAM for vmemmap allocation.
+ */
+ if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
+ altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
+ /*
+ * make sure we don't create altmap mappings
+ * covering things outside the device.
+ */
+ goto base_mapping;
+ }
+
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
+ if (p) {
+ vmemmap_set_pmd(pmd, p, node, addr, next);
+ pr_debug("PMD_SIZE vmemmap mapping\n");
+ continue;
+ } else if (altmap) {
+ /*
+ * A vmemmap block allocation can fail due to
+ * alignment requirements and we trying to align
+ * things aggressively there by running out of
+ * space. Try base mapping on failure.
+ */
+ goto base_mapping;
+ }
+ } else if (vmemmap_check_pmd(pmd, node, addr, next)) {
+ /*
+ * If a huge mapping exist due to early call to
+ * vmemmap_populate, let's try to use that.
+ */
+ continue;
+ }
+base_mapping:
+ /*
+ * Not able allocate higher order memory to back memmap
+ * or we found a pointer to pte page. Allocate base page
+ * size vmemmap
+ */
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return -ENOMEM;
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
+ if (!pte)
+ return -ENOMEM;
+
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+ next = addr + PAGE_SIZE;
+ }
+ return 0;
+}
+
+static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return NULL;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return NULL;
+ if (pmd_leaf(*pmd))
+ /*
+ * The second page is mapped as a hugepage due to a nearby request.
+ * Force our mapping to page size without deduplication
+ */
+ return NULL;
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return NULL;
+ radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ return pte;
+}
+
+static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
+ unsigned long pfn_offset, int node)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long map_addr;
+
+ /* the second vmemmap page which we use for duplication */
+ map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
+ pgd = pgd_offset_k(map_addr);
+ p4d = p4d_offset(pgd, map_addr);
+ pud = vmemmap_pud_alloc(p4d, node, map_addr);
+ if (!pud)
+ return NULL;
+ pmd = vmemmap_pmd_alloc(pud, node, map_addr);
+ if (!pmd)
+ return NULL;
+ if (pmd_leaf(*pmd))
+ /*
+ * The second page is mapped as a hugepage due to a nearby request.
+ * Force our mapping to page size without deduplication
+ */
+ return NULL;
+ pte = vmemmap_pte_alloc(pmd, node, map_addr);
+ if (!pte)
+ return NULL;
+ /*
+ * Check if there exist a mapping to the left
+ */
+ if (pte_none(*pte)) {
+ /*
+ * Populate the head page vmemmap page.
+ * It can fall in different pmd, hence
+ * vmemmap_populate_address()
+ */
+ pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
+ if (!pte)
+ return NULL;
+ /*
+ * Populate the tail pages vmemmap page
+ */
+ pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
+ if (!pte)
+ return NULL;
+ vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
+ return pte;
+ }
+ return pte;
+}
+
+int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
+ unsigned long start,
+ unsigned long end, int node,
+ struct dev_pagemap *pgmap)
+{
+ /*
+ * we want to map things as base page size mapping so that
+ * we can save space in vmemmap. We could have huge mapping
+ * covering out both edges.
+ */
+ unsigned long addr;
+ unsigned long addr_pfn = start_pfn;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ for (addr = start; addr < end; addr = next) {
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return -ENOMEM;
+
+ if (pmd_leaf(READ_ONCE(*pmd))) {
+ /* existing huge mapping. Skip the range */
+ addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
+ next = pmd_addr_end(addr, end);
+ continue;
+ }
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return -ENOMEM;
+ if (!pte_none(*pte)) {
+ /*
+ * This could be because we already have a compound
+ * page whose VMEMMAP_RESERVE_NR pages were mapped and
+ * this request fall in those pages.
+ */
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ } else {
+ unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
+ unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
+ pte_t *tail_page_pte;
+
+ /*
+ * if the address is aligned to huge page size it is the
+ * head mapping.
+ */
+ if (pfn_offset == 0) {
+ /* Populate the head page vmemmap page */
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ /*
+ * Populate the tail pages vmemmap page
+ * It can fall in different pmd, hence
+ * vmemmap_populate_address()
+ */
+ pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+
+ addr_pfn += 2;
+ next = addr + 2 * PAGE_SIZE;
+ continue;
+ }
+ /*
+ * get the 2nd mapping details
+ * Also create it if that doesn't exist
+ */
+ tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
+ if (!tail_page_pte) {
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ }
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ }
+ }
+ return 0;
+}
+
+
#ifdef CONFIG_MEMORY_HOTPLUG
void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
{
- remove_pagetable(start, start + page_size);
+ remove_pagetable(start, start + page_size, true, NULL);
+}
+
+void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+ remove_pagetable(start, end, false, altmap);
}
#endif
#endif
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
+void radix__kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ unsigned long addr;
+
+ addr = (unsigned long)page_address(page);
+
+ if (enable)
+ set_memory_p(addr, numpages);
+ else
+ set_memory_np(addr, numpages);
+}
+#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
@@ -932,8 +1366,25 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add
assert_spin_locked(pmd_lockptr(mm, pmdp));
#endif
- old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
- trace_hugepage_update(addr, old, clr, set);
+ old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
+ trace_hugepage_update_pmd(addr, old, clr, set);
+
+ return old;
+}
+
+unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, unsigned long clr,
+ unsigned long set)
+{
+ unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pud_devmap(*pudp));
+ assert_spin_locked(pud_lockptr(mm, pudp));
+#endif
+
+ old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
+ trace_hugepage_update_pud(addr, old, clr, set);
return old;
}
@@ -953,9 +1404,6 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre
pmd = *pmdp;
pmd_clear(pmdp);
- /*FIXME!! Verify whether we need this kick below */
- serialize_against_pte_lookup(vma->vm_mm);
-
radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
return pmd;
@@ -1014,41 +1462,46 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
old_pmd = __pmd(old);
- /*
- * Serialize against find_current_mm_pte which does lock-less
- * lookup in page tables with local interrupts disabled. For huge pages
- * it casts pmd_t to pte_t. Since format of pte_t is different from
- * pmd_t we want to prevent transit from pmd pointing to page table
- * to pmd pointing to huge page (and back) while interrupts are disabled.
- * We clear pmd to possibly replace it with page table pointer in
- * different code paths. So make sure we wait for the parallel
- * find_current_mm_pte to finish.
- */
- serialize_against_pte_lookup(mm);
return old_pmd;
}
+pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ pud_t old_pud;
+ unsigned long old;
+
+ old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
+ old_pud = __pud(old);
+ return old_pud;
+}
+
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
pte_t entry, unsigned long address, int psize)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
- _PAGE_RW | _PAGE_EXEC);
+ unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY |
+ _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
unsigned long change = pte_val(entry) ^ pte_val(*ptep);
/*
- * To avoid NMMU hang while relaxing access, we need mark
- * the pte invalid in between.
+ * On POWER9, the NMMU is not able to relax PTE access permissions
+ * for a translation with a TLB. The PTE must be invalidated, TLB
+ * flushed before the new PTE is installed.
+ *
+ * This only needs to be done for radix, because hash translation does
+ * flush when updating the linux pte (and we don't support NMMU
+ * accelerators on HPT on POWER9 anyway XXX: do we?).
+ *
+ * POWER10 (and P9P) NMMU does behave as per ISA.
*/
- if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
+ if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
+ atomic_read(&mm->context.copros) > 0) {
unsigned long old_pte, new_pte;
old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
- /*
- * new value of pte
- */
new_pte = old_pte | set;
radix__flush_tlb_page_psize(mm, address, psize);
__radix_pte_update(ptep, _PAGE_INVALID, new_pte);
@@ -1056,9 +1509,12 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
__radix_pte_update(ptep, 0, set);
/*
* Book3S does not require a TLB flush when relaxing access
- * restrictions when the address space is not attached to a
- * NMMU, because the core MMU will reload the pte after taking
- * an access fault, which is defined by the architectue.
+ * restrictions when the address space (modulo the POWER9 nest
+ * MMU issue above) because the MMU will reload the PTE after
+ * taking an access fault, as defined by the architecture. See
+ * "Setting a Reference or Change Bit or Upgrading Access
+ * Authority (PTE Subject to Atomic Hardware Updates)" in
+ * Power ISA Version 3.1B.
*/
}
/* See ptesync comment in radix__set_pte_at */
@@ -1071,33 +1527,18 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
/*
- * To avoid NMMU hang while relaxing access we need to flush the tlb before
- * we set the new value. We need to do this only for radix, because hash
- * translation does flush when updating the linux pte.
+ * POWER9 NMMU must flush the TLB after clearing the PTE before
+ * installing a PTE with more relaxed access permissions, see
+ * radix__ptep_set_access_flags.
*/
- if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
+ is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
(atomic_read(&mm->context.copros) > 0))
radix__flush_tlb_page(vma, addr);
set_pte_at(mm, addr, ptep, pte);
}
-int __init arch_ioremap_pud_supported(void)
-{
- /* HPT does not cope with large pages in the vmalloc area */
- return radix_enabled();
-}
-
-int __init arch_ioremap_pmd_supported(void)
-{
- return radix_enabled();
-}
-
-int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
-{
- return 0;
-}
-
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
pte_t *ptep = (pte_t *)pud;
@@ -1113,7 +1554,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
int pud_clear_huge(pud_t *pud)
{
- if (pud_huge(*pud)) {
+ if (pud_is_leaf(*pud)) {
pud_clear(pud);
return 1;
}
@@ -1126,7 +1567,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
pmd_t *pmd;
int i;
- pmd = (pmd_t *)pud_page_vaddr(*pud);
+ pmd = pud_pgtable(*pud);
pud_clear(pud);
flush_tlb_kernel_range(addr, addr + PUD_SIZE);
@@ -1160,7 +1601,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
int pmd_clear_huge(pmd_t *pmd)
{
- if (pmd_huge(*pmd)) {
+ if (pmd_is_leaf(*pmd)) {
pmd_clear(pmd);
return 1;
}
@@ -1181,8 +1622,3 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
return 1;
}
-
-int __init arch_ioremap_p4d_supported(void)
-{
- return 0;
-}
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index a95175c0972b..9e1f6558d026 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -10,16 +10,16 @@
#include <linux/memblock.h>
#include <linux/mmu_context.h>
#include <linux/sched/mm.h>
+#include <linux/debugfs.h>
#include <asm/ppc-opcode.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/trace.h>
#include <asm/cputhreads.h>
+#include <asm/plpar_wrappers.h>
-#define RIC_FLUSH_TLB 0
-#define RIC_FLUSH_PWC 1
-#define RIC_FLUSH_ALL 2
+#include "internal.h"
/*
* tlbiel instruction for radix, set invalidation
@@ -55,16 +55,23 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
if (early_cpu_has_feature(CPU_FTR_HVMODE)) {
/* MSR[HV] should flush partition scope translations first. */
tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
- for (set = 1; set < num_sets; set++)
- tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
+
+ if (!early_cpu_has_feature(CPU_FTR_ARCH_31)) {
+ for (set = 1; set < num_sets; set++)
+ tlbiel_radix_set_isa300(set, is, 0,
+ RIC_FLUSH_TLB, 0);
+ }
}
/* Flush process scoped entries. */
tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
- for (set = 1; set < num_sets; set++)
- tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
- asm volatile("ptesync": : :"memory");
+ if (!early_cpu_has_feature(CPU_FTR_ARCH_31)) {
+ for (set = 1; set < num_sets; set++)
+ tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
+ }
+
+ ppc_after_tlbiel_barrier();
}
void radix__tlbiel_all(unsigned int action)
@@ -244,7 +251,6 @@ static inline void fixup_tlbie_pid(unsigned long pid)
}
}
-
static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
unsigned long ap)
{
@@ -281,29 +287,39 @@ static inline void fixup_tlbie_lpid(unsigned long lpid)
/*
* We use 128 set in radix mode and 256 set in hpt mode.
*/
-static __always_inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
+static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
{
int set;
asm volatile("ptesync": : :"memory");
- /*
- * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
- * also flush the entire Page Walk Cache.
- */
- __tlbiel_pid(pid, 0, ric);
+ switch (ric) {
+ case RIC_FLUSH_PWC:
- /* For PWC, only one flush is needed */
- if (ric == RIC_FLUSH_PWC) {
- asm volatile("ptesync": : :"memory");
+ /* For PWC, only one flush is needed */
+ __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+ ppc_after_tlbiel_barrier();
return;
+ case RIC_FLUSH_TLB:
+ __tlbiel_pid(pid, 0, RIC_FLUSH_TLB);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ /*
+ * Flush the first set of the TLB, and if
+ * we're doing a RIC_FLUSH_ALL, also flush
+ * the entire Page Walk Cache.
+ */
+ __tlbiel_pid(pid, 0, RIC_FLUSH_ALL);
}
- /* For the remaining sets, just flush the TLB */
- for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
- __tlbiel_pid(pid, set, RIC_FLUSH_TLB);
+ if (!cpu_has_feature(CPU_FTR_ARCH_31)) {
+ /* For the remaining sets, just flush the TLB */
+ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+ __tlbiel_pid(pid, set, RIC_FLUSH_TLB);
+ }
- asm volatile("ptesync": : :"memory");
+ ppc_after_tlbiel_barrier();
asm volatile(PPC_RADIX_INVALIDATE_ERAT_USER "; isync" : : :"memory");
}
@@ -313,7 +329,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
/*
* Workaround the fact that the "ric" argument to __tlbie_pid
- * must be a compile-time contraint to match the "i" constraint
+ * must be a compile-time constraint to match the "i" constraint
* in the asm statement.
*/
switch (ric) {
@@ -430,7 +446,7 @@ static __always_inline void _tlbiel_va(unsigned long va, unsigned long pid,
asm volatile("ptesync": : :"memory");
__tlbiel_va(va, pid, ap, ric);
- asm volatile("ptesync": : :"memory");
+ ppc_after_tlbiel_barrier();
}
static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
@@ -441,7 +457,7 @@ static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
if (also_pwc)
__tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
__tlbiel_va_range(start, end, pid, page_size, psize);
- asm volatile("ptesync": : :"memory");
+ ppc_after_tlbiel_barrier();
}
static inline void __tlbie_va_range(unsigned long start, unsigned long end,
@@ -565,12 +581,13 @@ static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
*/
void radix__local_flush_tlb_mm(struct mm_struct *mm)
{
- unsigned long pid;
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_pid(pid, RIC_FLUSH_TLB);
+ _tlbiel_pid(pid, RIC_FLUSH_TLB);
preempt_enable();
}
EXPORT_SYMBOL(radix__local_flush_tlb_mm);
@@ -578,26 +595,33 @@ EXPORT_SYMBOL(radix__local_flush_tlb_mm);
#ifndef CONFIG_SMP
void radix__local_flush_all_mm(struct mm_struct *mm)
{
- unsigned long pid;
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
preempt_enable();
}
EXPORT_SYMBOL(radix__local_flush_all_mm);
+
+static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
+{
+ radix__local_flush_all_mm(mm);
+}
#endif /* CONFIG_SMP */
void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
int psize)
{
- unsigned long pid;
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
preempt_enable();
}
@@ -612,47 +636,86 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
}
EXPORT_SYMBOL(radix__local_flush_tlb_page);
-static bool mm_is_singlethreaded(struct mm_struct *mm)
-{
- if (atomic_read(&mm->context.copros) > 0)
- return false;
- if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm)
- return true;
- return false;
-}
-
static bool mm_needs_flush_escalation(struct mm_struct *mm)
{
/*
- * P9 nest MMU has issues with the page walk cache
- * caching PTEs and not flushing them properly when
- * RIC = 0 for a PID/LPID invalidate
+ * The P9 nest MMU has issues with the page walk cache caching PTEs
+ * and not flushing them when RIC = 0 for a PID/LPID invalidate.
+ *
+ * This may have been fixed in shipping firmware (by disabling PWC
+ * or preventing it from caching PTEs), but until that is confirmed,
+ * this workaround is required - escalate all RIC=0 IS=1/2/3 flushes
+ * to RIC=2.
+ *
+ * POWER10 (and P9P) does not have this problem.
*/
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ return false;
if (atomic_read(&mm->context.copros) > 0)
return true;
return false;
}
-#ifdef CONFIG_SMP
-static void do_exit_flush_lazy_tlb(void *arg)
+/*
+ * If always_flush is true, then flush even if this CPU can't be removed
+ * from mm_cpumask.
+ */
+void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush)
{
- struct mm_struct *mm = arg;
unsigned long pid = mm->context.id;
+ int cpu = smp_processor_id();
+ /*
+ * A kthread could have done a mmget_not_zero() after the flushing CPU
+ * checked mm_cpumask, and be in the process of kthread_use_mm when
+ * interrupted here. In that case, current->mm will be set to mm,
+ * because kthread_use_mm() setting ->mm and switching to the mm is
+ * done with interrupts off.
+ */
if (current->mm == mm)
- return; /* Local CPU */
+ goto out;
if (current->active_mm == mm) {
+ unsigned long flags;
+
+ WARN_ON_ONCE(current->mm != NULL);
/*
- * Must be a kernel thread because sender is single-threaded.
+ * It is a kernel thread and is using mm as the lazy tlb, so
+ * switch it to init_mm. This is not always called from IPI
+ * (e.g., flush_type_needed), so must disable irqs.
*/
- BUG_ON(current->mm);
- mmgrab(&init_mm);
- switch_mm(mm, &init_mm, current);
+ local_irq_save(flags);
+ mmgrab_lazy_tlb(&init_mm);
current->active_mm = &init_mm;
- mmdrop(mm);
+ switch_mm_irqs_off(mm, &init_mm, current);
+ mmdrop_lazy_tlb(mm);
+ local_irq_restore(flags);
}
- _tlbiel_pid(pid, RIC_FLUSH_ALL);
+
+ /*
+ * This IPI may be initiated from any source including those not
+ * running the mm, so there may be a racing IPI that comes after
+ * this one which finds the cpumask already clear. Check and avoid
+ * underflowing the active_cpus count in that case. The race should
+ * not otherwise be a problem, but the TLB must be flushed because
+ * that's what the caller expects.
+ */
+ if (cpumask_test_cpu(cpu, mm_cpumask(mm))) {
+ dec_mm_active_cpus(mm);
+ cpumask_clear_cpu(cpu, mm_cpumask(mm));
+ always_flush = true;
+ }
+
+out:
+ if (always_flush)
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
+}
+
+#ifdef CONFIG_SMP
+static void do_exit_flush_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+ exit_lazy_flush_tlb(mm, true);
}
static void exit_flush_lazy_tlbs(struct mm_struct *mm)
@@ -666,30 +729,136 @@ static void exit_flush_lazy_tlbs(struct mm_struct *mm)
*/
smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
(void *)mm, 1);
- mm_reset_thread_local(mm);
}
+#else /* CONFIG_SMP */
+static inline void exit_flush_lazy_tlbs(struct mm_struct *mm) { }
+#endif /* CONFIG_SMP */
+
+static DEFINE_PER_CPU(unsigned int, mm_cpumask_trim_clock);
+
+/*
+ * Interval between flushes at which we send out IPIs to check whether the
+ * mm_cpumask can be trimmed for the case where it's not a single-threaded
+ * process flushing its own mm. The intent is to reduce the cost of later
+ * flushes. Don't want this to be so low that it adds noticable cost to TLB
+ * flushing, or so high that it doesn't help reduce global TLBIEs.
+ */
+static unsigned long tlb_mm_cpumask_trim_timer = 1073;
+
+static bool tick_and_test_trim_clock(void)
+{
+ if (__this_cpu_inc_return(mm_cpumask_trim_clock) ==
+ tlb_mm_cpumask_trim_timer) {
+ __this_cpu_write(mm_cpumask_trim_clock, 0);
+ return true;
+ }
+ return false;
+}
+
+enum tlb_flush_type {
+ FLUSH_TYPE_NONE,
+ FLUSH_TYPE_LOCAL,
+ FLUSH_TYPE_GLOBAL,
+};
+
+static enum tlb_flush_type flush_type_needed(struct mm_struct *mm, bool fullmm)
+{
+ int active_cpus = atomic_read(&mm->context.active_cpus);
+ int cpu = smp_processor_id();
+
+ if (active_cpus == 0)
+ return FLUSH_TYPE_NONE;
+ if (active_cpus == 1 && cpumask_test_cpu(cpu, mm_cpumask(mm))) {
+ if (current->mm != mm) {
+ /*
+ * Asynchronous flush sources may trim down to nothing
+ * if the process is not running, so occasionally try
+ * to trim.
+ */
+ if (tick_and_test_trim_clock()) {
+ exit_lazy_flush_tlb(mm, true);
+ return FLUSH_TYPE_NONE;
+ }
+ }
+ return FLUSH_TYPE_LOCAL;
+ }
+
+ /* Coprocessors require TLBIE to invalidate nMMU. */
+ if (atomic_read(&mm->context.copros) > 0)
+ return FLUSH_TYPE_GLOBAL;
+
+ /*
+ * In the fullmm case there's no point doing the exit_flush_lazy_tlbs
+ * because the mm is being taken down anyway, and a TLBIE tends to
+ * be faster than an IPI+TLBIEL.
+ */
+ if (fullmm)
+ return FLUSH_TYPE_GLOBAL;
+
+ /*
+ * If we are running the only thread of a single-threaded process,
+ * then we should almost always be able to trim off the rest of the
+ * CPU mask (except in the case of use_mm() races), so always try
+ * trimming the mask.
+ */
+ if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm) {
+ exit_flush_lazy_tlbs(mm);
+ /*
+ * use_mm() race could prevent IPIs from being able to clear
+ * the cpumask here, however those users are established
+ * after our first check (and so after the PTEs are removed),
+ * and the TLB still gets flushed by the IPI, so this CPU
+ * will only require a local flush.
+ */
+ return FLUSH_TYPE_LOCAL;
+ }
+
+ /*
+ * Occasionally try to trim down the cpumask. It's possible this can
+ * bring the mask to zero, which results in no flush.
+ */
+ if (tick_and_test_trim_clock()) {
+ exit_flush_lazy_tlbs(mm);
+ if (current->mm == mm)
+ return FLUSH_TYPE_LOCAL;
+ if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
+ exit_lazy_flush_tlb(mm, true);
+ return FLUSH_TYPE_NONE;
+ }
+
+ return FLUSH_TYPE_GLOBAL;
+}
+
+#ifdef CONFIG_SMP
void radix__flush_tlb_mm(struct mm_struct *mm)
{
unsigned long pid;
+ enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
/*
- * Order loads of mm_cpumask vs previous stores to clear ptes before
- * the invalidate. See barrier in switch_mm_irqs_off
+ * Order loads of mm_cpumask (in flush_type_needed) vs previous
+ * stores to clear ptes before the invalidate. See barrier in
+ * switch_mm_irqs_off
*/
smp_mb();
- if (!mm_is_thread_local(mm)) {
- if (unlikely(mm_is_singlethreaded(mm))) {
- exit_flush_lazy_tlbs(mm);
- goto local;
- }
-
- if (cputlb_use_tlbie()) {
+ type = flush_type_needed(mm, false);
+ if (type == FLUSH_TYPE_LOCAL) {
+ _tlbiel_pid(pid, RIC_FLUSH_TLB);
+ } else if (type == FLUSH_TYPE_GLOBAL) {
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB,
+ H_RPTI_PAGE_ALL, 0, -1UL);
+ } else if (cputlb_use_tlbie()) {
if (mm_needs_flush_escalation(mm))
_tlbie_pid(pid, RIC_FLUSH_ALL);
else
@@ -697,40 +866,43 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
} else {
_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
}
- } else {
-local:
- _tlbiel_pid(pid, RIC_FLUSH_TLB);
}
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
EXPORT_SYMBOL(radix__flush_tlb_mm);
static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
{
unsigned long pid;
+ enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
smp_mb(); /* see radix__flush_tlb_mm */
- if (!mm_is_thread_local(mm)) {
- if (unlikely(mm_is_singlethreaded(mm))) {
- if (!fullmm) {
- exit_flush_lazy_tlbs(mm);
- goto local;
- }
- }
- if (cputlb_use_tlbie())
+ type = flush_type_needed(mm, fullmm);
+ if (type == FLUSH_TYPE_LOCAL) {
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ } else if (type == FLUSH_TYPE_GLOBAL) {
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+ unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+ H_RPTI_TYPE_PRT;
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, type,
+ H_RPTI_PAGE_ALL, 0, -1UL);
+ } else if (cputlb_use_tlbie())
_tlbie_pid(pid, RIC_FLUSH_ALL);
else
_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
- } else {
-local:
- _tlbiel_pid(pid, RIC_FLUSH_ALL);
}
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
void radix__flush_all_mm(struct mm_struct *mm)
@@ -743,25 +915,34 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
int psize)
{
unsigned long pid;
+ enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
smp_mb(); /* see radix__flush_tlb_mm */
- if (!mm_is_thread_local(mm)) {
- if (unlikely(mm_is_singlethreaded(mm))) {
- exit_flush_lazy_tlbs(mm);
- goto local;
- }
- if (cputlb_use_tlbie())
+ type = flush_type_needed(mm, false);
+ if (type == FLUSH_TYPE_LOCAL) {
+ _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ } else if (type == FLUSH_TYPE_GLOBAL) {
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt, pg_sizes, size;
+
+ tgt = H_RPTI_TARGET_CMMU;
+ pg_sizes = psize_to_rpti_pgsize(psize);
+ size = 1UL << mmu_psize_to_shift(psize);
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB,
+ pg_sizes, vmaddr,
+ vmaddr + size);
+ } else if (cputlb_use_tlbie())
_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
else
_tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB);
- } else {
-local:
- _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
}
preempt_enable();
}
@@ -776,8 +957,6 @@ void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
}
EXPORT_SYMBOL(radix__flush_tlb_page);
-#else /* CONFIG_SMP */
-#define radix__flush_all_mm radix__local_flush_all_mm
#endif /* CONFIG_SMP */
static void do_tlbiel_kernel(void *info)
@@ -805,13 +984,23 @@ static inline void _tlbiel_kernel_broadcast(void)
*/
void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- if (cputlb_use_tlbie())
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU | H_RPTI_TARGET_NMMU;
+ unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+ H_RPTI_TYPE_PRT;
+
+ pseries_rpt_invalidate(0, tgt, type, H_RPTI_PAGE_ALL,
+ start, end);
+ } else if (cputlb_use_tlbie())
_tlbie_pid(0, RIC_FLUSH_ALL);
else
_tlbiel_kernel_broadcast();
}
EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
+/*
+ * Doesn't appear to be used anywhere. Remove.
+ */
#define TLB_FLUSH_ALL -1UL
/*
@@ -823,77 +1012,90 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
* invalidating a full PID, so it has a far lower threshold to change from
* individual page flushes to full-pid flushes.
*/
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
-static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
+static u32 tlb_single_page_flush_ceiling __read_mostly = 33;
+static u32 tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
static inline void __radix__flush_tlb_range(struct mm_struct *mm,
unsigned long start, unsigned long end)
-
{
unsigned long pid;
unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
unsigned long page_size = 1UL << page_shift;
unsigned long nr_pages = (end - start) >> page_shift;
- bool local, full;
+ bool flush_pid, flush_pwc = false;
+ enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
+ WARN_ON_ONCE(end == TLB_FLUSH_ALL);
+
preempt_disable();
smp_mb(); /* see radix__flush_tlb_mm */
- if (!mm_is_thread_local(mm)) {
- if (unlikely(mm_is_singlethreaded(mm))) {
- if (end != TLB_FLUSH_ALL) {
- exit_flush_lazy_tlbs(mm);
- goto is_local;
- }
- }
- local = false;
- full = (end == TLB_FLUSH_ALL ||
- nr_pages > tlb_single_page_flush_ceiling);
- } else {
-is_local:
- local = true;
- full = (end == TLB_FLUSH_ALL ||
- nr_pages > tlb_local_single_page_flush_ceiling);
- }
+ type = flush_type_needed(mm, false);
+ if (type == FLUSH_TYPE_NONE)
+ goto out;
- if (full) {
- if (local) {
- _tlbiel_pid(pid, RIC_FLUSH_TLB);
+ if (type == FLUSH_TYPE_GLOBAL)
+ flush_pid = nr_pages > tlb_single_page_flush_ceiling;
+ else
+ flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
+ /*
+ * full pid flush already does the PWC flush. if it is not full pid
+ * flush check the range is more than PMD and force a pwc flush
+ * mremap() depends on this behaviour.
+ */
+ if (!flush_pid && (end - start) >= PMD_SIZE)
+ flush_pwc = true;
+
+ if (!mmu_has_feature(MMU_FTR_GTSE) && type == FLUSH_TYPE_GLOBAL) {
+ unsigned long type = H_RPTI_TYPE_TLB;
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+ unsigned long pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize);
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ pg_sizes |= psize_to_rpti_pgsize(MMU_PAGE_2M);
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ if (flush_pwc)
+ type |= H_RPTI_TYPE_PWC;
+ pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end);
+ } else if (flush_pid) {
+ /*
+ * We are now flushing a range larger than PMD size force a RIC_FLUSH_ALL
+ */
+ if (type == FLUSH_TYPE_LOCAL) {
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
} else {
if (cputlb_use_tlbie()) {
- if (mm_needs_flush_escalation(mm))
- _tlbie_pid(pid, RIC_FLUSH_ALL);
- else
- _tlbie_pid(pid, RIC_FLUSH_TLB);
+ _tlbie_pid(pid, RIC_FLUSH_ALL);
} else {
- _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
+ _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
}
}
} else {
- bool hflush = false;
+ bool hflush;
unsigned long hstart, hend;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- hstart = (start + PMD_SIZE - 1) & PMD_MASK;
- hend = end & PMD_MASK;
- if (hstart == hend)
- hflush = false;
- else
- hflush = true;
- }
+ hstart = (start + PMD_SIZE - 1) & PMD_MASK;
+ hend = end & PMD_MASK;
+ hflush = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hstart < hend;
- if (local) {
+ if (type == FLUSH_TYPE_LOCAL) {
asm volatile("ptesync": : :"memory");
+ if (flush_pwc)
+ /* For PWC, only one flush is needed */
+ __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
__tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
if (hflush)
__tlbiel_va_range(hstart, hend, pid,
PMD_SIZE, MMU_PAGE_2M);
- asm volatile("ptesync": : :"memory");
+ ppc_after_tlbiel_barrier();
} else if (cputlb_use_tlbie()) {
asm volatile("ptesync": : :"memory");
+ if (flush_pwc)
+ __tlbie_pid(pid, RIC_FLUSH_PWC);
__tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
if (hflush)
__tlbie_va_range(hstart, hend, pid,
@@ -901,13 +1103,15 @@ is_local:
asm volatile("eieio; tlbsync; ptesync": : :"memory");
} else {
_tlbiel_va_range_multicast(mm,
- start, end, pid, page_size, mmu_virtual_psize, false);
+ start, end, pid, page_size, mmu_virtual_psize, flush_pwc);
if (hflush)
_tlbiel_va_range_multicast(mm,
- hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false);
+ hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, flush_pwc);
}
}
+out:
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
@@ -977,9 +1181,6 @@ void radix__flush_all_lpid_guest(unsigned int lpid)
_tlbie_lpid_guest(lpid, RIC_FLUSH_ALL);
}
-static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
- unsigned long end, int psize);
-
void radix__tlb_flush(struct mmu_gather *tlb)
{
int psize = 0;
@@ -995,8 +1196,29 @@ void radix__tlb_flush(struct mmu_gather *tlb)
* that flushes the process table entry cache upon process teardown.
* See the comment for radix in arch_exit_mmap().
*/
- if (tlb->fullmm || tlb->need_flush_all) {
- __flush_all_mm(mm, true);
+ if (tlb->fullmm) {
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * Shootdown based lazy tlb mm refcounting means we
+ * have to IPI everyone in the mm_cpumask anyway soon
+ * when the mm goes away, so might as well do it as
+ * part of the final flush now.
+ *
+ * If lazy shootdown was improved to reduce IPIs (e.g.,
+ * by batching), then it may end up being better to use
+ * tlbies here instead.
+ */
+ preempt_disable();
+
+ smp_mb(); /* see radix__flush_tlb_mm */
+ exit_flush_lazy_tlbs(mm);
+ __flush_all_mm(mm, true);
+
+ preempt_enable();
+ } else {
+ __flush_all_mm(mm, true);
+ }
+
} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
if (!tlb->freed_tables)
radix__flush_tlb_mm(mm);
@@ -1010,7 +1232,7 @@ void radix__tlb_flush(struct mmu_gather *tlb)
}
}
-static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
unsigned long start, unsigned long end,
int psize, bool also_pwc)
{
@@ -1018,33 +1240,38 @@ static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
unsigned int page_shift = mmu_psize_defs[psize].shift;
unsigned long page_size = 1UL << page_shift;
unsigned long nr_pages = (end - start) >> page_shift;
- bool local, full;
+ bool flush_pid;
+ enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
+ WARN_ON_ONCE(end == TLB_FLUSH_ALL);
+
preempt_disable();
smp_mb(); /* see radix__flush_tlb_mm */
- if (!mm_is_thread_local(mm)) {
- if (unlikely(mm_is_singlethreaded(mm))) {
- if (end != TLB_FLUSH_ALL) {
- exit_flush_lazy_tlbs(mm);
- goto is_local;
- }
- }
- local = false;
- full = (end == TLB_FLUSH_ALL ||
- nr_pages > tlb_single_page_flush_ceiling);
- } else {
-is_local:
- local = true;
- full = (end == TLB_FLUSH_ALL ||
- nr_pages > tlb_local_single_page_flush_ceiling);
- }
+ type = flush_type_needed(mm, false);
+ if (type == FLUSH_TYPE_NONE)
+ goto out;
- if (full) {
- if (local) {
+ if (type == FLUSH_TYPE_GLOBAL)
+ flush_pid = nr_pages > tlb_single_page_flush_ceiling;
+ else
+ flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
+
+ if (!mmu_has_feature(MMU_FTR_GTSE) && type == FLUSH_TYPE_GLOBAL) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+ unsigned long type = H_RPTI_TYPE_TLB;
+ unsigned long pg_sizes = psize_to_rpti_pgsize(psize);
+
+ if (also_pwc)
+ type |= H_RPTI_TYPE_PWC;
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end);
+ } else if (flush_pid) {
+ if (type == FLUSH_TYPE_LOCAL) {
_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
} else {
if (cputlb_use_tlbie()) {
@@ -1060,7 +1287,7 @@ is_local:
}
} else {
- if (local)
+ if (type == FLUSH_TYPE_LOCAL)
_tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
else if (cputlb_use_tlbie())
_tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
@@ -1068,7 +1295,9 @@ is_local:
_tlbiel_va_range_multicast(mm,
start, end, pid, page_size, psize, also_pwc);
}
+out:
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
@@ -1077,8 +1306,8 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
}
-static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
- unsigned long end, int psize)
+void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int psize)
{
__radix__flush_tlb_range_psize(mm, start, end, psize, true);
}
@@ -1087,9 +1316,10 @@ static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long
void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
{
unsigned long pid, end;
+ enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
/* 4k page size, just blow the world */
@@ -1103,19 +1333,27 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
/* Otherwise first do the PWC, then iterate the pages. */
preempt_disable();
smp_mb(); /* see radix__flush_tlb_mm */
- if (!mm_is_thread_local(mm)) {
- if (unlikely(mm_is_singlethreaded(mm))) {
- exit_flush_lazy_tlbs(mm);
- goto local;
- }
- if (cputlb_use_tlbie())
+ type = flush_type_needed(mm, false);
+ if (type == FLUSH_TYPE_LOCAL) {
+ _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+ } else if (type == FLUSH_TYPE_GLOBAL) {
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt, type, pg_sizes;
+
+ tgt = H_RPTI_TARGET_CMMU;
+ type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+ H_RPTI_TYPE_PRT;
+ pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize);
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, type, pg_sizes,
+ addr, end);
+ } else if (cputlb_use_tlbie())
_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
else
_tlbiel_va_range_multicast(mm,
addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
- } else {
-local:
- _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
}
preempt_enable();
@@ -1129,6 +1367,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
}
EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
+void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G);
+}
+EXPORT_SYMBOL(radix__flush_pud_tlb_range);
+
void radix__flush_tlb_all(void)
{
unsigned long rb,prs,r,rs;
@@ -1154,44 +1399,189 @@ void radix__flush_tlb_all(void)
}
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
+static __always_inline void __tlbie_pid_lpid(unsigned long pid,
+ unsigned long lpid,
+ unsigned long ric)
{
- unsigned long pid = mm->context.id;
+ unsigned long rb, rs, prs, r;
- if (unlikely(pid == MMU_NO_CONTEXT))
- return;
+ rb = PPC_BIT(53); /* IS = 1 */
+ rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid,
+ unsigned long lpid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb, rs, prs, r;
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid)
+{
/*
- * If this context hasn't run on that CPU before and KVM is
- * around, there's a slim chance that the guest on another
- * CPU just brought in obsolete translation into the TLB of
- * this CPU due to a bad prefetch using the guest PID on
- * the way into the hypervisor.
- *
- * We work around this here. If KVM is possible, we check if
- * any sibling thread is in KVM. If it is, the window may exist
- * and thus we flush that PID from the core.
- *
- * A potential future improvement would be to mark which PIDs
- * have never been used on the system and avoid it if the PID
- * is new and the process has no other cpumask bit set.
+ * We can use any address for the invalidation, pick one which is
+ * probably unused as an optimisation.
*/
- if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
- int cpu = smp_processor_id();
- int sib = cpu_first_thread_sibling(cpu);
- bool flush = false;
-
- for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
- if (sib == cpu)
- continue;
- if (!cpu_possible(sib))
- continue;
- if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu)
- flush = true;
+ unsigned long va = ((1UL << 52) - 1);
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
+ RIC_FLUSH_TLB);
+ }
+}
+
+static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
+ unsigned long ric)
+{
+ asm volatile("ptesync" : : : "memory");
+
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
+ fixup_tlbie_pid_lpid(pid, lpid);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
+ fixup_tlbie_pid_lpid(pid, lpid);
+ }
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+
+static inline void fixup_tlbie_va_range_lpid(unsigned long va,
+ unsigned long pid,
+ unsigned long lpid,
+ unsigned long ap)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB);
+ }
+}
+
+static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long lpid,
+ unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ for (addr = start; addr < end; addr += page_size)
+ __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
+
+ fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
+}
+
+static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long lpid,
+ unsigned long page_size,
+ unsigned long psize, bool also_pwc)
+{
+ asm volatile("ptesync" : : : "memory");
+ if (also_pwc)
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+ __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+
+/*
+ * Performs process-scoped invalidations for a given LPID
+ * as part of H_RPT_INVALIDATE hcall.
+ */
+void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid,
+ unsigned long type, unsigned long pg_sizes,
+ unsigned long start, unsigned long end)
+{
+ unsigned long psize, nr_pages;
+ struct mmu_psize_def *def;
+ bool flush_pid;
+
+ /*
+ * A H_RPTI_TYPE_ALL request implies RIC=3, hence
+ * do a single IS=1 based flush.
+ */
+ if ((type & H_RPTI_TYPE_ALL) == H_RPTI_TYPE_ALL) {
+ _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
+ return;
+ }
+
+ if (type & H_RPTI_TYPE_PWC)
+ _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+
+ /* Full PID flush */
+ if (start == 0 && end == -1)
+ return _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
+
+ /* Do range invalidation for all the valid page sizes */
+ for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+ def = &mmu_psize_defs[psize];
+ if (!(pg_sizes & def->h_rpt_pgsize))
+ continue;
+
+ nr_pages = (end - start) >> def->shift;
+ flush_pid = nr_pages > tlb_single_page_flush_ceiling;
+
+ /*
+ * If the number of pages spanning the range is above
+ * the ceiling, convert the request into a full PID flush.
+ * And since PID flush takes out all the page sizes, there
+ * is no need to consider remaining page sizes.
+ */
+ if (flush_pid) {
+ _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
+ return;
}
- if (flush)
- _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ _tlbie_va_range_lpid(start, end, pid, lpid,
+ (1UL << def->shift), psize, false);
}
}
-EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
+EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt);
+
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+
+static int __init create_tlb_single_page_flush_ceiling(void)
+{
+ debugfs_create_u32("tlb_single_page_flush_ceiling", 0600,
+ arch_debugfs_dir, &tlb_single_page_flush_ceiling);
+ debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600,
+ arch_debugfs_dir, &tlb_local_single_page_flush_ceiling);
+ return 0;
+}
+late_initcall(create_tlb_single_page_flush_ceiling);
+
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index 716204aee3da..f2708c8629a5 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -9,11 +9,11 @@
* Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
*/
-#include <asm/asm-prototypes.h>
-#include <asm/pgtable.h>
+#include <asm/interrupt.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/paca.h>
+#include <asm/lppaca.h>
#include <asm/ppc-opcode.h>
#include <asm/cputable.h>
#include <asm/cacheflush.h>
@@ -21,38 +21,26 @@
#include <linux/compiler.h>
#include <linux/context_tracking.h>
#include <linux/mm_types.h>
+#include <linux/pgtable.h>
#include <asm/udbg.h>
#include <asm/code-patching.h>
-enum slb_index {
- LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */
- KSTACK_INDEX = 1, /* Kernel stack map */
-};
+#include "internal.h"
-static long slb_allocate_user(struct mm_struct *mm, unsigned long ea);
-#define slb_esid_mask(ssize) \
- (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
+static long slb_allocate_user(struct mm_struct *mm, unsigned long ea);
-static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
- enum slb_index index)
-{
- return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
-}
+bool stress_slb_enabled __initdata;
-static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize,
- unsigned long flags)
+static int __init parse_stress_slb(char *p)
{
- return (vsid << slb_vsid_shift(ssize)) | flags |
- ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
+ stress_slb_enabled = true;
+ return 0;
}
+early_param("stress_slb", parse_stress_slb);
-static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
- unsigned long flags)
-{
- return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
-}
+__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_slb_key);
static void assert_slb_presence(bool present, unsigned long ea)
{
@@ -68,7 +56,7 @@ static void assert_slb_presence(bool present, unsigned long ea)
* slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware
* ignores all other bits from 0-27, so just clear them all.
*/
- ea &= ~((1UL << 28) - 1);
+ ea &= ~((1UL << SID_SHIFT) - 1);
asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
WARN_ON(present == (tmp == 0));
@@ -153,14 +141,42 @@ void slb_flush_all_realmode(void)
asm volatile("slbmte %0,%0; slbia" : : "r" (0));
}
+static __always_inline void __slb_flush_and_restore_bolted(bool preserve_kernel_lookaside)
+{
+ struct slb_shadow *p = get_slb_shadow();
+ unsigned long ksp_esid_data, ksp_vsid_data;
+ u32 ih;
+
+ /*
+ * SLBIA IH=1 on ISA v2.05 and newer processors may preserve lookaside
+ * information created with Class=0 entries, which we use for kernel
+ * SLB entries (the SLB entries themselves are still invalidated).
+ *
+ * Older processors will ignore this optimisation. Over-invalidation
+ * is fine because we never rely on lookaside information existing.
+ */
+ if (preserve_kernel_lookaside)
+ ih = 1;
+ else
+ ih = 0;
+
+ ksp_esid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
+ ksp_vsid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
+
+ asm volatile(PPC_SLBIA(%0)" \n"
+ "slbmte %1, %2 \n"
+ :: "i" (ih),
+ "r" (ksp_vsid_data),
+ "r" (ksp_esid_data)
+ : "memory");
+}
+
/*
* This flushes non-bolted entries, it can be run in virtual mode. Must
* be called with interrupts disabled.
*/
void slb_flush_and_restore_bolted(void)
{
- struct slb_shadow *p = get_slb_shadow();
-
BUILD_BUG_ON(SLB_NUM_BOLTED != 2);
WARN_ON(!irqs_disabled());
@@ -171,13 +187,10 @@ void slb_flush_and_restore_bolted(void)
*/
hard_irq_disable();
- asm volatile("isync\n"
- "slbia\n"
- "slbmte %0, %1\n"
- "isync\n"
- :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)),
- "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid))
- : "memory");
+ isync();
+ __slb_flush_and_restore_bolted(false);
+ isync();
+
assert_slb_presence(true, get_paca()->kstack);
get_paca()->slb_cache_ptr = 0;
@@ -216,7 +229,6 @@ void slb_dump_contents(struct slb_entry *slb_ptr)
return;
pr_err("SLB contents of cpu 0x%x\n", smp_processor_id());
- pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr);
for (i = 0; i < mmu_slb_size; i++) {
e = slb_ptr->esid;
@@ -226,34 +238,38 @@ void slb_dump_contents(struct slb_entry *slb_ptr)
if (!e && !v)
continue;
- pr_err("%02d %016lx %016lx\n", i, e, v);
+ pr_err("%02d %016lx %016lx %s\n", i, e, v,
+ (e & SLB_ESID_V) ? "VALID" : "NOT VALID");
- if (!(e & SLB_ESID_V)) {
- pr_err("\n");
+ if (!(e & SLB_ESID_V))
continue;
- }
+
llp = v & SLB_VSID_LLP;
if (v & SLB_VSID_B_1T) {
- pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n",
+ pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n",
GET_ESID_1T(e),
(v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp);
} else {
- pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n",
+ pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n",
GET_ESID(e),
(v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp);
}
}
- pr_err("----------------------------------\n");
-
- /* Dump slb cache entires as well. */
- pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr);
- pr_err("Valid SLB cache entries:\n");
- n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES);
- for (i = 0; i < n; i++)
- pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
- pr_err("Rest of SLB cache entries:\n");
- for (i = n; i < SLB_CACHE_ENTRIES; i++)
- pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+
+ if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) {
+ /* RR is not so useful as it's often not used for allocation */
+ pr_err("SLB RR allocator index %d\n", get_paca()->stab_rr);
+
+ /* Dump slb cache entires as well. */
+ pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr);
+ pr_err("Valid SLB cache entries:\n");
+ n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES);
+ for (i = 0; i < n; i++)
+ pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+ pr_err("Rest of SLB cache entries:\n");
+ for (i = n; i < SLB_CACHE_ENTRIES; i++)
+ pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+ }
}
void slb_vmalloc_update(void)
@@ -332,7 +348,7 @@ void slb_setup_new_exec(void)
/*
* We have no good place to clear the slb preload cache on exec,
* flush_thread is about the earliest arch hook but that happens
- * after we switch to the mm and have aleady preloaded the SLBEs.
+ * after we switch to the mm and have already preloaded the SLBEs.
*
* For the most part that's probably okay to use entries from the
* previous exec, they will age out if unused. It may turn out to
@@ -400,6 +416,30 @@ void preload_new_slb_context(unsigned long start, unsigned long sp)
local_irq_enable();
}
+static void slb_cache_slbie_kernel(unsigned int index)
+{
+ unsigned long slbie_data = get_paca()->slb_cache[index];
+ unsigned long ksp = get_paca()->kstack;
+
+ slbie_data <<= SID_SHIFT;
+ slbie_data |= 0xc000000000000000ULL;
+ if ((ksp & slb_esid_mask(mmu_kernel_ssize)) == slbie_data)
+ return;
+ slbie_data |= mmu_kernel_ssize << SLBIE_SSIZE_SHIFT;
+
+ asm volatile("slbie %0" : : "r" (slbie_data));
+}
+
+static void slb_cache_slbie_user(unsigned int index)
+{
+ unsigned long slbie_data = get_paca()->slb_cache[index];
+
+ slbie_data <<= SID_SHIFT;
+ slbie_data |= user_segment_size(slbie_data) << SLBIE_SSIZE_SHIFT;
+ slbie_data |= SLBIE_C; /* user slbs have C=1 */
+
+ asm volatile("slbie %0" : : "r" (slbie_data));
+}
/* Flush all user entries from the segment table of the current processor. */
void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
@@ -414,8 +454,14 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
* which would update the slb_cache/slb_cache_ptr fields in the PACA.
*/
hard_irq_disable();
- asm volatile("isync" : : : "memory");
- if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ isync();
+ if (stress_slb()) {
+ __slb_flush_and_restore_bolted(false);
+ isync();
+ get_paca()->slb_cache_ptr = 0;
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+
+ } else if (cpu_has_feature(CPU_FTR_ARCH_300)) {
/*
* SLBIA IH=3 invalidates all Class=1 SLBEs and their
* associated lookaside structures, which matches what
@@ -423,47 +469,29 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
* cache.
*/
asm volatile(PPC_SLBIA(3));
+
} else {
unsigned long offset = get_paca()->slb_cache_ptr;
if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
offset <= SLB_CACHE_ENTRIES) {
- unsigned long slbie_data = 0;
-
- for (i = 0; i < offset; i++) {
- unsigned long ea;
-
- ea = (unsigned long)
- get_paca()->slb_cache[i] << SID_SHIFT;
- /*
- * Could assert_slb_presence(true) here, but
- * hypervisor or machine check could have come
- * in and removed the entry at this point.
- */
-
- slbie_data = ea;
- slbie_data |= user_segment_size(slbie_data)
- << SLBIE_SSIZE_SHIFT;
- slbie_data |= SLBIE_C; /* user slbs have C=1 */
- asm volatile("slbie %0" : : "r" (slbie_data));
- }
+ /*
+ * Could assert_slb_presence(true) here, but
+ * hypervisor or machine check could have come
+ * in and removed the entry at this point.
+ */
+
+ for (i = 0; i < offset; i++)
+ slb_cache_slbie_user(i);
/* Workaround POWER5 < DD2.1 issue */
if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1)
- asm volatile("slbie %0" : : "r" (slbie_data));
+ slb_cache_slbie_user(0);
} else {
- struct slb_shadow *p = get_slb_shadow();
- unsigned long ksp_esid_data =
- be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
- unsigned long ksp_vsid_data =
- be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
-
- asm volatile(PPC_SLBIA(1) "\n"
- "slbmte %0,%1\n"
- "isync"
- :: "r"(ksp_vsid_data),
- "r"(ksp_esid_data));
+ /* Flush but retain kernel lookaside information */
+ __slb_flush_and_restore_bolted(true);
+ isync();
get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
}
@@ -503,7 +531,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
* address accesses by the kernel (user mode won't happen until
* rfid, which is safe).
*/
- asm volatile("isync" : : : "memory");
+ isync();
}
void slb_set_size(u16 size)
@@ -571,6 +599,9 @@ static void slb_cache_update(unsigned long esid_data)
if (cpu_has_feature(CPU_FTR_ARCH_300))
return; /* ISAv3.0B and later does not use slb_cache */
+ if (stress_slb())
+ return;
+
/*
* Now update slb cache entries
*/
@@ -580,12 +611,12 @@ static void slb_cache_update(unsigned long esid_data)
* We have space in slb cache for optimized switch_slb().
* Top 36 bits from esid_data as per ISA
*/
- local_paca->slb_cache[slb_cache_index++] = esid_data >> 28;
+ local_paca->slb_cache[slb_cache_index++] = esid_data >> SID_SHIFT;
local_paca->slb_cache_ptr++;
} else {
/*
* Our cache is full and the current cache content strictly
- * doesn't indicate the active SLB conents. Bump the ptr
+ * doesn't indicate the active SLB contents. Bump the ptr
* so that switch_slb() will ignore the cache.
*/
local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1;
@@ -671,6 +702,28 @@ static long slb_insert_entry(unsigned long ea, unsigned long context,
* accesses user memory before it returns to userspace with rfid.
*/
assert_slb_presence(false, ea);
+ if (stress_slb()) {
+ int slb_cache_index = local_paca->slb_cache_ptr;
+
+ /*
+ * stress_slb() does not use slb cache, repurpose as a
+ * cache of inserted (non-bolted) kernel SLB entries. All
+ * non-bolted kernel entries are flushed on any user fault,
+ * or if there are already 3 non-boled kernel entries.
+ */
+ BUILD_BUG_ON(SLB_CACHE_ENTRIES < 3);
+ if (!kernel || slb_cache_index == 3) {
+ int i;
+
+ for (i = 0; i < slb_cache_index; i++)
+ slb_cache_slbie_kernel(i);
+ slb_cache_index = 0;
+ }
+
+ if (kernel)
+ local_paca->slb_cache[slb_cache_index++] = esid_data >> SID_SHIFT;
+ local_paca->slb_cache_ptr = slb_cache_index;
+ }
asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data));
barrier();
@@ -689,8 +742,8 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id)
if (id == LINEAR_MAP_REGION_ID) {
- /* We only support upto MAX_PHYSMEM_BITS */
- if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS))
+ /* We only support upto H_MAX_PHYSMEM_BITS */
+ if ((ea & EA_MASK) > (1UL << H_MAX_PHYSMEM_BITS))
return -EFAULT;
flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp;
@@ -761,30 +814,33 @@ static long slb_allocate_user(struct mm_struct *mm, unsigned long ea)
return slb_insert_entry(ea, context, flags, ssize, false);
}
-long do_slb_fault(struct pt_regs *regs, unsigned long ea)
+DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault)
{
+ unsigned long ea = regs->dar;
unsigned long id = get_region_id(ea);
/* IRQs are not reconciled here, so can't check irqs_disabled */
VM_WARN_ON(mfmsr() & MSR_EE);
- if (unlikely(!(regs->msr & MSR_RI)))
+ if (regs_is_unrecoverable(regs))
return -EINVAL;
/*
- * SLB kernel faults must be very careful not to touch anything
- * that is not bolted. E.g., PACA and global variables are okay,
- * mm->context stuff is not.
- *
- * SLB user faults can access all of kernel memory, but must be
- * careful not to touch things like IRQ state because it is not
- * "reconciled" here. The difficulty is that we must use
- * fast_exception_return to return from kernel SLB faults without
- * looking at possible non-bolted memory. We could test user vs
- * kernel faults in the interrupt handler asm and do a full fault,
- * reconcile, ret_from_except for user faults which would make them
- * first class kernel code. But for performance it's probably nicer
- * if they go via fast_exception_return too.
+ * SLB kernel faults must be very careful not to touch anything that is
+ * not bolted. E.g., PACA and global variables are okay, mm->context
+ * stuff is not. SLB user faults may access all of memory (and induce
+ * one recursive SLB kernel fault), so the kernel fault must not
+ * trample on the user fault state at those points.
+ */
+
+ /*
+ * This is a raw interrupt handler, for performance, so that
+ * fast_interrupt_return can be used. The handler must not touch local
+ * irq state, or schedule. We could test for usermode and upgrade to a
+ * normal process context (synchronous) interrupt for those, which
+ * would make them first-class kernel code and able to be traced and
+ * instrumented, although performance would suffer a bit, it would
+ * probably be a good tradeoff.
*/
if (id >= LINEAR_MAP_REGION_ID) {
long err;
@@ -812,17 +868,3 @@ long do_slb_fault(struct pt_regs *regs, unsigned long ea)
return err;
}
}
-
-void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err)
-{
- if (err == -EFAULT) {
- if (user_mode(regs))
- _exception(SIGSEGV, regs, SEGV_BNDERR, ea);
- else
- bad_page_fault(regs, ea, SIGSEGV);
- } else if (err == -EINVAL) {
- unrecoverable_exception(regs);
- } else {
- BUG();
- }
-}
diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c
new file mode 100644
index 000000000000..c0b58afb9a47
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/slice.c
@@ -0,0 +1,807 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * address space "slices" (meta-segments) support
+ *
+ * Copyright (C) 2007 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * Based on hugetlb implementation
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/hugetlb.h>
+#include <linux/sched/mm.h>
+#include <linux/security.h>
+#include <asm/mman.h>
+#include <asm/mmu.h>
+#include <asm/copro.h>
+#include <asm/hugetlb.h>
+#include <asm/mmu_context.h>
+
+static DEFINE_SPINLOCK(slice_convert_lock);
+
+#ifdef DEBUG
+int _slice_debug = 1;
+
+static void slice_print_mask(const char *label, const struct slice_mask *mask)
+{
+ if (!_slice_debug)
+ return;
+ pr_devel("%s low_slice: %*pbl\n", label,
+ (int)SLICE_NUM_LOW, &mask->low_slices);
+ pr_devel("%s high_slice: %*pbl\n", label,
+ (int)SLICE_NUM_HIGH, mask->high_slices);
+}
+
+#define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0)
+
+#else
+
+static void slice_print_mask(const char *label, const struct slice_mask *mask) {}
+#define slice_dbg(fmt...)
+
+#endif
+
+static inline notrace bool slice_addr_is_low(unsigned long addr)
+{
+ u64 tmp = (u64)addr;
+
+ return tmp < SLICE_LOW_TOP;
+}
+
+static void slice_range_to_mask(unsigned long start, unsigned long len,
+ struct slice_mask *ret)
+{
+ unsigned long end = start + len - 1;
+
+ ret->low_slices = 0;
+ if (SLICE_NUM_HIGH)
+ bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
+
+ if (slice_addr_is_low(start)) {
+ unsigned long mend = min(end,
+ (unsigned long)(SLICE_LOW_TOP - 1));
+
+ ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
+ - (1u << GET_LOW_SLICE_INDEX(start));
+ }
+
+ if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) {
+ unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
+ unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
+ unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
+
+ bitmap_set(ret->high_slices, start_index, count);
+ }
+}
+
+static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ struct vm_area_struct *vma;
+
+ if ((mm_ctx_slb_addr_limit(&mm->context) - len) < addr)
+ return 0;
+ vma = find_vma(mm, addr);
+ return (!vma || (addr + len) <= vm_start_gap(vma));
+}
+
+static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice)
+{
+ return !slice_area_is_free(mm, slice << SLICE_LOW_SHIFT,
+ 1ul << SLICE_LOW_SHIFT);
+}
+
+static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
+{
+ unsigned long start = slice << SLICE_HIGH_SHIFT;
+ unsigned long end = start + (1ul << SLICE_HIGH_SHIFT);
+
+ /* Hack, so that each addresses is controlled by exactly one
+ * of the high or low area bitmaps, the first high area starts
+ * at 4GB, not 0 */
+ if (start == 0)
+ start = (unsigned long)SLICE_LOW_TOP;
+
+ return !slice_area_is_free(mm, start, end - start);
+}
+
+static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret,
+ unsigned long high_limit)
+{
+ unsigned long i;
+
+ ret->low_slices = 0;
+ if (SLICE_NUM_HIGH)
+ bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
+
+ for (i = 0; i < SLICE_NUM_LOW; i++)
+ if (!slice_low_has_vma(mm, i))
+ ret->low_slices |= 1u << i;
+
+ if (slice_addr_is_low(high_limit - 1))
+ return;
+
+ for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++)
+ if (!slice_high_has_vma(mm, i))
+ __set_bit(i, ret->high_slices);
+}
+
+static bool slice_check_range_fits(struct mm_struct *mm,
+ const struct slice_mask *available,
+ unsigned long start, unsigned long len)
+{
+ unsigned long end = start + len - 1;
+ u64 low_slices = 0;
+
+ if (slice_addr_is_low(start)) {
+ unsigned long mend = min(end,
+ (unsigned long)(SLICE_LOW_TOP - 1));
+
+ low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
+ - (1u << GET_LOW_SLICE_INDEX(start));
+ }
+ if ((low_slices & available->low_slices) != low_slices)
+ return false;
+
+ if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) {
+ unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
+ unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
+ unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
+ unsigned long i;
+
+ for (i = start_index; i < start_index + count; i++) {
+ if (!test_bit(i, available->high_slices))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void slice_flush_segments(void *parm)
+{
+#ifdef CONFIG_PPC64
+ struct mm_struct *mm = parm;
+ unsigned long flags;
+
+ if (mm != current->active_mm)
+ return;
+
+ copy_mm_to_paca(current->active_mm);
+
+ local_irq_save(flags);
+ slb_flush_and_restore_bolted();
+ local_irq_restore(flags);
+#endif
+}
+
+static void slice_convert(struct mm_struct *mm,
+ const struct slice_mask *mask, int psize)
+{
+ int index, mask_index;
+ /* Write the new slice psize bits */
+ unsigned char *hpsizes, *lpsizes;
+ struct slice_mask *psize_mask, *old_mask;
+ unsigned long i, flags;
+ int old_psize;
+
+ slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
+ slice_print_mask(" mask", mask);
+
+ psize_mask = slice_mask_for_size(&mm->context, psize);
+
+ /* We need to use a spinlock here to protect against
+ * concurrent 64k -> 4k demotion ...
+ */
+ spin_lock_irqsave(&slice_convert_lock, flags);
+
+ lpsizes = mm_ctx_low_slices(&mm->context);
+ for (i = 0; i < SLICE_NUM_LOW; i++) {
+ if (!(mask->low_slices & (1u << i)))
+ continue;
+
+ mask_index = i & 0x1;
+ index = i >> 1;
+
+ /* Update the slice_mask */
+ old_psize = (lpsizes[index] >> (mask_index * 4)) & 0xf;
+ old_mask = slice_mask_for_size(&mm->context, old_psize);
+ old_mask->low_slices &= ~(1u << i);
+ psize_mask->low_slices |= 1u << i;
+
+ /* Update the sizes array */
+ lpsizes[index] = (lpsizes[index] & ~(0xf << (mask_index * 4))) |
+ (((unsigned long)psize) << (mask_index * 4));
+ }
+
+ hpsizes = mm_ctx_high_slices(&mm->context);
+ for (i = 0; i < GET_HIGH_SLICE_INDEX(mm_ctx_slb_addr_limit(&mm->context)); i++) {
+ if (!test_bit(i, mask->high_slices))
+ continue;
+
+ mask_index = i & 0x1;
+ index = i >> 1;
+
+ /* Update the slice_mask */
+ old_psize = (hpsizes[index] >> (mask_index * 4)) & 0xf;
+ old_mask = slice_mask_for_size(&mm->context, old_psize);
+ __clear_bit(i, old_mask->high_slices);
+ __set_bit(i, psize_mask->high_slices);
+
+ /* Update the sizes array */
+ hpsizes[index] = (hpsizes[index] & ~(0xf << (mask_index * 4))) |
+ (((unsigned long)psize) << (mask_index * 4));
+ }
+
+ slice_dbg(" lsps=%lx, hsps=%lx\n",
+ (unsigned long)mm_ctx_low_slices(&mm->context),
+ (unsigned long)mm_ctx_high_slices(&mm->context));
+
+ spin_unlock_irqrestore(&slice_convert_lock, flags);
+
+ copro_flush_all_slbs(mm);
+}
+
+/*
+ * Compute which slice addr is part of;
+ * set *boundary_addr to the start or end boundary of that slice
+ * (depending on 'end' parameter);
+ * return boolean indicating if the slice is marked as available in the
+ * 'available' slice_mark.
+ */
+static bool slice_scan_available(unsigned long addr,
+ const struct slice_mask *available,
+ int end, unsigned long *boundary_addr)
+{
+ unsigned long slice;
+ if (slice_addr_is_low(addr)) {
+ slice = GET_LOW_SLICE_INDEX(addr);
+ *boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
+ return !!(available->low_slices & (1u << slice));
+ } else {
+ slice = GET_HIGH_SLICE_INDEX(addr);
+ *boundary_addr = (slice + end) ?
+ ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
+ return !!test_bit(slice, available->high_slices);
+ }
+}
+
+static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ const struct slice_mask *available,
+ int psize, unsigned long high_limit)
+{
+ int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+ unsigned long found, next_end;
+ struct vm_unmapped_area_info info;
+
+ info.flags = 0;
+ info.length = len;
+ info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+ info.align_offset = 0;
+ /*
+ * Check till the allow max value for this mmap request
+ */
+ while (addr < high_limit) {
+ info.low_limit = addr;
+ if (!slice_scan_available(addr, available, 1, &addr))
+ continue;
+
+ next_slice:
+ /*
+ * At this point [info.low_limit; addr) covers
+ * available slices only and ends at a slice boundary.
+ * Check if we need to reduce the range, or if we can
+ * extend it to cover the next available slice.
+ */
+ if (addr >= high_limit)
+ addr = high_limit;
+ else if (slice_scan_available(addr, available, 1, &next_end)) {
+ addr = next_end;
+ goto next_slice;
+ }
+ info.high_limit = addr;
+
+ found = vm_unmapped_area(&info);
+ if (!(found & ~PAGE_MASK))
+ return found;
+ }
+
+ return -ENOMEM;
+}
+
+static unsigned long slice_find_area_topdown(struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ const struct slice_mask *available,
+ int psize, unsigned long high_limit)
+{
+ int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+ unsigned long found, prev;
+ struct vm_unmapped_area_info info;
+ unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr);
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+ info.align_offset = 0;
+ /*
+ * If we are trying to allocate above DEFAULT_MAP_WINDOW
+ * Add the different to the mmap_base.
+ * Only for that request for which high_limit is above
+ * DEFAULT_MAP_WINDOW we should apply this.
+ */
+ if (high_limit > DEFAULT_MAP_WINDOW)
+ addr += mm_ctx_slb_addr_limit(&mm->context) - DEFAULT_MAP_WINDOW;
+
+ while (addr > min_addr) {
+ info.high_limit = addr;
+ if (!slice_scan_available(addr - 1, available, 0, &addr))
+ continue;
+
+ prev_slice:
+ /*
+ * At this point [addr; info.high_limit) covers
+ * available slices only and starts at a slice boundary.
+ * Check if we need to reduce the range, or if we can
+ * extend it to cover the previous available slice.
+ */
+ if (addr < min_addr)
+ addr = min_addr;
+ else if (slice_scan_available(addr - 1, available, 0, &prev)) {
+ addr = prev;
+ goto prev_slice;
+ }
+ info.low_limit = addr;
+
+ found = vm_unmapped_area(&info);
+ if (!(found & ~PAGE_MASK))
+ return found;
+ }
+
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ return slice_find_area_bottomup(mm, TASK_UNMAPPED_BASE, len, available, psize, high_limit);
+}
+
+
+static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
+ const struct slice_mask *mask, int psize,
+ int topdown, unsigned long high_limit)
+{
+ if (topdown)
+ return slice_find_area_topdown(mm, mm->mmap_base, len, mask, psize, high_limit);
+ else
+ return slice_find_area_bottomup(mm, mm->mmap_base, len, mask, psize, high_limit);
+}
+
+static inline void slice_copy_mask(struct slice_mask *dst,
+ const struct slice_mask *src)
+{
+ dst->low_slices = src->low_slices;
+ if (!SLICE_NUM_HIGH)
+ return;
+ bitmap_copy(dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
+}
+
+static inline void slice_or_mask(struct slice_mask *dst,
+ const struct slice_mask *src1,
+ const struct slice_mask *src2)
+{
+ dst->low_slices = src1->low_slices | src2->low_slices;
+ if (!SLICE_NUM_HIGH)
+ return;
+ bitmap_or(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH);
+}
+
+static inline void slice_andnot_mask(struct slice_mask *dst,
+ const struct slice_mask *src1,
+ const struct slice_mask *src2)
+{
+ dst->low_slices = src1->low_slices & ~src2->low_slices;
+ if (!SLICE_NUM_HIGH)
+ return;
+ bitmap_andnot(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH);
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+#define MMU_PAGE_BASE MMU_PAGE_64K
+#else
+#define MMU_PAGE_BASE MMU_PAGE_4K
+#endif
+
+unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
+ unsigned long flags, unsigned int psize,
+ int topdown)
+{
+ struct slice_mask good_mask;
+ struct slice_mask potential_mask;
+ const struct slice_mask *maskp;
+ const struct slice_mask *compat_maskp = NULL;
+ int fixed = (flags & MAP_FIXED);
+ int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+ unsigned long page_size = 1UL << pshift;
+ struct mm_struct *mm = current->mm;
+ unsigned long newaddr;
+ unsigned long high_limit;
+
+ high_limit = DEFAULT_MAP_WINDOW;
+ if (addr >= high_limit || (fixed && (addr + len > high_limit)))
+ high_limit = TASK_SIZE;
+
+ if (len > high_limit)
+ return -ENOMEM;
+ if (len & (page_size - 1))
+ return -EINVAL;
+ if (fixed) {
+ if (addr & (page_size - 1))
+ return -EINVAL;
+ if (addr > high_limit - len)
+ return -ENOMEM;
+ }
+
+ if (high_limit > mm_ctx_slb_addr_limit(&mm->context)) {
+ /*
+ * Increasing the slb_addr_limit does not require
+ * slice mask cache to be recalculated because it should
+ * be already initialised beyond the old address limit.
+ */
+ mm_ctx_set_slb_addr_limit(&mm->context, high_limit);
+
+ on_each_cpu(slice_flush_segments, mm, 1);
+ }
+
+ /* Sanity checks */
+ BUG_ON(mm->task_size == 0);
+ BUG_ON(mm_ctx_slb_addr_limit(&mm->context) == 0);
+ VM_BUG_ON(radix_enabled());
+
+ slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
+ slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
+ addr, len, flags, topdown);
+
+ /* If hint, make sure it matches our alignment restrictions */
+ if (!fixed && addr) {
+ addr = ALIGN(addr, page_size);
+ slice_dbg(" aligned addr=%lx\n", addr);
+ /* Ignore hint if it's too large or overlaps a VMA */
+ if (addr > high_limit - len || addr < mmap_min_addr ||
+ !slice_area_is_free(mm, addr, len))
+ addr = 0;
+ }
+
+ /* First make up a "good" mask of slices that have the right size
+ * already
+ */
+ maskp = slice_mask_for_size(&mm->context, psize);
+
+ /*
+ * Here "good" means slices that are already the right page size,
+ * "compat" means slices that have a compatible page size (i.e.
+ * 4k in a 64k pagesize kernel), and "free" means slices without
+ * any VMAs.
+ *
+ * If MAP_FIXED:
+ * check if fits in good | compat => OK
+ * check if fits in good | compat | free => convert free
+ * else bad
+ * If have hint:
+ * check if hint fits in good => OK
+ * check if hint fits in good | free => convert free
+ * Otherwise:
+ * search in good, found => OK
+ * search in good | free, found => convert free
+ * search in good | compat | free, found => convert free.
+ */
+
+ /*
+ * If we support combo pages, we can allow 64k pages in 4k slices
+ * The mask copies could be avoided in most cases here if we had
+ * a pointer to good mask for the next code to use.
+ */
+ if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) {
+ compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K);
+ if (fixed)
+ slice_or_mask(&good_mask, maskp, compat_maskp);
+ else
+ slice_copy_mask(&good_mask, maskp);
+ } else {
+ slice_copy_mask(&good_mask, maskp);
+ }
+
+ slice_print_mask(" good_mask", &good_mask);
+ if (compat_maskp)
+ slice_print_mask(" compat_mask", compat_maskp);
+
+ /* First check hint if it's valid or if we have MAP_FIXED */
+ if (addr != 0 || fixed) {
+ /* Check if we fit in the good mask. If we do, we just return,
+ * nothing else to do
+ */
+ if (slice_check_range_fits(mm, &good_mask, addr, len)) {
+ slice_dbg(" fits good !\n");
+ newaddr = addr;
+ goto return_addr;
+ }
+ } else {
+ /* Now let's see if we can find something in the existing
+ * slices for that size
+ */
+ newaddr = slice_find_area(mm, len, &good_mask,
+ psize, topdown, high_limit);
+ if (newaddr != -ENOMEM) {
+ /* Found within the good mask, we don't have to setup,
+ * we thus return directly
+ */
+ slice_dbg(" found area at 0x%lx\n", newaddr);
+ goto return_addr;
+ }
+ }
+ /*
+ * We don't fit in the good mask, check what other slices are
+ * empty and thus can be converted
+ */
+ slice_mask_for_free(mm, &potential_mask, high_limit);
+ slice_or_mask(&potential_mask, &potential_mask, &good_mask);
+ slice_print_mask(" potential", &potential_mask);
+
+ if (addr != 0 || fixed) {
+ if (slice_check_range_fits(mm, &potential_mask, addr, len)) {
+ slice_dbg(" fits potential !\n");
+ newaddr = addr;
+ goto convert;
+ }
+ }
+
+ /* If we have MAP_FIXED and failed the above steps, then error out */
+ if (fixed)
+ return -EBUSY;
+
+ slice_dbg(" search...\n");
+
+ /* If we had a hint that didn't work out, see if we can fit
+ * anywhere in the good area.
+ */
+ if (addr) {
+ newaddr = slice_find_area(mm, len, &good_mask,
+ psize, topdown, high_limit);
+ if (newaddr != -ENOMEM) {
+ slice_dbg(" found area at 0x%lx\n", newaddr);
+ goto return_addr;
+ }
+ }
+
+ /* Now let's see if we can find something in the existing slices
+ * for that size plus free slices
+ */
+ newaddr = slice_find_area(mm, len, &potential_mask,
+ psize, topdown, high_limit);
+
+ if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && newaddr == -ENOMEM &&
+ psize == MMU_PAGE_64K) {
+ /* retry the search with 4k-page slices included */
+ slice_or_mask(&potential_mask, &potential_mask, compat_maskp);
+ newaddr = slice_find_area(mm, len, &potential_mask,
+ psize, topdown, high_limit);
+ }
+
+ if (newaddr == -ENOMEM)
+ return -ENOMEM;
+
+ slice_range_to_mask(newaddr, len, &potential_mask);
+ slice_dbg(" found potential area at 0x%lx\n", newaddr);
+ slice_print_mask(" mask", &potential_mask);
+
+ convert:
+ /*
+ * Try to allocate the context before we do slice convert
+ * so that we handle the context allocation failure gracefully.
+ */
+ if (need_extra_context(mm, newaddr)) {
+ if (alloc_extended_context(mm, newaddr) < 0)
+ return -ENOMEM;
+ }
+
+ slice_andnot_mask(&potential_mask, &potential_mask, &good_mask);
+ if (compat_maskp && !fixed)
+ slice_andnot_mask(&potential_mask, &potential_mask, compat_maskp);
+ if (potential_mask.low_slices ||
+ (SLICE_NUM_HIGH &&
+ !bitmap_empty(potential_mask.high_slices, SLICE_NUM_HIGH))) {
+ slice_convert(mm, &potential_mask, psize);
+ if (psize > MMU_PAGE_BASE)
+ on_each_cpu(slice_flush_segments, mm, 1);
+ }
+ return newaddr;
+
+return_addr:
+ if (need_extra_context(mm, newaddr)) {
+ if (alloc_extended_context(mm, newaddr) < 0)
+ return -ENOMEM;
+ }
+ return newaddr;
+}
+EXPORT_SYMBOL_GPL(slice_get_unmapped_area);
+
+unsigned long arch_get_unmapped_area(struct file *filp,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags)
+{
+ if (radix_enabled())
+ return generic_get_unmapped_area(filp, addr, len, pgoff, flags);
+
+ return slice_get_unmapped_area(addr, len, flags,
+ mm_ctx_user_psize(&current->mm->context), 0);
+}
+
+unsigned long arch_get_unmapped_area_topdown(struct file *filp,
+ const unsigned long addr0,
+ const unsigned long len,
+ const unsigned long pgoff,
+ const unsigned long flags)
+{
+ if (radix_enabled())
+ return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags);
+
+ return slice_get_unmapped_area(addr0, len, flags,
+ mm_ctx_user_psize(&current->mm->context), 1);
+}
+
+unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned char *psizes;
+ int index, mask_index;
+
+ VM_BUG_ON(radix_enabled());
+
+ if (slice_addr_is_low(addr)) {
+ psizes = mm_ctx_low_slices(&mm->context);
+ index = GET_LOW_SLICE_INDEX(addr);
+ } else {
+ psizes = mm_ctx_high_slices(&mm->context);
+ index = GET_HIGH_SLICE_INDEX(addr);
+ }
+ mask_index = index & 0x1;
+ return (psizes[index >> 1] >> (mask_index * 4)) & 0xf;
+}
+EXPORT_SYMBOL_GPL(get_slice_psize);
+
+void slice_init_new_context_exec(struct mm_struct *mm)
+{
+ unsigned char *hpsizes, *lpsizes;
+ struct slice_mask *mask;
+ unsigned int psize = mmu_virtual_psize;
+
+ slice_dbg("slice_init_new_context_exec(mm=%p)\n", mm);
+
+ /*
+ * In the case of exec, use the default limit. In the
+ * case of fork it is just inherited from the mm being
+ * duplicated.
+ */
+ mm_ctx_set_slb_addr_limit(&mm->context, SLB_ADDR_LIMIT_DEFAULT);
+ mm_ctx_set_user_psize(&mm->context, psize);
+
+ /*
+ * Set all slice psizes to the default.
+ */
+ lpsizes = mm_ctx_low_slices(&mm->context);
+ memset(lpsizes, (psize << 4) | psize, SLICE_NUM_LOW >> 1);
+
+ hpsizes = mm_ctx_high_slices(&mm->context);
+ memset(hpsizes, (psize << 4) | psize, SLICE_NUM_HIGH >> 1);
+
+ /*
+ * Slice mask cache starts zeroed, fill the default size cache.
+ */
+ mask = slice_mask_for_size(&mm->context, psize);
+ mask->low_slices = ~0UL;
+ if (SLICE_NUM_HIGH)
+ bitmap_fill(mask->high_slices, SLICE_NUM_HIGH);
+}
+
+void slice_setup_new_exec(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ slice_dbg("slice_setup_new_exec(mm=%p)\n", mm);
+
+ if (!is_32bit_task())
+ return;
+
+ mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW);
+}
+
+void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long len, unsigned int psize)
+{
+ struct slice_mask mask;
+
+ VM_BUG_ON(radix_enabled());
+
+ slice_range_to_mask(start, len, &mask);
+ slice_convert(mm, &mask, psize);
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * is_hugepage_only_range() is used by generic code to verify whether
+ * a normal mmap mapping (non hugetlbfs) is valid on a given area.
+ *
+ * until the generic code provides a more generic hook and/or starts
+ * calling arch get_unmapped_area for MAP_FIXED (which our implementation
+ * here knows how to deal with), we hijack it to keep standard mappings
+ * away from us.
+ *
+ * because of that generic code limitation, MAP_FIXED mapping cannot
+ * "convert" back a slice with no VMAs to the standard page size, only
+ * get_unmapped_area() can. It would be possible to fix it here but I
+ * prefer working on fixing the generic code instead.
+ *
+ * WARNING: This will not work if hugetlbfs isn't enabled since the
+ * generic code will redefine that function as 0 in that. This is ok
+ * for now as we only use slices with hugetlbfs enabled. This should
+ * be fixed as the generic code gets fixed.
+ */
+int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ const struct slice_mask *maskp;
+ unsigned int psize = mm_ctx_user_psize(&mm->context);
+
+ VM_BUG_ON(radix_enabled());
+
+ maskp = slice_mask_for_size(&mm->context, psize);
+
+ /* We need to account for 4k slices too */
+ if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) {
+ const struct slice_mask *compat_maskp;
+ struct slice_mask available;
+
+ compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K);
+ slice_or_mask(&available, maskp, compat_maskp);
+ return !slice_check_range_fits(mm, &available, addr, len);
+ }
+
+ return !slice_check_range_fits(mm, maskp, addr, len);
+}
+
+unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
+{
+ /* With radix we don't use slice, so derive it from vma*/
+ if (radix_enabled())
+ return vma_kernel_pagesize(vma);
+
+ return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start));
+}
+
+static int file_to_psize(struct file *file)
+{
+ struct hstate *hstate = hstate_file(file);
+ return shift_to_mmu_psize(huge_page_shift(hstate));
+}
+
+unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ if (radix_enabled())
+ return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags);
+
+ return slice_get_unmapped_area(addr, len, flags, file_to_psize(file), 1);
+}
+#endif
diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
index 2ef24a53f4c9..ec98e526167e 100644
--- a/arch/powerpc/mm/book3s64/subpage_prot.c
+++ b/arch/powerpc/mm/book3s64/subpage_prot.c
@@ -11,7 +11,7 @@
#include <linux/hugetlb.h>
#include <linux/syscalls.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
#include <linux/uaccess.h>
/*
@@ -54,21 +54,25 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
int npages)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
spinlock_t *ptl;
pgd = pgd_offset(mm, addr);
- if (pgd_none(*pgd))
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
return;
- pud = pud_offset(pgd, addr);
+ pud = pud_offset(p4d, addr);
if (pud_none(*pud))
return;
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
return;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return;
arch_enter_lazy_mmu_mode();
for (; npages > 0; --npages) {
pte_update(mm, addr, pte, 0, 0, 0);
@@ -92,7 +96,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
size_t nw;
unsigned long next, limit;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
spt = mm_ctx_subpage_prot(&mm->context);
if (!spt)
@@ -127,7 +131,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
}
err_out:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -141,30 +145,22 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
static const struct mm_walk_ops subpage_walk_ops = {
.pmd_entry = subpage_walk_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK_VERIFY,
};
static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
unsigned long len)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, addr);
/*
* We don't try too hard, we just mark all the vma in that range
* VM_NOHUGEPAGE and split them.
*/
- vma = find_vma(mm, addr);
- /*
- * If the range is in unmapped range, just return
- */
- if (vma && ((addr + len) <= vma->vm_start))
- return;
-
- while (vma) {
- if (vma->vm_start >= (addr + len))
- break;
- vma->vm_flags |= VM_NOHUGEPAGE;
+ for_each_vma_range(vmi, vma, addr + len) {
+ vm_flags_set(vma, VM_NOHUGEPAGE);
walk_page_vma(vma, &subpage_walk_ops, NULL);
- vma = vma->vm_next;
}
}
#else
@@ -217,13 +213,13 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32)))
return -EFAULT;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
spt = mm_ctx_subpage_prot(&mm->context);
if (!spt) {
/*
* Allocate subpage prot table if not already done.
- * Do this with mmap_sem held
+ * Do this with mmap_lock held
*/
spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL);
if (!spt) {
@@ -267,11 +263,11 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
if (addr + (nw << PAGE_SHIFT) > next)
nw = (next - addr) >> PAGE_SHIFT;
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
if (__copy_from_user(spp, map, nw * sizeof(u32)))
return -EFAULT;
map += nw;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
/* now flush any existing HPTEs for the range */
hpte_flush_range(mm, addr, nw);
@@ -280,6 +276,6 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
spt->maxaddr = limit;
err = 0;
out:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return err;
}
diff --git a/arch/powerpc/mm/book3s64/trace.c b/arch/powerpc/mm/book3s64/trace.c
new file mode 100644
index 000000000000..ccd64b5e6cac
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/trace.c
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file is for defining trace points and trace related helpers.
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#include <trace/events/thp.h>
+#endif