summaryrefslogtreecommitdiff
path: root/arch/powerpc/include/asm/book3s
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/include/asm/book3s')
-rw-r--r--arch/powerpc/include/asm/book3s/32/hash.h45
-rw-r--r--arch/powerpc/include/asm/book3s/32/kup.h235
-rw-r--r--arch/powerpc/include/asm/book3s/32/mmu-hash.h128
-rw-r--r--arch/powerpc/include/asm/book3s/32/pgalloc.h8
-rw-r--r--arch/powerpc/include/asm/book3s/32/pgtable.h406
-rw-r--r--arch/powerpc/include/asm/book3s/32/tlbflush.h80
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash-4k.h51
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash-64k.h38
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash-pkey.h45
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash.h33
-rw-r--r--arch/powerpc/include/asm/book3s/64/hugetlb.h7
-rw-r--r--arch/powerpc/include/asm/book3s/64/kexec.h33
-rw-r--r--arch/powerpc/include/asm/book3s/64/kup-radix.h108
-rw-r--r--arch/powerpc/include/asm/book3s/64/kup.h418
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu-hash.h63
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu.h89
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgalloc.h24
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable-4k.h10
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable-64k.h9
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h575
-rw-r--r--arch/powerpc/include/asm/book3s/64/pkeys.h25
-rw-r--r--arch/powerpc/include/asm/book3s/64/radix-4k.h2
-rw-r--r--arch/powerpc/include/asm/book3s/64/radix.h100
-rw-r--r--arch/powerpc/include/asm/book3s/64/slice.h26
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush-hash.h67
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush-radix.h25
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush.h141
-rw-r--r--arch/powerpc/include/asm/book3s/pgtable.h30
28 files changed, 1850 insertions, 971 deletions
diff --git a/arch/powerpc/include/asm/book3s/32/hash.h b/arch/powerpc/include/asm/book3s/32/hash.h
deleted file mode 100644
index 2a0a467d2985..000000000000
--- a/arch/powerpc/include/asm/book3s/32/hash.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_POWERPC_BOOK3S_32_HASH_H
-#define _ASM_POWERPC_BOOK3S_32_HASH_H
-#ifdef __KERNEL__
-
-/*
- * The "classic" 32-bit implementation of the PowerPC MMU uses a hash
- * table containing PTEs, together with a set of 16 segment registers,
- * to define the virtual to physical address mapping.
- *
- * We use the hash table as an extended TLB, i.e. a cache of currently
- * active mappings. We maintain a two-level page table tree, much
- * like that used by the i386, for the sake of the Linux memory
- * management code. Low-level assembler code in hash_low_32.S
- * (procedure hash_page) is responsible for extracting ptes from the
- * tree and putting them into the hash table when necessary, and
- * updating the accessed and modified bits in the page table tree.
- */
-
-#define _PAGE_PRESENT 0x001 /* software: pte contains a translation */
-#define _PAGE_HASHPTE 0x002 /* hash_page has made an HPTE for this pte */
-#define _PAGE_USER 0x004 /* usermode access allowed */
-#define _PAGE_GUARDED 0x008 /* G: prohibit speculative access */
-#define _PAGE_COHERENT 0x010 /* M: enforce memory coherence (SMP systems) */
-#define _PAGE_NO_CACHE 0x020 /* I: cache inhibit */
-#define _PAGE_WRITETHRU 0x040 /* W: cache write-through */
-#define _PAGE_DIRTY 0x080 /* C: page changed */
-#define _PAGE_ACCESSED 0x100 /* R: page referenced */
-#define _PAGE_EXEC 0x200 /* software: exec allowed */
-#define _PAGE_RW 0x400 /* software: user write access allowed */
-#define _PAGE_SPECIAL 0x800 /* software: Special page */
-
-#ifdef CONFIG_PTE_64BIT
-/* We never clear the high word of the pte */
-#define _PTE_NONE_MASK (0xffffffff00000000ULL | _PAGE_HASHPTE)
-#else
-#define _PTE_NONE_MASK _PAGE_HASHPTE
-#endif
-
-#define _PMD_PRESENT 0
-#define _PMD_PRESENT_MASK (PAGE_MASK)
-#define _PMD_BAD (~PAGE_MASK)
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_POWERPC_BOOK3S_32_HASH_H */
diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
index f9dc597b0b86..4e14a5427a63 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -2,141 +2,170 @@
#ifndef _ASM_POWERPC_BOOK3S_32_KUP_H
#define _ASM_POWERPC_BOOK3S_32_KUP_H
+#include <asm/bug.h>
#include <asm/book3s/32/mmu-hash.h>
+#include <asm/mmu.h>
+#include <asm/synch.h>
-#ifdef __ASSEMBLY__
-
-.macro kuep_update_sr gpr1, gpr2 /* NEVER use r0 as gpr2 due to addis */
-101: mtsrin \gpr1, \gpr2
- addi \gpr1, \gpr1, 0x111 /* next VSID */
- rlwinm \gpr1, \gpr1, 0, 0xf0ffffff /* clear VSID overflow */
- addis \gpr2, \gpr2, 0x1000 /* address of next segment */
- bdnz 101b
- isync
-.endm
-
-.macro kuep_lock gpr1, gpr2
-#ifdef CONFIG_PPC_KUEP
- li \gpr1, NUM_USER_SEGMENTS
- li \gpr2, 0
- mtctr \gpr1
- mfsrin \gpr1, \gpr2
- oris \gpr1, \gpr1, SR_NX@h /* set Nx */
- kuep_update_sr \gpr1, \gpr2
-#endif
-.endm
-
-.macro kuep_unlock gpr1, gpr2
-#ifdef CONFIG_PPC_KUEP
- li \gpr1, NUM_USER_SEGMENTS
- li \gpr2, 0
- mtctr \gpr1
- mfsrin \gpr1, \gpr2
- rlwinm \gpr1, \gpr1, 0, ~SR_NX /* Clear Nx */
- kuep_update_sr \gpr1, \gpr2
-#endif
-.endm
+#ifndef __ASSEMBLY__
#ifdef CONFIG_PPC_KUAP
-.macro kuap_update_sr gpr1, gpr2, gpr3 /* NEVER use r0 as gpr2 due to addis */
-101: mtsrin \gpr1, \gpr2
- addi \gpr1, \gpr1, 0x111 /* next VSID */
- rlwinm \gpr1, \gpr1, 0, 0xf0ffffff /* clear VSID overflow */
- addis \gpr2, \gpr2, 0x1000 /* address of next segment */
- cmplw \gpr2, \gpr3
- blt- 101b
- isync
-.endm
-
-.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3
- lwz \gpr2, KUAP(\thread)
- rlwinm. \gpr3, \gpr2, 28, 0xf0000000
- stw \gpr2, STACK_REGS_KUAP(\sp)
- beq+ 102f
- li \gpr1, 0
- stw \gpr1, KUAP(\thread)
- mfsrin \gpr1, \gpr2
- oris \gpr1, \gpr1, SR_KS@h /* set Ks */
- kuap_update_sr \gpr1, \gpr2, \gpr3
-102:
-.endm
-
-.macro kuap_restore sp, current, gpr1, gpr2, gpr3
- lwz \gpr2, STACK_REGS_KUAP(\sp)
- rlwinm. \gpr3, \gpr2, 28, 0xf0000000
- stw \gpr2, THREAD + KUAP(\current)
- beq+ 102f
- mfsrin \gpr1, \gpr2
- rlwinm \gpr1, \gpr1, 0, ~SR_KS /* Clear Ks */
- kuap_update_sr \gpr1, \gpr2, \gpr3
-102:
-.endm
-
-.macro kuap_check current, gpr
-#ifdef CONFIG_PPC_KUAP_DEBUG
- lwz \gpr2, KUAP(thread)
-999: twnei \gpr, 0
- EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE)
-#endif
-.endm
+#include <linux/sched.h>
-#endif /* CONFIG_PPC_KUAP */
+#define KUAP_NONE (~0UL)
-#else /* !__ASSEMBLY__ */
+static __always_inline void kuap_lock_one(unsigned long addr)
+{
+ mtsr(mfsr(addr) | SR_KS, addr);
+ isync(); /* Context sync required after mtsr() */
+}
-#ifdef CONFIG_PPC_KUAP
+static __always_inline void kuap_unlock_one(unsigned long addr)
+{
+ mtsr(mfsr(addr) & ~SR_KS, addr);
+ isync(); /* Context sync required after mtsr() */
+}
-#include <linux/sched.h>
+static __always_inline void uaccess_begin_32s(unsigned long addr)
+{
+ unsigned long tmp;
+
+ asm volatile(ASM_MMU_FTR_IFSET(
+ "mfsrin %0, %1;"
+ "rlwinm %0, %0, 0, %2;"
+ "mtsrin %0, %1;"
+ "isync", "", %3)
+ : "=&r"(tmp)
+ : "r"(addr), "i"(~SR_KS), "i"(MMU_FTR_KUAP)
+ : "memory");
+}
-static inline void kuap_update_sr(u32 sr, u32 addr, u32 end)
+static __always_inline void uaccess_end_32s(unsigned long addr)
{
- addr &= 0xf0000000; /* align addr to start of segment */
- barrier(); /* make sure thread.kuap is updated before playing with SRs */
- while (addr < end) {
- mtsrin(sr, addr);
- sr += 0x111; /* next VSID */
- sr &= 0xf0ffffff; /* clear VSID overflow */
- addr += 0x10000000; /* address of next segment */
- }
- isync(); /* Context sync required after mtsrin() */
+ unsigned long tmp;
+
+ asm volatile(ASM_MMU_FTR_IFSET(
+ "mfsrin %0, %1;"
+ "oris %0, %0, %2;"
+ "mtsrin %0, %1;"
+ "isync", "", %3)
+ : "=&r"(tmp)
+ : "r"(addr), "i"(SR_KS >> 16), "i"(MMU_FTR_KUAP)
+ : "memory");
}
-static inline void allow_user_access(void __user *to, const void __user *from, u32 size)
+static __always_inline void __kuap_save_and_lock(struct pt_regs *regs)
{
- u32 addr, end;
+ unsigned long kuap = current->thread.kuap;
- if (__builtin_constant_p(to) && to == NULL)
+ regs->kuap = kuap;
+ if (unlikely(kuap == KUAP_NONE))
return;
- addr = (__force u32)to;
+ current->thread.kuap = KUAP_NONE;
+ kuap_lock_one(kuap);
+}
+#define __kuap_save_and_lock __kuap_save_and_lock
+
+static __always_inline void kuap_user_restore(struct pt_regs *regs)
+{
+}
- if (!addr || addr >= TASK_SIZE || !size)
+static __always_inline void __kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap)
+{
+ if (unlikely(kuap != KUAP_NONE)) {
+ current->thread.kuap = KUAP_NONE;
+ kuap_lock_one(kuap);
+ }
+
+ if (likely(regs->kuap == KUAP_NONE))
return;
- end = min(addr + size, TASK_SIZE);
- current->thread.kuap = (addr & 0xf0000000) | ((((end - 1) >> 28) + 1) & 0xf);
- kuap_update_sr(mfsrin(addr) & ~SR_KS, addr, end); /* Clear Ks */
+ current->thread.kuap = regs->kuap;
+
+ kuap_unlock_one(regs->kuap);
}
-static inline void prevent_user_access(void __user *to, const void __user *from, u32 size)
+static __always_inline unsigned long __kuap_get_and_assert_locked(void)
{
- u32 addr = (__force u32)to;
- u32 end = min(addr + size, TASK_SIZE);
+ unsigned long kuap = current->thread.kuap;
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && kuap != KUAP_NONE);
+
+ return kuap;
+}
+#define __kuap_get_and_assert_locked __kuap_get_and_assert_locked
- if (!addr || addr >= TASK_SIZE || !size)
+static __always_inline void allow_user_access(void __user *to, const void __user *from,
+ u32 size, unsigned long dir)
+{
+ BUILD_BUG_ON(!__builtin_constant_p(dir));
+
+ if (!(dir & KUAP_WRITE))
return;
- current->thread.kuap = 0;
- kuap_update_sr(mfsrin(addr) | SR_KS, addr, end); /* set Ks */
+ current->thread.kuap = (__force u32)to;
+ uaccess_begin_32s((__force u32)to);
}
-static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write)
+static __always_inline void prevent_user_access(unsigned long dir)
{
+ u32 kuap = current->thread.kuap;
+
+ BUILD_BUG_ON(!__builtin_constant_p(dir));
+
+ if (!(dir & KUAP_WRITE))
+ return;
+
+ current->thread.kuap = KUAP_NONE;
+ uaccess_end_32s(kuap);
+}
+
+static __always_inline unsigned long prevent_user_access_return(void)
+{
+ unsigned long flags = current->thread.kuap;
+
+ if (flags != KUAP_NONE) {
+ current->thread.kuap = KUAP_NONE;
+ uaccess_end_32s(flags);
+ }
+
+ return flags;
+}
+
+static __always_inline void restore_user_access(unsigned long flags)
+{
+ if (flags != KUAP_NONE) {
+ current->thread.kuap = flags;
+ uaccess_begin_32s(flags);
+ }
+}
+
+static __always_inline bool
+__bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
+{
+ unsigned long kuap = regs->kuap;
+
if (!is_write)
return false;
+ if (kuap == KUAP_NONE)
+ return true;
+
+ /*
+ * If faulting address doesn't match unlocked segment, change segment.
+ * In case of unaligned store crossing two segments, emulate store.
+ */
+ if ((kuap ^ address) & 0xf0000000) {
+ if (!(kuap & 0x0fffffff) && address > kuap - 4 && fix_alignment(regs)) {
+ regs_add_return_ip(regs, 4);
+ emulate_single_step(regs);
+ } else {
+ regs->kuap = address;
+ }
+ }
- return WARN(!regs->kuap, "Bug: write fault blocked by segment registers !");
+ return false;
}
#endif /* CONFIG_PPC_KUAP */
diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
index 2e277ca0170f..78c6a5fde1d6 100644
--- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
@@ -64,7 +64,92 @@ struct ppc_bat {
#define SR_KP 0x20000000 /* User key */
#define SR_KS 0x40000000 /* Supervisor key */
-#ifndef __ASSEMBLY__
+#ifdef __ASSEMBLY__
+
+#include <asm/asm-offsets.h>
+
+.macro uus_addi sr reg1 reg2 imm
+ .if NUM_USER_SEGMENTS > \sr
+ addi \reg1,\reg2,\imm
+ .endif
+.endm
+
+.macro uus_mtsr sr reg1
+ .if NUM_USER_SEGMENTS > \sr
+ mtsr \sr, \reg1
+ .endif
+.endm
+
+/*
+ * This isync() shouldn't be necessary as the kernel is not excepted to run
+ * any instruction in userspace soon after the update of segments and 'rfi'
+ * instruction is used to return to userspace, but hash based cores
+ * (at least G3) seem to exhibit a random behaviour when the 'isync' is not
+ * there. 603 cores don't have this behaviour so don't do the 'isync' as it
+ * saves several CPU cycles.
+ */
+.macro uus_isync
+#ifdef CONFIG_PPC_BOOK3S_604
+BEGIN_MMU_FTR_SECTION
+ isync
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
+#endif
+.endm
+
+.macro update_user_segments_by_4 tmp1 tmp2 tmp3 tmp4
+ uus_addi 1, \tmp2, \tmp1, 0x111
+ uus_addi 2, \tmp3, \tmp1, 0x222
+ uus_addi 3, \tmp4, \tmp1, 0x333
+
+ uus_mtsr 0, \tmp1
+ uus_mtsr 1, \tmp2
+ uus_mtsr 2, \tmp3
+ uus_mtsr 3, \tmp4
+
+ uus_addi 4, \tmp1, \tmp1, 0x444
+ uus_addi 5, \tmp2, \tmp2, 0x444
+ uus_addi 6, \tmp3, \tmp3, 0x444
+ uus_addi 7, \tmp4, \tmp4, 0x444
+
+ uus_mtsr 4, \tmp1
+ uus_mtsr 5, \tmp2
+ uus_mtsr 6, \tmp3
+ uus_mtsr 7, \tmp4
+
+ uus_addi 8, \tmp1, \tmp1, 0x444
+ uus_addi 9, \tmp2, \tmp2, 0x444
+ uus_addi 10, \tmp3, \tmp3, 0x444
+ uus_addi 11, \tmp4, \tmp4, 0x444
+
+ uus_mtsr 8, \tmp1
+ uus_mtsr 9, \tmp2
+ uus_mtsr 10, \tmp3
+ uus_mtsr 11, \tmp4
+
+ uus_addi 12, \tmp1, \tmp1, 0x444
+ uus_addi 13, \tmp2, \tmp2, 0x444
+ uus_addi 14, \tmp3, \tmp3, 0x444
+ uus_addi 15, \tmp4, \tmp4, 0x444
+
+ uus_mtsr 12, \tmp1
+ uus_mtsr 13, \tmp2
+ uus_mtsr 14, \tmp3
+ uus_mtsr 15, \tmp4
+
+ uus_isync
+.endm
+
+#else
+
+/*
+ * This macro defines the mapping from contexts to VSIDs (virtual
+ * segment IDs). We use a skew on both the context and the high 4 bits
+ * of the 32-bit virtual address (the "effective segment ID") in order
+ * to spread out the entries in the MMU hash table. Note, if this
+ * function is changed then hash functions will have to be
+ * changed to correspond.
+ */
+#define CTX_TO_VSID(c, id) ((((c) * (897 * 16)) + (id * 0x111)) & 0xffffff)
/*
* Hardware Page Table Entry
@@ -90,10 +175,16 @@ struct hash_pte {
typedef struct {
unsigned long id;
- unsigned long vdso_base;
+ unsigned long sr0;
+ void __user *vdso;
} mm_context_t;
+#ifdef CONFIG_PPC_KUEP
+#define INIT_MM_CONTEXT(mm) .context.sr0 = SR_NX
+#endif
+
void update_bats(void);
+static inline void cleanup_cpu_mmu_context(void) { }
/* patch sites */
extern s32 patch__hash_page_A0, patch__hash_page_A1, patch__hash_page_A2;
@@ -101,6 +192,39 @@ extern s32 patch__hash_page_B, patch__hash_page_C;
extern s32 patch__flush_hash_A0, patch__flush_hash_A1, patch__flush_hash_A2;
extern s32 patch__flush_hash_B;
+#include <asm/reg.h>
+#include <asm/task_size_32.h>
+
+static __always_inline void update_user_segment(u32 n, u32 val)
+{
+ if (n << 28 < TASK_SIZE)
+ mtsr(val + n * 0x111, n << 28);
+}
+
+static __always_inline void update_user_segments(u32 val)
+{
+ val &= 0xf0ffffff;
+
+ update_user_segment(0, val);
+ update_user_segment(1, val);
+ update_user_segment(2, val);
+ update_user_segment(3, val);
+ update_user_segment(4, val);
+ update_user_segment(5, val);
+ update_user_segment(6, val);
+ update_user_segment(7, val);
+ update_user_segment(8, val);
+ update_user_segment(9, val);
+ update_user_segment(10, val);
+ update_user_segment(11, val);
+ update_user_segment(12, val);
+ update_user_segment(13, val);
+ update_user_segment(14, val);
+ update_user_segment(15, val);
+}
+
+int __init find_free_bat(void);
+unsigned int bat_block_size(unsigned long base, unsigned long top);
#endif /* !__ASSEMBLY__ */
/* We happily ignore the smaller BATs on 601, we don't actually use
diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 998317702630..dc5c039eb28e 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -49,7 +49,6 @@ static inline void pgtable_free(void *table, unsigned index_size)
#define get_hugepd_cache_index(x) (x)
-#ifdef CONFIG_SMP
static inline void pgtable_free_tlb(struct mmu_gather *tlb,
void *table, int shift)
{
@@ -66,13 +65,6 @@ static inline void __tlb_remove_table(void *_table)
pgtable_free(table, shift);
}
-#else
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
- void *table, int shift)
-{
- pgtable_free(table, shift);
-}
-#endif
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
unsigned long address)
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 0796533d37dd..52971ee30717 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -2,27 +2,52 @@
#ifndef _ASM_POWERPC_BOOK3S_32_PGTABLE_H
#define _ASM_POWERPC_BOOK3S_32_PGTABLE_H
-#define __ARCH_USE_5LEVEL_HACK
#include <asm-generic/pgtable-nopmd.h>
-#include <asm/book3s/32/hash.h>
+/*
+ * The "classic" 32-bit implementation of the PowerPC MMU uses a hash
+ * table containing PTEs, together with a set of 16 segment registers,
+ * to define the virtual to physical address mapping.
+ *
+ * We use the hash table as an extended TLB, i.e. a cache of currently
+ * active mappings. We maintain a two-level page table tree, much
+ * like that used by the i386, for the sake of the Linux memory
+ * management code. Low-level assembler code in hash_low_32.S
+ * (procedure hash_page) is responsible for extracting ptes from the
+ * tree and putting them into the hash table when necessary, and
+ * updating the accessed and modified bits in the page table tree.
+ */
-/* And here we include common definitions */
+#define _PAGE_PRESENT 0x001 /* software: pte contains a translation */
+#define _PAGE_HASHPTE 0x002 /* hash_page has made an HPTE for this pte */
+#define _PAGE_READ 0x004 /* software: read access allowed */
+#define _PAGE_GUARDED 0x008 /* G: prohibit speculative access */
+#define _PAGE_COHERENT 0x010 /* M: enforce memory coherence (SMP systems) */
+#define _PAGE_NO_CACHE 0x020 /* I: cache inhibit */
+#define _PAGE_WRITETHRU 0x040 /* W: cache write-through */
+#define _PAGE_DIRTY 0x080 /* C: page changed */
+#define _PAGE_ACCESSED 0x100 /* R: page referenced */
+#define _PAGE_EXEC 0x200 /* software: exec allowed */
+#define _PAGE_WRITE 0x400 /* software: user write access allowed */
+#define _PAGE_SPECIAL 0x800 /* software: Special page */
-#define _PAGE_KERNEL_RO 0
-#define _PAGE_KERNEL_ROX (_PAGE_EXEC)
-#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW)
-#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC)
+#ifdef CONFIG_PTE_64BIT
+/* We never clear the high word of the pte */
+#define _PTE_NONE_MASK (0xffffffff00000000ULL | _PAGE_HASHPTE)
+#else
+#define _PTE_NONE_MASK _PAGE_HASHPTE
+#endif
-#define _PAGE_HPTEFLAGS _PAGE_HASHPTE
+#define _PMD_PRESENT 0
+#define _PMD_PRESENT_MASK (PAGE_MASK)
+#define _PMD_BAD (~PAGE_MASK)
-#ifndef __ASSEMBLY__
+/* We borrow the _PAGE_READ bit to store the exclusive marker in swap PTEs. */
+#define _PAGE_SWP_EXCLUSIVE _PAGE_READ
-static inline bool pte_user(pte_t pte)
-{
- return pte_val(pte) & _PAGE_USER;
-}
-#endif /* __ASSEMBLY__ */
+/* And here we include common definitions */
+
+#define _PAGE_HPTEFLAGS _PAGE_HASHPTE
/*
* Location of the PFN in the PTE. Most 32-bit platforms use the same
@@ -37,8 +62,10 @@ static inline bool pte_user(pte_t pte)
*/
#ifdef CONFIG_PTE_64BIT
#define PTE_RPN_MASK (~((1ULL << PTE_RPN_SHIFT) - 1))
+#define MAX_POSSIBLE_PHYSMEM_BITS 36
#else
#define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1))
+#define MAX_POSSIBLE_PHYSMEM_BITS 32
#endif
/*
@@ -57,49 +84,16 @@ static inline bool pte_user(pte_t pte)
#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED)
#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT)
-/*
- * Permission masks used to generate the __P and __S table.
- *
- * Note:__pgprot is defined in arch/powerpc/include/asm/page.h
- *
- * Write permissions imply read permissions for now.
- */
-#define PAGE_NONE __pgprot(_PAGE_BASE)
-#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW)
-#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC)
-#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER)
-#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC)
-#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER)
-#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC)
+#include <asm/pgtable-masks.h>
/* Permission masks used for kernel mappings */
#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)
#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_NO_CACHE)
-#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
- _PAGE_NO_CACHE | _PAGE_GUARDED)
+#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_NO_CACHE | _PAGE_GUARDED)
#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX)
#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO)
#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX)
-/*
- * Protection used for kernel text. We want the debuggers to be able to
- * set breakpoints anywhere, so don't write protect the kernel text
- * on platforms where such control is possible.
- */
-#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\
- defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE)
-#define PAGE_KERNEL_TEXT PAGE_KERNEL_X
-#else
-#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX
-#endif
-
-/* Make modules code happy. We don't set RO yet */
-#define PAGE_KERNEL_EXEC PAGE_KERNEL_X
-
-/* Advertise special mapping type for AGP */
-#define PAGE_AGP (PAGE_KERNEL_NC)
-#define HAVE_PAGE_AGP
-
#define PTE_INDEX_SIZE PTE_SHIFT
#define PMD_INDEX_SIZE 0
#define PUD_INDEX_SIZE 0
@@ -113,6 +107,9 @@ static inline bool pte_user(pte_t pte)
#define PMD_TABLE_SIZE 0
#define PUD_TABLE_SIZE 0
#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE)
+
+/* Bits to mask out from a PMD to get to the PTE page */
+#define PMD_MASKED_BITS (PTE_TABLE_SIZE - 1)
#endif /* __ASSEMBLY__ */
#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE)
@@ -138,6 +135,7 @@ static inline bool pte_user(pte_t pte)
#ifndef __ASSEMBLY__
int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
+void unmap_kernel_page(unsigned long va);
#endif /* !__ASSEMBLY__ */
@@ -146,7 +144,14 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
* value (for now) on others, from where we can start layout kernel
* virtual space that goes below PKMAP and FIXMAP
*/
-#include <asm/fixmap.h>
+
+#define FIXADDR_SIZE 0
+#ifdef CONFIG_KASAN
+#include <asm/kasan.h>
+#define FIXADDR_TOP (KASAN_SHADOW_START - PAGE_SIZE)
+#else
+#define FIXADDR_TOP ((unsigned long)(-PAGE_SIZE))
+#endif
/*
* ioremap_bot starts at that address. Early ioremaps move down from there,
@@ -182,18 +187,16 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
*/
#define VMALLOC_OFFSET (0x1000000) /* 16M */
-/*
- * With CONFIG_STRICT_KERNEL_RWX, kernel segments are set NX. But when modules
- * are used, NX cannot be set on VMALLOC space. So vmalloc VM space and linear
- * memory shall not share segments.
- */
-#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_MODULES)
-#define VMALLOC_START ((_ALIGN((long)high_memory, 256L << 20) + VMALLOC_OFFSET) & \
- ~(VMALLOC_OFFSET - 1))
-#else
#define VMALLOC_START ((((long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)))
-#endif
+
+#ifdef CONFIG_KASAN_VMALLOC
+#define VMALLOC_END ALIGN_DOWN(ioremap_bot, PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
+#else
#define VMALLOC_END ioremap_bot
+#endif
+
+#define MODULES_END ALIGN_DOWN(PAGE_OFFSET, SZ_256M)
+#define MODULES_VADDR (MODULES_END - SZ_256M)
#ifndef __ASSEMBLY__
#include <linux/sched.h>
@@ -202,9 +205,6 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
/* Bits to mask out from a PGD to get to the PUD page */
#define PGD_MASKED_BITS 0
-#define pte_ERROR(e) \
- pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \
- (unsigned long long)pte_val(e))
#define pgd_ERROR(e) \
pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
/*
@@ -213,7 +213,7 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
*/
#define pte_clear(mm, addr, ptep) \
- do { pte_update(ptep, ~_PAGE_HASHPTE, 0); } while (0)
+ do { pte_update(mm, addr, ptep, ~_PAGE_HASHPTE, 0, 0); } while (0)
#define pmd_none(pmd) (!pmd_val(pmd))
#define pmd_bad(pmd) (pmd_val(pmd) & _PMD_BAD)
@@ -236,8 +236,14 @@ extern void add_hash_page(unsigned context, unsigned long va,
unsigned long pmdval);
/* Flush an entry from the TLB/hash table */
-extern void flush_hash_entry(struct mm_struct *mm, pte_t *ptep,
- unsigned long address);
+static inline void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
+{
+ if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
+ unsigned long ptephys = __pa(ptep) & PAGE_MASK;
+
+ flush_hash_pages(mm->context.id, addr, ptephys, 1);
+ }
+}
/*
* PTE updates. This function is called whenever an existing
@@ -248,84 +254,74 @@ extern void flush_hash_entry(struct mm_struct *mm, pte_t *ptep,
* and the PTE may be either 32 or 64 bit wide. In the later case,
* when using atomic updates, only the low part of the PTE is
* accessed atomically.
- *
- * In addition, on 44x, we also maintain a global flag indicating
- * that an executable user mapping was modified, which is needed
- * to properly flush the virtually tagged instruction cache of
- * those implementations.
*/
+static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p,
+ unsigned long clr, unsigned long set, int huge)
+{
+ pte_basic_t old;
+
+ if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
+ unsigned long tmp;
+
+ asm volatile(
+#ifndef CONFIG_PTE_64BIT
+ "1: lwarx %0, 0, %3\n"
+ " andc %1, %0, %4\n"
+#else
+ "1: lwarx %L0, 0, %3\n"
+ " lwz %0, -4(%3)\n"
+ " andc %1, %L0, %4\n"
+#endif
+ " or %1, %1, %5\n"
+ " stwcx. %1, 0, %3\n"
+ " bne- 1b"
+ : "=&r" (old), "=&r" (tmp), "=m" (*p)
#ifndef CONFIG_PTE_64BIT
-static inline unsigned long pte_update(pte_t *p,
- unsigned long clr,
- unsigned long set)
-{
- unsigned long old, tmp;
-
- __asm__ __volatile__("\
-1: lwarx %0,0,%3\n\
- andc %1,%0,%4\n\
- or %1,%1,%5\n"
-" stwcx. %1,0,%3\n\
- bne- 1b"
- : "=&r" (old), "=&r" (tmp), "=m" (*p)
- : "r" (p), "r" (clr), "r" (set), "m" (*p)
- : "cc" );
+ : "r" (p),
+#else
+ : "b" ((unsigned long)(p) + 4),
+#endif
+ "r" (clr), "r" (set), "m" (*p)
+ : "cc" );
+ } else {
+ old = pte_val(*p);
- return old;
-}
-#else /* CONFIG_PTE_64BIT */
-static inline unsigned long long pte_update(pte_t *p,
- unsigned long clr,
- unsigned long set)
-{
- unsigned long long old;
- unsigned long tmp;
-
- __asm__ __volatile__("\
-1: lwarx %L0,0,%4\n\
- lwzx %0,0,%3\n\
- andc %1,%L0,%5\n\
- or %1,%1,%6\n"
-" stwcx. %1,0,%4\n\
- bne- 1b"
- : "=&r" (old), "=&r" (tmp), "=m" (*p)
- : "r" (p), "r" ((unsigned long)(p) + 4), "r" (clr), "r" (set), "m" (*p)
- : "cc" );
+ *p = __pte((old & ~(pte_basic_t)clr) | set);
+ }
return old;
}
-#endif /* CONFIG_PTE_64BIT */
/*
* 2.6 calls this without flushing the TLB entry; this is wrong
* for our hash-based implementation, we fix that up here.
*/
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static inline int __ptep_test_and_clear_young(unsigned int context, unsigned long addr, pte_t *ptep)
+static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
{
unsigned long old;
- old = pte_update(ptep, _PAGE_ACCESSED, 0);
- if (old & _PAGE_HASHPTE) {
- unsigned long ptephys = __pa(ptep) & PAGE_MASK;
- flush_hash_pages(context, addr, ptephys, 1);
- }
+ old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0);
+ if (old & _PAGE_HASHPTE)
+ flush_hash_entry(mm, ptep, addr);
+
return (old & _PAGE_ACCESSED) != 0;
}
#define ptep_test_and_clear_young(__vma, __addr, __ptep) \
- __ptep_test_and_clear_young((__vma)->vm_mm->context.id, __addr, __ptep)
+ __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep)
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
- return __pte(pte_update(ptep, ~_PAGE_HASHPTE, 0));
+ return __pte(pte_update(mm, addr, ptep, ~_PAGE_HASHPTE, 0, 0));
}
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
- pte_update(ptep, _PAGE_RW, 0);
+ pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0);
}
static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
@@ -336,7 +332,7 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long set = pte_val(entry) &
(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
- pte_update(ptep, 0, set);
+ pte_update(vma->vm_mm, address, ptep, 0, set, 0);
flush_tlb_page(vma, address);
}
@@ -344,43 +340,56 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
#define __HAVE_ARCH_PTE_SAME
#define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HASHPTE) == 0)
-#define pmd_page_vaddr(pmd) \
- ((unsigned long)__va(pmd_val(pmd) & ~(PTE_TABLE_SIZE - 1)))
-#define pmd_page(pmd) \
- pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
-
-/* to find an entry in a kernel page-table-directory */
-#define pgd_offset_k(address) pgd_offset(&init_mm, address)
-
-/* to find an entry in a page-table-directory */
-#define pgd_index(address) ((address) >> PGDIR_SHIFT)
-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address))
-
-/* Find an entry in the third-level page table.. */
-#define pte_index(address) \
- (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset_kernel(dir, addr) \
- ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(addr))
-#define pte_offset_map(dir, addr) \
- ((pte_t *)(kmap_atomic(pmd_page(*(dir))) + \
- (pmd_page_vaddr(*(dir)) & ~PAGE_MASK)) + pte_index(addr))
-#define pte_unmap(pte) kunmap_atomic(pte)
+#define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT)
+#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
/*
- * Encode and decode a swap entry.
- * Note that the bits we use in a PTE for representing a swap entry
- * must not include the _PAGE_PRESENT bit or the _PAGE_HASHPTE bit (if used).
- * -- paulus
+ * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
+ * are !pte_none() && !pte_present().
+ *
+ * Format of swap PTEs (32bit PTEs):
+ *
+ * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * <----------------- offset --------------------> < type -> E H P
+ *
+ * E is the exclusive marker that is not stored in swap entries.
+ * _PAGE_PRESENT (P) and __PAGE_HASHPTE (H) must be 0.
+ *
+ * For 64bit PTEs, the offset is extended by 32bit.
*/
#define __swp_type(entry) ((entry).val & 0x1f)
#define __swp_offset(entry) ((entry).val >> 5)
-#define __swp_entry(type, offset) ((swp_entry_t) { (type) | ((offset) << 5) })
+#define __swp_entry(type, offset) ((swp_entry_t) { ((type) & 0x1f) | ((offset) << 5) })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 })
+static inline int pte_swp_exclusive(pte_t pte)
+{
+ return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
+}
+
+static inline pte_t pte_swp_mkexclusive(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE);
+}
+
+static inline pte_t pte_swp_clear_exclusive(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE);
+}
+
/* Generic accessors to PTE bits */
-static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);}
-static inline int pte_read(pte_t pte) { return 1; }
+static inline bool pte_read(pte_t pte)
+{
+ return !!(pte_val(pte) & _PAGE_READ);
+}
+
+static inline bool pte_write(pte_t pte)
+{
+ return !!(pte_val(pte) & _PAGE_WRITE);
+}
+
static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); }
static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); }
static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); }
@@ -415,10 +424,10 @@ static inline bool pte_ci(pte_t pte)
static inline bool pte_access_permitted(pte_t pte, bool write)
{
/*
- * A read-only access is controlled by _PAGE_USER bit.
- * We have _PAGE_READ set for WRITE and EXECUTE
+ * A read-only access is controlled by _PAGE_READ bit.
+ * We have _PAGE_READ set for WRITE
*/
- if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte))
+ if (!pte_present(pte) || !pte_read(pte))
return false;
if (write && !pte_write(pte))
@@ -439,15 +448,10 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
pgprot_val(pgprot));
}
-static inline unsigned long pte_pfn(pte_t pte)
-{
- return pte_val(pte) >> PTE_RPN_SHIFT;
-}
-
/* Generic modifiers for PTE bits */
static inline pte_t pte_wrprotect(pte_t pte)
{
- return __pte(pte_val(pte) & ~_PAGE_RW);
+ return __pte(pte_val(pte) & ~_PAGE_WRITE);
}
static inline pte_t pte_exprotect(pte_t pte)
@@ -475,8 +479,11 @@ static inline pte_t pte_mkpte(pte_t pte)
return pte;
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite_novma(pte_t pte)
{
+ /*
+ * write implies read, hence set both
+ */
return __pte(pte_val(pte) | _PAGE_RW);
}
@@ -500,16 +507,6 @@ static inline pte_t pte_mkhuge(pte_t pte)
return pte;
}
-static inline pte_t pte_mkprivileged(pte_t pte)
-{
- return __pte(pte_val(pte) & ~_PAGE_USER);
-}
-
-static inline pte_t pte_mkuser(pte_t pte)
-{
- return __pte(pte_val(pte) | _PAGE_USER);
-}
-
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
@@ -518,58 +515,43 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
/* This low level function performs the actual PTE insertion
- * Setting the PTE depends on the MMU type and other factors. It's
- * an horrible mess that I'm not going to try to clean up now but
- * I'm keeping it in one place rather than spread around
+ * Setting the PTE depends on the MMU type and other factors.
+ *
+ * First case is 32-bit in UP mode with 32-bit PTEs, we need to preserve
+ * the _PAGE_HASHPTE bit since we may not have invalidated the previous
+ * translation in the hash yet (done in a subsequent flush_tlb_xxx())
+ * and see we need to keep track that this PTE needs invalidating.
+ *
+ * Second case is 32-bit with 64-bit PTE. In this case, we
+ * can just store as long as we do the two halves in the right order
+ * with a barrier in between. This is possible because we take care,
+ * in the hash code, to pre-invalidate if the PTE was already hashed,
+ * which synchronizes us with any concurrent invalidation.
+ * In the percpu case, we fallback to the simple update preserving
+ * the hash bits (ie, same as the non-SMP case).
+ *
+ * Third case is 32-bit in SMP mode with 32-bit PTEs. We use the
+ * helper pte_update() which does an atomic update. We need to do that
+ * because a concurrent invalidation can clear _PAGE_HASHPTE. If it's a
+ * per-CPU PTE such as a kmap_atomic, we also do a simple update preserving
+ * the hash bits instead.
*/
static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, int percpu)
{
-#if defined(CONFIG_SMP) && !defined(CONFIG_PTE_64BIT)
- /* First case is 32-bit Hash MMU in SMP mode with 32-bit PTEs. We use the
- * helper pte_update() which does an atomic update. We need to do that
- * because a concurrent invalidation can clear _PAGE_HASHPTE. If it's a
- * per-CPU PTE such as a kmap_atomic, we do a simple update preserving
- * the hash bits instead (ie, same as the non-SMP case)
- */
- if (percpu)
- *ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
- | (pte_val(pte) & ~_PAGE_HASHPTE));
- else
- pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte));
-
-#elif defined(CONFIG_PTE_64BIT)
- /* Second case is 32-bit with 64-bit PTE. In this case, we
- * can just store as long as we do the two halves in the right order
- * with a barrier in between. This is possible because we take care,
- * in the hash code, to pre-invalidate if the PTE was already hashed,
- * which synchronizes us with any concurrent invalidation.
- * In the percpu case, we also fallback to the simple update preserving
- * the hash bits
- */
- if (percpu) {
- *ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
- | (pte_val(pte) & ~_PAGE_HASHPTE));
- return;
+ if ((!IS_ENABLED(CONFIG_SMP) && !IS_ENABLED(CONFIG_PTE_64BIT)) || percpu) {
+ *ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE) |
+ (pte_val(pte) & ~_PAGE_HASHPTE));
+ } else if (IS_ENABLED(CONFIG_PTE_64BIT)) {
+ if (pte_val(*ptep) & _PAGE_HASHPTE)
+ flush_hash_entry(mm, ptep, addr);
+
+ asm volatile("stw%X0 %2,%0; eieio; stw%X1 %L2,%1" :
+ "=m" (*ptep), "=m" (*((unsigned char *)ptep+4)) :
+ "r" (pte) : "memory");
+ } else {
+ pte_update(mm, addr, ptep, ~_PAGE_HASHPTE, pte_val(pte), 0);
}
- if (pte_val(*ptep) & _PAGE_HASHPTE)
- flush_hash_entry(mm, ptep, addr);
- __asm__ __volatile__("\
- stw%U0%X0 %2,%0\n\
- eieio\n\
- stw%U0%X0 %L2,%1"
- : "=m" (*ptep), "=m" (*((unsigned char *)ptep+4))
- : "r" (pte) : "memory");
-
-#else
- /* Third case is 32-bit hash table in UP mode, we need to preserve
- * the _PAGE_HASHPTE bit since we may not have invalidated the previous
- * translation in the hash yet (done in a subsequent flush_tlb_xxx())
- * and see we need to keep track that this PTE needs invalidating
- */
- *ptep = __pte((pte_val(*ptep) & _PAGE_HASHPTE)
- | (pte_val(pte) & ~_PAGE_HASHPTE));
-#endif
}
/*
diff --git a/arch/powerpc/include/asm/book3s/32/tlbflush.h b/arch/powerpc/include/asm/book3s/32/tlbflush.h
index 068085b709fb..e43534da5207 100644
--- a/arch/powerpc/include/asm/book3s/32/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/32/tlbflush.h
@@ -2,24 +2,90 @@
#ifndef _ASM_POWERPC_BOOK3S_32_TLBFLUSH_H
#define _ASM_POWERPC_BOOK3S_32_TLBFLUSH_H
+#include <linux/build_bug.h>
+
#define MMU_NO_CONTEXT (0)
/*
* TLB flushing for "classic" hash-MMU 32-bit CPUs, 6xx, 7xx, 7xxx
*/
-extern void flush_tlb_mm(struct mm_struct *mm);
-extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
-extern void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr);
-extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end);
-extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
+void hash__flush_tlb_mm(struct mm_struct *mm);
+void hash__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
+void hash__flush_range(struct mm_struct *mm, unsigned long start, unsigned long end);
+
+#ifdef CONFIG_SMP
+void _tlbie(unsigned long address);
+#else
+static inline void _tlbie(unsigned long address)
+{
+ asm volatile ("tlbie %0; sync" : : "r" (address) : "memory");
+}
+#endif
+void _tlbia(void);
+
+/*
+ * Called at the end of a mmu_gather operation to make sure the
+ * TLB flush is completely done.
+ */
+static inline void tlb_flush(struct mmu_gather *tlb)
+{
+ /* 603 needs to flush the whole TLB here since it doesn't use a hash table. */
+ if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ _tlbia();
+}
+
+static inline void flush_range(struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+ start &= PAGE_MASK;
+ if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ hash__flush_range(mm, start, end);
+ else if (end - start <= PAGE_SIZE)
+ _tlbie(start);
+ else
+ _tlbia();
+}
+
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+ if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ hash__flush_tlb_mm(mm);
+ else
+ _tlbia();
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+ if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ hash__flush_tlb_page(vma, vmaddr);
+ else
+ _tlbie(vmaddr);
+}
+
+static inline void
+flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+ flush_range(vma->vm_mm, start, end);
+}
+
+static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ flush_range(&init_mm, start, end);
+}
+
static inline void local_flush_tlb_page(struct vm_area_struct *vma,
unsigned long vmaddr)
{
flush_tlb_page(vma, vmaddr);
}
+
+static inline void local_flush_tlb_page_psize(struct mm_struct *mm,
+ unsigned long vmaddr, int psize)
+{
+ flush_range(mm, vmaddr, vmaddr);
+}
+
static inline void local_flush_tlb_mm(struct mm_struct *mm)
{
flush_tlb_mm(mm);
}
-#endif /* _ASM_POWERPC_TLBFLUSH_H */
+#endif /* _ASM_POWERPC_BOOK3S_32_TLBFLUSH_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 8fd8599c9395..6472b08fa1b0 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -13,20 +13,24 @@
*/
#define MAX_EA_BITS_PER_CONTEXT 46
-#define REGION_SHIFT (MAX_EA_BITS_PER_CONTEXT - 2)
/*
- * Our page table limit us to 64TB. Hence for the kernel mapping,
- * each MAP area is limited to 16 TB.
- * The four map areas are: linear mapping, vmap, IO and vmemmap
+ * Our page table limit us to 64TB. For 64TB physical memory, we only need 64GB
+ * of vmemmap space. To better support sparse memory layout, we use 61TB
+ * linear map range, 1TB of vmalloc, 1TB of I/O and 1TB of vmememmap.
*/
+#define REGION_SHIFT (40)
#define H_KERN_MAP_SIZE (ASM_CONST(1) << REGION_SHIFT)
/*
- * Define the address range of the kernel non-linear virtual area
- * 16TB
+ * Limits the linear mapping range
*/
-#define H_KERN_VIRT_START ASM_CONST(0xc000100000000000)
+#define H_MAX_PHYSMEM_BITS 46
+
+/*
+ * Define the address range of the kernel non-linear virtual area (61TB)
+ */
+#define H_KERN_VIRT_START ASM_CONST(0xc0003d0000000000)
#ifndef __ASSEMBLY__
#define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE)
@@ -34,11 +38,11 @@
#define H_PUD_TABLE_SIZE (sizeof(pud_t) << H_PUD_INDEX_SIZE)
#define H_PGD_TABLE_SIZE (sizeof(pgd_t) << H_PGD_INDEX_SIZE)
-#define H_PAGE_F_GIX_SHIFT 53
-#define H_PAGE_F_SECOND _RPAGE_RPN44 /* HPTE is in 2ndary HPTEG */
-#define H_PAGE_F_GIX (_RPAGE_RPN43 | _RPAGE_RPN42 | _RPAGE_RPN41)
-#define H_PAGE_BUSY _RPAGE_RSV1 /* software: PTE & hash are busy */
-#define H_PAGE_HASHPTE _RPAGE_RSV2 /* software: PTE & hash are busy */
+#define H_PAGE_F_GIX_SHIFT _PAGE_PA_MAX
+#define H_PAGE_F_SECOND _RPAGE_PKEY_BIT0 /* HPTE is in 2ndary HPTEG */
+#define H_PAGE_F_GIX (_RPAGE_RPN43 | _RPAGE_RPN42 | _RPAGE_RPN41)
+#define H_PAGE_BUSY _RPAGE_RSV1
+#define H_PAGE_HASHPTE _RPAGE_PKEY_BIT4
/* PTE flags to conserve for HPTE identification */
#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \
@@ -57,11 +61,12 @@
#define H_PMD_FRAG_NR (PAGE_SIZE >> H_PMD_FRAG_SIZE_SHIFT)
/* memory key bits, only 8 keys supported */
-#define H_PTE_PKEY_BIT0 0
-#define H_PTE_PKEY_BIT1 0
-#define H_PTE_PKEY_BIT2 _RPAGE_RSV3
-#define H_PTE_PKEY_BIT3 _RPAGE_RSV4
-#define H_PTE_PKEY_BIT4 _RPAGE_RSV5
+#define H_PTE_PKEY_BIT4 0
+#define H_PTE_PKEY_BIT3 0
+#define H_PTE_PKEY_BIT2 _RPAGE_PKEY_BIT3
+#define H_PTE_PKEY_BIT1 _RPAGE_PKEY_BIT2
+#define H_PTE_PKEY_BIT0 _RPAGE_PKEY_BIT1
+
/*
* On all 4K setups, remap_4k_pfn() equates to remap_pfn_range()
@@ -131,12 +136,6 @@ static inline int hash__pmd_trans_huge(pmd_t pmd)
return 0;
}
-static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b)
-{
- BUG();
- return 0;
-}
-
static inline pmd_t hash__pmd_mkhuge(pmd_t pmd)
{
BUG();
@@ -156,6 +155,12 @@ extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
extern int hash__has_transparent_hugepage(void);
#endif
+static inline pmd_t hash__pmd_mkdevmap(pmd_t pmd)
+{
+ BUG();
+ return pmd;
+}
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_POWERPC_BOOK3S_64_HASH_4K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index d1d9177d9ebd..0bf6fd0bf42a 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -7,6 +7,19 @@
#define H_PUD_INDEX_SIZE 10 // size: 8B << 10 = 8KB, maps 2^10 x 16GB = 16TB
#define H_PGD_INDEX_SIZE 8 // size: 8B << 8 = 2KB, maps 2^8 x 16TB = 4PB
+/*
+ * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS
+ * if we increase SECTIONS_WIDTH we will not store node details in page->flags and
+ * page_to_nid does a page->section->node lookup
+ * Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce
+ * memory requirements with large number of sections.
+ * 51 bits is the max physical real address on POWER9
+ */
+#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME)
+#define H_MAX_PHYSMEM_BITS 51
+#else
+#define H_MAX_PHYSMEM_BITS 46
+#endif
/*
* Each context is 512TB size. SLB miss for first context/default context
@@ -32,15 +45,15 @@
*/
#define H_PAGE_COMBO _RPAGE_RPN0 /* this is a combo 4k page */
#define H_PAGE_4K_PFN _RPAGE_RPN1 /* PFN is for a single 4k page */
-#define H_PAGE_BUSY _RPAGE_RPN44 /* software: PTE & hash are busy */
+#define H_PAGE_BUSY _RPAGE_RSV1 /* software: PTE & hash are busy */
#define H_PAGE_HASHPTE _RPAGE_RPN43 /* PTE has associated HPTE */
/* memory key bits. */
-#define H_PTE_PKEY_BIT0 _RPAGE_RSV1
-#define H_PTE_PKEY_BIT1 _RPAGE_RSV2
-#define H_PTE_PKEY_BIT2 _RPAGE_RSV3
-#define H_PTE_PKEY_BIT3 _RPAGE_RSV4
-#define H_PTE_PKEY_BIT4 _RPAGE_RSV5
+#define H_PTE_PKEY_BIT4 _RPAGE_PKEY_BIT4
+#define H_PTE_PKEY_BIT3 _RPAGE_PKEY_BIT3
+#define H_PTE_PKEY_BIT2 _RPAGE_PKEY_BIT2
+#define H_PTE_PKEY_BIT1 _RPAGE_PKEY_BIT1
+#define H_PTE_PKEY_BIT0 _RPAGE_PKEY_BIT0
/*
* We need to differentiate between explicit huge page and THP huge
@@ -246,15 +259,10 @@ static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
*/
static inline int hash__pmd_trans_huge(pmd_t pmd)
{
- return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE)) ==
+ return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE | _PAGE_DEVMAP)) ==
(_PAGE_PTE | H_PAGE_THP_HUGE));
}
-static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b)
-{
- return (((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0);
-}
-
static inline pmd_t hash__pmd_mkhuge(pmd_t pmd)
{
return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE));
@@ -272,6 +280,12 @@ extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp);
extern int hash__has_transparent_hugepage(void);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static inline pmd_t hash__pmd_mkdevmap(pmd_t pmd)
+{
+ return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE | _PAGE_DEVMAP));
+}
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_BOOK3S_64_HASH_64K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-pkey.h b/arch/powerpc/include/asm/book3s/64/hash-pkey.h
new file mode 100644
index 000000000000..6c5564c4fae4
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/hash-pkey.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_BOOK3S_64_HASH_PKEY_H
+#define _ASM_POWERPC_BOOK3S_64_HASH_PKEY_H
+
+/* We use key 3 for KERNEL */
+#define HASH_DEFAULT_KERNEL_KEY (HPTE_R_KEY_BIT0 | HPTE_R_KEY_BIT1)
+
+static inline u64 hash__vmflag_to_pte_pkey_bits(u64 vm_flags)
+{
+ return (((vm_flags & VM_PKEY_BIT0) ? H_PTE_PKEY_BIT0 : 0x0UL) |
+ ((vm_flags & VM_PKEY_BIT1) ? H_PTE_PKEY_BIT1 : 0x0UL) |
+ ((vm_flags & VM_PKEY_BIT2) ? H_PTE_PKEY_BIT2 : 0x0UL) |
+ ((vm_flags & VM_PKEY_BIT3) ? H_PTE_PKEY_BIT3 : 0x0UL) |
+ ((vm_flags & VM_PKEY_BIT4) ? H_PTE_PKEY_BIT4 : 0x0UL));
+}
+
+static inline u64 pte_to_hpte_pkey_bits(u64 pteflags, unsigned long flags)
+{
+ unsigned long pte_pkey;
+
+ pte_pkey = (((pteflags & H_PTE_PKEY_BIT4) ? HPTE_R_KEY_BIT4 : 0x0UL) |
+ ((pteflags & H_PTE_PKEY_BIT3) ? HPTE_R_KEY_BIT3 : 0x0UL) |
+ ((pteflags & H_PTE_PKEY_BIT2) ? HPTE_R_KEY_BIT2 : 0x0UL) |
+ ((pteflags & H_PTE_PKEY_BIT1) ? HPTE_R_KEY_BIT1 : 0x0UL) |
+ ((pteflags & H_PTE_PKEY_BIT0) ? HPTE_R_KEY_BIT0 : 0x0UL));
+
+ if (mmu_has_feature(MMU_FTR_KUAP) ||
+ mmu_has_feature(MMU_FTR_BOOK3S_KUEP)) {
+ if ((pte_pkey == 0) && (flags & HPTE_USE_KERNEL_KEY))
+ return HASH_DEFAULT_KERNEL_KEY;
+ }
+
+ return pte_pkey;
+}
+
+static inline u16 hash__pte_to_pkey_bits(u64 pteflags)
+{
+ return (((pteflags & H_PTE_PKEY_BIT4) ? 0x10 : 0x0UL) |
+ ((pteflags & H_PTE_PKEY_BIT3) ? 0x8 : 0x0UL) |
+ ((pteflags & H_PTE_PKEY_BIT2) ? 0x4 : 0x0UL) |
+ ((pteflags & H_PTE_PKEY_BIT1) ? 0x2 : 0x0UL) |
+ ((pteflags & H_PTE_PKEY_BIT0) ? 0x1 : 0x0UL));
+}
+
+#endif
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 2781ebf6add4..6e70ae511631 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -18,6 +18,10 @@
#include <asm/book3s/64/hash-4k.h>
#endif
+#define H_PTRS_PER_PTE (1 << H_PTE_INDEX_SIZE)
+#define H_PTRS_PER_PMD (1 << H_PMD_INDEX_SIZE)
+#define H_PTRS_PER_PUD (1 << H_PUD_INDEX_SIZE)
+
/* Bits to set in a PMD/PUD/PGD entry valid bit*/
#define HASH_PMD_VAL_BITS (0x8000000000000000UL)
#define HASH_PUD_VAL_BITS (0x8000000000000000UL)
@@ -99,10 +103,6 @@
* Defines the address of the vmemap area, in its own region on
* hash table CPUs.
*/
-#ifdef CONFIG_PPC_MM_SLICES
-#define HAVE_ARCH_UNMAPPED_AREA
-#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#endif /* CONFIG_PPC_MM_SLICES */
/* PTEIDX nibble */
#define _PTEIDX_SECONDARY 0x8
@@ -132,11 +132,25 @@ static inline int get_region_id(unsigned long ea)
return region_id;
}
+static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+ return (((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0);
+}
+
#define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS)
+
+/*
+ * pud comparison that will work with both pte and page table pointer.
+ */
+static inline int hash__pud_same(pud_t pud_a, pud_t pud_b)
+{
+ return (((pud_raw(pud_a) ^ pud_raw(pud_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0);
+}
#define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS)
-static inline int hash__pgd_bad(pgd_t pgd)
+
+static inline int hash__p4d_bad(p4d_t p4d)
{
- return (pgd_val(pgd) == 0);
+ return (p4d_val(p4d) == 0);
}
#ifdef CONFIG_STRICT_KERNEL_RWX
extern void hash__mark_rodata_ro(void);
@@ -145,7 +159,7 @@ extern void hash__mark_initmem_nx(void);
extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long pte, int huge);
-extern unsigned long htab_convert_pte_flags(unsigned long pteflags);
+unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags);
/* Atomic PTE updates */
static inline unsigned long hash__pte_update(struct mm_struct *mm,
unsigned long addr,
@@ -251,9 +265,12 @@ extern int __meminit hash__vmemmap_create_mapping(unsigned long start,
extern void hash__vmemmap_remove_mapping(unsigned long start,
unsigned long page_size);
-int hash__create_section_mapping(unsigned long start, unsigned long end, int nid);
+int hash__create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot);
int hash__remove_section_mapping(unsigned long start, unsigned long end);
+void hash__kernel_map_pages(struct page *page, int numpages, int enable);
+
#endif /* !__ASSEMBLY__ */
#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index 12e150e615b7..aa1c67c8bfc8 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -1,6 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_POWERPC_BOOK3S_64_HUGETLB_H
#define _ASM_POWERPC_BOOK3S_64_HUGETLB_H
+
+#include <asm/firmware.h>
+
/*
* For radix we want generic code to handle hugetlb. But then if we want
* both hash and radix to be enabled together we need to workaround the
@@ -8,10 +11,6 @@
*/
void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
-extern unsigned long
-radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags);
extern void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
diff --git a/arch/powerpc/include/asm/book3s/64/kexec.h b/arch/powerpc/include/asm/book3s/64/kexec.h
new file mode 100644
index 000000000000..df37a76c1e9f
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/kexec.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_POWERPC_BOOK3S_64_KEXEC_H_
+#define _ASM_POWERPC_BOOK3S_64_KEXEC_H_
+
+#include <asm/plpar_wrappers.h>
+
+#define reset_sprs reset_sprs
+static inline void reset_sprs(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+ mtspr(SPRN_AMR, 0);
+ mtspr(SPRN_UAMOR, 0);
+ }
+
+ if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+ mtspr(SPRN_IAMR, 0);
+ if (cpu_has_feature(CPU_FTR_HVMODE))
+ mtspr(SPRN_CIABR, 0);
+ else
+ plpar_set_ciabr(0);
+ }
+
+ if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+ mtspr(SPRN_DEXCR, 0);
+ mtspr(SPRN_HASHKEYR, 0);
+ }
+
+ /* Do we need isync()? We are going via a kexec reset */
+ isync();
+}
+
+#endif
diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h
deleted file mode 100644
index f254de956d6a..000000000000
--- a/arch/powerpc/include/asm/book3s/64/kup-radix.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H
-#define _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H
-
-#include <linux/const.h>
-
-#define AMR_KUAP_BLOCK_READ UL(0x4000000000000000)
-#define AMR_KUAP_BLOCK_WRITE UL(0x8000000000000000)
-#define AMR_KUAP_BLOCKED (AMR_KUAP_BLOCK_READ | AMR_KUAP_BLOCK_WRITE)
-#define AMR_KUAP_SHIFT 62
-
-#ifdef __ASSEMBLY__
-
-.macro kuap_restore_amr gpr
-#ifdef CONFIG_PPC_KUAP
- BEGIN_MMU_FTR_SECTION_NESTED(67)
- ld \gpr, STACK_REGS_KUAP(r1)
- mtspr SPRN_AMR, \gpr
- END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67)
-#endif
-.endm
-
-.macro kuap_check_amr gpr1, gpr2
-#ifdef CONFIG_PPC_KUAP_DEBUG
- BEGIN_MMU_FTR_SECTION_NESTED(67)
- mfspr \gpr1, SPRN_AMR
- li \gpr2, (AMR_KUAP_BLOCKED >> AMR_KUAP_SHIFT)
- sldi \gpr2, \gpr2, AMR_KUAP_SHIFT
-999: tdne \gpr1, \gpr2
- EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE)
- END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67)
-#endif
-.endm
-
-.macro kuap_save_amr_and_lock gpr1, gpr2, use_cr, msr_pr_cr
-#ifdef CONFIG_PPC_KUAP
- BEGIN_MMU_FTR_SECTION_NESTED(67)
- .ifnb \msr_pr_cr
- bne \msr_pr_cr, 99f
- .endif
- mfspr \gpr1, SPRN_AMR
- std \gpr1, STACK_REGS_KUAP(r1)
- li \gpr2, (AMR_KUAP_BLOCKED >> AMR_KUAP_SHIFT)
- sldi \gpr2, \gpr2, AMR_KUAP_SHIFT
- cmpd \use_cr, \gpr1, \gpr2
- beq \use_cr, 99f
- // We don't isync here because we very recently entered via rfid
- mtspr SPRN_AMR, \gpr2
- isync
-99:
- END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67)
-#endif
-.endm
-
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_PPC_KUAP
-
-#include <asm/reg.h>
-
-/*
- * We support individually allowing read or write, but we don't support nesting
- * because that would require an expensive read/modify write of the AMR.
- */
-
-static inline void set_kuap(unsigned long value)
-{
- if (!early_mmu_has_feature(MMU_FTR_RADIX_KUAP))
- return;
-
- /*
- * ISA v3.0B says we need a CSI (Context Synchronising Instruction) both
- * before and after the move to AMR. See table 6 on page 1134.
- */
- isync();
- mtspr(SPRN_AMR, value);
- isync();
-}
-
-static inline void allow_user_access(void __user *to, const void __user *from,
- unsigned long size)
-{
- // This is written so we can resolve to a single case at build time
- if (__builtin_constant_p(to) && to == NULL)
- set_kuap(AMR_KUAP_BLOCK_WRITE);
- else if (__builtin_constant_p(from) && from == NULL)
- set_kuap(AMR_KUAP_BLOCK_READ);
- else
- set_kuap(0);
-}
-
-static inline void prevent_user_access(void __user *to, const void __user *from,
- unsigned long size)
-{
- set_kuap(AMR_KUAP_BLOCKED);
-}
-
-static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write)
-{
- return WARN(mmu_has_feature(MMU_FTR_RADIX_KUAP) &&
- (regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : AMR_KUAP_BLOCK_READ)),
- "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read");
-}
-#endif /* CONFIG_PPC_KUAP */
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H */
diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
new file mode 100644
index 000000000000..497a7bd31ecc
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -0,0 +1,418 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_BOOK3S_64_KUP_H
+#define _ASM_POWERPC_BOOK3S_64_KUP_H
+
+#include <linux/const.h>
+#include <asm/reg.h>
+
+#define AMR_KUAP_BLOCK_READ UL(0x5455555555555555)
+#define AMR_KUAP_BLOCK_WRITE UL(0xa8aaaaaaaaaaaaaa)
+#define AMR_KUEP_BLOCKED UL(0x5455555555555555)
+#define AMR_KUAP_BLOCKED (AMR_KUAP_BLOCK_READ | AMR_KUAP_BLOCK_WRITE)
+
+#ifdef __ASSEMBLY__
+
+.macro kuap_user_restore gpr1, gpr2
+#if defined(CONFIG_PPC_PKEY)
+ BEGIN_MMU_FTR_SECTION_NESTED(67)
+ b 100f // skip_restore_amr
+ END_MMU_FTR_SECTION_NESTED_IFCLR(MMU_FTR_PKEY, 67)
+ /*
+ * AMR and IAMR are going to be different when
+ * returning to userspace.
+ */
+ ld \gpr1, STACK_REGS_AMR(r1)
+
+ /*
+ * If kuap feature is not enabled, do the mtspr
+ * only if AMR value is different.
+ */
+ BEGIN_MMU_FTR_SECTION_NESTED(68)
+ mfspr \gpr2, SPRN_AMR
+ cmpd \gpr1, \gpr2
+ beq 99f
+ END_MMU_FTR_SECTION_NESTED_IFCLR(MMU_FTR_KUAP, 68)
+
+ isync
+ mtspr SPRN_AMR, \gpr1
+99:
+ /*
+ * Restore IAMR only when returning to userspace
+ */
+ ld \gpr1, STACK_REGS_IAMR(r1)
+
+ /*
+ * If kuep feature is not enabled, do the mtspr
+ * only if IAMR value is different.
+ */
+ BEGIN_MMU_FTR_SECTION_NESTED(69)
+ mfspr \gpr2, SPRN_IAMR
+ cmpd \gpr1, \gpr2
+ beq 100f
+ END_MMU_FTR_SECTION_NESTED_IFCLR(MMU_FTR_BOOK3S_KUEP, 69)
+
+ isync
+ mtspr SPRN_IAMR, \gpr1
+
+100: //skip_restore_amr
+ /* No isync required, see kuap_user_restore() */
+#endif
+.endm
+
+.macro kuap_kernel_restore gpr1, gpr2
+#if defined(CONFIG_PPC_PKEY)
+
+ BEGIN_MMU_FTR_SECTION_NESTED(67)
+ /*
+ * AMR is going to be mostly the same since we are
+ * returning to the kernel. Compare and do a mtspr.
+ */
+ ld \gpr2, STACK_REGS_AMR(r1)
+ mfspr \gpr1, SPRN_AMR
+ cmpd \gpr1, \gpr2
+ beq 100f
+ isync
+ mtspr SPRN_AMR, \gpr2
+ /*
+ * No isync required, see kuap_restore_amr()
+ * No need to restore IAMR when returning to kernel space.
+ */
+100:
+ END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_KUAP, 67)
+#endif
+.endm
+
+#ifdef CONFIG_PPC_KUAP
+.macro kuap_check_amr gpr1, gpr2
+#ifdef CONFIG_PPC_KUAP_DEBUG
+ BEGIN_MMU_FTR_SECTION_NESTED(67)
+ mfspr \gpr1, SPRN_AMR
+ /* Prevent access to userspace using any key values */
+ LOAD_REG_IMMEDIATE(\gpr2, AMR_KUAP_BLOCKED)
+999: tdne \gpr1, \gpr2
+ EMIT_WARN_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE)
+ END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_KUAP, 67)
+#endif
+.endm
+#endif
+
+/*
+ * if (pkey) {
+ *
+ * save AMR -> stack;
+ * if (kuap) {
+ * if (AMR != BLOCKED)
+ * KUAP_BLOCKED -> AMR;
+ * }
+ * if (from_user) {
+ * save IAMR -> stack;
+ * if (kuep) {
+ * KUEP_BLOCKED ->IAMR
+ * }
+ * }
+ * return;
+ * }
+ *
+ * if (kuap) {
+ * if (from_kernel) {
+ * save AMR -> stack;
+ * if (AMR != BLOCKED)
+ * KUAP_BLOCKED -> AMR;
+ * }
+ *
+ * }
+ */
+.macro kuap_save_amr_and_lock gpr1, gpr2, use_cr, msr_pr_cr
+#if defined(CONFIG_PPC_PKEY)
+
+ /*
+ * if both pkey and kuap is disabled, nothing to do
+ */
+ BEGIN_MMU_FTR_SECTION_NESTED(68)
+ b 100f // skip_save_amr
+ END_MMU_FTR_SECTION_NESTED_IFCLR(MMU_FTR_PKEY | MMU_FTR_KUAP, 68)
+
+ /*
+ * if pkey is disabled and we are entering from userspace
+ * don't do anything.
+ */
+ BEGIN_MMU_FTR_SECTION_NESTED(67)
+ .ifnb \msr_pr_cr
+ /*
+ * Without pkey we are not changing AMR outside the kernel
+ * hence skip this completely.
+ */
+ bne \msr_pr_cr, 100f // from userspace
+ .endif
+ END_MMU_FTR_SECTION_NESTED_IFCLR(MMU_FTR_PKEY, 67)
+
+ /*
+ * pkey is enabled or pkey is disabled but entering from kernel
+ */
+ mfspr \gpr1, SPRN_AMR
+ std \gpr1, STACK_REGS_AMR(r1)
+
+ /*
+ * update kernel AMR with AMR_KUAP_BLOCKED only
+ * if KUAP feature is enabled
+ */
+ BEGIN_MMU_FTR_SECTION_NESTED(69)
+ LOAD_REG_IMMEDIATE(\gpr2, AMR_KUAP_BLOCKED)
+ cmpd \use_cr, \gpr1, \gpr2
+ beq \use_cr, 102f
+ /*
+ * We don't isync here because we very recently entered via an interrupt
+ */
+ mtspr SPRN_AMR, \gpr2
+ isync
+102:
+ END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_KUAP, 69)
+
+ /*
+ * if entering from kernel we don't need save IAMR
+ */
+ .ifnb \msr_pr_cr
+ beq \msr_pr_cr, 100f // from kernel space
+ mfspr \gpr1, SPRN_IAMR
+ std \gpr1, STACK_REGS_IAMR(r1)
+
+ /*
+ * update kernel IAMR with AMR_KUEP_BLOCKED only
+ * if KUEP feature is enabled
+ */
+ BEGIN_MMU_FTR_SECTION_NESTED(70)
+ LOAD_REG_IMMEDIATE(\gpr2, AMR_KUEP_BLOCKED)
+ mtspr SPRN_IAMR, \gpr2
+ isync
+ END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_BOOK3S_KUEP, 70)
+ .endif
+
+100: // skip_save_amr
+#endif
+.endm
+
+#else /* !__ASSEMBLY__ */
+
+#include <linux/jump_label.h>
+#include <linux/sched.h>
+
+DECLARE_STATIC_KEY_FALSE(uaccess_flush_key);
+
+#ifdef CONFIG_PPC_PKEY
+
+extern u64 __ro_after_init default_uamor;
+extern u64 __ro_after_init default_amr;
+extern u64 __ro_after_init default_iamr;
+
+#include <asm/mmu.h>
+#include <asm/ptrace.h>
+
+/* usage of kthread_use_mm() should inherit the
+ * AMR value of the operating address space. But, the AMR value is
+ * thread-specific and we inherit the address space and not thread
+ * access restrictions. Because of this ignore AMR value when accessing
+ * userspace via kernel thread.
+ */
+static __always_inline u64 current_thread_amr(void)
+{
+ if (current->thread.regs)
+ return current->thread.regs->amr;
+ return default_amr;
+}
+
+static __always_inline u64 current_thread_iamr(void)
+{
+ if (current->thread.regs)
+ return current->thread.regs->iamr;
+ return default_iamr;
+}
+#endif /* CONFIG_PPC_PKEY */
+
+#ifdef CONFIG_PPC_KUAP
+
+static __always_inline void kuap_user_restore(struct pt_regs *regs)
+{
+ bool restore_amr = false, restore_iamr = false;
+ unsigned long amr, iamr;
+
+ if (!mmu_has_feature(MMU_FTR_PKEY))
+ return;
+
+ if (!mmu_has_feature(MMU_FTR_KUAP)) {
+ amr = mfspr(SPRN_AMR);
+ if (amr != regs->amr)
+ restore_amr = true;
+ } else {
+ restore_amr = true;
+ }
+
+ if (!mmu_has_feature(MMU_FTR_BOOK3S_KUEP)) {
+ iamr = mfspr(SPRN_IAMR);
+ if (iamr != regs->iamr)
+ restore_iamr = true;
+ } else {
+ restore_iamr = true;
+ }
+
+
+ if (restore_amr || restore_iamr) {
+ isync();
+ if (restore_amr)
+ mtspr(SPRN_AMR, regs->amr);
+ if (restore_iamr)
+ mtspr(SPRN_IAMR, regs->iamr);
+ }
+ /*
+ * No isync required here because we are about to rfi
+ * back to previous context before any user accesses
+ * would be made, which is a CSI.
+ */
+}
+
+static __always_inline void __kuap_kernel_restore(struct pt_regs *regs, unsigned long amr)
+{
+ if (likely(regs->amr == amr))
+ return;
+
+ isync();
+ mtspr(SPRN_AMR, regs->amr);
+ /*
+ * No isync required here because we are about to rfi
+ * back to previous context before any user accesses
+ * would be made, which is a CSI.
+ *
+ * No need to restore IAMR when returning to kernel space.
+ */
+}
+
+static __always_inline unsigned long __kuap_get_and_assert_locked(void)
+{
+ unsigned long amr = mfspr(SPRN_AMR);
+
+ if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) /* kuap_check_amr() */
+ WARN_ON_ONCE(amr != AMR_KUAP_BLOCKED);
+ return amr;
+}
+#define __kuap_get_and_assert_locked __kuap_get_and_assert_locked
+
+/* __kuap_lock() not required, book3s/64 does that in ASM */
+
+/*
+ * We support individually allowing read or write, but we don't support nesting
+ * because that would require an expensive read/modify write of the AMR.
+ */
+
+static __always_inline unsigned long get_kuap(void)
+{
+ /*
+ * We return AMR_KUAP_BLOCKED when we don't support KUAP because
+ * prevent_user_access_return needs to return AMR_KUAP_BLOCKED to
+ * cause restore_user_access to do a flush.
+ *
+ * This has no effect in terms of actually blocking things on hash,
+ * so it doesn't break anything.
+ */
+ if (!mmu_has_feature(MMU_FTR_KUAP))
+ return AMR_KUAP_BLOCKED;
+
+ return mfspr(SPRN_AMR);
+}
+
+static __always_inline void set_kuap(unsigned long value)
+{
+ if (!mmu_has_feature(MMU_FTR_KUAP))
+ return;
+
+ /*
+ * ISA v3.0B says we need a CSI (Context Synchronising Instruction) both
+ * before and after the move to AMR. See table 6 on page 1134.
+ */
+ isync();
+ mtspr(SPRN_AMR, value);
+ isync();
+}
+
+static __always_inline bool
+__bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
+{
+ /*
+ * For radix this will be a storage protection fault (DSISR_PROTFAULT).
+ * For hash this will be a key fault (DSISR_KEYFAULT)
+ */
+ /*
+ * We do have exception table entry, but accessing the
+ * userspace results in fault. This could be because we
+ * didn't unlock the AMR or access is denied by userspace
+ * using a key value that blocks access. We are only interested
+ * in catching the use case of accessing without unlocking
+ * the AMR. Hence check for BLOCK_WRITE/READ against AMR.
+ */
+ if (is_write) {
+ return (regs->amr & AMR_KUAP_BLOCK_WRITE) == AMR_KUAP_BLOCK_WRITE;
+ }
+ return (regs->amr & AMR_KUAP_BLOCK_READ) == AMR_KUAP_BLOCK_READ;
+}
+
+static __always_inline void allow_user_access(void __user *to, const void __user *from,
+ unsigned long size, unsigned long dir)
+{
+ unsigned long thread_amr = 0;
+
+ // This is written so we can resolve to a single case at build time
+ BUILD_BUG_ON(!__builtin_constant_p(dir));
+
+ if (mmu_has_feature(MMU_FTR_PKEY))
+ thread_amr = current_thread_amr();
+
+ if (dir == KUAP_READ)
+ set_kuap(thread_amr | AMR_KUAP_BLOCK_WRITE);
+ else if (dir == KUAP_WRITE)
+ set_kuap(thread_amr | AMR_KUAP_BLOCK_READ);
+ else if (dir == KUAP_READ_WRITE)
+ set_kuap(thread_amr);
+ else
+ BUILD_BUG();
+}
+
+#else /* CONFIG_PPC_KUAP */
+
+static __always_inline unsigned long get_kuap(void)
+{
+ return AMR_KUAP_BLOCKED;
+}
+
+static __always_inline void set_kuap(unsigned long value) { }
+
+static __always_inline void allow_user_access(void __user *to, const void __user *from,
+ unsigned long size, unsigned long dir)
+{ }
+
+#endif /* !CONFIG_PPC_KUAP */
+
+static __always_inline void prevent_user_access(unsigned long dir)
+{
+ set_kuap(AMR_KUAP_BLOCKED);
+ if (static_branch_unlikely(&uaccess_flush_key))
+ do_uaccess_flush();
+}
+
+static __always_inline unsigned long prevent_user_access_return(void)
+{
+ unsigned long flags = get_kuap();
+
+ set_kuap(AMR_KUAP_BLOCKED);
+ if (static_branch_unlikely(&uaccess_flush_key))
+ do_uaccess_flush();
+
+ return flags;
+}
+
+static __always_inline void restore_user_access(unsigned long flags)
+{
+ set_kuap(flags);
+ if (static_branch_unlikely(&uaccess_flush_key) && flags == AMR_KUAP_BLOCKED)
+ do_uaccess_flush();
+}
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_POWERPC_BOOK3S_64_KUP_H */
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 15b75005bc34..1c4eebbc69c9 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -18,7 +18,7 @@
* complete pgtable.h but only a portion of it.
*/
#include <asm/book3s/64/pgtable.h>
-#include <asm/bug.h>
+#include <asm/book3s/64/slice.h>
#include <asm/task_size_64.h>
#include <asm/cpu_has_feature.h>
@@ -86,8 +86,8 @@
#define HPTE_R_PP0 ASM_CONST(0x8000000000000000)
#define HPTE_R_TS ASM_CONST(0x4000000000000000)
#define HPTE_R_KEY_HI ASM_CONST(0x3000000000000000)
-#define HPTE_R_KEY_BIT0 ASM_CONST(0x2000000000000000)
-#define HPTE_R_KEY_BIT1 ASM_CONST(0x1000000000000000)
+#define HPTE_R_KEY_BIT4 ASM_CONST(0x2000000000000000)
+#define HPTE_R_KEY_BIT3 ASM_CONST(0x1000000000000000)
#define HPTE_R_RPN_SHIFT 12
#define HPTE_R_RPN ASM_CONST(0x0ffffffffffff000)
#define HPTE_R_RPN_3_0 ASM_CONST(0x01fffffffffff000)
@@ -103,8 +103,8 @@
#define HPTE_R_R ASM_CONST(0x0000000000000100)
#define HPTE_R_KEY_LO ASM_CONST(0x0000000000000e00)
#define HPTE_R_KEY_BIT2 ASM_CONST(0x0000000000000800)
-#define HPTE_R_KEY_BIT3 ASM_CONST(0x0000000000000400)
-#define HPTE_R_KEY_BIT4 ASM_CONST(0x0000000000000200)
+#define HPTE_R_KEY_BIT1 ASM_CONST(0x0000000000000400)
+#define HPTE_R_KEY_BIT0 ASM_CONST(0x0000000000000200)
#define HPTE_R_KEY (HPTE_R_KEY_LO | HPTE_R_KEY_HI)
#define HPTE_V_1TB_SEG ASM_CONST(0x4000000000000000)
@@ -452,7 +452,10 @@ static inline unsigned long hpt_hash(unsigned long vpn,
#define HPTE_LOCAL_UPDATE 0x1
#define HPTE_NOHPTE_UPDATE 0x2
+#define HPTE_USE_KERNEL_KEY 0x4
+long hpte_insert_repeating(unsigned long hash, unsigned long vpn, unsigned long pa,
+ unsigned long rlags, unsigned long vflags, int psize, int ssize);
extern int __hash_page_4K(unsigned long ea, unsigned long access,
unsigned long vsid, pte_t *ptep, unsigned long trap,
unsigned long flags, int ssize, int subpage_prot);
@@ -466,6 +469,8 @@ extern int hash_page_mm(struct mm_struct *mm, unsigned long ea,
unsigned long flags);
extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
unsigned long dsisr);
+void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc);
+int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr, unsigned long msr);
int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
pte_t *ptep, unsigned long trap, unsigned long flags,
int ssize, unsigned int shift, unsigned int mmu_psize);
@@ -519,7 +524,14 @@ void slb_save_contents(struct slb_entry *slb_ptr);
void slb_dump_contents(struct slb_entry *slb_ptr);
extern void slb_vmalloc_update(void);
-extern void slb_set_size(u16 size);
+void preload_new_slb_context(unsigned long start, unsigned long sp);
+
+#ifdef CONFIG_PPC_64S_HASH_MMU
+void slb_set_size(u16 size);
+#else
+static inline void slb_set_size(u16 size) { }
+#endif
+
#endif /* __ASSEMBLY__ */
/*
@@ -577,8 +589,8 @@ extern void slb_set_size(u16 size);
* For vmalloc and memmap, we use just one context with 512TB. With 64 byte
* struct page size, we need ony 32 TB in memmap for 2PB (51 bits (MAX_PHYSMEM_BITS)).
*/
-#if (MAX_PHYSMEM_BITS > MAX_EA_BITS_PER_CONTEXT)
-#define MAX_KERNEL_CTX_CNT (1UL << (MAX_PHYSMEM_BITS - MAX_EA_BITS_PER_CONTEXT))
+#if (H_MAX_PHYSMEM_BITS > MAX_EA_BITS_PER_CONTEXT)
+#define MAX_KERNEL_CTX_CNT (1UL << (H_MAX_PHYSMEM_BITS - MAX_EA_BITS_PER_CONTEXT))
#else
#define MAX_KERNEL_CTX_CNT 1
#endif
@@ -600,8 +612,11 @@ extern void slb_set_size(u16 size);
*
*/
#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2)
+
+// The + 2 accounts for INVALID_REGION and 1 more to avoid overlap with kernel
#define MIN_USER_CONTEXT (MAX_KERNEL_CTX_CNT + MAX_VMALLOC_CTX_CNT + \
- MAX_IO_CTX_CNT + MAX_VMEMMAP_CTX_CNT)
+ MAX_IO_CTX_CNT + MAX_VMEMMAP_CTX_CNT + 2)
+
/*
* For platforms that support on 65bit VA we limit the context bits
*/
@@ -790,7 +805,7 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea,
}
/*
- * For kernel space, we use context ids as below
+ * For kernel space, we use context ids as
* below. Range is 512TB per context.
*
* 0x00001 - [ 0xc000000000000000 - 0xc001ffffffffffff]
@@ -839,6 +854,32 @@ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
unsigned htab_shift_for_mem_size(unsigned long mem_size);
-#endif /* __ASSEMBLY__ */
+enum slb_index {
+ LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */
+ KSTACK_INDEX = 1, /* Kernel stack map */
+};
+
+#define slb_esid_mask(ssize) \
+ (((ssize) == MMU_SEGSIZE_256M) ? ESID_MASK : ESID_MASK_1T)
+static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
+ enum slb_index index)
+{
+ return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
+}
+
+static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize,
+ unsigned long flags)
+{
+ return (vsid << slb_vsid_shift(ssize)) | flags |
+ ((unsigned long)ssize << SLB_VSID_SSIZE_SHIFT);
+}
+
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+ unsigned long flags)
+{
+ return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
+}
+
+#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index bb3deb76c951..fedbc5d38191 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -19,6 +19,7 @@ struct mmu_psize_def {
int penc[MMU_PAGE_COUNT]; /* HPTE encoding */
unsigned int tlbiel; /* tlbiel supported for that page size */
unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */
+ unsigned long h_rpt_pgsize; /* H_RPT_INVALIDATE page size encoding */
union {
unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */
unsigned long ap; /* Ap encoding used by PowerISA 3.0 */
@@ -27,21 +28,6 @@ struct mmu_psize_def {
extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
#endif /* __ASSEMBLY__ */
-/*
- * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS
- * if we increase SECTIONS_WIDTH we will not store node details in page->flags and
- * page_to_nid does a page->section->node lookup
- * Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce
- * memory requirements with large number of sections.
- * 51 bits is the max physical real address on POWER9
- */
-#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) && \
- defined(CONFIG_PPC_64K_PAGES)
-#define MAX_PHYSMEM_BITS 51
-#else
-#define MAX_PHYSMEM_BITS 46
-#endif
-
/* 64-bit classic hash table MMU */
#include <asm/book3s/64/mmu-hash.h>
@@ -76,19 +62,22 @@ extern struct patb_entry *partition_tb;
#define PRTS_MASK 0x1f /* process table size field */
#define PRTB_MASK 0x0ffffffffffff000UL
+/* Number of supported LPID bits */
+extern unsigned int mmu_lpid_bits;
+
/* Number of supported PID bits */
extern unsigned int mmu_pid_bits;
/* Base PID to allocate from */
extern unsigned int mmu_base_pid;
+extern unsigned long __ro_after_init memory_block_size;
+
#define PRTB_SIZE_SHIFT (mmu_pid_bits + 4)
#define PRTB_ENTRIES (1ul << mmu_pid_bits)
-/*
- * Power9 currently only support 64K partition table size.
- */
-#define PATB_SIZE_SHIFT 16
+#define PATB_SIZE_SHIFT (mmu_lpid_bits + 4)
+#define PATB_ENTRIES (1ul << mmu_lpid_bits)
typedef unsigned long mm_context_id_t;
struct spinlock;
@@ -107,7 +96,9 @@ typedef struct {
* from EA and new context ids to build the new VAs.
*/
mm_context_id_t id;
+#ifdef CONFIG_PPC_64S_HASH_MMU
mm_context_id_t extended_id[TASK_SIZE_USER64/TASK_CONTEXT_SIZE];
+#endif
};
/* Number of bits in the mm_cpumask */
@@ -116,9 +107,14 @@ typedef struct {
/* Number of users of the external (Nest) MMU */
atomic_t copros;
+ /* Number of user space windows opened in process mm_context */
+ atomic_t vas_windows;
+
+#ifdef CONFIG_PPC_64S_HASH_MMU
struct hash_mm_context *hash_context;
+#endif
- unsigned long vdso_base;
+ void __user *vdso;
/*
* pagetable fragment support
*/
@@ -139,6 +135,7 @@ typedef struct {
#endif
} mm_context_t;
+#ifdef CONFIG_PPC_64S_HASH_MMU
static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
{
return ctx->hash_context->user_psize;
@@ -196,19 +193,32 @@ static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx)
/*
* The current system page and segment sizes
*/
-extern int mmu_linear_psize;
extern int mmu_virtual_psize;
extern int mmu_vmalloc_psize;
-extern int mmu_vmemmap_psize;
extern int mmu_io_psize;
+#else /* CONFIG_PPC_64S_HASH_MMU */
+#ifdef CONFIG_PPC_64K_PAGES
+#define mmu_virtual_psize MMU_PAGE_64K
+#else
+#define mmu_virtual_psize MMU_PAGE_4K
+#endif
+#endif
+extern int mmu_linear_psize;
+extern int mmu_vmemmap_psize;
/* MMU initialization */
void mmu_early_init_devtree(void);
void hash__early_init_devtree(void);
void radix__early_init_devtree(void);
+#ifdef CONFIG_PPC_PKEY
+void pkey_early_init_devtree(void);
+#else
+static inline void pkey_early_init_devtree(void) {}
+#endif
+
extern void hash__early_init_mmu(void);
extern void radix__early_init_mmu(void);
-static inline void early_init_mmu(void)
+static inline void __init early_init_mmu(void)
{
if (radix_enabled())
return radix__early_init_mmu();
@@ -225,24 +235,38 @@ static inline void early_init_mmu_secondary(void)
extern void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size);
-extern void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size);
static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
{
- if (early_radix_enabled())
- return radix__setup_initial_memory_limit(first_memblock_base,
- first_memblock_size);
- return hash__setup_initial_memory_limit(first_memblock_base,
- first_memblock_size);
+ /*
+ * Hash has more strict restrictions. At this point we don't
+ * know which translations we will pick. Hence go with hash
+ * restrictions.
+ */
+ if (!early_radix_enabled())
+ hash__setup_initial_memory_limit(first_memblock_base,
+ first_memblock_size);
}
#ifdef CONFIG_PPC_PSERIES
-extern void radix_init_pseries(void);
+void __init radix_init_pseries(void);
#else
-static inline void radix_init_pseries(void) { };
+static inline void radix_init_pseries(void) { }
#endif
+#ifdef CONFIG_HOTPLUG_CPU
+#define arch_clear_mm_cpumask_cpu(cpu, mm) \
+ do { \
+ if (cpumask_test_cpu(cpu, mm_cpumask(mm))) { \
+ dec_mm_active_cpus(mm); \
+ cpumask_clear_cpu(cpu, mm_cpumask(mm)); \
+ } \
+ } while (0)
+
+void cleanup_cpu_mmu_context(void);
+#endif
+
+#ifdef CONFIG_PPC_64S_HASH_MMU
static inline int get_user_context(mm_context_t *ctx, unsigned long ea)
{
int index = ea >> MAX_EA_BITS_PER_CONTEXT;
@@ -262,6 +286,7 @@ static inline unsigned long get_user_vsid(mm_context_t *ctx,
return get_vsid(context, ea, ssize);
}
+#endif
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index f6968c811026..dd2cff53a111 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -19,9 +19,7 @@ extern struct vmemmap_backing *vmemmap_list;
extern pmd_t *pmd_fragment_alloc(struct mm_struct *, unsigned long);
extern void pmd_fragment_free(unsigned long *);
extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
-#ifdef CONFIG_SMP
extern void __tlb_remove_table(void *_table);
-#endif
void pte_frag_destroy(void *pte_frag);
static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm)
@@ -87,9 +85,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
}
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *pgd, pud_t *pud)
{
- *pgd = __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
+ *pgd = __p4d(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
}
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -109,9 +107,25 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
return pud;
}
+static inline void __pud_free(pud_t *pud)
+{
+ struct page *page = virt_to_page(pud);
+
+ /*
+ * Early pud pages allocated via memblock allocator
+ * can't be directly freed to slab. KFENCE pages have
+ * both reserved and slab flags set so need to be freed
+ * kmem_cache_free.
+ */
+ if (PageReserved(page) && !PageSlab(page))
+ free_reserved_page(page);
+ else
+ kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud);
+}
+
static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
- kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud);
+ return __pud_free(pud);
}
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
index 4e697bc2f4cd..48f21820afe2 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
@@ -26,16 +26,6 @@ static inline int pud_huge(pud_t pud)
return 0;
}
-static inline int pgd_huge(pgd_t pgd)
-{
- /*
- * leaf pte for huge page
- */
- if (radix_enabled())
- return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE));
- return 0;
-}
-#define pgd_huge pgd_huge
/*
* With radix , we have hugepage ptes in the pud and pmd entries. We don't
* need to setup hugepage directory for them. Our pte and page directory format
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
index 34d1018896b3..2fce3498b000 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
@@ -30,15 +30,6 @@ static inline int pud_huge(pud_t pud)
return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
}
-static inline int pgd_huge(pgd_t pgd)
-{
- /*
- * leaf pte for huge page
- */
- return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE));
-}
-#define pgd_huge pgd_huge
-
/*
* With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't
* need to setup hugepage directory for them. Our pte and page directory format
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index b01624e5c467..927d585652bc 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -2,23 +2,21 @@
#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
#define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
-#include <asm-generic/5level-fixup.h>
+#include <asm-generic/pgtable-nop4d.h>
#ifndef __ASSEMBLY__
#include <linux/mmdebug.h>
#include <linux/bug.h>
+#include <linux/sizes.h>
#endif
/*
* Common bits between hash and Radix page table
*/
-#define _PAGE_BIT_SWAP_TYPE 0
#define _PAGE_EXEC 0x00001 /* execute permission */
#define _PAGE_WRITE 0x00002 /* write access allowed */
#define _PAGE_READ 0x00004 /* read access allowed */
-#define _PAGE_RW (_PAGE_READ | _PAGE_WRITE)
-#define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
#define _PAGE_PRIVILEGED 0x00008 /* kernel access only */
#define _PAGE_SAO 0x00010 /* Strong access order */
#define _PAGE_NON_IDEMPOTENT 0x00020 /* non idempotent memory */
@@ -32,11 +30,13 @@
#define _RPAGE_SW1 0x00800
#define _RPAGE_SW2 0x00400
#define _RPAGE_SW3 0x00200
-#define _RPAGE_RSV1 0x1000000000000000UL
-#define _RPAGE_RSV2 0x0800000000000000UL
-#define _RPAGE_RSV3 0x0400000000000000UL
-#define _RPAGE_RSV4 0x0200000000000000UL
-#define _RPAGE_RSV5 0x00040UL
+#define _RPAGE_RSV1 0x00040UL
+
+#define _RPAGE_PKEY_BIT4 0x1000000000000000UL
+#define _RPAGE_PKEY_BIT3 0x0800000000000000UL
+#define _RPAGE_PKEY_BIT2 0x0400000000000000UL
+#define _RPAGE_PKEY_BIT1 0x0200000000000000UL
+#define _RPAGE_PKEY_BIT0 0x0100000000000000UL
#define _PAGE_PTE 0x4000000000000000UL /* distinguishes PTEs from pointers */
#define _PAGE_PRESENT 0x8000000000000000UL /* pte contains a translation */
@@ -58,13 +58,12 @@
*/
#define _RPAGE_RPN0 0x01000
#define _RPAGE_RPN1 0x02000
-#define _RPAGE_RPN44 0x0100000000000000UL
#define _RPAGE_RPN43 0x0080000000000000UL
#define _RPAGE_RPN42 0x0040000000000000UL
#define _RPAGE_RPN41 0x0020000000000000UL
/* Max physical address bit as per radix table */
-#define _RPAGE_PA_MAX 57
+#define _RPAGE_PA_MAX 56
/*
* Max physical address bit we will use for now.
@@ -103,6 +102,7 @@
* and every thing below PAGE_SHIFT;
*/
#define PTE_RPN_MASK (((1UL << _PAGE_PA_MAX) - 1) & (PAGE_MASK))
+#define PTE_RPN_SHIFT PAGE_SHIFT
/*
* set of bits not changed in pmd_modify. Even though we have hash specific bits
* in here, on radix we expect them to be zero.
@@ -115,8 +115,8 @@
*/
#define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_RW | _PAGE_DIRTY)
#define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_READ)
-#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \
- _PAGE_RW | _PAGE_EXEC)
+#define _PAGE_KERNEL_ROX (_PAGE_PRIVILEGED | _PAGE_READ | _PAGE_EXEC)
+#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC)
/*
* _PAGE_CHG_MASK masks of bits that are to be preserved across
* pgprot changes
@@ -125,8 +125,6 @@
_PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \
_PAGE_SOFT_DIRTY | _PAGE_DEVMAP)
-#define H_PTE_PKEY (H_PTE_PKEY_BIT0 | H_PTE_PKEY_BIT1 | H_PTE_PKEY_BIT2 | \
- H_PTE_PKEY_BIT3 | H_PTE_PKEY_BIT4)
/*
* We define 2 sets of base prot bits, one for basic pages (ie,
* cacheable kernel and user pages) and one for non cacheable
@@ -136,48 +134,16 @@
#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED)
#define _PAGE_BASE (_PAGE_BASE_NC)
-/* Permission masks used to generate the __P and __S table,
- *
- * Note:__pgprot is defined in arch/powerpc/include/asm/page.h
- *
- * Write permissions imply read permissions for now (we could make write-only
- * pages on BookE but we don't bother for now). Execute permission control is
- * possible on platforms that define _PAGE_EXEC
- */
-#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_PRIVILEGED)
-#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW)
-#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_RW | _PAGE_EXEC)
-#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_READ)
-#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC)
-#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_READ)
-#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC)
+#include <asm/pgtable-masks.h>
/* Permission masks used for kernel mappings */
#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)
-#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
- _PAGE_TOLERANT)
-#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
- _PAGE_NON_IDEMPOTENT)
+#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_TOLERANT)
+#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_NON_IDEMPOTENT)
#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX)
#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO)
#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX)
-/*
- * Protection used for kernel text. We want the debuggers to be able to
- * set breakpoints anywhere, so don't write protect the kernel text
- * on platforms where such control is possible.
- */
-#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) || \
- defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE)
-#define PAGE_KERNEL_TEXT PAGE_KERNEL_X
-#else
-#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX
-#endif
-
-/* Make modules code happy. We don't set RO yet */
-#define PAGE_KERNEL_EXEC PAGE_KERNEL_X
-#define PAGE_AGP (PAGE_KERNEL_NC)
-
#ifndef __ASSEMBLY__
/*
* page table defines
@@ -231,6 +197,12 @@ extern unsigned long __pmd_frag_size_shift;
#define PTRS_PER_PUD (1 << PUD_INDEX_SIZE)
#define PTRS_PER_PGD (1 << PGD_INDEX_SIZE)
+#define MAX_PTRS_PER_PTE ((H_PTRS_PER_PTE > R_PTRS_PER_PTE) ? H_PTRS_PER_PTE : R_PTRS_PER_PTE)
+#define MAX_PTRS_PER_PMD ((H_PTRS_PER_PMD > R_PTRS_PER_PMD) ? H_PTRS_PER_PMD : R_PTRS_PER_PMD)
+#define MAX_PTRS_PER_PUD ((H_PTRS_PER_PUD > R_PTRS_PER_PUD) ? H_PTRS_PER_PUD : R_PTRS_PER_PUD)
+#define MAX_PTRS_PER_PGD (1 << (H_PGD_INDEX_SIZE > RADIX_PGD_INDEX_SIZE ? \
+ H_PGD_INDEX_SIZE : RADIX_PGD_INDEX_SIZE))
+
/* PMD_SHIFT determines what a second-level page table entry can map */
#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE)
#define PMD_SIZE (1UL << PMD_SHIFT)
@@ -251,7 +223,7 @@ extern unsigned long __pmd_frag_size_shift;
/* Bits to mask out from a PUD to get to the PMD page */
#define PUD_MASKED_BITS 0xc0000000000000ffUL
/* Bits to mask out from a PGD to get to the PUD page */
-#define PGD_MASKED_BITS 0xc0000000000000ffUL
+#define P4D_MASKED_BITS 0xc0000000000000ffUL
/*
* Used as an indicator for rcu callback functions
@@ -295,6 +267,13 @@ extern unsigned long pci_io_base;
#include <asm/book3s/64/hash.h>
#include <asm/book3s/64/radix.h>
+#if H_MAX_PHYSMEM_BITS > R_MAX_PHYSMEM_BITS
+#define MAX_PHYSMEM_BITS H_MAX_PHYSMEM_BITS
+#else
+#define MAX_PHYSMEM_BITS R_MAX_PHYSMEM_BITS
+#endif
+
+
#ifdef CONFIG_PPC_64K_PAGES
#include <asm/book3s/64/pgtable-64k.h>
#else
@@ -317,10 +296,9 @@ extern unsigned long pci_io_base;
#define PHB_IO_END (KERN_IO_START + FULL_IO_SIZE)
#define IOREMAP_BASE (PHB_IO_END)
#define IOREMAP_START (ioremap_bot)
-#define IOREMAP_END (KERN_IO_END)
-
-/* Advertise special mapping type for AGP */
-#define HAVE_PAGE_AGP
+#define IOREMAP_END (KERN_IO_END - FIXADDR_SIZE)
+#define FIXADDR_SIZE SZ_32M
+#define FIXADDR_TOP (IOREMAP_END + FIXADDR_SIZE)
#ifndef __ASSEMBLY__
@@ -382,40 +360,34 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
#define ptep_test_and_clear_young(__vma, __addr, __ptep) \
({ \
- int __r; \
- __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \
- __r; \
+ __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \
})
-static inline int __pte_write(pte_t pte)
-{
- return !!(pte_raw(pte) & cpu_to_be64(_PAGE_WRITE));
-}
+/*
+ * On Book3S CPUs, clearing the accessed bit without a TLB flush
+ * doesn't cause data corruption. [ It could cause incorrect
+ * page aging and the (mistaken) reclaim of hot pages, but the
+ * chance of that should be relatively low. ]
+ *
+ * So as a performance optimization don't flush the TLB when
+ * clearing the accessed bit, it will eventually be flushed by
+ * a context switch or a VM operation anyway. [ In the rare
+ * event of it not getting flushed for a long time the delay
+ * shouldn't really matter because there's no real memory
+ * pressure for swapout to react to. ]
+ *
+ * Note: this optimisation also exists in pte_needs_flush() and
+ * huge_pmd_needs_flush().
+ */
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+#define ptep_clear_flush_young ptep_test_and_clear_young
-#ifdef CONFIG_NUMA_BALANCING
-#define pte_savedwrite pte_savedwrite
-static inline bool pte_savedwrite(pte_t pte)
-{
- /*
- * Saved write ptes are prot none ptes that doesn't have
- * privileged bit sit. We mark prot none as one which has
- * present and pviliged bit set and RWX cleared. To mark
- * protnone which used to have _PAGE_WRITE set we clear
- * the privileged bit.
- */
- return !(pte_raw(pte) & cpu_to_be64(_PAGE_RWX | _PAGE_PRIVILEGED));
-}
-#else
-#define pte_savedwrite pte_savedwrite
-static inline bool pte_savedwrite(pte_t pte)
-{
- return false;
-}
-#endif
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+#define pmdp_clear_flush_young pmdp_test_and_clear_young
static inline int pte_write(pte_t pte)
{
- return __pte_write(pte) || pte_savedwrite(pte);
+ return !!(pte_raw(pte) & cpu_to_be64(_PAGE_WRITE));
}
static inline int pte_read(pte_t pte)
@@ -427,24 +399,16 @@ static inline int pte_read(pte_t pte)
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
- if (__pte_write(*ptep))
+ if (pte_write(*ptep))
pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0);
- else if (unlikely(pte_savedwrite(*ptep)))
- pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0);
}
#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- /*
- * We should not find protnone for hugetlb, but this complete the
- * interface.
- */
- if (__pte_write(*ptep))
+ if (pte_write(*ptep))
pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1);
- else if (unlikely(pte_savedwrite(*ptep)))
- pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 1);
}
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
@@ -521,38 +485,14 @@ static inline int pte_protnone(pte_t pte)
return (pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE | _PAGE_RWX)) ==
cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE);
}
+#endif /* CONFIG_NUMA_BALANCING */
-#define pte_mk_savedwrite pte_mk_savedwrite
-static inline pte_t pte_mk_savedwrite(pte_t pte)
+static inline bool pte_hw_valid(pte_t pte)
{
- /*
- * Used by Autonuma subsystem to preserve the write bit
- * while marking the pte PROT_NONE. Only allow this
- * on PROT_NONE pte
- */
- VM_BUG_ON((pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_RWX | _PAGE_PRIVILEGED)) !=
- cpu_to_be64(_PAGE_PRESENT | _PAGE_PRIVILEGED));
- return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_PRIVILEGED));
+ return (pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE)) ==
+ cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE);
}
-#define pte_clear_savedwrite pte_clear_savedwrite
-static inline pte_t pte_clear_savedwrite(pte_t pte)
-{
- /*
- * Used by KSM subsystem to make a protnone pte readonly.
- */
- VM_BUG_ON(!pte_protnone(pte));
- return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PRIVILEGED));
-}
-#else
-#define pte_clear_savedwrite pte_clear_savedwrite
-static inline pte_t pte_clear_savedwrite(pte_t pte)
-{
- VM_WARN_ON(1);
- return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE));
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
static inline int pte_present(pte_t pte)
{
/*
@@ -561,12 +501,11 @@ static inline int pte_present(pte_t pte)
* invalid during ptep_set_access_flags. Hence we look for _PAGE_INVALID
* if we find _PAGE_PRESENT cleared.
*/
- return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
-}
-static inline bool pte_hw_valid(pte_t pte)
-{
- return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT));
+ if (pte_hw_valid(pte))
+ return true;
+ return (pte_raw(pte) & cpu_to_be64(_PAGE_INVALID | _PAGE_PTE)) ==
+ cpu_to_be64(_PAGE_INVALID | _PAGE_PTE);
}
#ifdef CONFIG_PPC_MEM_KEYS
@@ -587,8 +526,8 @@ static inline bool pte_user(pte_t pte)
static inline bool pte_access_permitted(pte_t pte, bool write)
{
/*
- * _PAGE_READ is needed for any access and will be
- * cleared for PROT_NONE
+ * _PAGE_READ is needed for any access and will be cleared for
+ * PROT_NONE. Execute-only mapping via PROT_EXEC also returns false.
*/
if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte))
return false;
@@ -611,19 +550,12 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
VM_BUG_ON(pfn >> (64 - PAGE_SHIFT));
VM_BUG_ON((pfn << PAGE_SHIFT) & ~PTE_RPN_MASK);
- return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot));
-}
-
-static inline unsigned long pte_pfn(pte_t pte)
-{
- return (pte_val(pte) & PTE_RPN_MASK) >> PAGE_SHIFT;
+ return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot) | _PAGE_PTE);
}
/* Generic modifiers for PTE bits */
static inline pte_t pte_wrprotect(pte_t pte)
{
- if (unlikely(pte_savedwrite(pte)))
- return pte_clear_savedwrite(pte);
return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE));
}
@@ -647,12 +579,7 @@ static inline pte_t pte_mkexec(pte_t pte)
return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_EXEC));
}
-static inline pte_t pte_mkpte(pte_t pte)
-{
- return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE));
-}
-
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite_novma(pte_t pte)
{
/*
* write implies read, hence set both
@@ -685,16 +612,6 @@ static inline pte_t pte_mkdevmap(pte_t pte)
return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SPECIAL | _PAGE_DEVMAP));
}
-static inline pte_t pte_mkprivileged(pte_t pte)
-{
- return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PRIVILEGED));
-}
-
-static inline pte_t pte_mkuser(pte_t pte)
-{
- return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_PRIVILEGED));
-}
-
/*
* This is potentially called with a pmd as the argument, in which case it's not
* safe to check _PAGE_DEVMAP unless we also confirm that _PAGE_PTE is set.
@@ -703,7 +620,7 @@ static inline pte_t pte_mkuser(pte_t pte)
*/
static inline int pte_devmap(pte_t pte)
{
- u64 mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE);
+ __be64 mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE);
return (pte_raw(pte) & mask) == mask;
}
@@ -722,17 +639,17 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
* Don't have overlapping bits with _PAGE_HPTEFLAGS \
* We filter HPTEFLAGS on set_pte. \
*/ \
- BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
+ BUILD_BUG_ON(_PAGE_HPTEFLAGS & SWP_TYPE_MASK); \
BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \
+ BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_EXCLUSIVE); \
} while (0)
#define SWP_TYPE_BITS 5
-#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \
- & ((1UL << SWP_TYPE_BITS) - 1))
+#define SWP_TYPE_MASK ((1UL << SWP_TYPE_BITS) - 1)
+#define __swp_type(x) ((x).val & SWP_TYPE_MASK)
#define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PAGE_SHIFT)
#define __swp_entry(type, offset) ((swp_entry_t) { \
- ((type) << _PAGE_BIT_SWAP_TYPE) \
- | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)})
+ (type) | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)})
/*
* swp_entry_t must be independent of pte bits. We build a swp_entry_t from
* swap type and offset we get from swap and convert that to pte to find a
@@ -745,11 +662,13 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
#define __swp_entry_to_pmd(x) (pte_pmd(__swp_entry_to_pte(x)))
#ifdef CONFIG_MEM_SOFT_DIRTY
-#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE))
+#define _PAGE_SWP_SOFT_DIRTY _PAGE_SOFT_DIRTY
#else
#define _PAGE_SWP_SOFT_DIRTY 0UL
#endif /* CONFIG_MEM_SOFT_DIRTY */
+#define _PAGE_SWP_EXCLUSIVE _PAGE_NON_IDEMPOTENT
+
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
@@ -767,6 +686,21 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
}
#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+static inline pte_t pte_swp_mkexclusive(pte_t pte)
+{
+ return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SWP_EXCLUSIVE));
+}
+
+static inline int pte_swp_exclusive(pte_t pte)
+{
+ return !!(pte_raw(pte) & cpu_to_be64(_PAGE_SWP_EXCLUSIVE));
+}
+
+static inline pte_t pte_swp_clear_exclusive(pte_t pte)
+{
+ return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_SWP_EXCLUSIVE));
+}
+
static inline bool check_pte_access(unsigned long access, unsigned long ptev)
{
/*
@@ -815,6 +749,14 @@ static inline int pte_none(pte_t pte)
static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, int percpu)
{
+
+ VM_WARN_ON(!(pte_raw(pte) & cpu_to_be64(_PAGE_PTE)));
+ /*
+ * Keep the _PAGE_PTE added till we are sure we handle _PAGE_PTE
+ * in all the callers.
+ */
+ pte = __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE));
+
if (radix_enabled())
return radix__set_pte_at(mm, addr, ptep, pte, percpu);
return hash__set_pte_at(mm, addr, ptep, pte, percpu);
@@ -862,6 +804,13 @@ static inline bool pte_ci(pte_t pte)
static inline void pmd_clear(pmd_t *pmdp)
{
+ if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) {
+ /*
+ * Don't use this if we can possibly have a hash page table
+ * entry mapping this.
+ */
+ WARN_ON((pmd_val(*pmdp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == (H_PAGE_HASHPTE | _PAGE_PTE));
+ }
*pmdp = __pmd(0);
}
@@ -910,6 +859,13 @@ static inline int pmd_bad(pmd_t pmd)
static inline void pud_clear(pud_t *pudp)
{
+ if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) {
+ /*
+ * Don't use this if we can possibly have a hash page table
+ * entry mapping this.
+ */
+ WARN_ON((pud_val(*pudp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == (H_PAGE_HASHPTE | _PAGE_PTE));
+ }
*pudp = __pud(0);
}
@@ -934,8 +890,29 @@ static inline pud_t pte_pud(pte_t pte)
{
return __pud_raw(pte_raw(pte));
}
+
+static inline pte_t *pudp_ptep(pud_t *pud)
+{
+ return (pte_t *)pud;
+}
+
+#define pud_pfn(pud) pte_pfn(pud_pte(pud))
+#define pud_dirty(pud) pte_dirty(pud_pte(pud))
+#define pud_young(pud) pte_young(pud_pte(pud))
+#define pud_mkold(pud) pte_pud(pte_mkold(pud_pte(pud)))
+#define pud_wrprotect(pud) pte_pud(pte_wrprotect(pud_pte(pud)))
+#define pud_mkdirty(pud) pte_pud(pte_mkdirty(pud_pte(pud)))
+#define pud_mkclean(pud) pte_pud(pte_mkclean(pud_pte(pud)))
+#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud)))
+#define pud_mkwrite(pud) pte_pud(pte_mkwrite_novma(pud_pte(pud)))
#define pud_write(pud) pte_write(pud_pte(pud))
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+#define pud_soft_dirty(pmd) pte_soft_dirty(pud_pte(pud))
+#define pud_mksoft_dirty(pmd) pte_pud(pte_mksoft_dirty(pud_pte(pud)))
+#define pud_clear_soft_dirty(pmd) pte_pud(pte_clear_soft_dirty(pud_pte(pud)))
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
static inline int pud_bad(pud_t pud)
{
if (radix_enabled())
@@ -949,84 +926,67 @@ static inline bool pud_access_permitted(pud_t pud, bool write)
return pte_access_permitted(pud_pte(pud), write);
}
-#define pgd_write(pgd) pte_write(pgd_pte(pgd))
+#define __p4d_raw(x) ((p4d_t) { __pgd_raw(x) })
+static inline __be64 p4d_raw(p4d_t x)
+{
+ return pgd_raw(x.pgd);
+}
+
+#define p4d_write(p4d) pte_write(p4d_pte(p4d))
-static inline void pgd_clear(pgd_t *pgdp)
+static inline void p4d_clear(p4d_t *p4dp)
{
- *pgdp = __pgd(0);
+ *p4dp = __p4d(0);
}
-static inline int pgd_none(pgd_t pgd)
+static inline int p4d_none(p4d_t p4d)
{
- return !pgd_raw(pgd);
+ return !p4d_raw(p4d);
}
-static inline int pgd_present(pgd_t pgd)
+static inline int p4d_present(p4d_t p4d)
{
- return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT));
+ return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PRESENT));
}
-static inline pte_t pgd_pte(pgd_t pgd)
+static inline pte_t p4d_pte(p4d_t p4d)
{
- return __pte_raw(pgd_raw(pgd));
+ return __pte_raw(p4d_raw(p4d));
}
-static inline pgd_t pte_pgd(pte_t pte)
+static inline p4d_t pte_p4d(pte_t pte)
{
- return __pgd_raw(pte_raw(pte));
+ return __p4d_raw(pte_raw(pte));
}
-static inline int pgd_bad(pgd_t pgd)
+static inline int p4d_bad(p4d_t p4d)
{
if (radix_enabled())
- return radix__pgd_bad(pgd);
- return hash__pgd_bad(pgd);
+ return radix__p4d_bad(p4d);
+ return hash__p4d_bad(p4d);
}
-#define pgd_access_permitted pgd_access_permitted
-static inline bool pgd_access_permitted(pgd_t pgd, bool write)
+#define p4d_access_permitted p4d_access_permitted
+static inline bool p4d_access_permitted(p4d_t p4d, bool write)
{
- return pte_access_permitted(pgd_pte(pgd), write);
+ return pte_access_permitted(p4d_pte(p4d), write);
}
-extern struct page *pgd_page(pgd_t pgd);
+extern struct page *p4d_page(p4d_t p4d);
/* Pointers in the page table tree are physical addresses */
#define __pgtable_ptr_val(ptr) __pa(ptr)
-#define pmd_page_vaddr(pmd) __va(pmd_val(pmd) & ~PMD_MASKED_BITS)
-#define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS)
-#define pgd_page_vaddr(pgd) __va(pgd_val(pgd) & ~PGD_MASKED_BITS)
-
-#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
-#define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1))
-#define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1))
-#define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1))
-
-/*
- * Find an entry in a page-table-directory. We combine the address region
- * (the high order N bits) and the pgd portion of the address.
- */
-
-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address))
-
-#define pud_offset(pgdp, addr) \
- (((pud_t *) pgd_page_vaddr(*(pgdp))) + pud_index(addr))
-#define pmd_offset(pudp,addr) \
- (((pmd_t *) pud_page_vaddr(*(pudp))) + pmd_index(addr))
-#define pte_offset_kernel(dir,addr) \
- (((pte_t *) pmd_page_vaddr(*(dir))) + pte_index(addr))
-
-#define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr))
-
-static inline void pte_unmap(pte_t *pte) { }
+static inline pud_t *p4d_pgtable(p4d_t p4d)
+{
+ return (pud_t *)__va(p4d_val(p4d) & ~P4D_MASKED_BITS);
+}
-/* to find an entry in a kernel page-table-directory */
-/* This now only contains the vmalloc pages */
-#define pgd_offset_k(address) pgd_offset(&init_mm, address)
+static inline pmd_t *pud_pgtable(pud_t pud)
+{
+ return (pmd_t *)__va(pud_val(pud) & ~PUD_MASKED_BITS);
+}
-#define pte_ERROR(e) \
- pr_err("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
#define pmd_ERROR(e) \
pr_err("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
#define pud_ERROR(e) \
@@ -1046,6 +1006,8 @@ static inline int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t p
return hash__map_kernel_page(ea, pa, prot);
}
+void unmap_kernel_page(unsigned long va);
+
static inline int __meminit vmemmap_create_mapping(unsigned long start,
unsigned long page_size,
unsigned long phys)
@@ -1065,6 +1027,16 @@ static inline void vmemmap_remove_mapping(unsigned long start,
}
#endif
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
+static inline void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ if (radix_enabled())
+ radix__kernel_map_pages(page, numpages, enable);
+ else
+ hash__kernel_map_pages(page, numpages, enable);
+}
+#endif
+
static inline pte_t pmd_pte(pmd_t pmd)
{
return __pte_raw(pmd_raw(pmd));
@@ -1087,9 +1059,7 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
-#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
-#define pmd_mk_savedwrite(pmd) pte_pmd(pte_mk_savedwrite(pmd_pte(pmd)))
-#define pmd_clear_savedwrite(pmd) pte_pmd(pte_clear_savedwrite(pmd_pte(pmd)))
+#define pmd_mkwrite_novma(pmd) pte_pmd(pte_mkwrite_novma(pmd_pte(pmd)))
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
#define pmd_soft_dirty(pmd) pte_soft_dirty(pmd_pte(pmd))
@@ -1111,8 +1081,6 @@ static inline int pmd_protnone(pmd_t pmd)
#endif /* CONFIG_NUMA_BALANCING */
#define pmd_write(pmd) pte_write(pmd_pte(pmd))
-#define __pmd_write(pmd) __pte_write(pmd_pte(pmd))
-#define pmd_savedwrite(pmd) pte_savedwrite(pmd_pte(pmd))
#define pmd_access_permitted pmd_access_permitted
static inline bool pmd_access_permitted(pmd_t pmd, bool write)
@@ -1135,12 +1103,24 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool write)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
+extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot);
extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd);
-extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd);
+extern void set_pud_at(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud);
+
+static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmd)
+{
+}
+
+static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pud)
+{
+}
+
extern int hash__has_transparent_hugepage(void);
static inline int has_transparent_hugepage(void)
{
@@ -1150,6 +1130,14 @@ static inline int has_transparent_hugepage(void)
}
#define has_transparent_hugepage has_transparent_hugepage
+static inline int has_transparent_pud_hugepage(void)
+{
+ if (radix_enabled())
+ return radix__has_transparent_pud_hugepage();
+ return 0;
+}
+#define has_transparent_pud_hugepage has_transparent_pud_hugepage
+
static inline unsigned long
pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
unsigned long clr, unsigned long set)
@@ -1159,6 +1147,16 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set);
}
+static inline unsigned long
+pud_hugepage_update(struct mm_struct *mm, unsigned long addr, pud_t *pudp,
+ unsigned long clr, unsigned long set)
+{
+ if (radix_enabled())
+ return radix__pud_hugepage_update(mm, addr, pudp, clr, set);
+ BUG();
+ return pud_val(*pudp);
+}
+
/*
* returns true for pmd migration entries, THP, devmap, hugetlb
* But compile time dependent on THP config
@@ -1168,10 +1166,11 @@ static inline int pmd_large(pmd_t pmd)
return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
}
-static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+static inline int pud_large(pud_t pud)
{
- return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
+ return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
}
+
/*
* For radix we should always find H_PAGE_HASHPTE zero. Hence
* the below will work for radix too
@@ -1187,14 +1186,31 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
return ((old & _PAGE_ACCESSED) != 0);
}
+static inline int __pudp_test_and_clear_young(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ unsigned long old;
+
+ if ((pud_raw(*pudp) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0)
+ return 0;
+ old = pud_hugepage_update(mm, addr, pudp, _PAGE_ACCESSED, 0);
+ return ((old & _PAGE_ACCESSED) != 0);
+}
+
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
{
- if (__pmd_write((*pmdp)))
+ if (pmd_write(*pmdp))
pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0);
- else if (unlikely(pmd_savedwrite(*pmdp)))
- pmd_hugepage_update(mm, addr, pmdp, 0, _PAGE_PRIVILEGED);
+}
+
+#define __HAVE_ARCH_PUDP_SET_WRPROTECT
+static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp)
+{
+ if (pud_write(*pudp))
+ pud_hugepage_update(mm, addr, pudp, _PAGE_WRITE, 0);
}
/*
@@ -1206,7 +1222,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
* should return true.
* We should not call this on a hugetlb entry. We should check for HugeTLB
* entry using vma->vm_flags
- * The page table walk rule is explained in Documentation/vm/transhuge.rst
+ * The page table walk rule is explained in Documentation/mm/transhuge.rst
*/
static inline int pmd_trans_huge(pmd_t pmd)
{
@@ -1218,6 +1234,17 @@ static inline int pmd_trans_huge(pmd_t pmd)
return hash__pmd_trans_huge(pmd);
}
+static inline int pud_trans_huge(pud_t pud)
+{
+ if (!pud_present(pud))
+ return false;
+
+ if (radix_enabled())
+ return radix__pud_trans_huge(pud);
+ return 0;
+}
+
+
#define __HAVE_ARCH_PMD_SAME
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
@@ -1226,21 +1253,73 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
return hash__pmd_same(pmd_a, pmd_b);
}
-static inline pmd_t pmd_mkhuge(pmd_t pmd)
+#define pud_same pud_same
+static inline int pud_same(pud_t pud_a, pud_t pud_b)
+{
+ if (radix_enabled())
+ return radix__pud_same(pud_a, pud_b);
+ return hash__pud_same(pud_a, pud_b);
+}
+
+
+static inline pmd_t __pmd_mkhuge(pmd_t pmd)
{
if (radix_enabled())
return radix__pmd_mkhuge(pmd);
return hash__pmd_mkhuge(pmd);
}
+static inline pud_t __pud_mkhuge(pud_t pud)
+{
+ if (radix_enabled())
+ return radix__pud_mkhuge(pud);
+ BUG();
+ return pud;
+}
+
+/*
+ * pfn_pmd return a pmd_t that can be used as pmd pte entry.
+ */
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+ if (radix_enabled())
+ WARN_ON((pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)) == 0);
+ else
+ WARN_ON((pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE | H_PAGE_THP_HUGE)) !=
+ cpu_to_be64(_PAGE_PTE | H_PAGE_THP_HUGE));
+#endif
+ return pmd;
+}
+
+static inline pud_t pud_mkhuge(pud_t pud)
+{
+#ifdef CONFIG_DEBUG_VM
+ if (radix_enabled())
+ WARN_ON((pud_raw(pud) & cpu_to_be64(_PAGE_PTE)) == 0);
+ else
+ WARN_ON(1);
+#endif
+ return pud;
+}
+
+
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
pmd_t entry, int dirty);
+#define __HAVE_ARCH_PUDP_SET_ACCESS_FLAGS
+extern int pudp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp,
+ pud_t entry, int dirty);
#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
+#define __HAVE_ARCH_PUDP_TEST_AND_CLEAR_YOUNG
+extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp);
+
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
@@ -1251,6 +1330,16 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
return hash__pmdp_huge_get_and_clear(mm, addr, pmdp);
}
+#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
+static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ if (radix_enabled())
+ return radix__pudp_huge_get_and_clear(mm, addr, pudp);
+ BUG();
+ return *pudp;
+}
+
static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
@@ -1260,6 +1349,16 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
}
#define pmdp_collapse_flush pmdp_collapse_flush
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
+pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmdp, int full);
+
+#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
+pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr,
+ pud_t *pudp, int full);
+
#define __HAVE_ARCH_PGTABLE_DEPOSIT
static inline void pgtable_trans_huge_deposit(struct mm_struct *mm,
pmd_t *pmdp, pgtable_t pgtable)
@@ -1303,7 +1402,17 @@ extern void serialize_against_pte_lookup(struct mm_struct *mm);
static inline pmd_t pmd_mkdevmap(pmd_t pmd)
{
- return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP));
+ if (radix_enabled())
+ return radix__pmd_mkdevmap(pmd);
+ return hash__pmd_mkdevmap(pmd);
+}
+
+static inline pud_t pud_mkdevmap(pud_t pud)
+{
+ if (radix_enabled())
+ return radix__pud_mkdevmap(pud);
+ BUG();
+ return pud;
}
static inline int pmd_devmap(pmd_t pmd)
@@ -1313,7 +1422,7 @@ static inline int pmd_devmap(pmd_t pmd)
static inline int pud_devmap(pud_t pud)
{
- return 0;
+ return pte_devmap(pud_pte(pud));
}
static inline int pgd_devmap(pgd_t pgd)
@@ -1322,16 +1431,6 @@ static inline int pgd_devmap(pgd_t pgd)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline int pud_pfn(pud_t pud)
-{
- /*
- * Currently all calls to pud_pfn() are gated around a pud_devmap()
- * check so this should never be used. If it grows another user we
- * want to know about it.
- */
- BUILD_BUG();
- return 0;
-}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
@@ -1355,22 +1454,18 @@ static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_va
* Like pmd_huge() and pmd_large(), but works regardless of config options
*/
#define pmd_is_leaf pmd_is_leaf
+#define pmd_leaf pmd_is_leaf
static inline bool pmd_is_leaf(pmd_t pmd)
{
return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
}
#define pud_is_leaf pud_is_leaf
+#define pud_leaf pud_is_leaf
static inline bool pud_is_leaf(pud_t pud)
{
return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
}
-#define pgd_is_leaf pgd_is_leaf
-static inline bool pgd_is_leaf(pgd_t pgd)
-{
- return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE));
-}
-
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/pkeys.h b/arch/powerpc/include/asm/book3s/64/pkeys.h
new file mode 100644
index 000000000000..5b178139f3c0
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/pkeys.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef _ASM_POWERPC_BOOK3S_64_PKEYS_H
+#define _ASM_POWERPC_BOOK3S_64_PKEYS_H
+
+#include <asm/book3s/64/hash-pkey.h>
+
+static inline u64 vmflag_to_pte_pkey_bits(u64 vm_flags)
+{
+ if (!mmu_has_feature(MMU_FTR_PKEY))
+ return 0x0UL;
+
+ if (radix_enabled())
+ BUG();
+ return hash__vmflag_to_pte_pkey_bits(vm_flags);
+}
+
+static inline u16 pte_to_pkey_bits(u64 pteflags)
+{
+ if (radix_enabled())
+ BUG();
+ return hash__pte_to_pkey_bits(pteflags);
+}
+
+#endif /*_ASM_POWERPC_KEYS_H */
diff --git a/arch/powerpc/include/asm/book3s/64/radix-4k.h b/arch/powerpc/include/asm/book3s/64/radix-4k.h
index d5f5ab73dc7f..035ceecd6d67 100644
--- a/arch/powerpc/include/asm/book3s/64/radix-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/radix-4k.h
@@ -11,7 +11,7 @@
#define RADIX_PGD_INDEX_SIZE 13 // size: 8B << 13 = 64KB, maps 2^13 x 512GB = 4PB
/*
- * One fragment per per page
+ * One fragment per page
*/
#define RADIX_PTE_FRAG_SIZE_SHIFT (RADIX_PTE_INDEX_SIZE + 3)
#define RADIX_PTE_FRAG_NR (PAGE_SIZE >> RADIX_PTE_FRAG_SIZE_SHIFT)
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index d97db3ad9aae..357e23a403d3 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -30,11 +30,16 @@
/* Don't have anything in the reserved bits and leaf bits */
#define RADIX_PMD_BAD_BITS 0x60000000000000e0UL
#define RADIX_PUD_BAD_BITS 0x60000000000000e0UL
-#define RADIX_PGD_BAD_BITS 0x60000000000000e0UL
+#define RADIX_P4D_BAD_BITS 0x60000000000000e0UL
#define RADIX_PMD_SHIFT (PAGE_SHIFT + RADIX_PTE_INDEX_SIZE)
#define RADIX_PUD_SHIFT (RADIX_PMD_SHIFT + RADIX_PMD_INDEX_SIZE)
#define RADIX_PGD_SHIFT (RADIX_PUD_SHIFT + RADIX_PUD_INDEX_SIZE)
+
+#define R_PTRS_PER_PTE (1 << RADIX_PTE_INDEX_SIZE)
+#define R_PTRS_PER_PMD (1 << RADIX_PMD_INDEX_SIZE)
+#define R_PTRS_PER_PUD (1 << RADIX_PUD_INDEX_SIZE)
+
/*
* Size of EA range mapped by our pagetables.
*/
@@ -68,11 +73,11 @@
*
*
* 3rd quadrant expanded:
- * +------------------------------+
- * | |
+ * +------------------------------+ Highest address (0xc010000000000000)
+ * +------------------------------+ KASAN shadow end (0xc00fc00000000000)
* | |
* | |
- * +------------------------------+ Kernel vmemmap end (0xc010000000000000)
+ * +------------------------------+ Kernel vmemmap end/shadow start (0xc00e000000000000)
* | |
* | 512TB |
* | |
@@ -91,6 +96,23 @@
* +------------------------------+ Kernel linear (0xc.....)
*/
+/* For the sizes of the shadow area, see kasan.h */
+
+/*
+ * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS
+ * if we increase SECTIONS_WIDTH we will not store node details in page->flags and
+ * page_to_nid does a page->section->node lookup
+ * Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce
+ * memory requirements with large number of sections.
+ * 51 bits is the max physical real address on POWER9
+ */
+
+#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME)
+#define R_MAX_PHYSMEM_BITS 51
+#else
+#define R_MAX_PHYSMEM_BITS 46
+#endif
+
#define RADIX_KERN_VIRT_START ASM_CONST(0xc008000000000000)
/*
* 49 = MAX_EA_BITS_PER_CONTEXT (hash specific). To make sure we pick
@@ -206,8 +228,10 @@ static inline void radix__set_pte_at(struct mm_struct *mm, unsigned long addr,
* from ptesync, it should probably go into update_mmu_cache, rather
* than set_pte_at (which is used to set ptes unrelated to faults).
*
- * Spurious faults to vmalloc region are not tolerated, so there is
- * a ptesync in flush_cache_vmap.
+ * Spurious faults from the kernel memory are not tolerated, so there
+ * is a ptesync in flush_cache_vmap, and __map_kernel_page() follows
+ * the pte update sequence from ISA Book III 6.10 Translation Table
+ * Update Synchronization Requirements.
*/
}
@@ -226,10 +250,14 @@ static inline int radix__pud_bad(pud_t pud)
return !!(pud_val(pud) & RADIX_PUD_BAD_BITS);
}
+static inline int radix__pud_same(pud_t pud_a, pud_t pud_b)
+{
+ return ((pud_raw(pud_a) ^ pud_raw(pud_b)) == 0);
+}
-static inline int radix__pgd_bad(pgd_t pgd)
+static inline int radix__p4d_bad(p4d_t p4d)
{
- return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS);
+ return !!(p4d_val(p4d) & RADIX_P4D_BAD_BITS);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -244,9 +272,22 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
return __pmd(pmd_val(pmd) | _PAGE_PTE);
}
+static inline int radix__pud_trans_huge(pud_t pud)
+{
+ return (pud_val(pud) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE;
+}
+
+static inline pud_t radix__pud_mkhuge(pud_t pud)
+{
+ return __pud(pud_val(pud) | _PAGE_PTE);
+}
+
extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, unsigned long clr,
unsigned long set);
+extern unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, unsigned long clr,
+ unsigned long set);
extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
@@ -254,6 +295,9 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp);
+pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp);
+
static inline int radix__has_transparent_hugepage(void)
{
/* For radix 2M at PMD level means thp */
@@ -261,11 +305,35 @@ static inline int radix__has_transparent_hugepage(void)
return 1;
return 0;
}
+
+static inline int radix__has_transparent_pud_hugepage(void)
+{
+ /* For radix 1G at PUD level means pud hugepage support */
+ if (mmu_psize_defs[MMU_PAGE_1G].shift == PUD_SHIFT)
+ return 1;
+ return 0;
+}
#endif
+static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
+{
+ return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP));
+}
+
+static inline pud_t radix__pud_mkdevmap(pud_t pud)
+{
+ return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP));
+}
+
+struct vmem_altmap;
+struct dev_pagemap;
extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
unsigned long page_size,
unsigned long phys);
+int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end,
+ int node, struct vmem_altmap *altmap);
+void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap);
extern void radix__vmemmap_remove_mapping(unsigned long start,
unsigned long page_size);
@@ -289,8 +357,22 @@ static inline unsigned long radix__get_tree_size(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int radix__create_section_mapping(unsigned long start, unsigned long end, int nid);
+int radix__create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot);
int radix__remove_section_mapping(unsigned long start, unsigned long end);
#endif /* CONFIG_MEMORY_HOTPLUG */
+
+void radix__kernel_map_pages(struct page *page, int numpages, int enable);
+
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+#define vmemmap_can_optimize vmemmap_can_optimize
+bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap);
+#endif
+
+#define vmemmap_populate_compound_pages vmemmap_populate_compound_pages
+int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
+ unsigned long start,
+ unsigned long end, int node,
+ struct dev_pagemap *pgmap);
#endif /* __ASSEMBLY__ */
#endif
diff --git a/arch/powerpc/include/asm/book3s/64/slice.h b/arch/powerpc/include/asm/book3s/64/slice.h
index f0d3194ba41b..5fbe18544cbd 100644
--- a/arch/powerpc/include/asm/book3s/64/slice.h
+++ b/arch/powerpc/include/asm/book3s/64/slice.h
@@ -2,6 +2,16 @@
#ifndef _ASM_POWERPC_BOOK3S_64_SLICE_H
#define _ASM_POWERPC_BOOK3S_64_SLICE_H
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_PPC_64S_HASH_MMU
+#ifdef CONFIG_HUGETLB_PAGE
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+#endif
+#define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+#endif
+
#define SLICE_LOW_SHIFT 28
#define SLICE_LOW_TOP (0x100000000ul)
#define SLICE_NUM_LOW (SLICE_LOW_TOP >> SLICE_LOW_SHIFT)
@@ -13,4 +23,20 @@
#define SLB_ADDR_LIMIT_DEFAULT DEFAULT_MAP_WINDOW_USER64
+struct mm_struct;
+
+unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
+ unsigned long flags, unsigned int psize,
+ int topdown);
+
+unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr);
+
+void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long len, unsigned int psize);
+
+void slice_init_new_context_exec(struct mm_struct *mm);
+void slice_setup_new_exec(void);
+
+#endif /* __ASSEMBLY__ */
+
#endif /* _ASM_POWERPC_BOOK3S_64_SLICE_H */
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
index 64d02a704bcb..146287d9580f 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
@@ -32,6 +32,11 @@ static inline void arch_enter_lazy_mmu_mode(void)
if (radix_enabled())
return;
+ /*
+ * apply_to_page_range can call us this preempt enabled when
+ * operating on kernel page tables.
+ */
+ preempt_disable();
batch = this_cpu_ptr(&ppc64_tlb_batch);
batch->active = 1;
}
@@ -47,6 +52,7 @@ static inline void arch_leave_lazy_mmu_mode(void)
if (batch->index)
__flush_tlb_pending(batch);
batch->active = 0;
+ preempt_enable();
}
#define arch_flush_lazy_mmu_mode() do {} while (0)
@@ -59,62 +65,15 @@ extern void flush_hash_range(unsigned long number, int local);
extern void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
pmd_t *pmdp, unsigned int psize, int ssize,
unsigned long flags);
-static inline void hash__local_flush_tlb_mm(struct mm_struct *mm)
-{
-}
-
-static inline void hash__flush_tlb_mm(struct mm_struct *mm)
-{
-}
-
-static inline void hash__local_flush_all_mm(struct mm_struct *mm)
-{
- /*
- * There's no Page Walk Cache for hash, so what is needed is
- * the same as flush_tlb_mm(), which doesn't really make sense
- * with hash. So the only thing we could do is flush the
- * entire LPID! Punt for now, as it's not being used.
- */
- WARN_ON_ONCE(1);
-}
-
-static inline void hash__flush_all_mm(struct mm_struct *mm)
-{
- /*
- * There's no Page Walk Cache for hash, so what is needed is
- * the same as flush_tlb_mm(), which doesn't really make sense
- * with hash. So the only thing we could do is flush the
- * entire LPID! Punt for now, as it's not being used.
- */
- WARN_ON_ONCE(1);
-}
-
-static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma,
- unsigned long vmaddr)
-{
-}
-
-static inline void hash__flush_tlb_page(struct vm_area_struct *vma,
- unsigned long vmaddr)
-{
-}
-
-static inline void hash__flush_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
-}
-
-static inline void hash__flush_tlb_kernel_range(unsigned long start,
- unsigned long end)
-{
-}
-
struct mmu_gather;
extern void hash__tlb_flush(struct mmu_gather *tlb);
+
+#ifdef CONFIG_PPC_64S_HASH_MMU
/* Private function for use by PCI IO mapping code */
-extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
- unsigned long end);
-extern void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr);
+extern void __flush_hash_table_range(unsigned long start, unsigned long end);
+void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr);
+#else
+static inline void __flush_hash_table_range(unsigned long start, unsigned long end) { }
+#endif
#endif /* _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H */
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index ca8db193ae38..a38542259fab 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -2,10 +2,29 @@
#ifndef _ASM_POWERPC_TLBFLUSH_RADIX_H
#define _ASM_POWERPC_TLBFLUSH_RADIX_H
+#include <asm/hvcall.h>
+
+#define RIC_FLUSH_TLB 0
+#define RIC_FLUSH_PWC 1
+#define RIC_FLUSH_ALL 2
+
struct vm_area_struct;
struct mm_struct;
struct mmu_gather;
+static inline u64 psize_to_rpti_pgsize(unsigned long psize)
+{
+ if (psize == MMU_PAGE_4K)
+ return H_RPTI_PAGE_4K;
+ if (psize == MMU_PAGE_64K)
+ return H_RPTI_PAGE_64K;
+ if (psize == MMU_PAGE_2M)
+ return H_RPTI_PAGE_2M;
+ if (psize == MMU_PAGE_1G)
+ return H_RPTI_PAGE_1G;
+ return H_RPTI_PAGE_ALL;
+}
+
static inline int mmu_get_ap(int psize)
{
return mmu_psize_defs[psize].ap;
@@ -20,7 +39,7 @@ extern void radix__flush_pwc_lpid(unsigned int lpid);
extern void radix__flush_all_lpid(unsigned int lpid);
extern void radix__flush_all_lpid_guest(unsigned int lpid);
#else
-static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); };
+static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); }
static inline void radix__flush_tlb_lpid_page(unsigned int lpid,
unsigned long addr,
unsigned long page_size)
@@ -45,8 +64,12 @@ extern void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
extern void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long end, int psize);
+void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int psize);
extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
+extern void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end);
extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index dcb5c3839d2f..fd642b729775 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -5,6 +5,7 @@
#define MMU_NO_CONTEXT ~0UL
#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
#include <asm/book3s/64/tlbflush-hash.h>
#include <asm/book3s/64/tlbflush-radix.h>
@@ -14,7 +15,6 @@ enum {
TLB_INVAL_SCOPE_LPID = 1, /* invalidate TLBs for current LPID */
};
-#ifdef CONFIG_PPC_NATIVE
static inline void tlbiel_all(void)
{
/*
@@ -30,9 +30,6 @@ static inline void tlbiel_all(void)
else
hash__tlbiel_all(TLB_INVAL_SCOPE_GLOBAL);
}
-#else
-static inline void tlbiel_all(void) { BUG(); };
-#endif
static inline void tlbiel_all_lpid(bool radix)
{
@@ -51,8 +48,15 @@ static inline void flush_pmd_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
if (radix_enabled())
- return radix__flush_pmd_tlb_range(vma, start, end);
- return hash__flush_tlb_range(vma, start, end);
+ radix__flush_pmd_tlb_range(vma, start, end);
+}
+
+#define __HAVE_ARCH_FLUSH_PUD_TLB_RANGE
+static inline void flush_pud_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ if (radix_enabled())
+ radix__flush_pud_tlb_range(vma, start, end);
}
#define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
@@ -61,91 +65,154 @@ static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma,
unsigned long end)
{
if (radix_enabled())
- return radix__flush_hugetlb_tlb_range(vma, start, end);
- return hash__flush_tlb_range(vma, start, end);
+ radix__flush_hugetlb_tlb_range(vma, start, end);
}
static inline void flush_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
if (radix_enabled())
- return radix__flush_tlb_range(vma, start, end);
- return hash__flush_tlb_range(vma, start, end);
+ radix__flush_tlb_range(vma, start, end);
}
static inline void flush_tlb_kernel_range(unsigned long start,
unsigned long end)
{
if (radix_enabled())
- return radix__flush_tlb_kernel_range(start, end);
- return hash__flush_tlb_kernel_range(start, end);
+ radix__flush_tlb_kernel_range(start, end);
}
static inline void local_flush_tlb_mm(struct mm_struct *mm)
{
if (radix_enabled())
- return radix__local_flush_tlb_mm(mm);
- return hash__local_flush_tlb_mm(mm);
+ radix__local_flush_tlb_mm(mm);
}
static inline void local_flush_tlb_page(struct vm_area_struct *vma,
unsigned long vmaddr)
{
if (radix_enabled())
- return radix__local_flush_tlb_page(vma, vmaddr);
- return hash__local_flush_tlb_page(vma, vmaddr);
+ radix__local_flush_tlb_page(vma, vmaddr);
}
-static inline void local_flush_all_mm(struct mm_struct *mm)
+static inline void local_flush_tlb_page_psize(struct mm_struct *mm,
+ unsigned long vmaddr, int psize)
{
if (radix_enabled())
- return radix__local_flush_all_mm(mm);
- return hash__local_flush_all_mm(mm);
+ radix__local_flush_tlb_page_psize(mm, vmaddr, psize);
}
static inline void tlb_flush(struct mmu_gather *tlb)
{
if (radix_enabled())
- return radix__tlb_flush(tlb);
- return hash__tlb_flush(tlb);
+ radix__tlb_flush(tlb);
+ else
+ hash__tlb_flush(tlb);
}
#ifdef CONFIG_SMP
static inline void flush_tlb_mm(struct mm_struct *mm)
{
if (radix_enabled())
- return radix__flush_tlb_mm(mm);
- return hash__flush_tlb_mm(mm);
+ radix__flush_tlb_mm(mm);
}
static inline void flush_tlb_page(struct vm_area_struct *vma,
unsigned long vmaddr)
{
if (radix_enabled())
- return radix__flush_tlb_page(vma, vmaddr);
- return hash__flush_tlb_page(vma, vmaddr);
-}
-
-static inline void flush_all_mm(struct mm_struct *mm)
-{
- if (radix_enabled())
- return radix__flush_all_mm(mm);
- return hash__flush_all_mm(mm);
+ radix__flush_tlb_page(vma, vmaddr);
}
#else
#define flush_tlb_mm(mm) local_flush_tlb_mm(mm)
#define flush_tlb_page(vma, addr) local_flush_tlb_page(vma, addr)
-#define flush_all_mm(mm) local_flush_all_mm(mm)
#endif /* CONFIG_SMP */
#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
- unsigned long address)
+ unsigned long address,
+ pte_t *ptep)
+{
+ /*
+ * Book3S 64 does not require spurious fault flushes because the PTE
+ * must be re-fetched in case of an access permission problem. So the
+ * only reason for a spurious fault should be concurrent modification
+ * to the PTE, in which case the PTE will eventually be re-fetched by
+ * the MMU when it attempts the access again.
+ *
+ * See: Power ISA Version 3.1B, 6.10.1.2 Modifying a Translation Table
+ * Entry, Setting a Reference or Change Bit or Upgrading Access
+ * Authority (PTE Subject to Atomic Hardware Updates):
+ *
+ * "If the only change being made to a valid PTE that is subject to
+ * atomic hardware updates is to set the Reference or Change bit to
+ * 1 or to upgrade access authority, a simpler sequence suffices
+ * because the translation hardware will refetch the PTE if an
+ * access is attempted for which the only problems were reference
+ * and/or change bits needing to be set or insufficient access
+ * authority."
+ *
+ * The nest MMU in POWER9 does not perform this PTE re-fetch, but
+ * it avoids the spurious fault problem by flushing the TLB before
+ * upgrading PTE permissions, see radix__ptep_set_access_flags.
+ */
+}
+
+static inline bool __pte_flags_need_flush(unsigned long oldval,
+ unsigned long newval)
+{
+ unsigned long delta = oldval ^ newval;
+
+ /*
+ * The return value of this function doesn't matter for hash,
+ * ptep_modify_prot_start() does a pte_update() which does or schedules
+ * any necessary hash table update and flush.
+ */
+ if (!radix_enabled())
+ return true;
+
+ /*
+ * We do not expect kernel mappings or non-PTEs or not-present PTEs.
+ */
+ VM_WARN_ON_ONCE(oldval & _PAGE_PRIVILEGED);
+ VM_WARN_ON_ONCE(newval & _PAGE_PRIVILEGED);
+ VM_WARN_ON_ONCE(!(oldval & _PAGE_PTE));
+ VM_WARN_ON_ONCE(!(newval & _PAGE_PTE));
+ VM_WARN_ON_ONCE(!(oldval & _PAGE_PRESENT));
+ VM_WARN_ON_ONCE(!(newval & _PAGE_PRESENT));
+
+ /*
+ * Must flush on any change except READ, WRITE, EXEC, DIRTY, ACCESSED.
+ *
+ * In theory, some changed software bits could be tolerated, in
+ * practice those should rarely if ever matter.
+ */
+
+ if (delta & ~(_PAGE_RWX | _PAGE_DIRTY | _PAGE_ACCESSED))
+ return true;
+
+ /*
+ * If any of the above was present in old but cleared in new, flush.
+ * With the exception of _PAGE_ACCESSED, don't worry about flushing
+ * if that was cleared (see the comment in ptep_clear_flush_young()).
+ */
+ if ((delta & ~_PAGE_ACCESSED) & oldval)
+ return true;
+
+ return false;
+}
+
+static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
+{
+ return __pte_flags_need_flush(pte_val(oldpte), pte_val(newpte));
+}
+#define pte_needs_flush pte_needs_flush
+
+static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
{
- /* See ptep_set_access_flags comment */
- if (atomic_read(&vma->vm_mm->context.copros) > 0)
- flush_tlb_page(vma, address);
+ return __pte_flags_need_flush(pmd_val(oldpmd), pmd_val(newpmd));
}
+#define huge_pmd_needs_flush huge_pmd_needs_flush
extern bool tlbie_capable;
extern bool tlbie_enabled;
diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h
index 0e1263455d73..f42d68c6b314 100644
--- a/arch/powerpc/include/asm/book3s/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/pgtable.h
@@ -8,34 +8,4 @@
#include <asm/book3s/32/pgtable.h>
#endif
-#define FIRST_USER_ADDRESS 0UL
-#ifndef __ASSEMBLY__
-/* Insert a PTE, top-level function is out of line. It uses an inline
- * low level function in the respective pgtable-* files
- */
-extern void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
- pte_t pte);
-
-
-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
- pte_t *ptep, pte_t entry, int dirty);
-
-struct file;
-extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
- unsigned long size, pgprot_t vma_prot);
-#define __HAVE_PHYS_MEM_ACCESS_PROT
-
-/*
- * This gets called at the end of handling a page fault, when
- * the kernel has put a new PTE into the page table for the process.
- * We use it to ensure coherency between the i-cache and d-cache
- * for the page which has just been mapped in.
- * On machines which use an MMU hash table, we use this to put a
- * corresponding HPTE into the hash table ahead of time, instead of
- * waiting for the inevitable extra hash-table miss exception.
- */
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep);
-
-#endif /* __ASSEMBLY__ */
#endif