summaryrefslogtreecommitdiff
path: root/arch/powerpc/mm/book3s64/hash_utils.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm/book3s64/hash_utils.c')
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c597
1 files changed, 429 insertions, 168 deletions
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index b30435c7d804..0626a25b0d72 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -7,7 +7,7 @@
*
* SMP scalability work:
* Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- *
+ *
* Module name: htab.c
*
* Description:
@@ -35,17 +35,20 @@
#include <linux/pkeys.h>
#include <linux/hugetlb.h>
#include <linux/cpu.h>
+#include <linux/pgtable.h>
+#include <linux/debugfs.h>
+#include <linux/random.h>
+#include <linux/elf-randomize.h>
+#include <linux/of_fdt.h>
-#include <asm/debugfs.h>
+#include <asm/interrupt.h>
#include <asm/processor.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/page.h>
#include <asm/types.h>
#include <linux/uaccess.h>
#include <asm/machdep.h>
-#include <asm/prom.h>
#include <asm/io.h>
#include <asm/eeh.h>
#include <asm/tlb.h>
@@ -66,6 +69,9 @@
#include <mm/mmu_decl.h>
+#include "internal.h"
+
+
#ifdef DEBUG
#define DBG(fmt...) udbg_printf(fmt)
#else
@@ -95,8 +101,6 @@
*/
static unsigned long _SDR1;
-struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
-EXPORT_SYMBOL_GPL(mmu_psize_defs);
u8 hpte_page_sizes[1 << LP_BITS];
EXPORT_SYMBOL_GPL(hpte_page_sizes);
@@ -109,9 +113,7 @@ int mmu_linear_psize = MMU_PAGE_4K;
EXPORT_SYMBOL_GPL(mmu_linear_psize);
int mmu_virtual_psize = MMU_PAGE_4K;
int mmu_vmalloc_psize = MMU_PAGE_4K;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-int mmu_vmemmap_psize = MMU_PAGE_4K;
-#endif
+EXPORT_SYMBOL_GPL(mmu_vmalloc_psize);
int mmu_io_psize = MMU_PAGE_4K;
int mmu_kernel_ssize = MMU_SEGSIZE_256M;
EXPORT_SYMBOL_GPL(mmu_kernel_ssize);
@@ -121,11 +123,8 @@ EXPORT_SYMBOL_GPL(mmu_slb_size);
#ifdef CONFIG_PPC_64K_PAGES
int mmu_ci_restrictions;
#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
static u8 *linear_map_hash_slots;
static unsigned long linear_map_hash_count;
-static DEFINE_SPINLOCK(linear_map_hash_lock);
-#endif /* CONFIG_DEBUG_PAGEALLOC */
struct mmu_hash_ops mmu_hash_ops;
EXPORT_SYMBOL(mmu_hash_ops);
@@ -170,6 +169,110 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
},
};
+static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
+{
+ unsigned long rb;
+
+ rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+
+ asm volatile("tlbiel %0" : : "r" (rb));
+}
+
+/*
+ * tlbiel instruction for hash, set invalidation
+ * i.e., r=1 and is=01 or is=10 or is=11
+ */
+static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
+ unsigned int pid,
+ unsigned int ric, unsigned int prs)
+{
+ unsigned long rb;
+ unsigned long rs;
+ unsigned int r = 0; /* hash format */
+
+ rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+ rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
+
+ asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
+ : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r)
+ : "memory");
+}
+
+
+static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
+{
+ unsigned int set;
+
+ asm volatile("ptesync": : :"memory");
+
+ for (set = 0; set < num_sets; set++)
+ tlbiel_hash_set_isa206(set, is);
+
+ ppc_after_tlbiel_barrier();
+}
+
+static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
+{
+ unsigned int set;
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the partition table cache if this is HV mode.
+ */
+ if (early_cpu_has_feature(CPU_FTR_HVMODE))
+ tlbiel_hash_set_isa300(0, is, 0, 2, 0);
+
+ /*
+ * Now invalidate the process table cache. UPRT=0 HPT modes (what
+ * current hardware implements) do not use the process table, but
+ * add the flushes anyway.
+ *
+ * From ISA v3.0B p. 1078:
+ * The following forms are invalid.
+ * * PRS=1, R=0, and RIC!=2 (The only process-scoped
+ * HPT caching is of the Process Table.)
+ */
+ tlbiel_hash_set_isa300(0, is, 0, 2, 1);
+
+ /*
+ * Then flush the sets of the TLB proper. Hash mode uses
+ * partition scoped TLB translations, which may be flushed
+ * in !HV mode.
+ */
+ for (set = 0; set < num_sets; set++)
+ tlbiel_hash_set_isa300(set, is, 0, 0, 0);
+
+ ppc_after_tlbiel_barrier();
+
+ asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+void hash__tlbiel_all(unsigned int action)
+{
+ unsigned int is;
+
+ switch (action) {
+ case TLB_INVAL_SCOPE_GLOBAL:
+ is = 3;
+ break;
+ case TLB_INVAL_SCOPE_LPID:
+ is = 2;
+ break;
+ default:
+ BUG();
+ }
+
+ if (early_cpu_has_feature(CPU_FTR_ARCH_300))
+ tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
+ else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
+ tlbiel_all_isa206(POWER8_TLB_SETS, is);
+ else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
+ tlbiel_all_isa206(POWER7_TLB_SETS, is);
+ else
+ WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
+}
+
/*
* 'R' and 'C' update notes:
* - Under pHyp or KVM, the updatepp path will not set C, thus it *will*
@@ -183,7 +286,7 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
* - We make sure R is always set and never lost
* - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping
*/
-unsigned long htab_convert_pte_flags(unsigned long pteflags)
+unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags)
{
unsigned long rflags = 0;
@@ -207,9 +310,16 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
else
rflags |= 0x3;
}
+ VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request");
} else {
if (pteflags & _PAGE_RWX)
rflags |= 0x2;
+ /*
+ * We should never hit this in normal fault handling because
+ * a permission check (check_pte_access()) will bubble this
+ * to higher level linux handler even for PAGE_NONE.
+ */
+ VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request");
if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
rflags |= 0x1;
}
@@ -237,7 +347,7 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
*/
rflags |= HPTE_R_M;
- rflags |= pte_to_hpte_pkey_bits(pteflags);
+ rflags |= pte_to_hpte_pkey_bits(pteflags, flags);
return rflags;
}
@@ -252,13 +362,17 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
shift = mmu_psize_defs[psize].shift;
step = 1 << shift;
- prot = htab_convert_pte_flags(prot);
+ prot = htab_convert_pte_flags(prot, HPTE_USE_KERNEL_KEY);
DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
vstart, vend, pstart, prot, psize, ssize);
- for (vaddr = vstart, paddr = pstart; vaddr < vend;
- vaddr += step, paddr += step) {
+ /* Carefully map only the possible range */
+ vaddr = ALIGN(vstart, step);
+ paddr = ALIGN(pstart, step);
+ vend = ALIGN_DOWN(vend, step);
+
+ for (; vaddr < vend; vaddr += step, paddr += step) {
unsigned long hash, hpteg;
unsigned long vsid = get_kernel_vsid(vaddr, ssize);
unsigned long vpn = hpt_vpn(vaddr, vsid, ssize);
@@ -298,7 +412,7 @@ repeat:
ssize);
if (ret == -1) {
/*
- * Try to to keep bolted entries in primary.
+ * Try to keep bolted entries in primary.
* Remove non bolted entries and try insert again
*/
ret = mmu_hash_ops.hpte_remove(hpteg);
@@ -317,11 +431,9 @@ repeat:
break;
cond_resched();
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (debug_pagealloc_enabled() &&
+ if (debug_pagealloc_enabled_or_kfence() &&
(paddr >> PAGE_SHIFT) < linear_map_hash_count)
linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
-#endif /* CONFIG_DEBUG_PAGEALLOC */
}
return ret < 0 ? ret : 0;
}
@@ -329,7 +441,7 @@ repeat:
int htab_remove_mapping(unsigned long vstart, unsigned long vend,
int psize, int ssize)
{
- unsigned long vaddr;
+ unsigned long vaddr, time_limit;
unsigned int step, shift;
int rc;
int ret = 0;
@@ -340,8 +452,21 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend,
if (!mmu_hash_ops.hpte_removebolted)
return -ENODEV;
- for (vaddr = vstart; vaddr < vend; vaddr += step) {
+ /* Unmap the full range specificied */
+ vaddr = ALIGN_DOWN(vstart, step);
+ time_limit = jiffies + HZ;
+
+ for (;vaddr < vend; vaddr += step) {
rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize);
+
+ /*
+ * For large number of mappings introduce a cond_resched()
+ * to prevent softlockup warnings.
+ */
+ if (time_after(jiffies, time_limit)) {
+ cond_resched();
+ time_limit = jiffies + HZ;
+ }
if (rc == -ENOENT) {
ret = -ENOENT;
continue;
@@ -353,7 +478,7 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend,
return ret;
}
-static bool disable_1tb_segments = false;
+static bool disable_1tb_segments __ro_after_init;
static int __init parse_disable_1tb_segments(char *p)
{
@@ -362,6 +487,40 @@ static int __init parse_disable_1tb_segments(char *p)
}
early_param("disable_1tb_segments", parse_disable_1tb_segments);
+bool stress_hpt_enabled __initdata;
+
+static int __init parse_stress_hpt(char *p)
+{
+ stress_hpt_enabled = true;
+ return 0;
+}
+early_param("stress_hpt", parse_stress_hpt);
+
+__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_hpt_key);
+
+/*
+ * per-CPU array allocated if we enable stress_hpt.
+ */
+#define STRESS_MAX_GROUPS 16
+struct stress_hpt_struct {
+ unsigned long last_group[STRESS_MAX_GROUPS];
+};
+
+static inline int stress_nr_groups(void)
+{
+ /*
+ * LPAR H_REMOVE flushes TLB, so need some number > 1 of entries
+ * to allow practical forward progress. Bare metal returns 1, which
+ * seems to help uncover more bugs.
+ */
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ return STRESS_MAX_GROUPS;
+ else
+ return 1;
+}
+
+static struct stress_hpt_struct *stress_hpt_struct;
+
static int __init htab_dt_scan_seg_sizes(unsigned long node,
const char *uname, int depth,
void *data)
@@ -541,7 +700,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
}
#endif /* CONFIG_HUGETLB_PAGE */
-static void mmu_psize_set_default_penc(void)
+static void __init mmu_psize_set_default_penc(void)
{
int bpsize, apsize;
for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
@@ -551,7 +710,7 @@ static void mmu_psize_set_default_penc(void)
#ifdef CONFIG_PPC_64K_PAGES
-static bool might_have_hea(void)
+static bool __init might_have_hea(void)
{
/*
* The HEA ethernet adapter requires awareness of the
@@ -593,7 +752,7 @@ static void __init htab_scan_page_sizes(void)
}
#ifdef CONFIG_HUGETLB_PAGE
- if (!hugetlb_disabled) {
+ if (!hugetlb_disabled && !early_radix_enabled() ) {
/* Reserve 16G huge page memory sections for huge pages */
of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
}
@@ -622,7 +781,7 @@ static void __init htab_scan_page_sizes(void)
* low-order N bits as the encoding for the 2^(12+N) byte page size
* (if it exists).
*/
-static void init_hpte_page_sizes(void)
+static void __init init_hpte_page_sizes(void)
{
long int ap, bp;
long int shift, penc;
@@ -652,14 +811,22 @@ static void init_hpte_page_sizes(void)
static void __init htab_init_page_sizes(void)
{
+ bool aligned = true;
init_hpte_page_sizes();
- if (!debug_pagealloc_enabled()) {
+ if (!debug_pagealloc_enabled_or_kfence()) {
/*
* Pick a size for the linear mapping. Currently, we only
* support 16M, 1M and 4K which is the default
*/
- if (mmu_psize_defs[MMU_PAGE_16M].shift)
+ if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) &&
+ (unsigned long)_stext % 0x1000000) {
+ if (mmu_psize_defs[MMU_PAGE_16M].shift)
+ pr_warn("Kernel not 16M aligned, disabling 16M linear map alignment\n");
+ aligned = false;
+ }
+
+ if (mmu_psize_defs[MMU_PAGE_16M].shift && aligned)
mmu_linear_psize = MMU_PAGE_16M;
else if (mmu_psize_defs[MMU_PAGE_1M].shift)
mmu_linear_psize = MMU_PAGE_1M;
@@ -776,7 +943,7 @@ static unsigned long __init htab_get_table_size(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int resize_hpt_for_hotplug(unsigned long new_mem_size)
+static int resize_hpt_for_hotplug(unsigned long new_mem_size)
{
unsigned target_hpt_shift;
@@ -800,7 +967,8 @@ int resize_hpt_for_hotplug(unsigned long new_mem_size)
return 0;
}
-int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
+int hash__create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot)
{
int rc;
@@ -809,8 +977,10 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, int nid
return -1;
}
+ resize_hpt_for_hotplug(memblock_phys_mem_size());
+
rc = htab_bolt_mapping(start, end, __pa(start),
- pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+ pgprot_val(prot), mmu_linear_psize,
mmu_kernel_ssize);
if (rc < 0) {
@@ -825,7 +995,10 @@ int hash__remove_section_mapping(unsigned long start, unsigned long end)
{
int rc = htab_remove_mapping(start, end, mmu_linear_psize,
mmu_kernel_ssize);
- WARN_ON(rc < 0);
+
+ if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
+ pr_warn("Hash collision while resizing HPT\n");
+
return rc;
}
#endif /* CONFIG_MEMORY_HOTPLUG */
@@ -844,13 +1017,30 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
pr_info("Partition table %p\n", partition_tb);
}
+void hpt_clear_stress(void);
+static struct timer_list stress_hpt_timer;
+static void stress_hpt_timer_fn(struct timer_list *timer)
+{
+ int next_cpu;
+
+ hpt_clear_stress();
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ tlbiel_all();
+
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+ stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
+ add_timer_on(&stress_hpt_timer, next_cpu);
+}
+
static void __init htab_initialize(void)
{
unsigned long table;
unsigned long pteg_count;
unsigned long prot;
- unsigned long base = 0, size = 0;
- struct memblock_region *reg;
+ phys_addr_t base = 0, size = 0, end;
+ u64 i;
DBG(" -> htab_initialize()\n");
@@ -860,10 +1050,28 @@ static void __init htab_initialize(void)
printk(KERN_INFO "Using 1TB segments\n");
}
+ if (stress_slb_enabled)
+ static_branch_enable(&stress_slb_key);
+
+ if (stress_hpt_enabled) {
+ unsigned long tmp;
+ static_branch_enable(&stress_hpt_key);
+ // Too early to use nr_cpu_ids, so use NR_CPUS
+ tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS,
+ __alignof__(struct stress_hpt_struct),
+ 0, MEMBLOCK_ALLOC_ANYWHERE);
+ memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS);
+ stress_hpt_struct = __va(tmp);
+
+ timer_setup(&stress_hpt_timer, stress_hpt_timer_fn, 0);
+ stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
+ add_timer(&stress_hpt_timer);
+ }
+
/*
* Calculate the required size of the htab. We want the number of
* PTEGs to equal one half the number of real pages.
- */
+ */
htab_size_bytes = htab_get_table_size();
pteg_count = htab_size_bytes >> 7;
@@ -873,7 +1081,7 @@ static void __init htab_initialize(void)
firmware_has_feature(FW_FEATURE_PS3_LV1)) {
/* Using a hypervisor which owns the htab */
htab_address = NULL;
- _SDR1 = 0;
+ _SDR1 = 0;
#ifdef CONFIG_FA_DUMP
/*
* If firmware assisted dump is active firmware preserves
@@ -926,8 +1134,7 @@ static void __init htab_initialize(void)
prot = pgprot_val(PAGE_KERNEL);
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (debug_pagealloc_enabled()) {
+ if (debug_pagealloc_enabled_or_kfence()) {
linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
linear_map_hash_slots = memblock_alloc_try_nid(
linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
@@ -936,12 +1143,11 @@ static void __init htab_initialize(void)
panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
__func__, linear_map_hash_count, &ppc64_rma_size);
}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
/* create bolted the linear mapping in the hash table */
- for_each_memblock(memory, reg) {
- base = (unsigned long)__va(reg->base);
- size = reg->size;
+ for_each_mem_range(i, &base, &end) {
+ size = end - base;
+ base = (unsigned long)__va(base);
DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
base, size, prot);
@@ -1052,7 +1258,7 @@ void __init hash__early_init_mmu(void)
ps3_early_mm_init();
else if (firmware_has_feature(FW_FEATURE_LPAR))
hpte_init_pseries();
- else if (IS_ENABLED(CONFIG_PPC_NATIVE))
+ else if (IS_ENABLED(CONFIG_PPC_HASH_MMU_NATIVE))
hpte_init_native();
if (!mmu_hash_ops.hpte_insert)
@@ -1095,6 +1301,11 @@ void hash__early_init_mmu_secondary(void)
if (cpu_has_feature(CPU_FTR_ARCH_206)
&& cpu_has_feature(CPU_FTR_HVMODE))
tlbiel_all();
+
+#ifdef CONFIG_PPC_MEM_KEYS
+ if (mmu_has_feature(MMU_FTR_PKEY))
+ mtspr(SPRN_UAMOR, default_uamor);
+#endif
}
#endif /* CONFIG_SMP */
@@ -1103,25 +1314,25 @@ void hash__early_init_mmu_secondary(void)
*/
unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
{
- struct page *page;
+ struct folio *folio;
if (!pfn_valid(pte_pfn(pte)))
return pp;
- page = pte_page(pte);
+ folio = page_folio(pte_page(pte));
/* page is dirty */
- if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
- if (trap == 0x400) {
- flush_dcache_icache_page(page);
- set_bit(PG_arch_1, &page->flags);
+ if (!test_bit(PG_dcache_clean, &folio->flags) &&
+ !folio_test_reserved(folio)) {
+ if (trap == INTERRUPT_INST_STORAGE) {
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags);
} else
pp |= HPTE_R_N;
}
return pp;
}
-#ifdef CONFIG_PPC_MM_SLICES
static unsigned int get_paca_psize(unsigned long addr)
{
unsigned char *psizes;
@@ -1138,12 +1349,6 @@ static unsigned int get_paca_psize(unsigned long addr)
return (psizes[index >> 1] >> (mask_index * 4)) & 0xF;
}
-#else
-unsigned int get_paca_psize(unsigned long addr)
-{
- return get_paca()->mm_ctx_user_psize;
-}
-#endif
/*
* Demote a segment to using 4k pages.
@@ -1200,7 +1405,7 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea)
spp >>= 30 - 2 * ((ea >> 12) & 0xf);
/*
- * 0 -> full premission
+ * 0 -> full permission
* 1 -> Read only
* 2 -> no access.
* We return the flag that need to be cleared.
@@ -1256,7 +1461,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
unsigned long flags)
{
bool is_thp;
- enum ctx_state prev_state = exception_enter();
pgd_t *pgdir;
unsigned long vsid;
pte_t *ptep;
@@ -1285,12 +1489,14 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
psize = mmu_vmalloc_psize;
ssize = mmu_kernel_ssize;
+ flags |= HPTE_USE_KERNEL_KEY;
break;
case IO_REGION_ID:
vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
psize = mmu_io_psize;
ssize = mmu_kernel_ssize;
+ flags |= HPTE_USE_KERNEL_KEY;
break;
default:
/*
@@ -1340,8 +1546,15 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
goto bail;
}
- /* Add _PAGE_PRESENT to the required access perm */
- access |= _PAGE_PRESENT;
+ /*
+ * Add _PAGE_PRESENT to the required access perm. If there are parallel
+ * updates to the pte that can possibly clear _PAGE_PTE, catch that too.
+ *
+ * We can safely use the return pte address in rest of the function
+ * because we do set H_PAGE_BUSY which prevents further updates to pte
+ * from generic code.
+ */
+ access |= _PAGE_PRESENT | _PAGE_PTE;
/*
* Pre-check access permissions (will be re-checked atomically
@@ -1449,7 +1662,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
DBG_LOW(" -> rc=%d\n", rc);
bail:
- exception_exit(prev_state);
return rc;
}
EXPORT_SYMBOL_GPL(hash_page_mm);
@@ -1471,16 +1683,26 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
}
EXPORT_SYMBOL_GPL(hash_page);
-int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr,
- unsigned long msr)
+DEFINE_INTERRUPT_HANDLER(do_hash_fault)
{
+ unsigned long ea = regs->dar;
+ unsigned long dsisr = regs->dsisr;
unsigned long access = _PAGE_PRESENT | _PAGE_READ;
unsigned long flags = 0;
- struct mm_struct *mm = current->mm;
- unsigned int region_id = get_region_id(ea);
+ struct mm_struct *mm;
+ unsigned int region_id;
+ long err;
+ if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT))) {
+ hash__do_page_fault(regs);
+ return;
+ }
+
+ region_id = get_region_id(ea);
if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
mm = &init_mm;
+ else
+ mm = current->mm;
if (dsisr & DSISR_NOHPTE)
flags |= HPTE_NOHPTE_UPDATE;
@@ -1496,16 +1718,30 @@ int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr,
* 2) user space access kernel space.
*/
access |= _PAGE_PRIVILEGED;
- if ((msr & MSR_PR) || (region_id == USER_REGION_ID))
+ if (user_mode(regs) || (region_id == USER_REGION_ID))
access &= ~_PAGE_PRIVILEGED;
- if (trap == 0x400)
+ if (TRAP(regs) == INTERRUPT_INST_STORAGE)
access |= _PAGE_EXEC;
- return hash_page_mm(mm, ea, access, trap, flags);
+ err = hash_page_mm(mm, ea, access, TRAP(regs), flags);
+ if (unlikely(err < 0)) {
+ // failed to insert a hash PTE due to an hypervisor error
+ if (user_mode(regs)) {
+ if (IS_ENABLED(CONFIG_PPC_SUBPAGE_PROT) && err == -2)
+ _exception(SIGSEGV, regs, SEGV_ACCERR, ea);
+ else
+ _exception(SIGBUS, regs, BUS_ADRERR, ea);
+ } else {
+ bad_page_fault(regs, SIGBUS);
+ }
+ err = 0;
+
+ } else if (err) {
+ hash__do_page_fault(regs);
+ }
}
-#ifdef CONFIG_PPC_MM_SLICES
static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
{
int psize = get_slice_psize(mm, ea);
@@ -1522,23 +1758,15 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
return true;
}
-#else
-static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
-{
- return true;
-}
-#endif
-static void hash_preload(struct mm_struct *mm, unsigned long ea,
+static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
bool is_exec, unsigned long trap)
{
- int hugepage_shift;
unsigned long vsid;
pgd_t *pgdir;
- pte_t *ptep;
- unsigned long flags;
int rc, ssize, update_flags = 0;
unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
+ unsigned long flags;
BUG_ON(get_region_id(ea) != USER_REGION_ID);
@@ -1558,32 +1786,34 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea,
vsid = get_user_vsid(&mm->context, ea, ssize);
if (!vsid)
return;
- /*
- * Hash doesn't like irqs. Walking linux page table with irq disabled
- * saves us from holding multiple locks.
- */
- local_irq_save(flags);
- /*
- * THP pages use update_mmu_cache_pmd. We don't do
- * hash preload there. Hence can ignore THP here
- */
- ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift);
- if (!ptep)
- goto out_exit;
-
- WARN_ON(hugepage_shift);
#ifdef CONFIG_PPC_64K_PAGES
/* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
* a 64K kernel), then we don't preload, hash_page() will take
* care of it once we actually try to access the page.
* That way we don't have to duplicate all of the logic for segment
* page size demotion here
+ * Called with PTL held, hence can be sure the value won't change in
+ * between.
*/
if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
- goto out_exit;
+ return;
#endif /* CONFIG_PPC_64K_PAGES */
+ /*
+ * __hash_page_* must run with interrupts off, including PMI interrupts
+ * off, as it sets the H_PAGE_BUSY bit.
+ *
+ * It's otherwise possible for perf interrupts to hit at any time and
+ * may take a hash fault reading the user stack, which could take a
+ * hash miss and deadlock on the same H_PAGE_BUSY bit.
+ *
+ * Interrupts must also be off for the duration of the
+ * mm_is_thread_local test and update, to prevent preempt running the
+ * mm on another CPU (XXX: this may be racy vs kthread_use_mm).
+ */
+ powerpc_local_irq_pmu_save(flags);
+
/* Is that local to this CPU ? */
if (mm_is_thread_local(mm))
update_flags |= HPTE_LOCAL_UPDATE;
@@ -1606,8 +1836,8 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea,
mm_ctx_user_psize(&mm->context),
mm_ctx_user_psize(&mm->context),
pte_val(*ptep));
-out_exit:
- local_irq_restore(flags);
+
+ powerpc_local_irq_pmu_restore(flags);
}
/*
@@ -1618,7 +1848,7 @@ out_exit:
*
* This must always be called with the pte lock held.
*/
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep)
{
/*
@@ -1628,11 +1858,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
unsigned long trap;
bool is_exec;
- if (radix_enabled()) {
- prefetch((void *)address);
- return;
- }
-
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
if (!pte_young(*ptep) || address >= TASK_SIZE)
return;
@@ -1658,32 +1883,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
return;
}
- hash_preload(vma->vm_mm, address, is_exec, trap);
-}
-
-#ifdef CONFIG_PPC_MEM_KEYS
-/*
- * Return the protection key associated with the given address and the
- * mm_struct.
- */
-u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
-{
- pte_t *ptep;
- u16 pkey = 0;
- unsigned long flags;
-
- if (!mm || !mm->pgd)
- return 0;
-
- local_irq_save(flags);
- ptep = find_linux_pte(mm->pgd, address, NULL, NULL);
- if (ptep)
- pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep)));
- local_irq_restore(flags);
-
- return pkey;
+ hash_preload(vma->vm_mm, ptep, address, is_exec, trap);
}
-#endif /* CONFIG_PPC_MEM_KEYS */
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
static inline void tm_flush_hash_page(int local)
@@ -1726,10 +1927,6 @@ unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift,
return gslot;
}
-/*
- * WARNING: This is called from hash_low_64.S, if you change this prototype,
- * do not forget to update the assembly call site !
- */
void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
unsigned long flags)
{
@@ -1824,27 +2021,6 @@ void flush_hash_range(unsigned long number, int local)
}
}
-/*
- * low_hash_fault is called when we the low level hash code failed
- * to instert a PTE due to an hypervisor error
- */
-void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
-{
- enum ctx_state prev_state = exception_enter();
-
- if (user_mode(regs)) {
-#ifdef CONFIG_PPC_SUBPAGE_PROT
- if (rc == -2)
- _exception(SIGSEGV, regs, SEGV_ACCERR, address);
- else
-#endif
- _exception(SIGBUS, regs, BUS_ADRERR, address);
- } else
- bad_page_fault(regs, address, SIGBUS);
-
- exception_exit(prev_state);
-}
-
long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
unsigned long pa, unsigned long rflags,
unsigned long vflags, int psize, int ssize)
@@ -1878,13 +2054,78 @@ repeat:
return slot;
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
+void hpt_clear_stress(void)
+{
+ int cpu = raw_smp_processor_id();
+ int g;
+
+ for (g = 0; g < stress_nr_groups(); g++) {
+ unsigned long last_group;
+ last_group = stress_hpt_struct[cpu].last_group[g];
+
+ if (last_group != -1UL) {
+ int i;
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ if (mmu_hash_ops.hpte_remove(last_group) == -1)
+ break;
+ }
+ stress_hpt_struct[cpu].last_group[g] = -1;
+ }
+ }
+}
+
+void hpt_do_stress(unsigned long ea, unsigned long hpte_group)
+{
+ unsigned long last_group;
+ int cpu = raw_smp_processor_id();
+
+ last_group = stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1];
+ if (hpte_group == last_group)
+ return;
+
+ if (last_group != -1UL) {
+ int i;
+ /*
+ * Concurrent CPUs might be inserting into this group, so
+ * give up after a number of iterations, to prevent a live
+ * lock.
+ */
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ if (mmu_hash_ops.hpte_remove(last_group) == -1)
+ break;
+ }
+ stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1] = -1;
+ }
+
+ if (ea >= PAGE_OFFSET) {
+ /*
+ * We would really like to prefetch to get the TLB loaded, then
+ * remove the PTE before returning from fault interrupt, to
+ * increase the hash fault rate.
+ *
+ * Unfortunately QEMU TCG does not model the TLB in a way that
+ * makes this possible, and systemsim (mambo) emulator does not
+ * bring in TLBs with prefetches (although loads/stores do
+ * work for non-CI PTEs).
+ *
+ * So remember this PTE and clear it on the next hash fault.
+ */
+ memmove(&stress_hpt_struct[cpu].last_group[1],
+ &stress_hpt_struct[cpu].last_group[0],
+ (stress_nr_groups() - 1) * sizeof(unsigned long));
+ stress_hpt_struct[cpu].last_group[0] = hpte_group;
+ }
+}
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
+static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
+
static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
{
unsigned long hash;
unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
- unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
+ unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY);
long ret;
hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
@@ -1893,15 +2134,18 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
if (!vsid)
return;
+ if (linear_map_hash_slots[lmi] & 0x80)
+ return;
+
ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
HPTE_V_BOLTED,
mmu_linear_psize, mmu_kernel_ssize);
BUG_ON (ret < 0);
- spin_lock(&linear_map_hash_lock);
+ raw_spin_lock(&linear_map_hash_lock);
BUG_ON(linear_map_hash_slots[lmi] & 0x80);
linear_map_hash_slots[lmi] = ret | 0x80;
- spin_unlock(&linear_map_hash_lock);
+ raw_spin_unlock(&linear_map_hash_lock);
}
static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
@@ -1911,11 +2155,14 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
- spin_lock(&linear_map_hash_lock);
- BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
+ raw_spin_lock(&linear_map_hash_lock);
+ if (!(linear_map_hash_slots[lmi] & 0x80)) {
+ raw_spin_unlock(&linear_map_hash_lock);
+ return;
+ }
hidx = linear_map_hash_slots[lmi] & 0x7f;
linear_map_hash_slots[lmi] = 0;
- spin_unlock(&linear_map_hash_lock);
+ raw_spin_unlock(&linear_map_hash_lock);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
@@ -1925,7 +2172,7 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
mmu_kernel_ssize, 0);
}
-void __kernel_map_pages(struct page *page, int numpages, int enable)
+void hash__kernel_map_pages(struct page *page, int numpages, int enable)
{
unsigned long flags, vaddr, lmi;
int i;
@@ -1943,7 +2190,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
}
local_irq_restore(flags);
}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
+#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_KFENCE */
void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
@@ -2009,11 +2256,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n")
static int __init hash64_debugfs(void)
{
- if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root,
- NULL, &fops_hpt_order)) {
- pr_err("lpar: unable to create hpt_order debugsfs file\n");
- }
-
+ debugfs_create_file("hpt_order", 0600, arch_debugfs_dir, NULL,
+ &fops_hpt_order);
return 0;
}
machine_device_initcall(pseries, hash64_debugfs);
@@ -2026,3 +2270,20 @@ void __init print_system_hash_info(void)
if (htab_hash_mask)
pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask);
}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ /*
+ * If we are using 1TB segments and we are allowed to randomise
+ * the heap, we can put it above 1TB so it is backed by a 1TB
+ * segment. Otherwise the heap will be in the bottom 1TB
+ * which always uses 256MB segments and this may result in a
+ * performance penalty.
+ */
+ if (is_32bit_task())
+ return randomize_page(mm->brk, SZ_32M);
+ else if (!radix_enabled() && mmu_highuser_ssize == MMU_SEGSIZE_1T)
+ return randomize_page(max_t(unsigned long, mm->brk, SZ_1T), SZ_1G);
+ else
+ return randomize_page(mm->brk, SZ_1G);
+}