diff options
Diffstat (limited to 'arch/powerpc/mm/book3s64/radix_tlb.c')
-rw-r--r-- | arch/powerpc/mm/book3s64/radix_tlb.c | 812 |
1 files changed, 601 insertions, 211 deletions
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index a95175c0972b..9e1f6558d026 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -10,16 +10,16 @@ #include <linux/memblock.h> #include <linux/mmu_context.h> #include <linux/sched/mm.h> +#include <linux/debugfs.h> #include <asm/ppc-opcode.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/trace.h> #include <asm/cputhreads.h> +#include <asm/plpar_wrappers.h> -#define RIC_FLUSH_TLB 0 -#define RIC_FLUSH_PWC 1 -#define RIC_FLUSH_ALL 2 +#include "internal.h" /* * tlbiel instruction for radix, set invalidation @@ -55,16 +55,23 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) if (early_cpu_has_feature(CPU_FTR_HVMODE)) { /* MSR[HV] should flush partition scope translations first. */ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); - for (set = 1; set < num_sets; set++) - tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); + + if (!early_cpu_has_feature(CPU_FTR_ARCH_31)) { + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, + RIC_FLUSH_TLB, 0); + } } /* Flush process scoped entries. */ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); - for (set = 1; set < num_sets; set++) - tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); - asm volatile("ptesync": : :"memory"); + if (!early_cpu_has_feature(CPU_FTR_ARCH_31)) { + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); + } + + ppc_after_tlbiel_barrier(); } void radix__tlbiel_all(unsigned int action) @@ -244,7 +251,6 @@ static inline void fixup_tlbie_pid(unsigned long pid) } } - static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid, unsigned long ap) { @@ -281,29 +287,39 @@ static inline void fixup_tlbie_lpid(unsigned long lpid) /* * We use 128 set in radix mode and 256 set in hpt mode. */ -static __always_inline void _tlbiel_pid(unsigned long pid, unsigned long ric) +static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) { int set; asm volatile("ptesync": : :"memory"); - /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. - */ - __tlbiel_pid(pid, 0, ric); + switch (ric) { + case RIC_FLUSH_PWC: - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); + /* For PWC, only one flush is needed */ + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); + ppc_after_tlbiel_barrier(); return; + case RIC_FLUSH_TLB: + __tlbiel_pid(pid, 0, RIC_FLUSH_TLB); + break; + case RIC_FLUSH_ALL: + default: + /* + * Flush the first set of the TLB, and if + * we're doing a RIC_FLUSH_ALL, also flush + * the entire Page Walk Cache. + */ + __tlbiel_pid(pid, 0, RIC_FLUSH_ALL); } - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_pid(pid, set, RIC_FLUSH_TLB); + if (!cpu_has_feature(CPU_FTR_ARCH_31)) { + /* For the remaining sets, just flush the TLB */ + for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) + __tlbiel_pid(pid, set, RIC_FLUSH_TLB); + } - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); asm volatile(PPC_RADIX_INVALIDATE_ERAT_USER "; isync" : : :"memory"); } @@ -313,7 +329,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) /* * Workaround the fact that the "ric" argument to __tlbie_pid - * must be a compile-time contraint to match the "i" constraint + * must be a compile-time constraint to match the "i" constraint * in the asm statement. */ switch (ric) { @@ -430,7 +446,7 @@ static __always_inline void _tlbiel_va(unsigned long va, unsigned long pid, asm volatile("ptesync": : :"memory"); __tlbiel_va(va, pid, ap, ric); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); } static inline void _tlbiel_va_range(unsigned long start, unsigned long end, @@ -441,7 +457,7 @@ static inline void _tlbiel_va_range(unsigned long start, unsigned long end, if (also_pwc) __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); __tlbiel_va_range(start, end, pid, page_size, psize); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); } static inline void __tlbie_va_range(unsigned long start, unsigned long end, @@ -565,12 +581,13 @@ static inline void _tlbiel_va_range_multicast(struct mm_struct *mm, */ void radix__local_flush_tlb_mm(struct mm_struct *mm) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_TLB); + _tlbiel_pid(pid, RIC_FLUSH_TLB); preempt_enable(); } EXPORT_SYMBOL(radix__local_flush_tlb_mm); @@ -578,26 +595,33 @@ EXPORT_SYMBOL(radix__local_flush_tlb_mm); #ifndef CONFIG_SMP void radix__local_flush_all_mm(struct mm_struct *mm) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_ALL); + _tlbiel_pid(pid, RIC_FLUSH_ALL); preempt_enable(); } EXPORT_SYMBOL(radix__local_flush_all_mm); + +static void __flush_all_mm(struct mm_struct *mm, bool fullmm) +{ + radix__local_flush_all_mm(mm); +} #endif /* CONFIG_SMP */ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) { - unsigned long pid; + unsigned long pid = mm->context.id; + + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) + return; preempt_disable(); - pid = mm->context.id; - if (pid != MMU_NO_CONTEXT) - _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); preempt_enable(); } @@ -612,47 +636,86 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd } EXPORT_SYMBOL(radix__local_flush_tlb_page); -static bool mm_is_singlethreaded(struct mm_struct *mm) -{ - if (atomic_read(&mm->context.copros) > 0) - return false; - if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm) - return true; - return false; -} - static bool mm_needs_flush_escalation(struct mm_struct *mm) { /* - * P9 nest MMU has issues with the page walk cache - * caching PTEs and not flushing them properly when - * RIC = 0 for a PID/LPID invalidate + * The P9 nest MMU has issues with the page walk cache caching PTEs + * and not flushing them when RIC = 0 for a PID/LPID invalidate. + * + * This may have been fixed in shipping firmware (by disabling PWC + * or preventing it from caching PTEs), but until that is confirmed, + * this workaround is required - escalate all RIC=0 IS=1/2/3 flushes + * to RIC=2. + * + * POWER10 (and P9P) does not have this problem. */ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + return false; if (atomic_read(&mm->context.copros) > 0) return true; return false; } -#ifdef CONFIG_SMP -static void do_exit_flush_lazy_tlb(void *arg) +/* + * If always_flush is true, then flush even if this CPU can't be removed + * from mm_cpumask. + */ +void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush) { - struct mm_struct *mm = arg; unsigned long pid = mm->context.id; + int cpu = smp_processor_id(); + /* + * A kthread could have done a mmget_not_zero() after the flushing CPU + * checked mm_cpumask, and be in the process of kthread_use_mm when + * interrupted here. In that case, current->mm will be set to mm, + * because kthread_use_mm() setting ->mm and switching to the mm is + * done with interrupts off. + */ if (current->mm == mm) - return; /* Local CPU */ + goto out; if (current->active_mm == mm) { + unsigned long flags; + + WARN_ON_ONCE(current->mm != NULL); /* - * Must be a kernel thread because sender is single-threaded. + * It is a kernel thread and is using mm as the lazy tlb, so + * switch it to init_mm. This is not always called from IPI + * (e.g., flush_type_needed), so must disable irqs. */ - BUG_ON(current->mm); - mmgrab(&init_mm); - switch_mm(mm, &init_mm, current); + local_irq_save(flags); + mmgrab_lazy_tlb(&init_mm); current->active_mm = &init_mm; - mmdrop(mm); + switch_mm_irqs_off(mm, &init_mm, current); + mmdrop_lazy_tlb(mm); + local_irq_restore(flags); } - _tlbiel_pid(pid, RIC_FLUSH_ALL); + + /* + * This IPI may be initiated from any source including those not + * running the mm, so there may be a racing IPI that comes after + * this one which finds the cpumask already clear. Check and avoid + * underflowing the active_cpus count in that case. The race should + * not otherwise be a problem, but the TLB must be flushed because + * that's what the caller expects. + */ + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) { + dec_mm_active_cpus(mm); + cpumask_clear_cpu(cpu, mm_cpumask(mm)); + always_flush = true; + } + +out: + if (always_flush) + _tlbiel_pid(pid, RIC_FLUSH_ALL); +} + +#ifdef CONFIG_SMP +static void do_exit_flush_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + exit_lazy_flush_tlb(mm, true); } static void exit_flush_lazy_tlbs(struct mm_struct *mm) @@ -666,30 +729,136 @@ static void exit_flush_lazy_tlbs(struct mm_struct *mm) */ smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb, (void *)mm, 1); - mm_reset_thread_local(mm); } +#else /* CONFIG_SMP */ +static inline void exit_flush_lazy_tlbs(struct mm_struct *mm) { } +#endif /* CONFIG_SMP */ + +static DEFINE_PER_CPU(unsigned int, mm_cpumask_trim_clock); + +/* + * Interval between flushes at which we send out IPIs to check whether the + * mm_cpumask can be trimmed for the case where it's not a single-threaded + * process flushing its own mm. The intent is to reduce the cost of later + * flushes. Don't want this to be so low that it adds noticable cost to TLB + * flushing, or so high that it doesn't help reduce global TLBIEs. + */ +static unsigned long tlb_mm_cpumask_trim_timer = 1073; + +static bool tick_and_test_trim_clock(void) +{ + if (__this_cpu_inc_return(mm_cpumask_trim_clock) == + tlb_mm_cpumask_trim_timer) { + __this_cpu_write(mm_cpumask_trim_clock, 0); + return true; + } + return false; +} + +enum tlb_flush_type { + FLUSH_TYPE_NONE, + FLUSH_TYPE_LOCAL, + FLUSH_TYPE_GLOBAL, +}; + +static enum tlb_flush_type flush_type_needed(struct mm_struct *mm, bool fullmm) +{ + int active_cpus = atomic_read(&mm->context.active_cpus); + int cpu = smp_processor_id(); + + if (active_cpus == 0) + return FLUSH_TYPE_NONE; + if (active_cpus == 1 && cpumask_test_cpu(cpu, mm_cpumask(mm))) { + if (current->mm != mm) { + /* + * Asynchronous flush sources may trim down to nothing + * if the process is not running, so occasionally try + * to trim. + */ + if (tick_and_test_trim_clock()) { + exit_lazy_flush_tlb(mm, true); + return FLUSH_TYPE_NONE; + } + } + return FLUSH_TYPE_LOCAL; + } + + /* Coprocessors require TLBIE to invalidate nMMU. */ + if (atomic_read(&mm->context.copros) > 0) + return FLUSH_TYPE_GLOBAL; + + /* + * In the fullmm case there's no point doing the exit_flush_lazy_tlbs + * because the mm is being taken down anyway, and a TLBIE tends to + * be faster than an IPI+TLBIEL. + */ + if (fullmm) + return FLUSH_TYPE_GLOBAL; + + /* + * If we are running the only thread of a single-threaded process, + * then we should almost always be able to trim off the rest of the + * CPU mask (except in the case of use_mm() races), so always try + * trimming the mask. + */ + if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm) { + exit_flush_lazy_tlbs(mm); + /* + * use_mm() race could prevent IPIs from being able to clear + * the cpumask here, however those users are established + * after our first check (and so after the PTEs are removed), + * and the TLB still gets flushed by the IPI, so this CPU + * will only require a local flush. + */ + return FLUSH_TYPE_LOCAL; + } + + /* + * Occasionally try to trim down the cpumask. It's possible this can + * bring the mask to zero, which results in no flush. + */ + if (tick_and_test_trim_clock()) { + exit_flush_lazy_tlbs(mm); + if (current->mm == mm) + return FLUSH_TYPE_LOCAL; + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) + exit_lazy_flush_tlb(mm, true); + return FLUSH_TYPE_NONE; + } + + return FLUSH_TYPE_GLOBAL; +} + +#ifdef CONFIG_SMP void radix__flush_tlb_mm(struct mm_struct *mm) { unsigned long pid; + enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); /* - * Order loads of mm_cpumask vs previous stores to clear ptes before - * the invalidate. See barrier in switch_mm_irqs_off + * Order loads of mm_cpumask (in flush_type_needed) vs previous + * stores to clear ptes before the invalidate. See barrier in + * switch_mm_irqs_off */ smp_mb(); - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - exit_flush_lazy_tlbs(mm); - goto local; - } - - if (cputlb_use_tlbie()) { + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_pid(pid, RIC_FLUSH_TLB); + } else if (type == FLUSH_TYPE_GLOBAL) { + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt = H_RPTI_TARGET_CMMU; + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB, + H_RPTI_PAGE_ALL, 0, -1UL); + } else if (cputlb_use_tlbie()) { if (mm_needs_flush_escalation(mm)) _tlbie_pid(pid, RIC_FLUSH_ALL); else @@ -697,40 +866,43 @@ void radix__flush_tlb_mm(struct mm_struct *mm) } else { _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); } - } else { -local: - _tlbiel_pid(pid, RIC_FLUSH_TLB); } preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } EXPORT_SYMBOL(radix__flush_tlb_mm); static void __flush_all_mm(struct mm_struct *mm, bool fullmm) { unsigned long pid; + enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - if (!fullmm) { - exit_flush_lazy_tlbs(mm); - goto local; - } - } - if (cputlb_use_tlbie()) + type = flush_type_needed(mm, fullmm); + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_pid(pid, RIC_FLUSH_ALL); + } else if (type == FLUSH_TYPE_GLOBAL) { + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt = H_RPTI_TARGET_CMMU; + unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | + H_RPTI_TYPE_PRT; + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, type, + H_RPTI_PAGE_ALL, 0, -1UL); + } else if (cputlb_use_tlbie()) _tlbie_pid(pid, RIC_FLUSH_ALL); else _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); - } else { -local: - _tlbiel_pid(pid, RIC_FLUSH_ALL); } preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } void radix__flush_all_mm(struct mm_struct *mm) @@ -743,25 +915,34 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) { unsigned long pid; + enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - exit_flush_lazy_tlbs(mm); - goto local; - } - if (cputlb_use_tlbie()) + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + } else if (type == FLUSH_TYPE_GLOBAL) { + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt, pg_sizes, size; + + tgt = H_RPTI_TARGET_CMMU; + pg_sizes = psize_to_rpti_pgsize(psize); + size = 1UL << mmu_psize_to_shift(psize); + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB, + pg_sizes, vmaddr, + vmaddr + size); + } else if (cputlb_use_tlbie()) _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); else _tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB); - } else { -local: - _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); } preempt_enable(); } @@ -776,8 +957,6 @@ void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) } EXPORT_SYMBOL(radix__flush_tlb_page); -#else /* CONFIG_SMP */ -#define radix__flush_all_mm radix__local_flush_all_mm #endif /* CONFIG_SMP */ static void do_tlbiel_kernel(void *info) @@ -805,13 +984,23 @@ static inline void _tlbiel_kernel_broadcast(void) */ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) { - if (cputlb_use_tlbie()) + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt = H_RPTI_TARGET_CMMU | H_RPTI_TARGET_NMMU; + unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | + H_RPTI_TYPE_PRT; + + pseries_rpt_invalidate(0, tgt, type, H_RPTI_PAGE_ALL, + start, end); + } else if (cputlb_use_tlbie()) _tlbie_pid(0, RIC_FLUSH_ALL); else _tlbiel_kernel_broadcast(); } EXPORT_SYMBOL(radix__flush_tlb_kernel_range); +/* + * Doesn't appear to be used anywhere. Remove. + */ #define TLB_FLUSH_ALL -1UL /* @@ -823,77 +1012,90 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); * invalidating a full PID, so it has a far lower threshold to change from * individual page flushes to full-pid flushes. */ -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; -static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; +static u32 tlb_single_page_flush_ceiling __read_mostly = 33; +static u32 tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; static inline void __radix__flush_tlb_range(struct mm_struct *mm, unsigned long start, unsigned long end) - { unsigned long pid; unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; - bool local, full; + bool flush_pid, flush_pwc = false; + enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; + WARN_ON_ONCE(end == TLB_FLUSH_ALL); + preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - if (end != TLB_FLUSH_ALL) { - exit_flush_lazy_tlbs(mm); - goto is_local; - } - } - local = false; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_single_page_flush_ceiling); - } else { -is_local: - local = true; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_local_single_page_flush_ceiling); - } + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_NONE) + goto out; - if (full) { - if (local) { - _tlbiel_pid(pid, RIC_FLUSH_TLB); + if (type == FLUSH_TYPE_GLOBAL) + flush_pid = nr_pages > tlb_single_page_flush_ceiling; + else + flush_pid = nr_pages > tlb_local_single_page_flush_ceiling; + /* + * full pid flush already does the PWC flush. if it is not full pid + * flush check the range is more than PMD and force a pwc flush + * mremap() depends on this behaviour. + */ + if (!flush_pid && (end - start) >= PMD_SIZE) + flush_pwc = true; + + if (!mmu_has_feature(MMU_FTR_GTSE) && type == FLUSH_TYPE_GLOBAL) { + unsigned long type = H_RPTI_TYPE_TLB; + unsigned long tgt = H_RPTI_TARGET_CMMU; + unsigned long pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize); + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + pg_sizes |= psize_to_rpti_pgsize(MMU_PAGE_2M); + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + if (flush_pwc) + type |= H_RPTI_TYPE_PWC; + pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end); + } else if (flush_pid) { + /* + * We are now flushing a range larger than PMD size force a RIC_FLUSH_ALL + */ + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_pid(pid, RIC_FLUSH_ALL); } else { if (cputlb_use_tlbie()) { - if (mm_needs_flush_escalation(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); + _tlbie_pid(pid, RIC_FLUSH_ALL); } else { - _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } } } else { - bool hflush = false; + bool hflush; unsigned long hstart, hend; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - hstart = (start + PMD_SIZE - 1) & PMD_MASK; - hend = end & PMD_MASK; - if (hstart == hend) - hflush = false; - else - hflush = true; - } + hstart = (start + PMD_SIZE - 1) & PMD_MASK; + hend = end & PMD_MASK; + hflush = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hstart < hend; - if (local) { + if (type == FLUSH_TYPE_LOCAL) { asm volatile("ptesync": : :"memory"); + if (flush_pwc) + /* For PWC, only one flush is needed */ + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbiel_va_range(hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M); - asm volatile("ptesync": : :"memory"); + ppc_after_tlbiel_barrier(); } else if (cputlb_use_tlbie()) { asm volatile("ptesync": : :"memory"); + if (flush_pwc) + __tlbie_pid(pid, RIC_FLUSH_PWC); __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbie_va_range(hstart, hend, pid, @@ -901,13 +1103,15 @@ is_local: asm volatile("eieio; tlbsync; ptesync": : :"memory"); } else { _tlbiel_va_range_multicast(mm, - start, end, pid, page_size, mmu_virtual_psize, false); + start, end, pid, page_size, mmu_virtual_psize, flush_pwc); if (hflush) _tlbiel_va_range_multicast(mm, - hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false); + hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, flush_pwc); } } +out: preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -977,9 +1181,6 @@ void radix__flush_all_lpid_guest(unsigned int lpid) _tlbie_lpid_guest(lpid, RIC_FLUSH_ALL); } -static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, - unsigned long end, int psize); - void radix__tlb_flush(struct mmu_gather *tlb) { int psize = 0; @@ -995,8 +1196,29 @@ void radix__tlb_flush(struct mmu_gather *tlb) * that flushes the process table entry cache upon process teardown. * See the comment for radix in arch_exit_mmap(). */ - if (tlb->fullmm || tlb->need_flush_all) { - __flush_all_mm(mm, true); + if (tlb->fullmm) { + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) { + /* + * Shootdown based lazy tlb mm refcounting means we + * have to IPI everyone in the mm_cpumask anyway soon + * when the mm goes away, so might as well do it as + * part of the final flush now. + * + * If lazy shootdown was improved to reduce IPIs (e.g., + * by batching), then it may end up being better to use + * tlbies here instead. + */ + preempt_disable(); + + smp_mb(); /* see radix__flush_tlb_mm */ + exit_flush_lazy_tlbs(mm); + __flush_all_mm(mm, true); + + preempt_enable(); + } else { + __flush_all_mm(mm, true); + } + } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { if (!tlb->freed_tables) radix__flush_tlb_mm(mm); @@ -1010,7 +1232,7 @@ void radix__tlb_flush(struct mmu_gather *tlb) } } -static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, +static void __radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize, bool also_pwc) { @@ -1018,33 +1240,38 @@ static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned int page_shift = mmu_psize_defs[psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; - bool local, full; + bool flush_pid; + enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; + WARN_ON_ONCE(end == TLB_FLUSH_ALL); + preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - if (end != TLB_FLUSH_ALL) { - exit_flush_lazy_tlbs(mm); - goto is_local; - } - } - local = false; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_single_page_flush_ceiling); - } else { -is_local: - local = true; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_local_single_page_flush_ceiling); - } + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_NONE) + goto out; - if (full) { - if (local) { + if (type == FLUSH_TYPE_GLOBAL) + flush_pid = nr_pages > tlb_single_page_flush_ceiling; + else + flush_pid = nr_pages > tlb_local_single_page_flush_ceiling; + + if (!mmu_has_feature(MMU_FTR_GTSE) && type == FLUSH_TYPE_GLOBAL) { + unsigned long tgt = H_RPTI_TARGET_CMMU; + unsigned long type = H_RPTI_TYPE_TLB; + unsigned long pg_sizes = psize_to_rpti_pgsize(psize); + + if (also_pwc) + type |= H_RPTI_TYPE_PWC; + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end); + } else if (flush_pid) { + if (type == FLUSH_TYPE_LOCAL) { _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); } else { if (cputlb_use_tlbie()) { @@ -1060,7 +1287,7 @@ is_local: } } else { - if (local) + if (type == FLUSH_TYPE_LOCAL) _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); else if (cputlb_use_tlbie()) _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); @@ -1068,7 +1295,9 @@ is_local: _tlbiel_va_range_multicast(mm, start, end, pid, page_size, psize, also_pwc); } +out: preempt_enable(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, @@ -1077,8 +1306,8 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, return __radix__flush_tlb_range_psize(mm, start, end, psize, false); } -static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, - unsigned long end, int psize) +void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize) { __radix__flush_tlb_range_psize(mm, start, end, psize, true); } @@ -1087,9 +1316,10 @@ static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) { unsigned long pid, end; + enum tlb_flush_type type; pid = mm->context.id; - if (unlikely(pid == MMU_NO_CONTEXT)) + if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT)) return; /* 4k page size, just blow the world */ @@ -1103,19 +1333,27 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) /* Otherwise first do the PWC, then iterate the pages. */ preempt_disable(); smp_mb(); /* see radix__flush_tlb_mm */ - if (!mm_is_thread_local(mm)) { - if (unlikely(mm_is_singlethreaded(mm))) { - exit_flush_lazy_tlbs(mm); - goto local; - } - if (cputlb_use_tlbie()) + type = flush_type_needed(mm, false); + if (type == FLUSH_TYPE_LOCAL) { + _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + } else if (type == FLUSH_TYPE_GLOBAL) { + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt, type, pg_sizes; + + tgt = H_RPTI_TARGET_CMMU; + type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | + H_RPTI_TYPE_PRT; + pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize); + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, type, pg_sizes, + addr, end); + } else if (cputlb_use_tlbie()) _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); else _tlbiel_va_range_multicast(mm, addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); - } else { -local: - _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); } preempt_enable(); @@ -1129,6 +1367,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, } EXPORT_SYMBOL(radix__flush_pmd_tlb_range); +void radix__flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G); +} +EXPORT_SYMBOL(radix__flush_pud_tlb_range); + void radix__flush_tlb_all(void) { unsigned long rb,prs,r,rs; @@ -1154,44 +1399,189 @@ void radix__flush_tlb_all(void) } #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -extern void radix_kvm_prefetch_workaround(struct mm_struct *mm) +static __always_inline void __tlbie_pid_lpid(unsigned long pid, + unsigned long lpid, + unsigned long ric) { - unsigned long pid = mm->context.id; + unsigned long rb, rs, prs, r; - if (unlikely(pid == MMU_NO_CONTEXT)) - return; + rb = PPC_BIT(53); /* IS = 1 */ + rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31))); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid, + unsigned long lpid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb, rs, prs, r; + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31))); + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + +static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid) +{ /* - * If this context hasn't run on that CPU before and KVM is - * around, there's a slim chance that the guest on another - * CPU just brought in obsolete translation into the TLB of - * this CPU due to a bad prefetch using the guest PID on - * the way into the hypervisor. - * - * We work around this here. If KVM is possible, we check if - * any sibling thread is in KVM. If it is, the window may exist - * and thus we flush that PID from the core. - * - * A potential future improvement would be to mark which PIDs - * have never been used on the system and avoid it if the PID - * is new and the process has no other cpumask bit set. + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. */ - if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) { - int cpu = smp_processor_id(); - int sib = cpu_first_thread_sibling(cpu); - bool flush = false; - - for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) { - if (sib == cpu) - continue; - if (!cpu_possible(sib)) - continue; - if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu) - flush = true; + unsigned long va = ((1UL << 52) - 1); + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K), + RIC_FLUSH_TLB); + } +} + +static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid, + unsigned long ric) +{ + asm volatile("ptesync" : : : "memory"); + + /* + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. + */ + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB); + fixup_tlbie_pid_lpid(pid, lpid); + break; + case RIC_FLUSH_PWC: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL); + fixup_tlbie_pid_lpid(pid, lpid); + } + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} + +static inline void fixup_tlbie_va_range_lpid(unsigned long va, + unsigned long pid, + unsigned long lpid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync" : : : "memory"); + __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB); + } +} + +static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end, + unsigned long pid, unsigned long lpid, + unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB); + + fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap); +} + +static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end, + unsigned long pid, unsigned long lpid, + unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + asm volatile("ptesync" : : : "memory"); + if (also_pwc) + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); + __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} + +/* + * Performs process-scoped invalidations for a given LPID + * as part of H_RPT_INVALIDATE hcall. + */ +void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid, + unsigned long type, unsigned long pg_sizes, + unsigned long start, unsigned long end) +{ + unsigned long psize, nr_pages; + struct mmu_psize_def *def; + bool flush_pid; + + /* + * A H_RPTI_TYPE_ALL request implies RIC=3, hence + * do a single IS=1 based flush. + */ + if ((type & H_RPTI_TYPE_ALL) == H_RPTI_TYPE_ALL) { + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL); + return; + } + + if (type & H_RPTI_TYPE_PWC) + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC); + + /* Full PID flush */ + if (start == 0 && end == -1) + return _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB); + + /* Do range invalidation for all the valid page sizes */ + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { + def = &mmu_psize_defs[psize]; + if (!(pg_sizes & def->h_rpt_pgsize)) + continue; + + nr_pages = (end - start) >> def->shift; + flush_pid = nr_pages > tlb_single_page_flush_ceiling; + + /* + * If the number of pages spanning the range is above + * the ceiling, convert the request into a full PID flush. + * And since PID flush takes out all the page sizes, there + * is no need to consider remaining page sizes. + */ + if (flush_pid) { + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB); + return; } - if (flush) - _tlbiel_pid(pid, RIC_FLUSH_ALL); + _tlbie_va_range_lpid(start, end, pid, lpid, + (1UL << def->shift), psize, false); } } -EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround); +EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt); + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + +static int __init create_tlb_single_page_flush_ceiling(void) +{ + debugfs_create_u32("tlb_single_page_flush_ceiling", 0600, + arch_debugfs_dir, &tlb_single_page_flush_ceiling); + debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600, + arch_debugfs_dir, &tlb_local_single_page_flush_ceiling); + return 0; +} +late_initcall(create_tlb_single_page_flush_ceiling); + |