diff options
Diffstat (limited to 'arch/x86/mm/tlb.c')
| -rw-r--r-- | arch/x86/mm/tlb.c | 1455 |
1 files changed, 1208 insertions, 247 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 999d6d8f0bef..f5b93e01e347 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/init.h> #include <linux/mm.h> @@ -7,16 +8,34 @@ #include <linux/export.h> #include <linux/cpu.h> #include <linux/debugfs.h> +#include <linux/sched/smt.h> +#include <linux/task_work.h> +#include <linux/mmu_notifier.h> +#include <linux/mmu_context.h> +#include <linux/kvm_types.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/nospec-branch.h> #include <asm/cache.h> +#include <asm/cacheflush.h> #include <asm/apic.h> -#include <asm/uv/uv.h> +#include <asm/msr.h> +#include <asm/perf_event.h> +#include <asm/tlb.h> #include "mm_internal.h" +#ifdef CONFIG_PARAVIRT +# define STATIC_NOPV +#else +# define STATIC_NOPV static +# define __flush_tlb_local native_flush_tlb_local +# define __flush_tlb_global native_flush_tlb_global +# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr) +# define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info) +#endif + /* * TLB flushing, formerly SMP-only * c/o Linus Torvalds. @@ -32,10 +51,137 @@ */ /* - * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is - * stored in cpu_tlb_state.last_user_mm_ibpb. + * Bits to mangle the TIF_SPEC_* state into the mm pointer which is + * stored in cpu_tlb_state.last_user_mm_spec. */ #define LAST_USER_MM_IBPB 0x1UL +#define LAST_USER_MM_L1D_FLUSH 0x2UL +#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_L1D_FLUSH) + +/* Bits to set when tlbstate and flush is (re)initialized */ +#define LAST_USER_MM_INIT LAST_USER_MM_IBPB + +/* + * The x86 feature is called PCID (Process Context IDentifier). It is similar + * to what is traditionally called ASID on the RISC processors. + * + * We don't use the traditional ASID implementation, where each process/mm gets + * its own ASID and flush/restart when we run out of ASID space. + * + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's + * that came by on this CPU, allowing cheaper switch_mm between processes on + * this CPU. + * + * We end up with different spaces for different things. To avoid confusion we + * use different names for each of them: + * + * ASID - [0, TLB_NR_DYN_ASIDS-1] + * the canonical identifier for an mm, dynamically allocated on each CPU + * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1] + * the canonical, global identifier for an mm, identical across all CPUs + * + * kPCID - [1, MAX_ASID_AVAILABLE] + * the value we write into the PCID part of CR3; corresponds to the + * ASID+1, because PCID 0 is special. + * + * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE] + * for KPTI each mm has two address spaces and thus needs two + * PCID values, but we can still do with a single ASID denomination + * for each mm. Corresponds to kPCID + 2048. + * + */ + +/* + * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for + * user/kernel switches + */ +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION +# define PTI_CONSUMED_PCID_BITS 1 +#else +# define PTI_CONSUMED_PCID_BITS 0 +#endif + +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) + +/* + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account + * for them being zero-based. Another -1 is because PCID 0 is reserved for + * use by non-PCID-aware users. + */ +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) + +/* + * Given @asid, compute kPCID + */ +static inline u16 kern_pcid(u16 asid) +{ + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION + /* + * Make sure that the dynamic ASID space does not conflict with the + * bit we are using to switch between user and kernel ASIDs. + */ + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); + + /* + * The ASID being passed in here should have respected the + * MAX_ASID_AVAILABLE and thus never have the switch bit set. + */ + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); +#endif + /* + * The dynamically-assigned ASIDs that get passed in are small + * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set, + * so do not bother to clear it. + * + * If PCID is on, ASID-aware code paths put the ASID+1 into the + * PCID bits. This serves two purposes. It prevents a nasty + * situation in which PCID-unaware code saves CR3, loads some other + * value (with PCID == 0), and then restores CR3, thus corrupting + * the TLB for ASID 0 if the saved ASID was nonzero. It also means + * that any bugs involving loading a PCID-enabled CR3 with + * CR4.PCIDE off will trigger deterministically. + */ + return asid + 1; +} + +/* + * Given @asid, compute uPCID + */ +static inline u16 user_pcid(u16 asid) +{ + u16 ret = kern_pcid(asid); +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION + ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; +#endif + return ret; +} + +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam) +{ + unsigned long cr3 = __sme_pa(pgd) | lam; + + if (static_cpu_has(X86_FEATURE_PCID)) { + cr3 |= kern_pcid(asid); + } else { + VM_WARN_ON_ONCE(asid != 0); + } + + return cr3; +} + +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid, + unsigned long lam) +{ + /* + * Use boot_cpu_has() instead of this_cpu_has() as this function + * might be called during early boot. This should work even after + * boot because all CPU's the have same capabilities: + */ + VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); + return build_cr3(pgd, asid, lam) | CR3_NOFLUSH; +} /* * We get here when we do something requiring a TLB invalidation @@ -71,16 +217,34 @@ static void clear_asid_other(void) atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); +struct new_asid { + unsigned int asid : 16; + unsigned int need_flush : 1; +}; -static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, - u16 *new_asid, bool *need_flush) +static struct new_asid choose_new_asid(struct mm_struct *next, u64 next_tlb_gen) { + struct new_asid ns; u16 asid; if (!static_cpu_has(X86_FEATURE_PCID)) { - *new_asid = 0; - *need_flush = true; - return; + ns.asid = 0; + ns.need_flush = 1; + return ns; + } + + /* + * TLB consistency for global ASIDs is maintained with hardware assisted + * remote TLB flushing. Global ASIDs are always up to date. + */ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { + u16 global_asid = mm_global_asid(next); + + if (global_asid) { + ns.asid = global_asid; + ns.need_flush = 0; + return ns; + } } if (this_cpu_read(cpu_tlbstate.invalidate_other)) @@ -91,33 +255,323 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, next->context.ctx_id) continue; - *new_asid = asid; - *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < - next_tlb_gen); - return; + ns.asid = asid; + ns.need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < next_tlb_gen); + return ns; } /* * We don't currently own an ASID slot on this CPU. * Allocate a slot. */ - *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; - if (*new_asid >= TLB_NR_DYN_ASIDS) { - *new_asid = 0; + ns.asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; + if (ns.asid >= TLB_NR_DYN_ASIDS) { + ns.asid = 0; this_cpu_write(cpu_tlbstate.next_asid, 1); } - *need_flush = true; + ns.need_flush = true; + + return ns; +} + +/* + * Global ASIDs are allocated for multi-threaded processes that are + * active on multiple CPUs simultaneously, giving each of those + * processes the same PCID on every CPU, for use with hardware-assisted + * TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR. + * + * These global ASIDs are held for the lifetime of the process. + */ +static DEFINE_RAW_SPINLOCK(global_asid_lock); +static u16 last_global_asid = MAX_ASID_AVAILABLE; +static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE); +static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE); +static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1; + +/* + * When the search for a free ASID in the global ASID space reaches + * MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously + * freed global ASIDs are safe to re-use. + * + * This way the global flush only needs to happen at ASID rollover + * time, and not at ASID allocation time. + */ +static void reset_global_asid_space(void) +{ + lockdep_assert_held(&global_asid_lock); + + invlpgb_flush_all_nonglobals(); + + /* + * The TLB flush above makes it safe to re-use the previously + * freed global ASIDs. + */ + bitmap_andnot(global_asid_used, global_asid_used, + global_asid_freed, MAX_ASID_AVAILABLE); + bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE); + + /* Restart the search from the start of global ASID space. */ + last_global_asid = TLB_NR_DYN_ASIDS; +} + +static u16 allocate_global_asid(void) +{ + u16 asid; + + lockdep_assert_held(&global_asid_lock); + + /* The previous allocation hit the edge of available address space */ + if (last_global_asid >= MAX_ASID_AVAILABLE - 1) + reset_global_asid_space(); + + asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid); + + if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) { + /* This should never happen. */ + VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n", + global_asid_available); + return 0; + } + + /* Claim this global ASID. */ + __set_bit(asid, global_asid_used); + last_global_asid = asid; + global_asid_available--; + return asid; +} + +/* + * Check whether a process is currently active on more than @threshold CPUs. + * This is a cheap estimation on whether or not it may make sense to assign + * a global ASID to this process, and use broadcast TLB invalidation. + */ +static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold) +{ + int count = 0; + int cpu; + + /* This quick check should eliminate most single threaded programs. */ + if (cpumask_weight(mm_cpumask(mm)) <= threshold) + return false; + + /* Slower check to make sure. */ + for_each_cpu(cpu, mm_cpumask(mm)) { + /* Skip the CPUs that aren't really running this process. */ + if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm) + continue; + + if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) + continue; + + if (++count > threshold) + return true; + } + return false; } -static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) +/* + * Assign a global ASID to the current process, protecting against + * races between multiple threads in the process. + */ +static void use_global_asid(struct mm_struct *mm) +{ + u16 asid; + + guard(raw_spinlock_irqsave)(&global_asid_lock); + + /* This process is already using broadcast TLB invalidation. */ + if (mm_global_asid(mm)) + return; + + /* + * The last global ASID was consumed while waiting for the lock. + * + * If this fires, a more aggressive ASID reuse scheme might be + * needed. + */ + if (!global_asid_available) { + VM_WARN_ONCE(1, "Ran out of global ASIDs\n"); + return; + } + + asid = allocate_global_asid(); + if (!asid) + return; + + mm_assign_global_asid(mm, asid); +} + +void mm_free_global_asid(struct mm_struct *mm) +{ + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return; + + if (!mm_global_asid(mm)) + return; + + guard(raw_spinlock_irqsave)(&global_asid_lock); + + /* The global ASID can be re-used only after flush at wrap-around. */ +#ifdef CONFIG_BROADCAST_TLB_FLUSH + __set_bit(mm->context.global_asid, global_asid_freed); + + mm->context.global_asid = 0; + global_asid_available++; +#endif +} + +/* + * Is the mm transitioning from a CPU-local ASID to a global ASID? + */ +static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid) +{ + u16 global_asid = mm_global_asid(mm); + + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return false; + + /* Process is transitioning to a global ASID */ + if (global_asid && asid != global_asid) + return true; + + return false; +} + +/* + * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86 + * systems have over 8k CPUs. Because of this potential ASID shortage, + * global ASIDs are handed out to processes that have frequent TLB + * flushes and are active on 4 or more CPUs simultaneously. + */ +static void consider_global_asid(struct mm_struct *mm) +{ + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return; + + /* Check every once in a while. */ + if ((current->pid & 0x1f) != (jiffies & 0x1f)) + return; + + /* + * Assign a global ASID if the process is active on + * 4 or more CPUs simultaneously. + */ + if (mm_active_cpus_exceeds(mm, 3)) + use_global_asid(mm); +} + +static void finish_asid_transition(struct flush_tlb_info *info) +{ + struct mm_struct *mm = info->mm; + int bc_asid = mm_global_asid(mm); + int cpu; + + if (!mm_in_asid_transition(mm)) + return; + + for_each_cpu(cpu, mm_cpumask(mm)) { + /* + * The remote CPU is context switching. Wait for that to + * finish, to catch the unlikely case of it switching to + * the target mm with an out of date ASID. + */ + while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING) + cpu_relax(); + + if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm) + continue; + + /* + * If at least one CPU is not using the global ASID yet, + * send a TLB flush IPI. The IPI should cause stragglers + * to transition soon. + * + * This can race with the CPU switching to another task; + * that results in a (harmless) extra IPI. + */ + if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) { + flush_tlb_multi(mm_cpumask(info->mm), info); + return; + } + } + + /* All the CPUs running this process are using the global ASID. */ + mm_clear_asid_transition(mm); +} + +static void broadcast_tlb_flush(struct flush_tlb_info *info) +{ + bool pmd = info->stride_shift == PMD_SHIFT; + unsigned long asid = mm_global_asid(info->mm); + unsigned long addr = info->start; + + /* + * TLB flushes with INVLPGB are kicked off asynchronously. + * The inc_mm_tlb_gen() guarantees page table updates are done + * before these TLB flushes happen. + */ + if (info->end == TLB_FLUSH_ALL) { + invlpgb_flush_single_pcid_nosync(kern_pcid(asid)); + /* Do any CPUs supporting INVLPGB need PTI? */ + if (cpu_feature_enabled(X86_FEATURE_PTI)) + invlpgb_flush_single_pcid_nosync(user_pcid(asid)); + } else do { + unsigned long nr = 1; + + if (info->stride_shift <= PMD_SHIFT) { + nr = (info->end - addr) >> info->stride_shift; + nr = clamp_val(nr, 1, invlpgb_count_max); + } + + invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd); + if (cpu_feature_enabled(X86_FEATURE_PTI)) + invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd); + + addr += nr << info->stride_shift; + } while (addr < info->end); + + finish_asid_transition(info); + + /* Wait for the INVLPGBs kicked off above to finish. */ + __tlbsync(); +} + +/* + * Given an ASID, flush the corresponding user ASID. We can delay this + * until the next time we switch to it. + * + * See SWITCH_TO_USER_CR3. + */ +static inline void invalidate_user_asid(u16 asid) +{ + /* There is no user ASID if address space separation is off */ + if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) + return; + + /* + * We only have a single ASID if PCID is off and the CR3 + * write will have flushed it. + */ + if (!cpu_feature_enabled(X86_FEATURE_PCID)) + return; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + __set_bit(kern_pcid(asid), + (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); +} + +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam, + bool need_flush) { unsigned long new_mm_cr3; if (need_flush) { invalidate_user_asid(new_asid); - new_mm_cr3 = build_cr3(pgdir, new_asid); + new_mm_cr3 = build_cr3(pgdir, new_asid, lam); } else { - new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam); } /* @@ -128,7 +582,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) write_cr3(new_mm_cr3); } -void leave_mm(int cpu) +void leave_mm(void) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); @@ -144,7 +598,7 @@ void leave_mm(int cpu) return; /* Warn if we're not lazy. */ - WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); + WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy)); switch_mm(NULL, &init_mm, NULL); } @@ -156,52 +610,74 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, unsigned long flags; local_irq_save(flags); - switch_mm_irqs_off(prev, next, tsk); + switch_mm_irqs_off(NULL, next, tsk); local_irq_restore(flags); } -static void sync_current_stack_to_mm(struct mm_struct *mm) +/* + * Invoked from return to user/guest by a task that opted-in to L1D + * flushing but ended up running on an SMT enabled core due to wrong + * affinity settings or CPU hotplug. This is part of the paranoid L1D flush + * contract which this task requested. + */ +static void l1d_flush_force_sigbus(struct callback_head *ch) { - unsigned long sp = current_stack_pointer; - pgd_t *pgd = pgd_offset(mm, sp); - - if (pgtable_l5_enabled()) { - if (unlikely(pgd_none(*pgd))) { - pgd_t *pgd_ref = pgd_offset_k(sp); + force_sig(SIGBUS); +} - set_pgd(pgd, *pgd_ref); - } - } else { - /* - * "pgd" is faked. The top level entries are "p4d"s, so sync - * the p4d. This compiles to approximately the same code as - * the 5-level case. - */ - p4d_t *p4d = p4d_offset(pgd, sp); +static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm, + struct task_struct *next) +{ + /* Flush L1D if the outgoing task requests it */ + if (prev_mm & LAST_USER_MM_L1D_FLUSH) + wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); - if (unlikely(p4d_none(*p4d))) { - pgd_t *pgd_ref = pgd_offset_k(sp); - p4d_t *p4d_ref = p4d_offset(pgd_ref, sp); + /* Check whether the incoming task opted in for L1D flush */ + if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH))) + return; - set_p4d(p4d, *p4d_ref); - } + /* + * Validate that it is not running on an SMT sibling as this would + * make the exercise pointless because the siblings share L1D. If + * it runs on a SMT sibling, notify it with SIGBUS on return to + * user/guest + */ + if (this_cpu_read(cpu_info.smt_active)) { + clear_ti_thread_flag(&next->thread_info, TIF_SPEC_L1D_FLUSH); + next->l1d_flush_kill.func = l1d_flush_force_sigbus; + task_work_add(next, &next->l1d_flush_kill, TWA_RESUME); } } -static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) +static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next) { - unsigned long next_tif = task_thread_info(next)->flags; - unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; + unsigned long next_tif = read_task_thread_flags(next); + unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK; - return (unsigned long)next->mm | ibpb; + /* + * Ensure that the bit shift above works as expected and the two flags + * end up in bit 0 and 1. + */ + BUILD_BUG_ON(TIF_SPEC_L1D_FLUSH != TIF_SPEC_IB + 1); + + return (unsigned long)next->mm | spec_bits; } -static void cond_ibpb(struct task_struct *next) +static void cond_mitigation(struct task_struct *next) { + unsigned long prev_mm, next_mm; + if (!next || !next->mm) return; + next_mm = mm_mangle_tif_spec_bits(next); + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); + /* + * Avoid user->user BTB/RSB poisoning by flushing them when switching + * between processes. This stops one process from doing Spectre-v2 + * attacks on another. + * * Both, the conditional and the always IBPB mode use the mm * pointer to avoid the IBPB when switching between tasks of the * same process. Using the mm pointer instead of mm->context.ctx_id @@ -211,8 +687,6 @@ static void cond_ibpb(struct task_struct *next) * exposed data is not really interesting. */ if (static_branch_likely(&switch_mm_cond_ibpb)) { - unsigned long prev_mm, next_mm; - /* * This is a bit more complex than the always mode because * it has to handle two cases: @@ -242,20 +716,14 @@ static void cond_ibpb(struct task_struct *next) * Optimize this with reasonably small overhead for the * above cases. Mangle the TIF_SPEC_IB bit into the mm * pointer of the incoming task which is stored in - * cpu_tlbstate.last_user_mm_ibpb for comparison. - */ - next_mm = mm_mangle_tif_spec_ib(next); - prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); - - /* + * cpu_tlbstate.last_user_mm_spec for comparison. + * * Issue IBPB only if the mm's are different and one or * both have the IBPB bit set. */ if (next_mm != prev_mm && (next_mm | prev_mm) & LAST_USER_MM_IBPB) indirect_branch_prediction_barrier(); - - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); } if (static_branch_unlikely(&switch_mm_always_ibpb)) { @@ -264,34 +732,67 @@ static void cond_ibpb(struct task_struct *next) * different context than the user space task which ran * last on this CPU. */ - if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { + if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != (unsigned long)next->mm) indirect_branch_prediction_barrier(); - this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); - } } + + if (static_branch_unlikely(&switch_mm_cond_l1d_flush)) { + /* + * Flush L1D when the outgoing task requested it and/or + * check whether the incoming task requested L1D flushing + * and ended up on an SMT sibling. + */ + if (unlikely((prev_mm | next_mm) & LAST_USER_MM_L1D_FLUSH)) + l1d_flush_evaluate(prev_mm, next_mm, next); + } + + this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm); +} + +#ifdef CONFIG_PERF_EVENTS +static inline void cr4_update_pce_mm(struct mm_struct *mm) +{ + if (static_branch_unlikely(&rdpmc_always_available_key) || + (!static_branch_unlikely(&rdpmc_never_available_key) && + atomic_read(&mm->context.perf_rdpmc_allowed))) { + /* + * Clear the existing dirty counters to + * prevent the leak for an RDPMC task. + */ + perf_clear_dirty_counters(); + cr4_set_bits_irqsoff(X86_CR4_PCE); + } else + cr4_clear_bits_irqsoff(X86_CR4_PCE); +} + +void cr4_update_pce(void *ignored) +{ + cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm)); } -void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, +#else +static inline void cr4_update_pce_mm(struct mm_struct *mm) { } +#endif + +/* + * This optimizes when not actually switching mm's. Some architectures use the + * 'unused' argument for this optimization, but x86 must use + * 'cpu_tlbstate.loaded_mm' instead because it does not always keep + * 'current->active_mm' up to date. + */ +void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, struct task_struct *tsk) { - struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); + bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); unsigned cpu = smp_processor_id(); + unsigned long new_lam; + struct new_asid ns; u64 next_tlb_gen; - bool need_flush; - u16 new_asid; - /* - * NB: The scheduler will call us with prev == next when switching - * from lazy TLB mode to normal mode if active_mm isn't changing. - * When this happens, we don't assume that CR3 (and hence - * cpu_tlbstate.loaded_mm) matches next. - * - * NB: leave_mm() calls us with prev == NULL and tsk == NULL. - */ - /* We don't want flush_tlb_func_* to run concurrently with us. */ + /* We don't want flush_tlb_func() to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(!irqs_disabled()); @@ -305,7 +806,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * isn't free. */ #ifdef CONFIG_DEBUG_VM - if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { + if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid, + tlbstate_lam_cr3_mask()))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. @@ -321,27 +823,56 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, __flush_tlb_all(); } #endif - this_cpu_write(cpu_tlbstate.is_lazy, false); + if (was_lazy) + this_cpu_write(cpu_tlbstate_shared.is_lazy, false); /* * The membarrier system call requires a full memory barrier and * core serialization before returning to user-space, after - * storing to rq->curr. Writing to CR3 provides that full - * memory barrier and core serializing instruction. + * storing to rq->curr, when changing mm. This is because + * membarrier() sends IPIs to all CPUs that are in the target mm + * to make them issue memory barriers. However, if another CPU + * switches to/from the target mm concurrently with + * membarrier(), it can cause that CPU not to receive an IPI + * when it really should issue a memory barrier. Writing to CR3 + * provides that full memory barrier and core serializing + * instruction. */ - if (real_prev == next) { - VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + if (prev == next) { + /* Not actually switching mm's */ + VM_WARN_ON(is_dyn_asid(prev_asid) && + this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); /* + * If this races with another thread that enables lam, 'new_lam' + * might not match tlbstate_lam_cr3_mask(). + */ + + /* * Even in lazy TLB mode, the CPU should stay set in the * mm_cpumask. The TLB shootdown code can figure out from - * from cpu_tlbstate.is_lazy whether or not to send an IPI. + * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. */ - if (WARN_ON_ONCE(real_prev != &init_mm && + if (IS_ENABLED(CONFIG_DEBUG_VM) && + WARN_ON_ONCE(prev != &init_mm && !is_notrack_mm(prev) && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); + /* Check if the current mm is transitioning to a global ASID */ + if (mm_needs_global_asid(next, prev_asid)) { + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + ns = choose_new_asid(next, next_tlb_gen); + goto reload_tlb; + } + + /* + * Broadcast TLB invalidation keeps this ASID up to date + * all the time. + */ + if (is_global_asid(prev_asid)) + return; + /* * If the CPU is not in lazy TLB mode, we are just switching * from one thread in a process to another thread in the same @@ -366,81 +897,77 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * TLB contents went out of date while we were in lazy * mode. Fall through to the TLB switching code below. */ - new_asid = prev_asid; - need_flush = true; + ns.asid = prev_asid; + ns.need_flush = true; } else { /* - * Avoid user/user BTB poisoning by flushing the branch - * predictor when switching between processes. This stops - * one process from doing Spectre-v2 attacks on another. + * Apply process to process speculation vulnerability + * mitigations if applicable. */ - cond_ibpb(tsk); - - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. - */ - sync_current_stack_to_mm(next); - } + cond_mitigation(tsk); /* - * Stop remote flushes for the previous mm. - * Skip kernel threads; we never send init_mm TLB flushing IPIs, - * but the bitmap manipulation can cause cache line contention. + * Indicate that CR3 is about to change. nmi_uaccess_okay() + * and others are sensitive to the window where mm_cpumask(), + * CR3 and cpu_tlbstate.loaded_mm are not all in sync. */ - if (real_prev != &init_mm) { - VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, - mm_cpumask(real_prev))); - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); - } + this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); /* - * Start remote flushes and then read tlb_gen. + * Make sure this CPU is set in mm_cpumask() such that we'll + * receive invalidation IPIs. + * + * Rely on the smp_mb() implied by cpumask_set_cpu()'s atomic + * operation, or explicitly provide one. Such that: + * + * switch_mm_irqs_off() flush_tlb_mm_range() + * smp_store_release(loaded_mm, SWITCHING); atomic64_inc_return(tlb_gen) + * smp_mb(); // here // smp_mb() implied + * atomic64_read(tlb_gen); this_cpu_read(loaded_mm); + * + * we properly order against flush_tlb_mm_range(), where the + * loaded_mm load can happen in mative_flush_tlb_multi() -> + * should_flush_tlb(). + * + * This way switch_mm() must see the new tlb_gen or + * flush_tlb_mm_range() must see the new loaded_mm, or both. */ - if (next != &init_mm) + if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next))) cpumask_set_cpu(cpu, mm_cpumask(next)); - next_tlb_gen = atomic64_read(&next->context.tlb_gen); + else + smp_mb(); - choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); - /* Let nmi_uaccess_okay() know that we're changing CR3. */ - this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); - barrier(); + ns = choose_new_asid(next, next_tlb_gen); } - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); +reload_tlb: + new_lam = mm_lam_cr3_mask(next); + if (ns.need_flush) { + VM_WARN_ON_ONCE(is_global_asid(ns.asid)); + this_cpu_write(cpu_tlbstate.ctxs[ns.asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[ns.asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, ns.asid, new_lam, true); - /* - * NB: This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we need to use the _rcuidle variant. - * - * (There is no good reason for this. The idle code should - * be rearranged to call this before rcu_idle_enter().) - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); + load_new_mm_cr3(next->pgd, ns.asid, new_lam, false); - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } /* Make sure we write CR3 before loaded_mm. */ barrier(); this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, ns.asid); + cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next)); - if (next != real_prev) { - load_mm_cr4(next); - switch_ldt(real_prev, next); + if (next != prev) { + cr4_update_pce_mm(next); + switch_ldt(prev, next); } } @@ -462,7 +989,78 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; - this_cpu_write(cpu_tlbstate.is_lazy, true); + this_cpu_write(cpu_tlbstate_shared.is_lazy, true); +} + +/* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. Temporary mms can also be used for EFI runtime service + * calls or similar functionality. + * + * It is illegal to schedule while using a temporary mm -- the context switch + * code is unaware of the temporary mm and does not know how to context switch. + * Use a real (non-temporary) mm in a kernel thread if you need to sleep. + * + * Note: For sensitive memory writes, the temporary mm needs to be used + * exclusively by a single core, and IRQs should be disabled while the + * temporary mm is loaded, thereby preventing interrupt handler bugs from + * overriding the kernel memory protection. + */ +struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm) +{ + struct mm_struct *prev_mm; + + lockdep_assert_preemption_disabled(); + guard(irqsave)(); + + /* + * Make sure not to be in TLB lazy mode, as otherwise we'll end up + * with a stale address space WITHOUT being in lazy mode after + * restoring the previous mm. + */ + if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) + leave_mm(); + + prev_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, temp_mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return prev_mm; +} + +void unuse_temporary_mm(struct mm_struct *prev_mm) +{ + lockdep_assert_preemption_disabled(); + guard(irqsave)(); + + /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ + cpumask_clear_cpu(smp_processor_id(), mm_cpumask(this_cpu_read(cpu_tlbstate.loaded_mm))); + + switch_mm_irqs_off(NULL, prev_mm, current); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); } /* @@ -483,11 +1081,16 @@ void initialize_tlbstate_and_flush(void) int i; struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); + unsigned long lam = mm_lam_cr3_mask(mm); unsigned long cr3 = __read_cr3(); /* Assert that CR3 already references the right mm. */ WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); + /* LAM expected to be disabled */ + WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)); + WARN_ON(lam); + /* * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization * doesn't work like other CR4 bits because it can only be set from @@ -496,29 +1099,29 @@ void initialize_tlbstate_and_flush(void) WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && !(cr4_read_shadow() & X86_CR4_PCIDE)); - /* Force ASID 0 and force a TLB flush. */ - write_cr3(build_cr3(mm->pgd, 0)); + /* Disable LAM, force ASID 0 and force a TLB flush. */ + write_cr3(build_cr3(mm->pgd, 0, 0)); /* Reinitialize tlbstate. */ - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB); + this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT); this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); + cpu_tlbstate_update_lam(lam, mm_untag_mask(mm)); for (i = 1; i < TLB_NR_DYN_ASIDS; i++) this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); } /* - * flush_tlb_func_common()'s memory ordering requirement is that any + * flush_tlb_func()'s memory ordering requirement is that any * TLB fills that happen after we flush the TLB are ordered after we * read active_mm's tlb_gen. We don't need any explicit barriers * because all x86 flush operations are serializing and the * atomic64_read operation won't be reordered by the compiler. */ -static void flush_tlb_func_common(const struct flush_tlb_info *f, - bool local, enum tlb_flush_reason reason) +static void flush_tlb_func(void *info) { /* * We have three different tlb_gen values in here. They are: @@ -529,34 +1132,78 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * - f->new_tlb_gen: the generation that the requester of the flush * wants us to catch up to. */ + const struct flush_tlb_info *f = info; struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); - u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + u64 local_tlb_gen; + bool local = smp_processor_id() == f->initiating_cpu; + unsigned long nr_invalidate = 0; + u64 mm_tlb_gen; /* This code cannot presently handle being reentered. */ VM_WARN_ON(!irqs_disabled()); + if (!local) { + inc_irq_stat(irq_tlb_count); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + } + + /* The CPU was left in the mm_cpumask of the target mm. Clear it. */ + if (f->mm && f->mm != loaded_mm) { + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm)); + trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0); + return; + } + if (unlikely(loaded_mm == &init_mm)) return; + /* Reload the ASID if transitioning into or out of a global ASID */ + if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) { + switch_mm_irqs_off(NULL, loaded_mm, NULL); + loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + } + + /* Broadcast ASIDs are always kept up to date with INVLPGB. */ + if (is_global_asid(loaded_mm_asid)) + return; + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != loaded_mm->context.ctx_id); - if (this_cpu_read(cpu_tlbstate.is_lazy)) { + if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) { /* * We're in lazy mode. We need to at least flush our * paging-structure cache to avoid speculatively reading * garbage into our TLB. Since switching to init_mm is barely * slower than a minimal flush, just switch to init_mm. * - * This should be rare, with native_flush_tlb_others skipping + * This should be rare, with native_flush_tlb_multi() skipping * IPIs to lazy TLB mode CPUs. */ switch_mm_irqs_off(NULL, &init_mm, NULL); return; } + local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + + if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID && + f->new_tlb_gen <= local_tlb_gen)) { + /* + * The TLB is already up to date in respect to f->new_tlb_gen. + * While the core might be still behind mm_tlb_gen, checking + * mm_tlb_gen unnecessarily would have negative caching effects + * so avoid it. + */ + return; + } + + /* + * Defer mm_tlb_gen reading as long as possible to avoid cache + * contention. + */ + mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + if (unlikely(local_tlb_gen == mm_tlb_gen)) { /* * There's nothing to do: we're already up to date. This can @@ -564,8 +1211,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * be handled can catch us all the way up, leaving no work for * the second flush. */ - trace_tlb_flush(reason, 0); - return; + goto done; } WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); @@ -600,7 +1246,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * 3, we'd be break the invariant: we'd update local_tlb_gen above * 1 without the full flush that's needed for tlb_gen 2. * - * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization. * Partial TLB flushes are not all that much cheaper than full TLB * flushes, so it seems unlikely that it would be a performance win * to do a partial flush if that won't bring our TLB fully up to @@ -612,56 +1258,100 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, f->new_tlb_gen == local_tlb_gen + 1 && f->new_tlb_gen == mm_tlb_gen) { /* Partial flush */ - unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift; unsigned long addr = f->start; + /* Partial flush cannot have invalid generations */ + VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID); + + /* Partial flush must have valid mm */ + VM_WARN_ON(f->mm == NULL); + + nr_invalidate = (f->end - f->start) >> f->stride_shift; + while (addr < f->end) { - __flush_tlb_one_user(addr); + flush_tlb_one_user(addr); addr += 1UL << f->stride_shift; } if (local) count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate); - trace_tlb_flush(reason, nr_invalidate); } else { /* Full flush. */ - local_flush_tlb(); + nr_invalidate = TLB_FLUSH_ALL; + + flush_tlb_local(); if (local) count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - trace_tlb_flush(reason, TLB_FLUSH_ALL); } /* Both paths above update our state to mm_tlb_gen. */ this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); + + /* Tracing is done in a unified manner to reduce the code size */ +done: + trace_tlb_flush(!local ? TLB_REMOTE_SHOOTDOWN : + (f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN : + TLB_LOCAL_MM_SHOOTDOWN, + nr_invalidate); } -static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) +static bool should_flush_tlb(int cpu, void *data) { - const struct flush_tlb_info *f = info; + struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu); + struct flush_tlb_info *info = data; - flush_tlb_func_common(f, true, reason); -} + /* + * Order the 'loaded_mm' and 'is_lazy' against their + * write ordering in switch_mm_irqs_off(). Ensure + * 'is_lazy' is at least as new as 'loaded_mm'. + */ + smp_rmb(); -static void flush_tlb_func_remote(void *info) -{ - const struct flush_tlb_info *f = info; + /* Lazy TLB will get flushed at the next context switch. */ + if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) + return false; - inc_irq_stat(irq_tlb_count); + /* No mm means kernel memory flush. */ + if (!info->mm) + return true; - if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm)) - return; + /* + * While switching, the remote CPU could have state from + * either the prev or next mm. Assume the worst and flush. + */ + if (loaded_mm == LOADED_MM_SWITCHING) + return true; - count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); - flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); + /* The target mm is loaded, and the CPU is not lazy. */ + if (loaded_mm == info->mm) + return true; + + /* In cpumask, but not the loaded mm? Periodically remove by flushing. */ + if (info->trim_cpumask) + return true; + + return false; } -static bool tlb_is_not_lazy(int cpu, void *data) +static bool should_trim_cpumask(struct mm_struct *mm) { - return !per_cpu(cpu_tlbstate.is_lazy, cpu); + if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) { + WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ); + return true; + } + return false; } -void native_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) +DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); +EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared); + +STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask, + const struct flush_tlb_info *info) { + /* + * Do accounting and tracing. Note that there are (and have always been) + * cases in which a remote TLB flush will be traced, but eventually + * would not happen. + */ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (info->end == TLB_FLUSH_ALL) trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); @@ -669,32 +1359,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, trace_tlb_flush(TLB_REMOTE_SEND_IPI, (info->end - info->start) >> PAGE_SHIFT); - if (is_uv_system()) { - /* - * This whole special case is confused. UV has a "Broadcast - * Assist Unit", which seems to be a fancy way to send IPIs. - * Back when x86 used an explicit TLB flush IPI, UV was - * optimized to use its own mechanism. These days, x86 uses - * smp_call_function_many(), but UV still uses a manual IPI, - * and that IPI's action is out of date -- it does a manual - * flush instead of calling flush_tlb_func_remote(). This - * means that the percpu tlb_gen variables won't be updated - * and we'll do pointless flushes on future context switches. - * - * Rather than hooking native_flush_tlb_others() here, I think - * that UV should be updated so that smp_call_function_many(), - * etc, are optimal on UV. - */ - unsigned int cpu; - - cpu = smp_processor_id(); - cpumask = uv_flush_tlb_others(cpumask, info); - if (cpumask) - smp_call_function_many(cpumask, flush_tlb_func_remote, - (void *)info, 1); - return; - } - /* * If no page tables were freed, we can skip sending IPIs to * CPUs in lazy TLB mode. They will flush the CPU themselves @@ -705,16 +1369,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask, * up on the new contents of what used to be page tables, while * doing a speculative memory access. */ - if (info->freed_tables) - smp_call_function_many(cpumask, flush_tlb_func_remote, - (void *)info, 1); + if (info->freed_tables || mm_in_asid_transition(info->mm)) + on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); else - on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote, - (void *)info, 1, GFP_ATOMIC, cpumask); + on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func, + (void *)info, 1, cpumask); +} + +void flush_tlb_multi(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + __flush_tlb_multi(cpumask, info); } /* - * See Documentation/x86/tlb.txt for details. We choose 33 + * See Documentation/arch/x86/tlb.rst for details. We choose 33 * because it is large enough to cover the vast majority (at * least 95%) of allocations, and is small enough that we are * confident it will not cause too much overhead. Each single @@ -725,47 +1394,95 @@ void native_flush_tlb_others(const struct cpumask *cpumask, */ unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info); + +#ifdef CONFIG_DEBUG_VM +static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx); +#endif + +static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, + unsigned long start, unsigned long end, + unsigned int stride_shift, bool freed_tables, + u64 new_tlb_gen) +{ + struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info); + +#ifdef CONFIG_DEBUG_VM + /* + * Ensure that the following code is non-reentrant and flush_tlb_info + * is not overwritten. This means no TLB flushing is initiated by + * interrupt handlers and machine-check exception handlers. + */ + BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); +#endif + + /* + * If the number of flushes is so large that a full flush + * would be faster, do a full flush. + */ + if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) { + start = 0; + end = TLB_FLUSH_ALL; + } + + info->start = start; + info->end = end; + info->mm = mm; + info->stride_shift = stride_shift; + info->freed_tables = freed_tables; + info->new_tlb_gen = new_tlb_gen; + info->initiating_cpu = smp_processor_id(); + info->trim_cpumask = 0; + + return info; +} + +static void put_flush_tlb_info(void) +{ +#ifdef CONFIG_DEBUG_VM + /* Complete reentrancy prevention checks */ + barrier(); + this_cpu_dec(flush_tlb_info_idx); +#endif +} + void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift, bool freed_tables) { - int cpu; - - struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { - .mm = mm, - .stride_shift = stride_shift, - .freed_tables = freed_tables, - }; - - cpu = get_cpu(); + struct flush_tlb_info *info; + int cpu = get_cpu(); + u64 new_tlb_gen; /* This is also a barrier that synchronizes with switch_mm(). */ - info.new_tlb_gen = inc_mm_tlb_gen(mm); + new_tlb_gen = inc_mm_tlb_gen(mm); - /* Should we flush just the requested range? */ - if ((end != TLB_FLUSH_ALL) && - ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) { - info.start = start; - info.end = end; - } else { - info.start = 0UL; - info.end = TLB_FLUSH_ALL; - } + info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables, + new_tlb_gen); - if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { - VM_WARN_ON(irqs_disabled()); + /* + * flush_tlb_multi() is not optimized for the common case in which only + * a local TLB flush is needed. Optimize this use-case by calling + * flush_tlb_func_local() directly in this case. + */ + if (mm_global_asid(mm)) { + broadcast_tlb_flush(info); + } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { + info->trim_cpumask = should_trim_cpumask(mm); + flush_tlb_multi(mm_cpumask(mm), info); + consider_global_asid(mm); + } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { + lockdep_assert_irqs_enabled(); local_irq_disable(); - flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); + flush_tlb_func(info); local_irq_enable(); } - if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), &info); - + put_flush_tlb_info(); put_cpu(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } - static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); @@ -775,7 +1492,32 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); - on_each_cpu(do_flush_tlb_all, NULL, 1); + + /* First try (faster) hardware-assisted TLB invalidation. */ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) + invlpgb_flush_all(); + else + /* Fall back to the IPI-based invalidation. */ + on_each_cpu(do_flush_tlb_all, NULL, 1); +} + +/* Flush an arbitrarily large range of memory with INVLPGB. */ +static void invlpgb_kernel_range_flush(struct flush_tlb_info *info) +{ + unsigned long addr, nr; + + for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) { + nr = (info->end - addr) >> PAGE_SHIFT; + + /* + * INVLPGB has a limit on the size of ranges it can + * flush. Break up large flushes. + */ + nr = clamp_val(nr, 1, invlpgb_count_max); + + invlpgb_flush_addr_nosync(addr, nr); + } + __tlbsync(); } static void do_kernel_range_flush(void *info) @@ -785,49 +1527,268 @@ static void do_kernel_range_flush(void *info) /* flush range by one by one 'invlpg' */ for (addr = f->start; addr < f->end; addr += PAGE_SIZE) - __flush_tlb_one_kernel(addr); + flush_tlb_one_kernel(addr); +} + +static void kernel_tlb_flush_all(struct flush_tlb_info *info) +{ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) + invlpgb_flush_all(); + else + on_each_cpu(do_flush_tlb_all, NULL, 1); +} + +static void kernel_tlb_flush_range(struct flush_tlb_info *info) +{ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) + invlpgb_kernel_range_flush(info); + else + on_each_cpu(do_kernel_range_flush, info, 1); } void flush_tlb_kernel_range(unsigned long start, unsigned long end) { + struct flush_tlb_info *info; - /* Balance as user space task's flush, a bit conservative */ - if (end == TLB_FLUSH_ALL || - (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { - on_each_cpu(do_flush_tlb_all, NULL, 1); + guard(preempt)(); + + info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false, + TLB_GENERATION_INVALID); + + if (info->end == TLB_FLUSH_ALL) + kernel_tlb_flush_all(info); + else + kernel_tlb_flush_range(info); + + put_flush_tlb_info(); +} + +/* + * This can be used from process context to figure out what the value of + * CR3 is without needing to do a (slow) __read_cr3(). + * + * It's intended to be used for code like KVM that sneakily changes CR3 + * and needs to restore it. It needs to be used very carefully. + */ +unsigned long __get_current_cr3_fast(void) +{ + unsigned long cr3 = + build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, + this_cpu_read(cpu_tlbstate.loaded_mm_asid), + tlbstate_lam_cr3_mask()); + + /* For now, be very restrictive about when this can be called. */ + VM_WARN_ON(in_nmi() || preemptible()); + + VM_BUG_ON(cr3 != __read_cr3()); + return cr3; +} +EXPORT_SYMBOL_FOR_KVM(__get_current_cr3_fast); + +/* + * Flush one page in the kernel mapping + */ +void flush_tlb_one_kernel(unsigned long addr) +{ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); + + /* + * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its + * paravirt equivalent. Even with PCID, this is sufficient: we only + * use PCID if we also use global PTEs for the kernel mapping, and + * INVLPG flushes global translations across all address spaces. + * + * If PTI is on, then the kernel is mapped with non-global PTEs, and + * __flush_tlb_one_user() will flush the given address for the current + * kernel address space and for its usermode counterpart, but it does + * not flush it for other address spaces. + */ + flush_tlb_one_user(addr); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * See above. We need to propagate the flush to all other address + * spaces. In principle, we only need to propagate it to kernelmode + * address spaces, but the extra bookkeeping we would need is not + * worth it. + */ + this_cpu_write(cpu_tlbstate.invalidate_other, true); +} + +/* + * Flush one page in the user mapping + */ +STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr) +{ + u32 loaded_mm_asid; + bool cpu_pcide; + + /* Flush 'addr' from the kernel PCID: */ + invlpg(addr); + + /* If PTI is off there is no user PCID and nothing to flush. */ + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + cpu_pcide = this_cpu_read(cpu_tlbstate.cr4) & X86_CR4_PCIDE; + + /* + * invpcid_flush_one(pcid>0) will #GP if CR4.PCIDE==0. Check + * 'cpu_pcide' to ensure that *this* CPU will not trigger those + * #GP's even if called before CR4.PCIDE has been initialized. + */ + if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_pcide) + invpcid_flush_one(user_pcid(loaded_mm_asid), addr); + else + invalidate_user_asid(loaded_mm_asid); +} + +void flush_tlb_one_user(unsigned long addr) +{ + __flush_tlb_one_user(addr); +} + +/* + * Flush everything + */ +STATIC_NOPV void native_flush_tlb_global(void) +{ + unsigned long flags; + + if (static_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. + */ + invpcid_flush_all(); + return; + } + + /* + * Read-modify-write to CR4 - protect it from preemption and + * from interrupts. (Use the raw variant because this code can + * be called from deep inside debugging code.) + */ + raw_local_irq_save(flags); + + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); + + raw_local_irq_restore(flags); +} + +/* + * Flush the entire current user mapping + */ +STATIC_NOPV void native_flush_tlb_local(void) +{ + /* + * Preemption or interrupts must be disabled to protect the access + * to the per CPU variable and to prevent being preempted between + * read_cr3() and write_cr3(). + */ + WARN_ON_ONCE(preemptible()); + + invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* If current->mm == NULL then the read_cr3() "borrows" an mm */ + native_write_cr3(__native_read_cr3()); +} + +void flush_tlb_local(void) +{ + __flush_tlb_local(); +} + +/* + * Flush everything + */ +void __flush_tlb_all(void) +{ + /* + * This is to catch users with enabled preemption and the PGE feature + * and don't trigger the warning in __native_flush_tlb(). + */ + VM_WARN_ON_ONCE(preemptible()); + + if (cpu_feature_enabled(X86_FEATURE_PGE)) { + __flush_tlb_global(); } else { - struct flush_tlb_info info; - info.start = start; - info.end = end; - on_each_cpu(do_kernel_range_flush, &info, 1); + /* + * !PGE -> !PCID (setup_pcid()), thus every flush is total. + */ + flush_tlb_local(); } } +EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all); void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { - struct flush_tlb_info info = { - .mm = NULL, - .start = 0UL, - .end = TLB_FLUSH_ALL, - }; + struct flush_tlb_info *info; int cpu = get_cpu(); - if (cpumask_test_cpu(cpu, &batch->cpumask)) { - VM_WARN_ON(irqs_disabled()); + info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, + TLB_GENERATION_INVALID); + /* + * flush_tlb_multi() is not optimized for the common case in which only + * a local TLB flush is needed. Optimize this use-case by calling + * flush_tlb_func_local() directly in this case. + */ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) { + invlpgb_flush_all_nonglobals(); + batch->unmapped_pages = false; + } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { + flush_tlb_multi(&batch->cpumask, info); + } else if (cpumask_test_cpu(cpu, &batch->cpumask)) { + lockdep_assert_irqs_enabled(); local_irq_disable(); - flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); + flush_tlb_func(info); local_irq_enable(); } - if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) - flush_tlb_others(&batch->cpumask, &info); - cpumask_clear(&batch->cpumask); + put_flush_tlb_info(); put_cpu(); } +/* + * Blindly accessing user memory from NMI context can be dangerous + * if we're in the middle of switching the current user task or + * switching the loaded mm. It can also be dangerous if we + * interrupted some kernel code that was temporarily using a + * different mm. + */ +bool nmi_uaccess_okay(void) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *current_mm = current->mm; + + VM_WARN_ON_ONCE(!loaded_mm); + + /* + * The condition we want to check is + * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, + * if we're running in a VM with shadow paging, and nmi_uaccess_okay() + * is supposed to be reasonably fast. + * + * Instead, we check the almost equivalent but somewhat conservative + * condition below, and we rely on the fact that switch_mm_irqs_off() + * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. + */ + if (loaded_mm != current_mm) + return false; + + VM_WARN_ON_ONCE(__pa(current_mm->pgd) != read_cr3_pa()); + + return true; +} + static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { |
