summaryrefslogtreecommitdiff
path: root/arch/x86/mm/tlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm/tlb.c')
-rw-r--r--arch/x86/mm/tlb.c124
1 files changed, 74 insertions, 50 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5768d386efab..6cf881a942bb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -11,6 +11,7 @@
#include <linux/sched/smt.h>
#include <linux/task_work.h>
#include <linux/mmu_notifier.h>
+#include <linux/mmu_context.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -19,6 +20,7 @@
#include <asm/cacheflush.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
+#include <asm/tlb.h>
#include "mm_internal.h"
@@ -85,14 +87,11 @@
*
*/
-/* There are 12 bits of space for ASIDS in CR3 */
-#define CR3_HW_ASID_BITS 12
-
/*
- * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
+ * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches
*/
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
# define PTI_CONSUMED_PCID_BITS 1
#else
# define PTI_CONSUMED_PCID_BITS 0
@@ -114,7 +113,7 @@ static inline u16 kern_pcid(u16 asid)
{
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
* Make sure that the dynamic ASID space does not conflict with the
* bit we are using to switch between user and kernel ASIDs.
@@ -149,7 +148,7 @@ static inline u16 kern_pcid(u16 asid)
static inline u16 user_pcid(u16 asid)
{
u16 ret = kern_pcid(asid);
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
#endif
return ret;
@@ -160,7 +159,6 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
unsigned long cr3 = __sme_pa(pgd) | lam;
if (static_cpu_has(X86_FEATURE_PCID)) {
- VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
cr3 |= kern_pcid(asid);
} else {
VM_WARN_ON_ONCE(asid != 0);
@@ -262,7 +260,7 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
static inline void invalidate_user_asid(u16 asid)
{
/* There is no user ASID if address space separation is off */
- if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
return;
/*
@@ -299,7 +297,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
write_cr3(new_mm_cr3);
}
-void leave_mm(int cpu)
+void leave_mm(void)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -327,7 +325,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
unsigned long flags;
local_irq_save(flags);
- switch_mm_irqs_off(prev, next, tsk);
+ switch_mm_irqs_off(NULL, next, tsk);
local_irq_restore(flags);
}
@@ -492,27 +490,24 @@ void cr4_update_pce(void *ignored)
static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
#endif
-void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+/*
+ * This optimizes when not actually switching mm's. Some architectures use the
+ * 'unused' argument for this optimization, but x86 must use
+ * 'cpu_tlbstate.loaded_mm' instead because it does not always keep
+ * 'current->active_mm' up to date.
+ */
+void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
struct task_struct *tsk)
{
- struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+ struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- unsigned long new_lam = mm_lam_cr3_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
+ unsigned long new_lam;
u64 next_tlb_gen;
bool need_flush;
u16 new_asid;
- /*
- * NB: The scheduler will call us with prev == next when switching
- * from lazy TLB mode to normal mode if active_mm isn't changing.
- * When this happens, we don't assume that CR3 (and hence
- * cpu_tlbstate.loaded_mm) matches next.
- *
- * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
- */
-
/* We don't want flush_tlb_func() to run concurrently with us. */
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(!irqs_disabled());
@@ -527,7 +522,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid,
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid,
tlbstate_lam_cr3_mask()))) {
/*
* If we were to BUG here, we'd be very likely to kill
@@ -559,7 +554,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* provides that full memory barrier and core serializing
* instruction.
*/
- if (real_prev == next) {
+ if (prev == next) {
/* Not actually switching mm's */
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
@@ -574,7 +569,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* mm_cpumask. The TLB shootdown code can figure out from
* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
*/
- if (WARN_ON_ONCE(real_prev != &init_mm &&
+ if (IS_ENABLED(CONFIG_DEBUG_VM) && WARN_ON_ONCE(prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
@@ -612,20 +607,15 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
cond_mitigation(tsk);
/*
- * Stop remote flushes for the previous mm.
- * Skip kernel threads; we never send init_mm TLB flushing IPIs,
- * but the bitmap manipulation can cause cache line contention.
+ * Leave this CPU in prev's mm_cpumask. Atomic writes to
+ * mm_cpumask can be expensive under contention. The CPU
+ * will be removed lazily at TLB flush time.
*/
- if (real_prev != &init_mm) {
- VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
- mm_cpumask(real_prev)));
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
- }
+ VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu,
+ mm_cpumask(prev)));
- /*
- * Start remote flushes and then read tlb_gen.
- */
- if (next != &init_mm)
+ /* Start receiving IPIs and then read tlb_gen (and LAM below) */
+ if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))
cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
@@ -636,7 +626,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
barrier();
}
- set_tlbstate_lam_mode(next);
+ new_lam = mm_lam_cr3_mask(next);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
@@ -655,10 +645,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next));
- if (next != real_prev) {
+ if (next != prev) {
cr4_update_pce_mm(next);
- switch_ldt(real_prev, next);
+ switch_ldt(prev, next);
}
}
@@ -701,6 +692,7 @@ void initialize_tlbstate_and_flush(void)
int i;
struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
+ unsigned long lam = mm_lam_cr3_mask(mm);
unsigned long cr3 = __read_cr3();
/* Assert that CR3 already references the right mm. */
@@ -708,7 +700,7 @@ void initialize_tlbstate_and_flush(void)
/* LAM expected to be disabled */
WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
- WARN_ON(mm_lam_cr3_mask(mm));
+ WARN_ON(lam);
/*
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
@@ -727,7 +719,7 @@ void initialize_tlbstate_and_flush(void)
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
- set_tlbstate_lam_mode(mm);
+ cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
@@ -765,10 +757,13 @@ static void flush_tlb_func(void *info)
if (!local) {
inc_irq_stat(irq_tlb_count);
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ }
- /* Can only happen on remote CPUs */
- if (f->mm && f->mm != loaded_mm)
- return;
+ /* The CPU was left in the mm_cpumask of the target mm. Clear it. */
+ if (f->mm && f->mm != loaded_mm) {
+ cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm));
+ trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0);
+ return;
}
if (unlikely(loaded_mm == &init_mm))
@@ -898,9 +893,36 @@ done:
nr_invalidate);
}
-static bool tlb_is_not_lazy(int cpu, void *data)
+static bool should_flush_tlb(int cpu, void *data)
{
- return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
+ struct flush_tlb_info *info = data;
+
+ /* Lazy TLB will get flushed at the next context switch. */
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ return false;
+
+ /* No mm means kernel memory flush. */
+ if (!info->mm)
+ return true;
+
+ /* The target mm is loaded, and the CPU is not lazy. */
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm)
+ return true;
+
+ /* In cpumask, but not the loaded mm? Periodically remove by flushing. */
+ if (info->trim_cpumask)
+ return true;
+
+ return false;
+}
+
+static bool should_trim_cpumask(struct mm_struct *mm)
+{
+ if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) {
+ WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ);
+ return true;
+ }
+ return false;
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
@@ -934,7 +956,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
if (info->freed_tables)
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
else
- on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
+ on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
(void *)info, 1, cpumask);
}
@@ -985,6 +1007,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
info->freed_tables = freed_tables;
info->new_tlb_gen = new_tlb_gen;
info->initiating_cpu = smp_processor_id();
+ info->trim_cpumask = 0;
return info;
}
@@ -1027,6 +1050,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
* flush_tlb_func_local() directly in this case.
*/
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ info->trim_cpumask = should_trim_cpumask(mm);
flush_tlb_multi(mm_cpumask(mm), info);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
@@ -1146,7 +1170,7 @@ STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
bool cpu_pcide;
/* Flush 'addr' from the kernel PCID: */
- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ invlpg(addr);
/* If PTI is off there is no user PCID and nothing to flush. */
if (!static_cpu_has(X86_FEATURE_PTI))