diff options
Diffstat (limited to 'arch/x86/mm/tlb.c')
-rw-r--r-- | arch/x86/mm/tlb.c | 645 |
1 files changed, 385 insertions, 260 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6e7bedf69af7..dbbcfd59726a 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -15,7 +15,7 @@ #include <linux/debugfs.h> /* - * Smarter SMP flushing macros. + * TLB flushing, formerly SMP-only * c/o Linus Torvalds. * * These mean you can really definitely utterly forget about @@ -28,38 +28,62 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ -#ifdef CONFIG_SMP +atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); -struct flush_tlb_info { - struct mm_struct *flush_mm; - unsigned long flush_start; - unsigned long flush_end; -}; - -/* - * We cannot call mmdrop() because we are in interrupt context, - * instead update mm->cpu_vm_mask. - */ -void leave_mm(int cpu) +static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, + u16 *new_asid, bool *need_flush) { - struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm); - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - BUG(); - if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { - cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); - load_cr3(swapper_pg_dir); - /* - * This gets called in the idle path where RCU - * functions differently. Tracing normally - * uses RCU, so we have to call the tracepoint - * specially here. - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + u16 asid; + + if (!static_cpu_has(X86_FEATURE_PCID)) { + *new_asid = 0; + *need_flush = true; + return; } + + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != + next->context.ctx_id) + continue; + + *new_asid = asid; + *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < + next_tlb_gen); + return; + } + + /* + * We don't currently own an ASID slot on this CPU. + * Allocate a slot. + */ + *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; + if (*new_asid >= TLB_NR_DYN_ASIDS) { + *new_asid = 0; + this_cpu_write(cpu_tlbstate.next_asid, 1); + } + *need_flush = true; } -EXPORT_SYMBOL_GPL(leave_mm); -#endif /* CONFIG_SMP */ +void leave_mm(int cpu) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + + /* + * It's plausible that we're in lazy TLB mode while our mm is init_mm. + * If so, our callers still expect us to flush the TLB, but there + * aren't any user TLB entries in init_mm to worry about. + * + * This needs to happen before any other sanity checks due to + * intel_idle's shenanigans. + */ + if (loaded_mm == &init_mm) + return; + + /* Warn if we're not lazy. */ + WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); + + switch_mm(NULL, &init_mm, NULL); +} void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) @@ -74,217 +98,340 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { + struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); + u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); unsigned cpu = smp_processor_id(); + u64 next_tlb_gen; + + /* + * NB: The scheduler will call us with prev == next when switching + * from lazy TLB mode to normal mode if active_mm isn't changing. + * When this happens, we don't assume that CR3 (and hence + * cpu_tlbstate.loaded_mm) matches next. + * + * NB: leave_mm() calls us with prev == NULL and tsk == NULL. + */ + + /* We don't want flush_tlb_func_* to run concurrently with us. */ + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Verify that CR3 is what we think it is. This will catch + * hypothetical buggy code that directly switches to swapper_pg_dir + * without going through leave_mm() / switch_mm_irqs_off() or that + * does something like write_cr3(read_cr3_pa()). + */ + VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid)); + + if (real_prev == next) { + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + next->context.ctx_id); + + if (cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * There's nothing to do: we weren't lazy, and we + * aren't changing our mm. We don't need to flush + * anything, nor do we need to update CR3, CR4, or + * LDTR. + */ + return; + } + + /* Resume remote flushes and then read tlb_gen. */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < + next_tlb_gen) { + /* + * Ideally, we'd have a flush_tlb() variant that + * takes the known CR3 value as input. This would + * be faster on Xen PV and on hypothetical CPUs + * on which INVPCID is fast. + */ + this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, + next_tlb_gen); + write_cr3(__sme_pa(next->pgd) | prev_asid); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); + } + + /* + * We just exited lazy mode, which means that CR4 and/or LDTR + * may be stale. (Changes to the required CR4 and LDTR states + * are not reflected in tlb_gen.) + */ + } else { + u16 new_asid; + bool need_flush; - if (likely(prev != next)) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* * If our current stack is in vmalloc space and isn't * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ - unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); - - pgd_t *pgd = next->pgd + stack_pgd_index; + unsigned int index = pgd_index(current_stack_pointer()); + pgd_t *pgd = next->pgd + index; if (unlikely(pgd_none(*pgd))) - set_pgd(pgd, init_mm.pgd[stack_pgd_index]); + set_pgd(pgd, init_mm.pgd[index]); } -#ifdef CONFIG_SMP - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - this_cpu_write(cpu_tlbstate.active_mm, next); -#endif + /* Stop remote flushes for the previous mm */ + if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); - cpumask_set_cpu(cpu, mm_cpumask(next)); + VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); /* - * Re-load page tables. - * - * This logic has an ordering constraint: - * - * CPU 0: Write to a PTE for 'next' - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. - * CPU 1: set bit 1 in next's mm_cpumask - * CPU 1: load from the PTE that CPU 0 writes (implicit) - * - * We need to prevent an outcome in which CPU 1 observes - * the new PTE value and CPU 0 observes bit 1 clear in - * mm_cpumask. (If that occurs, then the IPI will never - * be sent, and CPU 0's TLB will contain a stale entry.) - * - * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs must - * execute full barriers to prevent this from happening. - * - * Thus, switch_mm needs a full barrier between the - * store to mm_cpumask and any operation that could load - * from next->pgd. TLB fills are special and can happen - * due to instruction fetches or for no reason at all, - * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - * + * Start remote flushes and then read tlb_gen. */ - load_cr3(next->pgd); - - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); - /* Stop flush ipis for the previous mm */ - cpumask_clear_cpu(cpu, mm_cpumask(prev)); + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); - /* Load per-mm CR4 state */ - load_mm_cr4(next); + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + write_cr3(__sme_pa(next->pgd) | new_asid); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); + } -#ifdef CONFIG_MODIFY_LDT_SYSCALL - /* - * Load the LDT, if the LDT is different. - * - * It's possible that prev->context.ldt doesn't match - * the LDT register. This can happen if leave_mm(prev) - * was called and then modify_ldt changed - * prev->context.ldt but suppressed an IPI to this CPU. - * In this case, prev->context.ldt != NULL, because we - * never set context.ldt to NULL while the mm still - * exists. That means that next->context.ldt != - * prev->context.ldt, because mms never share an LDT. - */ - if (unlikely(prev->context.ldt != next->context.ldt)) - load_mm_ldt(next); -#endif + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); } -#ifdef CONFIG_SMP - else { - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); - - if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { - /* - * On established mms, the mm_cpumask is only changed - * from irq context, from ptep_clear_flush() while in - * lazy tlb mode, and here. Irqs are blocked during - * schedule, protecting us from simultaneous changes. - */ - cpumask_set_cpu(cpu, mm_cpumask(next)); - /* - * We were in lazy tlb mode and leave_mm disabled - * tlb flush IPI delivery. We must reload CR3 - * to make sure to use no freed page tables. - * - * As above, load_cr3() is serializing and orders TLB - * fills with respect to the mm_cpumask write. - */ - load_cr3(next->pgd); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - load_mm_cr4(next); - load_mm_ldt(next); - } - } -#endif + load_mm_cr4(next); + switch_ldt(real_prev, next); } -#ifdef CONFIG_SMP - /* - * The flush IPI assumes that a thread switch happens in this order: - * [cpu0: the cpu that switches] - * 1) switch_mm() either 1a) or 1b) - * 1a) thread switch to a different mm - * 1a1) set cpu_tlbstate to TLBSTATE_OK - * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm - * if cpu0 was in lazy tlb mode. - * 1a2) update cpu active_mm - * Now cpu0 accepts tlb flushes for the new mm. - * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. - * 1a4) change cr3. - * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but flush_tlb_func ignore flush ipis for the wrong - * mm, and in the worst case we perform a superfluous tlb flush. - * 1b) thread switch without mm change - * cpu active_mm is correct, cpu0 already handles flush ipis. - * 1b1) set cpu_tlbstate to TLBSTATE_OK - * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. - * 1b3) if the bit was 0: leave_mm was called, flush the tlb. - * 2) switch %%esp, ie current + * Call this when reinitializing a CPU. It fixes the following potential + * problems: * - * The interrupt must handle 2 special cases: - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. - * - the cpu performs speculative tlb reads, i.e. even if the cpu only - * runs in kernel space, the cpu could load tlb entries for user space - * pages. + * - The ASID changed from what cpu_tlbstate thinks it is (most likely + * because the CPU was taken down and came back up with CR3's PCID + * bits clear. CPU hotplug can do this. * - * The good news is that cpu_tlbstate is local to each cpu, no - * write/read ordering problems. + * - The TLB contains junk in slots corresponding to inactive ASIDs. + * + * - The CPU went so far out to lunch that it may have missed a TLB + * flush. */ +void initialize_tlbstate_and_flush(void) +{ + int i; + struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); + unsigned long cr3 = __read_cr3(); + + /* Assert that CR3 already references the right mm. */ + WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); + + /* + * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization + * doesn't work like other CR4 bits because it can only be set from + * long mode.) + */ + WARN_ON(boot_cpu_has(X86_CR4_PCIDE) && + !(cr4_read_shadow() & X86_CR4_PCIDE)); + + /* Force ASID 0 and force a TLB flush. */ + write_cr3(cr3 & ~CR3_PCID_MASK); + + /* Reinitialize tlbstate. */ + this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); + this_cpu_write(cpu_tlbstate.next_asid, 1); + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); + + for (i = 1; i < TLB_NR_DYN_ASIDS; i++) + this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); +} /* - * TLB flush funcation: - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. - * 2) Leave the mm if we are in the lazy tlb mode. + * flush_tlb_func_common()'s memory ordering requirement is that any + * TLB fills that happen after we flush the TLB are ordered after we + * read active_mm's tlb_gen. We don't need any explicit barriers + * because all x86 flush operations are serializing and the + * atomic64_read operation won't be reordered by the compiler. */ -static void flush_tlb_func(void *info) +static void flush_tlb_func_common(const struct flush_tlb_info *f, + bool local, enum tlb_flush_reason reason) { - struct flush_tlb_info *f = info; + /* + * We have three different tlb_gen values in here. They are: + * + * - mm_tlb_gen: the latest generation. + * - local_tlb_gen: the generation that this CPU has already caught + * up to. + * - f->new_tlb_gen: the generation that the requester of the flush + * wants us to catch up to. + */ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); - inc_irq_stat(irq_tlb_count); + /* This code cannot presently handle being reentered. */ + VM_WARN_ON(!irqs_disabled()); - if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + + if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { + /* + * We're in lazy mode -- don't flush. We can get here on + * remote flushes due to races and on local flushes if a + * kernel thread coincidentally flushes the mm it's lazily + * still using. + */ return; + } - count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { - if (f->flush_end == TLB_FLUSH_ALL) { - local_flush_tlb(); - trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL); - } else { - unsigned long addr; - unsigned long nr_pages = - (f->flush_end - f->flush_start) / PAGE_SIZE; - addr = f->flush_start; - while (addr < f->flush_end) { - __flush_tlb_single(addr); - addr += PAGE_SIZE; - } - trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages); + if (unlikely(local_tlb_gen == mm_tlb_gen)) { + /* + * There's nothing to do: we're already up to date. This can + * happen if two concurrent flushes happen -- the first flush to + * be handled can catch us all the way up, leaving no work for + * the second flush. + */ + trace_tlb_flush(reason, 0); + return; + } + + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); + + /* + * If we get to this point, we know that our TLB is out of date. + * This does not strictly imply that we need to flush (it's + * possible that f->new_tlb_gen <= local_tlb_gen), but we're + * going to need to flush in the very near future, so we might + * as well get it over with. + * + * The only question is whether to do a full or partial flush. + * + * We do a partial flush if requested and two extra conditions + * are met: + * + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that + * we've always done all needed flushes to catch up to + * local_tlb_gen. If, for example, local_tlb_gen == 2 and + * f->new_tlb_gen == 3, then we know that the flush needed to bring + * us up to date for tlb_gen 3 is the partial flush we're + * processing. + * + * As an example of why this check is needed, suppose that there + * are two concurrent flushes. The first is a full flush that + * changes context.tlb_gen from 1 to 2. The second is a partial + * flush that changes context.tlb_gen from 2 to 3. If they get + * processed on this CPU in reverse order, we'll see + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. + * If we were to use __flush_tlb_single() and set local_tlb_gen to + * 3, we'd be break the invariant: we'd update local_tlb_gen above + * 1 without the full flush that's needed for tlb_gen 2. + * + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * Partial TLB flushes are not all that much cheaper than full TLB + * flushes, so it seems unlikely that it would be a performance win + * to do a partial flush if that won't bring our TLB fully up to + * date. By doing a full flush instead, we can increase + * local_tlb_gen all the way to mm_tlb_gen and we can probably + * avoid another flush in the very near future. + */ + if (f->end != TLB_FLUSH_ALL && + f->new_tlb_gen == local_tlb_gen + 1 && + f->new_tlb_gen == mm_tlb_gen) { + /* Partial flush */ + unsigned long addr; + unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; + + addr = f->start; + while (addr < f->end) { + __flush_tlb_single(addr); + addr += PAGE_SIZE; } - } else - leave_mm(smp_processor_id()); + if (local) + count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); + trace_tlb_flush(reason, nr_pages); + } else { + /* Full flush. */ + local_flush_tlb(); + if (local) + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + trace_tlb_flush(reason, TLB_FLUSH_ALL); + } + /* Both paths above update our state to mm_tlb_gen. */ + this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); } -void native_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long start, - unsigned long end) +static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) { - struct flush_tlb_info info; + const struct flush_tlb_info *f = info; - info.flush_mm = mm; - info.flush_start = start; - info.flush_end = end; + flush_tlb_func_common(f, true, reason); +} + +static void flush_tlb_func_remote(void *info) +{ + const struct flush_tlb_info *f = info; + + inc_irq_stat(irq_tlb_count); + if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm)) + return; + + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); +} + +void native_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); - if (end == TLB_FLUSH_ALL) + if (info->end == TLB_FLUSH_ALL) trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); else trace_tlb_flush(TLB_REMOTE_SEND_IPI, - (end - start) >> PAGE_SHIFT); + (info->end - info->start) >> PAGE_SHIFT); if (is_uv_system()) { + /* + * This whole special case is confused. UV has a "Broadcast + * Assist Unit", which seems to be a fancy way to send IPIs. + * Back when x86 used an explicit TLB flush IPI, UV was + * optimized to use its own mechanism. These days, x86 uses + * smp_call_function_many(), but UV still uses a manual IPI, + * and that IPI's action is out of date -- it does a manual + * flush instead of calling flush_tlb_func_remote(). This + * means that the percpu tlb_gen variables won't be updated + * and we'll do pointless flushes on future context switches. + * + * Rather than hooking native_flush_tlb_others() here, I think + * that UV should be updated so that smp_call_function_many(), + * etc, are optimal on UV. + */ unsigned int cpu; cpu = smp_processor_id(); - cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu); + cpumask = uv_flush_tlb_others(cpumask, info); if (cpumask) - smp_call_function_many(cpumask, flush_tlb_func, - &info, 1); + smp_call_function_many(cpumask, flush_tlb_func_remote, + (void *)info, 1); return; } - smp_call_function_many(cpumask, flush_tlb_func, &info, 1); + smp_call_function_many(cpumask, flush_tlb_func_remote, + (void *)info, 1); } /* @@ -302,91 +449,46 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { - unsigned long addr; - /* do a global flush by default */ - unsigned long base_pages_to_flush = TLB_FLUSH_ALL; - - preempt_disable(); + int cpu; - if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) - base_pages_to_flush = (end - start) >> PAGE_SHIFT; - if (base_pages_to_flush > tlb_single_page_flush_ceiling) - base_pages_to_flush = TLB_FLUSH_ALL; + struct flush_tlb_info info = { + .mm = mm, + }; - if (current->active_mm != mm) { - /* Synchronize with switch_mm. */ - smp_mb(); - - goto out; - } + cpu = get_cpu(); - if (!current->mm) { - leave_mm(smp_processor_id()); + /* This is also a barrier that synchronizes with switch_mm(). */ + info.new_tlb_gen = inc_mm_tlb_gen(mm); - /* Synchronize with switch_mm. */ - smp_mb(); - - goto out; - } - - /* - * Both branches below are implicit full barriers (MOV to CR or - * INVLPG) that synchronize with switch_mm. - */ - if (base_pages_to_flush == TLB_FLUSH_ALL) { - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - local_flush_tlb(); + /* Should we flush just the requested range? */ + if ((end != TLB_FLUSH_ALL) && + !(vmflag & VM_HUGETLB) && + ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { + info.start = start; + info.end = end; } else { - /* flush range by one by one 'invlpg' */ - for (addr = start; addr < end; addr += PAGE_SIZE) { - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); - __flush_tlb_single(addr); - } - } - trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush); -out: - if (base_pages_to_flush == TLB_FLUSH_ALL) { - start = 0UL; - end = TLB_FLUSH_ALL; + info.start = 0UL; + info.end = TLB_FLUSH_ALL; } - if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, end); - preempt_enable(); -} - -void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) -{ - struct mm_struct *mm = vma->vm_mm; - preempt_disable(); - - if (current->active_mm == mm) { - if (current->mm) { - /* - * Implicit full barrier (INVLPG) that synchronizes - * with switch_mm. - */ - __flush_tlb_one(start); - } else { - leave_mm(smp_processor_id()); - - /* Synchronize with switch_mm. */ - smp_mb(); - } + if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { + VM_WARN_ON(irqs_disabled()); + local_irq_disable(); + flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); + local_irq_enable(); } - if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE); + if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), &info); - preempt_enable(); + put_cpu(); } + static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) - leave_mm(smp_processor_id()); } void flush_tlb_all(void) @@ -401,7 +503,7 @@ static void do_kernel_range_flush(void *info) unsigned long addr; /* flush range by one by one 'invlpg' */ - for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) + for (addr = f->start; addr < f->end; addr += PAGE_SIZE) __flush_tlb_single(addr); } @@ -410,16 +512,41 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) /* Balance as user space task's flush, a bit conservative */ if (end == TLB_FLUSH_ALL || - (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { + (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { on_each_cpu(do_flush_tlb_all, NULL, 1); } else { struct flush_tlb_info info; - info.flush_start = start; - info.flush_end = end; + info.start = start; + info.end = end; on_each_cpu(do_kernel_range_flush, &info, 1); } } +void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) +{ + struct flush_tlb_info info = { + .mm = NULL, + .start = 0UL, + .end = TLB_FLUSH_ALL, + }; + + int cpu = get_cpu(); + + if (cpumask_test_cpu(cpu, &batch->cpumask)) { + VM_WARN_ON(irqs_disabled()); + local_irq_disable(); + flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); + local_irq_enable(); + } + + if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) + flush_tlb_others(&batch->cpumask, &info); + + cpumask_clear(&batch->cpumask); + + put_cpu(); +} + static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { @@ -465,5 +592,3 @@ static int __init create_tlb_single_page_flush_ceiling(void) return 0; } late_initcall(create_tlb_single_page_flush_ceiling); - -#endif /* CONFIG_SMP */ |