diff options
-rw-r--r-- | arch/um/kernel/um_arch.c | 2 | ||||
-rw-r--r-- | arch/x86/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/events/core.c | 9 | ||||
-rw-r--r-- | arch/x86/include/asm/alternative.h | 11 | ||||
-rw-r--r-- | arch/x86/include/asm/mmu.h | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/mmu_context.h | 15 | ||||
-rw-r--r-- | arch/x86/include/asm/text-patching.h | 29 | ||||
-rw-r--r-- | arch/x86/kernel/alternative.c | 439 | ||||
-rw-r--r-- | arch/x86/kernel/callthunks.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/ftrace.c | 18 | ||||
-rw-r--r-- | arch/x86/kernel/jump_label.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/core.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/opt.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/module.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/static_call.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 6 | ||||
-rw-r--r-- | arch/x86/lib/insn-eval.c | 13 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 19 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 82 | ||||
-rw-r--r-- | arch/x86/net/bpf_jit_comp.c | 2 | ||||
-rw-r--r-- | arch/x86/platform/efi/efi_64.c | 8 |
21 files changed, 348 insertions, 336 deletions
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index d4b3b6742ec8..2f5ee045bc7a 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -477,7 +477,7 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len) return text_poke(addr, opcode, len); } -void text_poke_sync(void) +void smp_text_poke_sync_each_cpu(void) { } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5873c9e39919..297e33cfa760 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -153,6 +153,7 @@ config X86 select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH + select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select BUILDTIME_TABLE_SORT select CLKEVT_I8253 select CLOCKSOURCE_WATCHDOG diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 139ad80d1df3..5fe201efc753 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2803,8 +2803,15 @@ static unsigned long get_segment_base(unsigned int segment) #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; + /* + * If we're not in a valid context with a real (not just lazy) + * user mm, then don't even try. + */ + if (!nmi_uaccess_okay()) + return 0; + /* IRQs are off, so this synchronizes with smp_store_release */ - ldt = READ_ONCE(current->active_mm->context.ldt); + ldt = smp_load_acquire(¤t->mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4a37a8bd87fd..e18cdaa1573c 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -82,6 +82,12 @@ struct alt_instr { extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +extern s32 __retpoline_sites[], __retpoline_sites_end[]; +extern s32 __return_sites[], __return_sites_end[]; +extern s32 __cfi_sites[], __cfi_sites_end[]; +extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; +extern s32 __smp_locks[], __smp_locks_end[]; + /* * Debug flag that can be tested to see whether alternative * instructions were patched in already: @@ -335,11 +341,6 @@ void nop_func(void); __ALTERNATIVE(\oldinstr, \newinstr, \ft_flags) .endm -#define old_len 141b-140b -#define new_len1 144f-143f -#define new_len2 145f-144f -#define new_len3 146f-145f - /* * Same as ALTERNATIVE macro above but for two alternatives. If CPU * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 8b8055a8eb9e..0fe9c569d171 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -16,6 +16,8 @@ #define MM_CONTEXT_LOCK_LAM 2 /* Allow LAM and SVA coexisting */ #define MM_CONTEXT_FORCE_TAGGED_SVA 3 +/* Tracks mm_cpumask */ +#define MM_CONTEXT_NOTRACK 4 /* * x86 has arch-specific MMU state beyond what lives in mm_struct. @@ -44,9 +46,7 @@ typedef struct { struct ldt_struct *ldt; #endif -#ifdef CONFIG_X86_64 unsigned long flags; -#endif #ifdef CONFIG_ADDRESS_MASKING /* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 2398058b6e83..73bf3b1b44e8 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -190,7 +190,7 @@ extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, #define activate_mm(prev, next) \ do { \ paravirt_enter_mmap(next); \ - switch_mm((prev), (next), NULL); \ + switch_mm_irqs_off((prev), (next), NULL); \ } while (0); #ifdef CONFIG_X86_32 @@ -247,6 +247,16 @@ static inline bool is_64bit_mm(struct mm_struct *mm) } #endif +static inline bool is_notrack_mm(struct mm_struct *mm) +{ + return test_bit(MM_CONTEXT_NOTRACK, &mm->context.flags); +} + +static inline void set_notrack_mm(struct mm_struct *mm) +{ + set_bit(MM_CONTEXT_NOTRACK, &mm->context.flags); +} + /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other @@ -272,4 +282,7 @@ unsigned long __get_current_cr3_fast(void); #include <asm-generic/mmu_context.h> +extern struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm); +extern void unuse_temporary_mm(struct mm_struct *prev_mm); + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index ab9e143ec9fe..5337f1be18f6 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -11,11 +11,11 @@ * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. * Raise it if needed. */ -#define POKE_MAX_OPCODE_SIZE 5 +#define TEXT_POKE_MAX_OPCODE_SIZE 5 extern void text_poke_early(void *addr, const void *opcode, size_t len); -extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len); +extern void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len); /* * Clear and restore the kernel write-protection flag on the local CPU. @@ -32,17 +32,17 @@ extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u * an inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); -extern void text_poke_sync(void); +extern void smp_text_poke_sync_each_cpu(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); #define text_poke_copy text_poke_copy extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); -extern int poke_int3_handler(struct pt_regs *regs); -extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); +extern int smp_text_poke_int3_handler(struct pt_regs *regs); +extern void smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate); -extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); -extern void text_poke_finish(void); +extern void smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate); +extern void smp_text_poke_batch_finish(void); #define INT3_INSN_SIZE 1 #define INT3_INSN_OPCODE 0xCC @@ -82,7 +82,7 @@ static __always_inline int text_opcode_size(u8 opcode) } union text_poke_insn { - u8 text[POKE_MAX_OPCODE_SIZE]; + u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; struct { u8 opcode; s32 disp; @@ -128,8 +128,8 @@ void *text_gen_insn(u8 opcode, const void *addr, const void *dest) } extern int after_bootmem; -extern __ro_after_init struct mm_struct *poking_mm; -extern __ro_after_init unsigned long poking_addr; +extern __ro_after_init struct mm_struct *text_poke_mm; +extern __ro_after_init unsigned long text_poke_mm_addr; #ifndef CONFIG_UML_X86 static __always_inline @@ -142,13 +142,14 @@ static __always_inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* - * The int3 handler in entry_64.S adds a gap between the + * The INT3 handler in entry_64.S adds a gap between the * stack where the break point happened, and the saving of * pt_regs. We can extend the original stack because of - * this gap. See the idtentry macro's create_gap option. + * this gap. See the idtentry macro's X86_TRAP_BP logic. * - * Similarly entry_32.S will have a gap on the stack for (any) hardware - * exception and pt_regs; see FIXUP_FRAME. + * Similarly, entry_32.S will have a gap on the stack for + * (any) hardware exception and pt_regs; see the + * FIXUP_FRAME macro. */ regs->sp -= sizeof(unsigned long); *(unsigned long *)regs->sp = val; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index bf82c6f7d690..ddbc303e41e3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,36 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "SMP alternatives: " fmt -#include <linux/module.h> -#include <linux/sched.h> +#include <linux/mmu_context.h> #include <linux/perf_event.h> -#include <linux/mutex.h> -#include <linux/list.h> -#include <linux/stringify.h> -#include <linux/highmem.h> -#include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> -#include <linux/stop_machine.h> -#include <linux/slab.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> -#include <linux/mmu_context.h> -#include <linux/bsearch.h> -#include <linux/sync_core.h> + #include <asm/text-patching.h> -#include <asm/alternative.h> -#include <asm/sections.h> -#include <asm/mce.h> -#include <asm/nmi.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> #include <asm/insn.h> -#include <asm/io.h> -#include <asm/fixmap.h> -#include <asm/paravirt.h> -#include <asm/asm-prototypes.h> -#include <asm/cfi.h> +#include <asm/nmi.h> int __read_mostly alternatives_patched; @@ -171,13 +149,6 @@ static void add_nop(u8 *buf, unsigned int len) *buf = INT3_INSN_OPCODE; } -extern s32 __retpoline_sites[], __retpoline_sites_end[]; -extern s32 __return_sites[], __return_sites_end[]; -extern s32 __cfi_sites[], __cfi_sites_end[]; -extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; -extern s32 __smp_locks[], __smp_locks_end[]; -void text_poke_early(void *addr, const void *opcode, size_t len); - /* * Matches NOP and NOPL, not any of the other possible NOPs. */ @@ -369,7 +340,7 @@ static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, } } -void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) +void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { __apply_relocation(buf, instr, instrlen, repl, repl_len); optimize_nops(instr, buf, instrlen); @@ -525,7 +496,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; - apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); + text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); @@ -2010,7 +1981,7 @@ __visible noinline void __init __alt_reloc_selftest(void *arg) static noinline void __init alt_reloc_selftest(void) { /* - * Tests apply_relocation(). + * Tests text_poke_apply_relocation(). * * This has a relative immediate (CALL) in a place other than the first * instruction and additionally on x86_64 we get a RIP-relative LEA: @@ -2140,76 +2111,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } } -typedef struct { - struct mm_struct *mm; -} temp_mm_state_t; - -/* - * Using a temporary mm allows to set temporary mappings that are not accessible - * by other CPUs. Such mappings are needed to perform sensitive memory writes - * that override the kernel memory protections (e.g., W^X), without exposing the - * temporary page-table mappings that are required for these write operations to - * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the - * mapping is torn down. - * - * Context: The temporary mm needs to be used exclusively by a single core. To - * harden security IRQs must be disabled while the temporary mm is - * loaded, thereby preventing interrupt handler bugs from overriding - * the kernel memory protection. - */ -static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) -{ - temp_mm_state_t temp_state; - - lockdep_assert_irqs_disabled(); - - /* - * Make sure not to be in TLB lazy mode, as otherwise we'll end up - * with a stale address space WITHOUT being in lazy mode after - * restoring the previous mm. - */ - if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) - leave_mm(); - - temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); - switch_mm_irqs_off(NULL, mm, current); - - /* - * If breakpoints are enabled, disable them while the temporary mm is - * used. Userspace might set up watchpoints on addresses that are used - * in the temporary mm, which would lead to wrong signals being sent or - * crashes. - * - * Note that breakpoints are not disabled selectively, which also causes - * kernel breakpoints (e.g., perf's) to be disabled. This might be - * undesirable, but still seems reasonable as the code that runs in the - * temporary mm should be short. - */ - if (hw_breakpoint_active()) - hw_breakpoint_disable(); - - return temp_state; -} - -__ro_after_init struct mm_struct *poking_mm; -__ro_after_init unsigned long poking_addr; - -static inline void unuse_temporary_mm(temp_mm_state_t prev_state) -{ - lockdep_assert_irqs_disabled(); - - switch_mm_irqs_off(NULL, prev_state.mm, current); - - /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ - cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm)); - - /* - * Restore the breakpoints if they were disabled before the temporary mm - * was loaded. - */ - if (hw_breakpoint_active()) - hw_breakpoint_restore(); -} +__ro_after_init struct mm_struct *text_poke_mm; +__ro_after_init unsigned long text_poke_mm_addr; static void text_poke_memcpy(void *dst, const void *src, size_t len) { @@ -2229,7 +2132,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; - temp_mm_state_t prev; + struct mm_struct *prev_mm; unsigned long flags; pte_t pte, *ptep; spinlock_t *ptl; @@ -2266,7 +2169,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l /* * The lock is not really needed, but this allows to avoid open-coding. */ - ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); /* * This must not fail; preallocated in poking_init(). @@ -2276,21 +2179,21 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l local_irq_save(flags); pte = mk_pte(pages[0], pgprot); - set_pte_at(poking_mm, poking_addr, ptep, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte); if (cross_page_boundary) { pte = mk_pte(pages[1], pgprot); - set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte); } /* * Loading the temporary mm behaves as a compiler barrier, which * guarantees that the PTE will be set at the time memcpy() is done. */ - prev = use_temporary_mm(poking_mm); + prev_mm = use_temporary_mm(text_poke_mm); kasan_disable_current(); - func((u8 *)poking_addr + offset_in_page(addr), src, len); + func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* @@ -2299,22 +2202,22 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l */ barrier(); - pte_clear(poking_mm, poking_addr, ptep); + pte_clear(text_poke_mm, text_poke_mm_addr, ptep); if (cross_page_boundary) - pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1); /* * Loading the previous page-table hierarchy requires a serializing * instruction that already allows the core to see the updated version. * Xen-PV is assumed to serialize execution in a similar manner. */ - unuse_temporary_mm(prev); + unuse_temporary_mm(prev_mm); /* * Flushing the TLB might involve IPIs, which would require enabled * IRQs, but not if the mm is not used, as it is in this point. */ - flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); @@ -2450,7 +2353,7 @@ static void do_sync_core(void *info) sync_core(); } -void text_poke_sync(void) +void smp_text_poke_sync_each_cpu(void) { on_each_cpu(do_sync_core, NULL, 1); } @@ -2460,64 +2363,66 @@ void text_poke_sync(void) * this thing. When len == 6 everything is prefixed with 0x0f and we map * opcode to Jcc.d8, using len to distinguish. */ -struct text_poke_loc { +struct smp_text_poke_loc { /* addr := _stext + rel_addr */ s32 rel_addr; s32 disp; u8 len; u8 opcode; - const u8 text[POKE_MAX_OPCODE_SIZE]; - /* see text_poke_bp_batch() */ + const u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; + /* see smp_text_poke_batch_finish() */ u8 old; }; -struct bp_patching_desc { - struct text_poke_loc *vec; +#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) + +static struct smp_text_poke_array { + struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX]; int nr_entries; - atomic_t refs; -}; +} text_poke_array; -static struct bp_patching_desc bp_desc; +static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); -static __always_inline -struct bp_patching_desc *try_get_desc(void) +/* + * These four __always_inline annotations imply noinstr, necessary + * due to smp_text_poke_int3_handler() being noinstr: + */ + +static __always_inline bool try_get_text_poke_array(void) { - struct bp_patching_desc *desc = &bp_desc; + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); - if (!raw_atomic_inc_not_zero(&desc->refs)) - return NULL; + if (!raw_atomic_inc_not_zero(refs)) + return false; - return desc; + return true; } -static __always_inline void put_desc(void) +static __always_inline void put_text_poke_array(void) { - struct bp_patching_desc *desc = &bp_desc; + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); smp_mb__before_atomic(); - raw_atomic_dec(&desc->refs); + raw_atomic_dec(refs); } -static __always_inline void *text_poke_addr(struct text_poke_loc *tp) +static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl) { - return _stext + tp->rel_addr; + return _stext + tpl->rel_addr; } -static __always_inline int patch_cmp(const void *key, const void *elt) +static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b) { - struct text_poke_loc *tp = (struct text_poke_loc *) elt; - - if (key < text_poke_addr(tp)) + if (tpl_a < text_poke_addr(tpl_b)) return -1; - if (key > text_poke_addr(tp)) + if (tpl_a > text_poke_addr(tpl_b)) return 1; return 0; } -noinstr int poke_int3_handler(struct pt_regs *regs) +noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { - struct bp_patching_desc *desc; - struct text_poke_loc *tp; + struct smp_text_poke_loc *tpl; int ret = 0; void *ip; @@ -2526,41 +2431,40 @@ noinstr int poke_int3_handler(struct pt_regs *regs) /* * Having observed our INT3 instruction, we now must observe - * bp_desc with non-zero refcount: + * text_poke_array with non-zero refcount: * - * bp_desc.refs = 1 INT3 - * WMB RMB - * write INT3 if (bp_desc.refs != 0) + * text_poke_array_refs = 1 INT3 + * WMB RMB + * write INT3 if (text_poke_array_refs != 0) */ smp_rmb(); - desc = try_get_desc(); - if (!desc) + if (!try_get_text_poke_array()) return 0; /* - * Discount the INT3. See text_poke_bp_batch(). + * Discount the INT3. See smp_text_poke_batch_finish(). */ ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. */ - if (unlikely(desc->nr_entries > 1)) { - tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, - sizeof(struct text_poke_loc), + if (unlikely(text_poke_array.nr_entries > 1)) { + tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries, + sizeof(struct smp_text_poke_loc), patch_cmp); - if (!tp) + if (!tpl) goto out_put; } else { - tp = desc->vec; - if (text_poke_addr(tp) != ip) + tpl = text_poke_array.vec; + if (text_poke_addr(tpl) != ip) goto out_put; } - ip += tp->len; + ip += tpl->len; - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: /* * Someone poked an explicit INT3, they'll want to handle it, @@ -2573,16 +2477,16 @@ noinstr int poke_int3_handler(struct pt_regs *regs) break; case CALL_INSN_OPCODE: - int3_emulate_call(regs, (long)ip + tp->disp); + int3_emulate_call(regs, (long)ip + tpl->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - int3_emulate_jmp(regs, (long)ip + tp->disp); + int3_emulate_jmp(regs, (long)ip + tpl->disp); break; case 0x70 ... 0x7f: /* Jcc */ - int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp); + int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp); break; default: @@ -2592,51 +2496,50 @@ noinstr int poke_int3_handler(struct pt_regs *regs) ret = 1; out_put: - put_desc(); + put_text_poke_array(); return ret; } -#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) -static struct text_poke_loc tp_vec[TP_VEC_MAX]; -static int tp_vec_nr; - /** - * text_poke_bp_batch() -- update instructions on live kernel on SMP - * @tp: vector of instructions to patch - * @nr_entries: number of entries in the vector + * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP * - * Modify multi-byte instruction by using int3 breakpoint on SMP. - * We completely avoid stop_machine() here, and achieve the - * synchronization using int3 breakpoint. + * Input state: + * text_poke_array.vec: vector of instructions to patch + * text_poke_array.nr_entries: number of entries in the vector + * + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. * * The way it is done: * - For each entry in the vector: - * - add a int3 trap to the address that will be patched - * - sync cores + * - add an INT3 trap to the address that will be patched + * - SMP sync all CPUs * - For each entry in the vector: * - update all but the first byte of the patched range - * - sync cores + * - SMP sync all CPUs * - For each entry in the vector: - * - replace the first byte (int3) by the first byte of + * - replace the first byte (INT3) by the first byte of the * replacing opcode - * - sync cores + * - SMP sync all CPUs */ -static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +void smp_text_poke_batch_finish(void) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; - lockdep_assert_held(&text_mutex); + if (!text_poke_array.nr_entries) + return; - bp_desc.vec = tp; - bp_desc.nr_entries = nr_entries; + lockdep_assert_held(&text_mutex); /* - * Corresponds to the implicit memory barrier in try_get_desc() to - * ensure reading a non-zero refcount provides up to date bp_desc data. + * Corresponds to the implicit memory barrier in try_get_text_poke_array() to + * ensure reading a non-zero refcount provides up to date text_poke_array data. */ - atomic_set_release(&bp_desc.refs, 1); + for_each_possible_cpu(i) + atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1); /* * Function tracing can enable thousands of places that need to be @@ -2649,33 +2552,33 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries cond_resched(); /* - * Corresponding read barrier in int3 notifier for making sure the - * nr_entries and handler are correctly ordered wrt. patching. + * Corresponding read barrier in INT3 notifier for making sure the + * text_poke_array.nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); /* - * First step: add a int3 trap to the address that will be patched. + * First step: add a INT3 trap to the address that will be patched. */ - for (i = 0; i < nr_entries; i++) { - tp[i].old = *(u8 *)text_poke_addr(&tp[i]); - text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + for (i = 0; i < text_poke_array.nr_entries; i++) { + text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE); } - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Second step: update all but the first byte of the patched range. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, }; - u8 _new[POKE_MAX_OPCODE_SIZE+1]; - const u8 *new = tp[i].text; - int len = tp[i].len; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, }; + u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1]; + const u8 *new = text_poke_array.vec[i].text; + int len = text_poke_array.vec[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, - text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, len - INT3_INSN_SIZE); if (len == 6) { @@ -2684,7 +2587,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries new = _new; } - text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, new + INT3_INSN_SIZE, len - INT3_INSN_SIZE); @@ -2715,7 +2618,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * The old instruction is recorded so that the event can be * processed forwards or backwards. */ - perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len); + perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len); } if (do_sync) { @@ -2724,63 +2627,79 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ - text_poke_sync(); + smp_text_poke_sync_each_cpu(); } /* - * Third step: replace the first byte (int3) by the first byte of + * Third step: replace the first byte (INT3) by the first byte of the * replacing opcode. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 byte = tp[i].text[0]; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 byte = text_poke_array.vec[i].text[0]; - if (tp[i].len == 6) + if (text_poke_array.vec[i].len == 6) byte = 0x0f; if (byte == INT3_INSN_OPCODE) continue; - text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE); do_sync++; } if (do_sync) - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Remove and wait for refs to be zero. + * + * Notably, if after step-3 above the INT3 got removed, then the + * smp_text_poke_sync_each_cpu() will have serialized against any running INT3 + * handlers and the below spin-wait will not happen. + * + * IOW. unless the replacement instruction is INT3, this case goes + * unused. */ - if (!atomic_dec_and_test(&bp_desc.refs)) - atomic_cond_read_acquire(&bp_desc.refs, !VAL); + for_each_possible_cpu(i) { + atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i); + + if (unlikely(!atomic_dec_and_test(refs))) + atomic_cond_read_acquire(refs, !VAL); + } + + /* They are all completed: */ + text_poke_array.nr_entries = 0; } -static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, - const void *opcode, size_t len, const void *emulate) +static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { + struct smp_text_poke_loc *tpl; struct insn insn; int ret, i = 0; + tpl = &text_poke_array.vec[text_poke_array.nr_entries++]; + if (len == 6) i = 1; - memcpy((void *)tp->text, opcode+i, len-i); + memcpy((void *)tpl->text, opcode+i, len-i); if (!emulate) emulate = opcode; ret = insn_decode_kernel(&insn, emulate); BUG_ON(ret < 0); - tp->rel_addr = addr - (void *)_stext; - tp->len = len; - tp->opcode = insn.opcode.bytes[0]; + tpl->rel_addr = addr - (void *)_stext; + tpl->len = len; + tpl->opcode = insn.opcode.bytes[0]; if (is_jcc32(&insn)) { /* * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. */ - tp->opcode = insn.opcode.bytes[1] - 0x10; + tpl->opcode = insn.opcode.bytes[1] - 0x10; } - switch (tp->opcode) { + switch (tpl->opcode) { case RET_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: @@ -2789,14 +2708,14 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, * next instruction can be padded with INT3. */ for (i = insn.length; i < len; i++) - BUG_ON(tp->text[i] != INT3_INSN_OPCODE); + BUG_ON(tpl->text[i] != INT3_INSN_OPCODE); break; default: BUG_ON(len != insn.length); } - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: break; @@ -2805,21 +2724,21 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: case 0x70 ... 0x7f: /* Jcc */ - tp->disp = insn.immediate.value; + tpl->disp = insn.immediate.value; break; default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP8_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP8_INSN_OPCODE; + tpl->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP32_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP32_INSN_OPCODE; + tpl->disp = 0; break; default: /* unknown instruction */ @@ -2830,51 +2749,50 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, } /* - * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing * early if needed. */ -static bool tp_order_fail(void *addr) +static bool text_poke_addr_ordered(void *addr) { - struct text_poke_loc *tp; - - if (!tp_vec_nr) - return false; + WARN_ON_ONCE(!addr); - if (!addr) /* force */ + if (!text_poke_array.nr_entries) return true; - tp = &tp_vec[tp_vec_nr - 1]; - if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) - return true; - - return false; -} - -static void text_poke_flush(void *addr) -{ - if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { - text_poke_bp_batch(tp_vec, tp_vec_nr); - tp_vec_nr = 0; - } -} + /* + * If the last current entry's address is higher than the + * new entry's address we'd like to add, then ordering + * is violated and we must first flush all pending patching + * requests: + */ + if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr) + return false; -void text_poke_finish(void) -{ - text_poke_flush(NULL); + return true; } -void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +/** + * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @emulate: instruction to be emulated + * + * Add a new instruction to the current queue of to-be-patched instructions + * the kernel maintains. The patching request will not be executed immediately, + * but becomes part of an array of patching requests, optimized for batched + * execution. All pending patching requests will be executed on the next + * smp_text_poke_batch_finish() call. + */ +void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc *tp; - - text_poke_flush(addr); - - tp = &tp_vec[tp_vec_nr++]; - text_poke_loc_init(tp, addr, opcode, len, emulate); + if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr)) + smp_text_poke_batch_finish(); + __smp_text_poke_batch_add(addr, opcode, len, emulate); } /** - * text_poke_bp() -- update instructions on live kernel on SMP + * smp_text_poke_single() -- update instruction on live kernel on SMP immediately * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy @@ -2882,12 +2800,11 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is - * not possible to allocate memory. + * not possible to allocate memory for a vector. The single instruction + * is patched in immediately. */ -void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) +void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc tp; - - text_poke_loc_init(&tp, addr, opcode, len, emulate); - text_poke_bp_batch(&tp, 1); + __smp_text_poke_batch_add(addr, opcode, len, emulate); + smp_text_poke_batch_finish(); } diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index d86d7d6e750c..a951333c5995 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -185,7 +185,7 @@ static void *patch_dest(void *dest, bool direct) u8 *pad = dest - tsize; memcpy(insn_buff, skl_call_thunk_template, tsize); - apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize); + text_poke_apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize); /* Already patched? */ if (!bcmp(pad, insn_buff, tsize)) @@ -294,7 +294,7 @@ static bool is_callthunk(void *addr) pad = (void *)(dest - tmpl_size); memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size); + text_poke_apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size); return !bcmp(pad, insn_buff, tmpl_size); } @@ -312,7 +312,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip) return 0; memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size); + text_poke_apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size); memcpy(*pprog, insn_buff, tmpl_size); *pprog += tmpl_size; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index cace6e8d7cc7..0853ba3fd04a 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -55,10 +55,10 @@ void ftrace_arch_code_modify_post_process(void) { /* * ftrace_make_{call,nop}() may be called during - * module load, and we need to finish the text_poke_queue() + * module load, and we need to finish the smp_text_poke_batch_add() * that they do, here. */ - text_poke_finish(); + smp_text_poke_batch_finish(); ftrace_poke_late = 0; mutex_unlock(&text_mutex); } @@ -119,7 +119,7 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, /* replace the text with the new text */ if (ftrace_poke_late) - text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_batch_add((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); else text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); return 0; @@ -186,11 +186,11 @@ int ftrace_update_ftrace_func(ftrace_func_t func) ip = (unsigned long)(&ftrace_call); new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); ip = (unsigned long)(&ftrace_regs_call); new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } @@ -247,10 +247,10 @@ void ftrace_replace_code(int enable) break; } - text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_batch_add((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); ftrace_update_record(rec, enable); } - text_poke_finish(); + smp_text_poke_batch_finish(); } void arch_ftrace_update_code(int command) @@ -492,7 +492,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) mutex_lock(&text_mutex); /* Do a safe modify in case the trampoline is executing */ new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); mutex_unlock(&text_mutex); } @@ -586,7 +586,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func) const char *new; new = ftrace_jmp_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index f5b8ef02d172..a7949a54a0ff 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -102,7 +102,7 @@ __jump_label_transform(struct jump_entry *entry, return; } - text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); + smp_text_poke_single((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); } static void __ref jump_label_transform(struct jump_entry *entry, @@ -135,7 +135,7 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, mutex_lock(&text_mutex); jlp = __jump_label_patch(entry, type); - text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); + smp_text_poke_batch_add((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); mutex_unlock(&text_mutex); return true; } @@ -143,6 +143,6 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, void arch_jump_label_transform_apply(void) { mutex_lock(&text_mutex); - text_poke_finish(); + smp_text_poke_batch_finish(); mutex_unlock(&text_mutex); } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 09608fd93687..47cb8eb138ba 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -808,7 +808,7 @@ void arch_arm_kprobe(struct kprobe *p) u8 int3 = INT3_INSN_OPCODE; text_poke(p->addr, &int3, 1); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); } @@ -818,7 +818,7 @@ void arch_disarm_kprobe(struct kprobe *p) perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); text_poke(p->addr, &p->opcode, 1); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); } void arch_remove_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 36d6809c6c9e..0aabd4c4e2c4 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -488,7 +488,7 @@ void arch_optimize_kprobes(struct list_head *oplist) insn_buff[0] = JMP32_INSN_OPCODE; *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); + smp_text_poke_single(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); list_del_init(&op->list); } @@ -513,11 +513,11 @@ void arch_unoptimize_kprobe(struct optimized_kprobe *op) JMP32_INSN_SIZE - INT3_INSN_SIZE); text_poke(addr, new, INT3_INSN_SIZE); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); text_poke(addr + INT3_INSN_SIZE, new + INT3_INSN_SIZE, JMP32_INSN_SIZE - INT3_INSN_SIZE); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index a7998f351701..231d6326d1fd 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -206,7 +206,7 @@ static int write_relocate_add(Elf64_Shdr *sechdrs, write, apply); if (!early) { - text_poke_sync(); + smp_text_poke_sync_each_cpu(); mutex_unlock(&text_mutex); } diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index a59c72e77645..8164a7323c17 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -108,7 +108,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, if (system_state == SYSTEM_BOOTING || modinit) return text_poke_early(insn, code, size); - text_poke_bp(insn, code, size, emulate); + smp_text_poke_single(insn, code, size, emulate); } static void __static_call_validate(u8 *insn, bool tail, bool tramp) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9f88b8a78e50..d67407c623f3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -882,16 +882,16 @@ static void do_int3_user(struct pt_regs *regs) DEFINE_IDTENTRY_RAW(exc_int3) { /* - * poke_int3_handler() is completely self contained code; it does (and + * smp_text_poke_int3_handler() is completely self contained code; it does (and * must) *NOT* call out to anything, lest it hits upon yet another * INT3. */ - if (poke_int3_handler(regs)) + if (smp_text_poke_int3_handler(regs)) return; /* * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() - * and therefore can trigger INT3, hence poke_int3_handler() must + * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must * be done before. If the entry came from kernel mode, then use * nmi_enter() because the INT3 could have been hit in any context * including NMI. diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 98631c0e7a11..f786401ac15d 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -631,14 +631,21 @@ static bool get_desc(struct desc_struct *out, unsigned short sel) /* Bits [15:3] contain the index of the desired entry. */ sel >>= 3; - mutex_lock(¤t->active_mm->context.lock); - ldt = current->active_mm->context.ldt; + /* + * If we're not in a valid context with a real (not just lazy) + * user mm, then don't even try. + */ + if (!nmi_uaccess_okay()) + return false; + + mutex_lock(¤t->mm->context.lock); + ldt = current->mm->context.ldt; if (ldt && sel < ldt->nr_entries) { *out = ldt->entries[sel]; success = true; } - mutex_unlock(¤t->active_mm->context.lock); + mutex_unlock(¤t->mm->context.lock); return success; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bfa444a7dbb0..aa56d9ac0b8f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -28,6 +28,7 @@ #include <asm/text-patching.h> #include <asm/memtype.h> #include <asm/paravirt.h> +#include <asm/mmu_context.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -824,31 +825,33 @@ void __init poking_init(void) spinlock_t *ptl; pte_t *ptep; - poking_mm = mm_alloc(); - BUG_ON(!poking_mm); + text_poke_mm = mm_alloc(); + BUG_ON(!text_poke_mm); /* Xen PV guests need the PGD to be pinned. */ - paravirt_enter_mmap(poking_mm); + paravirt_enter_mmap(text_poke_mm); + + set_notrack_mm(text_poke_mm); /* * Randomize the poking address, but make sure that the following page * will be mapped at the same PMD. We need 2 pages, so find space for 3, * and adjust the address if the PMD ends after the first one. */ - poking_addr = TASK_UNMAPPED_BASE; + text_poke_mm_addr = TASK_UNMAPPED_BASE; if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % + text_poke_mm_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); - if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) - poking_addr += PAGE_SIZE; + if (((text_poke_mm_addr + PAGE_SIZE) & ~PMD_MASK) == 0) + text_poke_mm_addr += PAGE_SIZE; /* * We need to trigger the allocation of the page-tables that will be * needed for poking now. Later, poking may be performed in an atomic * section, which might cause allocation to fail. */ - ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); BUG_ON(!ptep); pte_unmap_unlock(ptep, ptl); } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index b6d6750e4bd1..2ea8cec6cd49 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -847,7 +847,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, * mm_cpumask. The TLB shootdown code can figure out from * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. */ - if (IS_ENABLED(CONFIG_DEBUG_VM) && WARN_ON_ONCE(prev != &init_mm && + if (IS_ENABLED(CONFIG_DEBUG_VM) && + WARN_ON_ONCE(prev != &init_mm && !is_notrack_mm(prev) && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); @@ -906,14 +907,6 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); - /* - * Leave this CPU in prev's mm_cpumask. Atomic writes to - * mm_cpumask can be expensive under contention. The CPU - * will be removed lazily at TLB flush time. - */ - VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, - mm_cpumask(prev))); - /* Start receiving IPIs and then read tlb_gen (and LAM below) */ if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next))) cpumask_set_cpu(cpu, mm_cpumask(next)); @@ -973,6 +966,77 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) } /* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. Temporary mms can also be used for EFI runtime service + * calls or similar functionality. + * + * It is illegal to schedule while using a temporary mm -- the context switch + * code is unaware of the temporary mm and does not know how to context switch. + * Use a real (non-temporary) mm in a kernel thread if you need to sleep. + * + * Note: For sensitive memory writes, the temporary mm needs to be used + * exclusively by a single core, and IRQs should be disabled while the + * temporary mm is loaded, thereby preventing interrupt handler bugs from + * overriding the kernel memory protection. + */ +struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm) +{ + struct mm_struct *prev_mm; + + lockdep_assert_preemption_disabled(); + guard(irqsave)(); + + /* + * Make sure not to be in TLB lazy mode, as otherwise we'll end up + * with a stale address space WITHOUT being in lazy mode after + * restoring the previous mm. + */ + if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) + leave_mm(); + + prev_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, temp_mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return prev_mm; +} + +void unuse_temporary_mm(struct mm_struct *prev_mm) +{ + lockdep_assert_preemption_disabled(); + guard(irqsave)(); + + /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ + cpumask_clear_cpu(smp_processor_id(), mm_cpumask(this_cpu_read(cpu_tlbstate.loaded_mm))); + + switch_mm_irqs_off(NULL, prev_mm, current); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +} + +/* * Call this when reinitializing a CPU. It fixes the following potential * problems: * diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9e5fe2ba858f..e2b9991c3326 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -629,7 +629,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, goto out; ret = 1; if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { - text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); + smp_text_poke_single(ip, new_insn, X86_PATCH_SIZE, NULL); ret = 0; } out: diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index a4b4ebd41b8f..e7e8f77f77f8 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -89,6 +89,7 @@ int __init efi_alloc_page_tables(void) efi_mm.pgd = efi_pgd; mm_init_cpumask(&efi_mm); init_new_context(NULL, &efi_mm); + set_notrack_mm(&efi_mm); return 0; @@ -434,15 +435,12 @@ void __init efi_dump_pagetable(void) */ static void efi_enter_mm(void) { - efi_prev_mm = current->active_mm; - current->active_mm = &efi_mm; - switch_mm(efi_prev_mm, &efi_mm, NULL); + efi_prev_mm = use_temporary_mm(&efi_mm); } static void efi_leave_mm(void) { - current->active_mm = efi_prev_mm; - switch_mm(&efi_mm, efi_prev_mm, NULL); + unuse_temporary_mm(efi_prev_mm); } void arch_efi_call_virt_setup(void) |