diff options
Diffstat (limited to 'kernel/entry')
-rw-r--r-- | kernel/entry/common.c | 185 | ||||
-rw-r--r-- | kernel/entry/kvm.c | 11 | ||||
-rw-r--r-- | kernel/entry/syscall_user_dispatch.c | 86 |
3 files changed, 140 insertions, 142 deletions
diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f9d491b17b78..90843cc38588 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -2,34 +2,19 @@ #include <linux/context_tracking.h> #include <linux/entry-common.h> +#include <linux/resume_user_mode.h> #include <linux/highmem.h> +#include <linux/jump_label.h> +#include <linux/kmsan.h> #include <linux/livepatch.h> #include <linux/audit.h> +#include <linux/tick.h> #include "common.h" #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> -/* See comment for enter_from_user_mode() in entry-common.h */ -static __always_inline void __enter_from_user_mode(struct pt_regs *regs) -{ - arch_check_user_regs(regs); - lockdep_hardirqs_off(CALLER_ADDR0); - - CT_WARN_ON(ct_state() != CONTEXT_USER); - user_exit_irqoff(); - - instrumentation_begin(); - trace_hardirqs_off_finish(); - instrumentation_end(); -} - -void noinstr enter_from_user_mode(struct pt_regs *regs) -{ - __enter_from_user_mode(regs); -} - static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) { if (unlikely(audit_context())) { @@ -40,7 +25,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) } } -static long syscall_trace_enter(struct pt_regs *regs, long syscall, +long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work) { long ret = 0; @@ -57,7 +42,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Handle ptrace */ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { - ret = arch_syscall_enter_tracehook(regs); + ret = ptrace_report_syscall_entry(regs); if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) return -1L; } @@ -72,83 +57,38 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Either of the above might have changed the syscall number */ syscall = syscall_get_nr(current, regs); - if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { trace_sys_enter(regs, syscall); + /* + * Probes or BPF hooks in the tracepoint may have changed the + * system call number as well. + */ + syscall = syscall_get_nr(current, regs); + } syscall_enter_audit(regs, syscall); return ret ? : syscall; } -static __always_inline long -__syscall_enter_from_user_work(struct pt_regs *regs, long syscall) -{ - unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - - if (work & SYSCALL_WORK_ENTER) - syscall = syscall_trace_enter(regs, syscall, work); - - return syscall; -} - -long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) -{ - return __syscall_enter_from_user_work(regs, syscall); -} - -noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) -{ - long ret; - - __enter_from_user_mode(regs); - - instrumentation_begin(); - local_irq_enable(); - ret = __syscall_enter_from_user_work(regs, syscall); - instrumentation_end(); - - return ret; -} - noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) { - __enter_from_user_mode(regs); + enter_from_user_mode(regs); instrumentation_begin(); local_irq_enable(); instrumentation_end(); } -/* See comment for exit_to_user_mode() in entry-common.h */ -static __always_inline void __exit_to_user_mode(void) -{ - instrumentation_begin(); - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - instrumentation_end(); - - user_enter_irqoff(); - arch_exit_to_user_mode(); - lockdep_hardirqs_on(CALLER_ADDR0); -} - -void noinstr exit_to_user_mode(void) -{ - __exit_to_user_mode(); -} - /* Workaround to allow gradual conversion of architecture code */ -void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { } - -static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work) -{ - if (ti_work & _TIF_NOTIFY_SIGNAL) - tracehook_notify_signal(); - - arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING); -} +void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } -static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work) +/** + * exit_to_user_mode_loop - do any pending work before leaving to user space + * @regs: Pointer to pt_regs on entry stack + * @ti_work: TIF work flags as read by the caller + */ +__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { /* * Before returning to user space ensure that all pending work @@ -168,12 +108,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, klp_update_patch_state(current); if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) - handle_signal_work(regs, ti_work); + arch_do_signal_or_restart(regs); - if (ti_work & _TIF_NOTIFY_RESUME) { - tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } + if (ti_work & _TIF_NOTIFY_RESUME) + resume_user_mode_work(regs); /* Architecture specific TIF work */ arch_exit_to_user_mode_work(regs, ti_work); @@ -184,31 +122,17 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, * enabled above. */ local_irq_disable_exit_to_user(); - ti_work = READ_ONCE(current_thread_info()->flags); + + /* Check if any of the above work has queued a deferred wakeup */ + tick_nohz_user_enter_prepare(); + + ti_work = read_thread_flags(); } /* Return the latest work state for arch_exit_to_user_mode() */ return ti_work; } -static void exit_to_user_mode_prepare(struct pt_regs *regs) -{ - unsigned long ti_work = READ_ONCE(current_thread_info()->flags); - - lockdep_assert_irqs_disabled(); - - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) - ti_work = exit_to_user_mode_loop(regs, ti_work); - - arch_exit_to_user_mode_prepare(regs, ti_work); - - /* Ensure that the address limit is intact and no locks are held */ - addr_limit_user_check(); - kmap_assert_nomap(); - lockdep_assert_irqs_disabled(); - lockdep_sys_exit(); -} - /* * If SYSCALL_EMU is set, then the only reason to report is when * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall @@ -246,7 +170,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work) step = report_single_step(work); if (step || work & SYSCALL_WORK_SYSCALL_TRACE) - arch_syscall_exit_tracehook(regs, step); + ptrace_report_syscall_exit(regs, step); } /* @@ -293,12 +217,12 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) instrumentation_begin(); __syscall_exit_to_user_mode_work(regs); instrumentation_end(); - __exit_to_user_mode(); + exit_to_user_mode(); } noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) { - __enter_from_user_mode(regs); + enter_from_user_mode(regs); } noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) @@ -306,7 +230,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) instrumentation_begin(); exit_to_user_mode_prepare(regs); instrumentation_end(); - __exit_to_user_mode(); + exit_to_user_mode(); } noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) @@ -321,7 +245,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) } /* - * If this entry hit the idle task invoke rcu_irq_enter() whether + * If this entry hit the idle task invoke ct_irq_enter() whether * RCU is watching or not. * * Interrupts can nest when the first interrupt invokes softirq @@ -332,12 +256,12 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * not nested into another interrupt. * * Checking for rcu_is_watching() here would prevent the nesting - * interrupt to invoke rcu_irq_enter(). If that nested interrupt is + * interrupt to invoke ct_irq_enter(). If that nested interrupt is * the tick then rcu_flavor_sched_clock_irq() would wrongfully - * assume that it is the first interupt and eventually claim + * assume that it is the first interrupt and eventually claim * quiescent state and end grace periods prematurely. * - * Unconditionally invoke rcu_irq_enter() so RCU state stays + * Unconditionally invoke ct_irq_enter() so RCU state stays * consistent. * * TINY_RCU does not support EQS, so let the compiler eliminate @@ -350,8 +274,9 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * as in irqentry_enter_from_user_mode(). */ lockdep_hardirqs_off(CALLER_ADDR0); - rcu_irq_enter(); + ct_irq_enter(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); instrumentation_end(); @@ -367,6 +292,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) */ lockdep_hardirqs_off(CALLER_ADDR0); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); rcu_irq_enter_check_tick(); trace_hardirqs_off_finish(); instrumentation_end(); @@ -374,7 +300,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) return ret; } -void irqentry_exit_cond_resched(void) +void raw_irqentry_exit_cond_resched(void) { if (!preempt_count()) { /* Sanity check RCU and thread stack */ @@ -385,6 +311,19 @@ void irqentry_exit_cond_resched(void) preempt_schedule_irq(); } } +#ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +void dynamic_irqentry_exit_cond_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) + return; + raw_irqentry_exit_cond_resched(); +} +#endif +#endif noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) { @@ -403,9 +342,9 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) instrumentation_begin(); /* Tell the tracer that IRET will enable interrupts */ trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); instrumentation_end(); - rcu_irq_exit(); + ct_irq_exit(); lockdep_hardirqs_on(CALLER_ADDR0); return; } @@ -413,6 +352,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) instrumentation_begin(); if (IS_ENABLED(CONFIG_PREEMPTION)) irqentry_exit_cond_resched(); + /* Covers both tracing and lockdep */ trace_hardirqs_on(); instrumentation_end(); @@ -422,7 +362,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) * was not watching on entry. */ if (state.exit_rcu) - rcu_irq_exit(); + ct_irq_exit(); } } @@ -435,9 +375,10 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) __nmi_enter(); lockdep_hardirqs_off(CALLER_ADDR0); lockdep_hardirq_enter(); - rcu_nmi_enter(); + ct_nmi_enter(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); ftrace_nmi_enter(); instrumentation_end(); @@ -451,11 +392,11 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) ftrace_nmi_exit(); if (irq_state.lockdep) { trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); } instrumentation_end(); - rcu_nmi_exit(); + ct_nmi_exit(); lockdep_hardirq_exit(); if (irq_state.lockdep) lockdep_hardirqs_on(CALLER_ADDR0); diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 49972ee99aff..2e0f75bcb7fd 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -8,10 +8,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) do { int ret; - if (ti_work & _TIF_NOTIFY_SIGNAL) - tracehook_notify_signal(); - - if (ti_work & _TIF_SIGPENDING) { + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { kvm_handle_signal_exit(vcpu); return -EINTR; } @@ -20,13 +17,13 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) schedule(); if (ti_work & _TIF_NOTIFY_RESUME) - tracehook_notify_resume(NULL); + resume_user_mode_work(NULL); ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); if (ret) return ret; - ti_work = READ_ONCE(current_thread_info()->flags); + ti_work = read_thread_flags(); } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); return 0; } @@ -43,7 +40,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) * disabled in the inner loop before going into guest mode. No need * to disable interrupts here. */ - ti_work = READ_ONCE(current_thread_info()->flags); + ti_work = read_thread_flags(); if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) return 0; diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index c240302f56e2..5340c5aa89e7 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -4,6 +4,7 @@ */ #include <linux/sched.h> #include <linux/prctl.h> +#include <linux/ptrace.h> #include <linux/syscall_user_dispatch.h> #include <linux/uaccess.h> #include <linux/signal.h> @@ -47,14 +48,18 @@ bool syscall_user_dispatch(struct pt_regs *regs) * access_ok() is performed once, at prctl time, when * the selector is loaded by userspace. */ - if (unlikely(__get_user(state, sd->selector))) - do_exit(SIGSEGV); + if (unlikely(__get_user(state, sd->selector))) { + force_exit_sig(SIGSEGV); + return true; + } if (likely(state == SYSCALL_DISPATCH_FILTER_ALLOW)) return false; - if (state != SYSCALL_DISPATCH_FILTER_BLOCK) - do_exit(SIGSYS); + if (state != SYSCALL_DISPATCH_FILTER_BLOCK) { + force_exit_sig(SIGSYS); + return true; + } } sd->on_dispatch = true; @@ -64,8 +69,9 @@ bool syscall_user_dispatch(struct pt_regs *regs) return true; } -int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, - unsigned long len, char __user *selector) +static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned long mode, + unsigned long offset, unsigned long len, + char __user *selector) { switch (mode) { case PR_SYS_DISPATCH_OFF: @@ -82,7 +88,16 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, if (offset && offset + len <= offset) return -EINVAL; - if (selector && !access_ok(selector, sizeof(*selector))) + /* + * access_ok() will clear memory tags for tagged addresses + * if current has memory tagging enabled. + + * To enable a tracer to set a tracees selector the + * selector address must be untagged for access_ok(), + * otherwise an untagged tracer will always fail to set a + * tagged tracees selector. + */ + if (selector && !access_ok(untagged_addr(selector), sizeof(*selector))) return -EFAULT; break; @@ -90,15 +105,60 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, return -EINVAL; } - current->syscall_dispatch.selector = selector; - current->syscall_dispatch.offset = offset; - current->syscall_dispatch.len = len; - current->syscall_dispatch.on_dispatch = false; + task->syscall_dispatch.selector = selector; + task->syscall_dispatch.offset = offset; + task->syscall_dispatch.len = len; + task->syscall_dispatch.on_dispatch = false; if (mode == PR_SYS_DISPATCH_ON) - set_syscall_work(SYSCALL_USER_DISPATCH); + set_task_syscall_work(task, SYSCALL_USER_DISPATCH); + else + clear_task_syscall_work(task, SYSCALL_USER_DISPATCH); + + return 0; +} + +int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector) +{ + return task_set_syscall_user_dispatch(current, mode, offset, len, selector); +} + +int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size, + void __user *data) +{ + struct syscall_user_dispatch *sd = &task->syscall_dispatch; + struct ptrace_sud_config cfg; + + if (size != sizeof(cfg)) + return -EINVAL; + + if (test_task_syscall_work(task, SYSCALL_USER_DISPATCH)) + cfg.mode = PR_SYS_DISPATCH_ON; else - clear_syscall_work(SYSCALL_USER_DISPATCH); + cfg.mode = PR_SYS_DISPATCH_OFF; + + cfg.offset = sd->offset; + cfg.len = sd->len; + cfg.selector = (__u64)(uintptr_t)sd->selector; + + if (copy_to_user(data, &cfg, sizeof(cfg))) + return -EFAULT; return 0; } + +int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size, + void __user *data) +{ + struct ptrace_sud_config cfg; + + if (size != sizeof(cfg)) + return -EINVAL; + + if (copy_from_user(&cfg, data, sizeof(cfg))) + return -EFAULT; + + return task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset, cfg.len, + (char __user *)(uintptr_t)cfg.selector); +} |