diff options
Diffstat (limited to 'kernel/entry/common.c')
| -rw-r--r-- | kernel/entry/common.c | 295 |
1 files changed, 39 insertions, 256 deletions
diff --git a/kernel/entry/common.c b/kernel/entry/common.c index be61332c66b5..5c792b30c58a 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -1,161 +1,34 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/context_tracking.h> -#include <linux/entry-common.h> +#include <linux/irq-entry-common.h> #include <linux/resume_user_mode.h> #include <linux/highmem.h> #include <linux/jump_label.h> #include <linux/kmsan.h> #include <linux/livepatch.h> -#include <linux/audit.h> #include <linux/tick.h> -#include "common.h" - -#define CREATE_TRACE_POINTS -#include <trace/events/syscalls.h> - -/* See comment for enter_from_user_mode() in entry-common.h */ -static __always_inline void __enter_from_user_mode(struct pt_regs *regs) -{ - arch_enter_from_user_mode(regs); - lockdep_hardirqs_off(CALLER_ADDR0); - - CT_WARN_ON(__ct_state() != CONTEXT_USER); - user_exit_irqoff(); - - instrumentation_begin(); - kmsan_unpoison_entry_regs(regs); - trace_hardirqs_off_finish(); - instrumentation_end(); -} - -void noinstr enter_from_user_mode(struct pt_regs *regs) -{ - __enter_from_user_mode(regs); -} - -static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) -{ - if (unlikely(audit_context())) { - unsigned long args[6]; - - syscall_get_arguments(current, regs, args); - audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); - } -} - -static long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long work) -{ - long ret = 0; - - /* - * Handle Syscall User Dispatch. This must comes first, since - * the ABI here can be something that doesn't make sense for - * other syscall_work features. - */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (syscall_user_dispatch(regs)) - return -1L; - } - - /* Handle ptrace */ - if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { - ret = ptrace_report_syscall_entry(regs); - if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) - return -1L; - } - - /* Do seccomp after ptrace, to catch any tracer changes. */ - if (work & SYSCALL_WORK_SECCOMP) { - ret = __secure_computing(NULL); - if (ret == -1L) - return ret; - } - - /* Either of the above might have changed the syscall number */ - syscall = syscall_get_nr(current, regs); - - if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) - trace_sys_enter(regs, syscall); - - syscall_enter_audit(regs, syscall); - - return ret ? : syscall; -} - -static __always_inline long -__syscall_enter_from_user_work(struct pt_regs *regs, long syscall) -{ - unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - - if (work & SYSCALL_WORK_ENTER) - syscall = syscall_trace_enter(regs, syscall, work); - - return syscall; -} - -long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) -{ - return __syscall_enter_from_user_work(regs, syscall); -} - -noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) -{ - long ret; - - __enter_from_user_mode(regs); - - instrumentation_begin(); - local_irq_enable(); - ret = __syscall_enter_from_user_work(regs, syscall); - instrumentation_end(); - - return ret; -} - -noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) -{ - __enter_from_user_mode(regs); - instrumentation_begin(); - local_irq_enable(); - instrumentation_end(); -} - -/* See comment for exit_to_user_mode() in entry-common.h */ -static __always_inline void __exit_to_user_mode(void) -{ - instrumentation_begin(); - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - instrumentation_end(); - - user_enter_irqoff(); - arch_exit_to_user_mode(); - lockdep_hardirqs_on(CALLER_ADDR0); -} - -void noinstr exit_to_user_mode(void) -{ - __exit_to_user_mode(); -} - /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } -static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work) +#ifdef CONFIG_HAVE_GENERIC_TIF_BITS +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ) +#else +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK) +#endif + +static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { /* * Before returning to user space ensure that all pending work * items have been completed. */ - while (ti_work & EXIT_TO_USER_MODE_WORK) { + while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) { local_irq_enable_exit_to_user(ti_work); - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); if (ti_work & _TIF_UPROBE) @@ -190,126 +63,21 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, return ti_work; } -static void exit_to_user_mode_prepare(struct pt_regs *regs) -{ - unsigned long ti_work; - - lockdep_assert_irqs_disabled(); - - /* Flush pending rcuog wakeup before the last need_resched() check */ - tick_nohz_user_enter_prepare(); - - ti_work = read_thread_flags(); - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) - ti_work = exit_to_user_mode_loop(regs, ti_work); - - arch_exit_to_user_mode_prepare(regs, ti_work); - - /* Ensure that the address limit is intact and no locks are held */ - addr_limit_user_check(); - kmap_assert_nomap(); - lockdep_assert_irqs_disabled(); - lockdep_sys_exit(); -} - -/* - * If SYSCALL_EMU is set, then the only reason to report is when - * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall - * instruction has been already reported in syscall_enter_from_user_mode(). - */ -static inline bool report_single_step(unsigned long work) -{ - if (work & SYSCALL_WORK_SYSCALL_EMU) - return false; - - return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; -} - -static void syscall_exit_work(struct pt_regs *regs, unsigned long work) -{ - bool step; - - /* - * If the syscall was rolled back due to syscall user dispatching, - * then the tracers below are not invoked for the same reason as - * the entry side was not invoked in syscall_trace_enter(): The ABI - * of these syscalls is unknown. - */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (unlikely(current->syscall_dispatch.on_dispatch)) { - current->syscall_dispatch.on_dispatch = false; - return; - } - } - - audit_syscall_exit(regs); - - if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, syscall_get_return_value(current, regs)); - - step = report_single_step(work); - if (step || work & SYSCALL_WORK_SYSCALL_TRACE) - ptrace_report_syscall_exit(regs, step); -} - -/* - * Syscall specific exit to user mode preparation. Runs with interrupts - * enabled. +/** + * exit_to_user_mode_loop - do any pending work before leaving to user space + * @regs: Pointer to pt_regs on entry stack + * @ti_work: TIF work flags as read by the caller */ -static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) +__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { - unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - unsigned long nr = syscall_get_nr(current, regs); + for (;;) { + ti_work = __exit_to_user_mode_loop(regs, ti_work); - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); - - if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { - if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) - local_irq_enable(); + if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work))) + return ti_work; + ti_work = read_thread_flags(); } - - rseq_syscall(regs); - - /* - * Do one-time syscall specific work. If these work items are - * enabled, we want to run them exactly once per syscall exit with - * interrupts enabled. - */ - if (unlikely(work & SYSCALL_WORK_EXIT)) - syscall_exit_work(regs, work); -} - -static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) -{ - syscall_exit_to_user_mode_prepare(regs); - local_irq_disable_exit_to_user(); - exit_to_user_mode_prepare(regs); -} - -void syscall_exit_to_user_mode_work(struct pt_regs *regs) -{ - __syscall_exit_to_user_mode_work(regs); -} - -__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) -{ - instrumentation_begin(); - __syscall_exit_to_user_mode_work(regs); - instrumentation_end(); - __exit_to_user_mode(); -} - -noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) -{ - __enter_from_user_mode(regs); -} - -noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) -{ - instrumentation_begin(); - exit_to_user_mode_prepare(regs); - instrumentation_end(); - __exit_to_user_mode(); } noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) @@ -346,7 +114,8 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * TINY_RCU does not support EQS, so let the compiler eliminate * this part when enabled. */ - if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { + if (!IS_ENABLED(CONFIG_TINY_RCU) && + (is_idle_task(current) || arch_in_rcu_eqs())) { /* * If RCU is not watching then the same careful * sequence vs. lockdep and tracing is required @@ -379,6 +148,20 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) return ret; } +/** + * arch_irqentry_exit_need_resched - Architecture specific need resched function + * + * Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed. + * Defaults return true. + * + * The main purpose is to permit arch to avoid preemption of a task from an IRQ. + */ +static inline bool arch_irqentry_exit_need_resched(void); + +#ifndef arch_irqentry_exit_need_resched +static inline bool arch_irqentry_exit_need_resched(void) { return true; } +#endif + void raw_irqentry_exit_cond_resched(void) { if (!preempt_count()) { @@ -386,7 +169,7 @@ void raw_irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) + if (need_resched() && arch_irqentry_exit_need_resched()) preempt_schedule_irq(); } } |
