diff options
Diffstat (limited to 'kernel/entry')
| -rw-r--r-- | kernel/entry/Makefile | 8 | ||||
| -rw-r--r-- | kernel/entry/common.c | 294 | ||||
| -rw-r--r-- | kernel/entry/syscall-common.c | 104 | ||||
| -rw-r--r-- | kernel/entry/syscall_user_dispatch.c | 94 | ||||
| -rw-r--r-- | kernel/entry/virt.c (renamed from kernel/entry/kvm.c) | 19 |
5 files changed, 237 insertions, 282 deletions
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index 095c775e001e..2333d70802e4 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -6,8 +6,12 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n KCOV_INSTRUMENT := n +# Branch profiling isn't noinstr-safe +ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING + CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector -obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o -obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o +obj-$(CONFIG_GENERIC_IRQ_ENTRY) += common.o +obj-$(CONFIG_GENERIC_SYSCALL) += syscall-common.o syscall_user_dispatch.o +obj-$(CONFIG_VIRT_XFER_TO_GUEST_WORK) += virt.o diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 846add8394c4..5c792b30c58a 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -1,161 +1,34 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/context_tracking.h> -#include <linux/entry-common.h> +#include <linux/irq-entry-common.h> #include <linux/resume_user_mode.h> #include <linux/highmem.h> #include <linux/jump_label.h> #include <linux/kmsan.h> #include <linux/livepatch.h> -#include <linux/audit.h> #include <linux/tick.h> -#include "common.h" - -#define CREATE_TRACE_POINTS -#include <trace/events/syscalls.h> - -/* See comment for enter_from_user_mode() in entry-common.h */ -static __always_inline void __enter_from_user_mode(struct pt_regs *regs) -{ - arch_enter_from_user_mode(regs); - lockdep_hardirqs_off(CALLER_ADDR0); - - CT_WARN_ON(ct_state() != CONTEXT_USER); - user_exit_irqoff(); - - instrumentation_begin(); - kmsan_unpoison_entry_regs(regs); - trace_hardirqs_off_finish(); - instrumentation_end(); -} - -void noinstr enter_from_user_mode(struct pt_regs *regs) -{ - __enter_from_user_mode(regs); -} - -static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) -{ - if (unlikely(audit_context())) { - unsigned long args[6]; - - syscall_get_arguments(current, regs, args); - audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); - } -} - -static long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long work) -{ - long ret = 0; - - /* - * Handle Syscall User Dispatch. This must comes first, since - * the ABI here can be something that doesn't make sense for - * other syscall_work features. - */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (syscall_user_dispatch(regs)) - return -1L; - } - - /* Handle ptrace */ - if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { - ret = ptrace_report_syscall_entry(regs); - if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) - return -1L; - } - - /* Do seccomp after ptrace, to catch any tracer changes. */ - if (work & SYSCALL_WORK_SECCOMP) { - ret = __secure_computing(NULL); - if (ret == -1L) - return ret; - } - - /* Either of the above might have changed the syscall number */ - syscall = syscall_get_nr(current, regs); - - if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) - trace_sys_enter(regs, syscall); - - syscall_enter_audit(regs, syscall); - - return ret ? : syscall; -} - -static __always_inline long -__syscall_enter_from_user_work(struct pt_regs *regs, long syscall) -{ - unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - - if (work & SYSCALL_WORK_ENTER) - syscall = syscall_trace_enter(regs, syscall, work); - - return syscall; -} - -long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) -{ - return __syscall_enter_from_user_work(regs, syscall); -} - -noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) -{ - long ret; - - __enter_from_user_mode(regs); - - instrumentation_begin(); - local_irq_enable(); - ret = __syscall_enter_from_user_work(regs, syscall); - instrumentation_end(); - - return ret; -} - -noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) -{ - __enter_from_user_mode(regs); - instrumentation_begin(); - local_irq_enable(); - instrumentation_end(); -} - -/* See comment for exit_to_user_mode() in entry-common.h */ -static __always_inline void __exit_to_user_mode(void) -{ - instrumentation_begin(); - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - instrumentation_end(); - - user_enter_irqoff(); - arch_exit_to_user_mode(); - lockdep_hardirqs_on(CALLER_ADDR0); -} - -void noinstr exit_to_user_mode(void) -{ - __exit_to_user_mode(); -} - /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } -static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work) +#ifdef CONFIG_HAVE_GENERIC_TIF_BITS +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ) +#else +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK) +#endif + +static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { /* * Before returning to user space ensure that all pending work * items have been completed. */ - while (ti_work & EXIT_TO_USER_MODE_WORK) { + while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) { local_irq_enable_exit_to_user(ti_work); - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); if (ti_work & _TIF_UPROBE) @@ -190,125 +63,21 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, return ti_work; } -static void exit_to_user_mode_prepare(struct pt_regs *regs) -{ - unsigned long ti_work = read_thread_flags(); - - lockdep_assert_irqs_disabled(); - - /* Flush pending rcuog wakeup before the last need_resched() check */ - tick_nohz_user_enter_prepare(); - - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) - ti_work = exit_to_user_mode_loop(regs, ti_work); - - arch_exit_to_user_mode_prepare(regs, ti_work); - - /* Ensure that the address limit is intact and no locks are held */ - addr_limit_user_check(); - kmap_assert_nomap(); - lockdep_assert_irqs_disabled(); - lockdep_sys_exit(); -} - -/* - * If SYSCALL_EMU is set, then the only reason to report is when - * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall - * instruction has been already reported in syscall_enter_from_user_mode(). - */ -static inline bool report_single_step(unsigned long work) -{ - if (work & SYSCALL_WORK_SYSCALL_EMU) - return false; - - return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; -} - -static void syscall_exit_work(struct pt_regs *regs, unsigned long work) -{ - bool step; - - /* - * If the syscall was rolled back due to syscall user dispatching, - * then the tracers below are not invoked for the same reason as - * the entry side was not invoked in syscall_trace_enter(): The ABI - * of these syscalls is unknown. - */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (unlikely(current->syscall_dispatch.on_dispatch)) { - current->syscall_dispatch.on_dispatch = false; - return; - } - } - - audit_syscall_exit(regs); - - if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, syscall_get_return_value(current, regs)); - - step = report_single_step(work); - if (step || work & SYSCALL_WORK_SYSCALL_TRACE) - ptrace_report_syscall_exit(regs, step); -} - -/* - * Syscall specific exit to user mode preparation. Runs with interrupts - * enabled. +/** + * exit_to_user_mode_loop - do any pending work before leaving to user space + * @regs: Pointer to pt_regs on entry stack + * @ti_work: TIF work flags as read by the caller */ -static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) +__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { - unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - unsigned long nr = syscall_get_nr(current, regs); + for (;;) { + ti_work = __exit_to_user_mode_loop(regs, ti_work); - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); - - if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { - if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) - local_irq_enable(); + if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work))) + return ti_work; + ti_work = read_thread_flags(); } - - rseq_syscall(regs); - - /* - * Do one-time syscall specific work. If these work items are - * enabled, we want to run them exactly once per syscall exit with - * interrupts enabled. - */ - if (unlikely(work & SYSCALL_WORK_EXIT)) - syscall_exit_work(regs, work); -} - -static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) -{ - syscall_exit_to_user_mode_prepare(regs); - local_irq_disable_exit_to_user(); - exit_to_user_mode_prepare(regs); -} - -void syscall_exit_to_user_mode_work(struct pt_regs *regs) -{ - __syscall_exit_to_user_mode_work(regs); -} - -__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) -{ - instrumentation_begin(); - __syscall_exit_to_user_mode_work(regs); - instrumentation_end(); - __exit_to_user_mode(); -} - -noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) -{ - __enter_from_user_mode(regs); -} - -noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) -{ - instrumentation_begin(); - exit_to_user_mode_prepare(regs); - instrumentation_end(); - __exit_to_user_mode(); } noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) @@ -345,7 +114,8 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * TINY_RCU does not support EQS, so let the compiler eliminate * this part when enabled. */ - if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { + if (!IS_ENABLED(CONFIG_TINY_RCU) && + (is_idle_task(current) || arch_in_rcu_eqs())) { /* * If RCU is not watching then the same careful * sequence vs. lockdep and tracing is required @@ -378,6 +148,20 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) return ret; } +/** + * arch_irqentry_exit_need_resched - Architecture specific need resched function + * + * Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed. + * Defaults return true. + * + * The main purpose is to permit arch to avoid preemption of a task from an IRQ. + */ +static inline bool arch_irqentry_exit_need_resched(void); + +#ifndef arch_irqentry_exit_need_resched +static inline bool arch_irqentry_exit_need_resched(void) { return true; } +#endif + void raw_irqentry_exit_cond_resched(void) { if (!preempt_count()) { @@ -385,7 +169,7 @@ void raw_irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) + if (need_resched() && arch_irqentry_exit_need_resched()) preempt_schedule_irq(); } } diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c new file mode 100644 index 000000000000..940a597ded40 --- /dev/null +++ b/kernel/entry/syscall-common.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/audit.h> +#include <linux/entry-common.h> +#include "common.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + +static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) +{ + if (unlikely(audit_context())) { + unsigned long args[6]; + + syscall_get_arguments(current, regs, args); + audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); + } +} + +long syscall_trace_enter(struct pt_regs *regs, long syscall, + unsigned long work) +{ + long ret = 0; + + /* + * Handle Syscall User Dispatch. This must comes first, since + * the ABI here can be something that doesn't make sense for + * other syscall_work features. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (syscall_user_dispatch(regs)) + return -1L; + } + + /* Handle ptrace */ + if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { + ret = ptrace_report_syscall_entry(regs); + if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) + return -1L; + } + + /* Do seccomp after ptrace, to catch any tracer changes. */ + if (work & SYSCALL_WORK_SECCOMP) { + ret = __secure_computing(); + if (ret == -1L) + return ret; + } + + /* Either of the above might have changed the syscall number */ + syscall = syscall_get_nr(current, regs); + + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { + trace_sys_enter(regs, syscall); + /* + * Probes or BPF hooks in the tracepoint may have changed the + * system call number as well. + */ + syscall = syscall_get_nr(current, regs); + } + + syscall_enter_audit(regs, syscall); + + return ret ? : syscall; +} + +/* + * If SYSCALL_EMU is set, then the only reason to report is when + * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall + * instruction has been already reported in syscall_enter_from_user_mode(). + */ +static inline bool report_single_step(unsigned long work) +{ + if (work & SYSCALL_WORK_SYSCALL_EMU) + return false; + + return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; +} + +void syscall_exit_work(struct pt_regs *regs, unsigned long work) +{ + bool step; + + /* + * If the syscall was rolled back due to syscall user dispatching, + * then the tracers below are not invoked for the same reason as + * the entry side was not invoked in syscall_trace_enter(): The ABI + * of these syscalls is unknown. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (unlikely(current->syscall_dispatch.on_dispatch)) { + current->syscall_dispatch.on_dispatch = false; + return; + } + } + + audit_syscall_exit(regs); + + if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, syscall_get_return_value(current, regs)); + + step = report_single_step(work); + if (step || work & SYSCALL_WORK_SYSCALL_TRACE) + ptrace_report_syscall_exit(regs, step); +} diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index 0b6379adff6b..a9055eccb27e 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -4,6 +4,7 @@ */ #include <linux/sched.h> #include <linux/prctl.h> +#include <linux/ptrace.h> #include <linux/syscall_user_dispatch.h> #include <linux/uaccess.h> #include <linux/signal.h> @@ -68,15 +69,16 @@ bool syscall_user_dispatch(struct pt_regs *regs) return true; } -int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, - unsigned long len, char __user *selector) +static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned long mode, + unsigned long offset, unsigned long len, + char __user *selector) { switch (mode) { case PR_SYS_DISPATCH_OFF: if (offset || len || selector) return -EINVAL; break; - case PR_SYS_DISPATCH_ON: + case PR_SYS_DISPATCH_EXCLUSIVE_ON: /* * Validate the direct dispatcher region just for basic * sanity against overflow and a 0-sized dispatcher @@ -85,24 +87,88 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, */ if (offset && offset + len <= offset) return -EINVAL; - - if (selector && !access_ok(selector, sizeof(*selector))) - return -EFAULT; - + break; + case PR_SYS_DISPATCH_INCLUSIVE_ON: + if (len == 0 || offset + len <= offset) + return -EINVAL; + /* + * Invert the range, the check in syscall_user_dispatch() + * supports wrap-around. + */ + offset = offset + len; + len = -len; break; default: return -EINVAL; } - current->syscall_dispatch.selector = selector; - current->syscall_dispatch.offset = offset; - current->syscall_dispatch.len = len; - current->syscall_dispatch.on_dispatch = false; + /* + * access_ok() will clear memory tags for tagged addresses + * if current has memory tagging enabled. + * + * To enable a tracer to set a tracees selector the + * selector address must be untagged for access_ok(), + * otherwise an untagged tracer will always fail to set a + * tagged tracees selector. + */ + if (mode != PR_SYS_DISPATCH_OFF && selector && + !access_ok(untagged_addr(selector), sizeof(*selector))) + return -EFAULT; + + task->syscall_dispatch.selector = selector; + task->syscall_dispatch.offset = offset; + task->syscall_dispatch.len = len; + task->syscall_dispatch.on_dispatch = false; + + if (mode != PR_SYS_DISPATCH_OFF) + set_task_syscall_work(task, SYSCALL_USER_DISPATCH); + else + clear_task_syscall_work(task, SYSCALL_USER_DISPATCH); + + return 0; +} + +int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector) +{ + return task_set_syscall_user_dispatch(current, mode, offset, len, selector); +} + +int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size, + void __user *data) +{ + struct syscall_user_dispatch *sd = &task->syscall_dispatch; + struct ptrace_sud_config cfg; + + if (size != sizeof(cfg)) + return -EINVAL; - if (mode == PR_SYS_DISPATCH_ON) - set_syscall_work(SYSCALL_USER_DISPATCH); + if (test_task_syscall_work(task, SYSCALL_USER_DISPATCH)) + cfg.mode = PR_SYS_DISPATCH_ON; else - clear_syscall_work(SYSCALL_USER_DISPATCH); + cfg.mode = PR_SYS_DISPATCH_OFF; + + cfg.offset = sd->offset; + cfg.len = sd->len; + cfg.selector = (__u64)(uintptr_t)sd->selector; + + if (copy_to_user(data, &cfg, sizeof(cfg))) + return -EFAULT; return 0; } + +int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size, + void __user *data) +{ + struct ptrace_sud_config cfg; + + if (size != sizeof(cfg)) + return -EINVAL; + + if (copy_from_user(&cfg, data, sizeof(cfg))) + return -EFAULT; + + return task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset, cfg.len, + (char __user *)(uintptr_t)cfg.selector); +} diff --git a/kernel/entry/kvm.c b/kernel/entry/virt.c index 2e0f75bcb7fd..c52f99249763 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/virt.c @@ -1,34 +1,31 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/entry-kvm.h> -#include <linux/kvm_host.h> +#include <linux/entry-virt.h> -static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) +static int xfer_to_guest_mode_work(unsigned long ti_work) { do { int ret; - if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { - kvm_handle_signal_exit(vcpu); + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) return -EINTR; - } - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); if (ti_work & _TIF_NOTIFY_RESUME) resume_user_mode_work(NULL); - ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); + ret = arch_xfer_to_guest_mode_handle_work(ti_work); if (ret) return ret; ti_work = read_thread_flags(); - } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); + } while (ti_work & XFER_TO_GUEST_MODE_WORK); return 0; } -int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) +int xfer_to_guest_mode_handle_work(void) { unsigned long ti_work; @@ -44,6 +41,6 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) return 0; - return xfer_to_guest_mode_work(vcpu, ti_work); + return xfer_to_guest_mode_work(ti_work); } EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work); |
