summaryrefslogtreecommitdiff
path: root/kernel/entry
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/entry')
-rw-r--r--kernel/entry/common.c185
-rw-r--r--kernel/entry/kvm.c11
-rw-r--r--kernel/entry/syscall_user_dispatch.c86
3 files changed, 140 insertions, 142 deletions
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index f9d491b17b78..90843cc38588 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -2,34 +2,19 @@
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
+#include <linux/resume_user_mode.h>
#include <linux/highmem.h>
+#include <linux/jump_label.h>
+#include <linux/kmsan.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
+#include <linux/tick.h>
#include "common.h"
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
-/* See comment for enter_from_user_mode() in entry-common.h */
-static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
-{
- arch_check_user_regs(regs);
- lockdep_hardirqs_off(CALLER_ADDR0);
-
- CT_WARN_ON(ct_state() != CONTEXT_USER);
- user_exit_irqoff();
-
- instrumentation_begin();
- trace_hardirqs_off_finish();
- instrumentation_end();
-}
-
-void noinstr enter_from_user_mode(struct pt_regs *regs)
-{
- __enter_from_user_mode(regs);
-}
-
static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
{
if (unlikely(audit_context())) {
@@ -40,7 +25,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
}
}
-static long syscall_trace_enter(struct pt_regs *regs, long syscall,
+long syscall_trace_enter(struct pt_regs *regs, long syscall,
unsigned long work)
{
long ret = 0;
@@ -57,7 +42,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
/* Handle ptrace */
if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
- ret = arch_syscall_enter_tracehook(regs);
+ ret = ptrace_report_syscall_entry(regs);
if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
return -1L;
}
@@ -72,83 +57,38 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
/* Either of the above might have changed the syscall number */
syscall = syscall_get_nr(current, regs);
- if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
+ if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
trace_sys_enter(regs, syscall);
+ /*
+ * Probes or BPF hooks in the tracepoint may have changed the
+ * system call number as well.
+ */
+ syscall = syscall_get_nr(current, regs);
+ }
syscall_enter_audit(regs, syscall);
return ret ? : syscall;
}
-static __always_inline long
-__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
-{
- unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
-
- if (work & SYSCALL_WORK_ENTER)
- syscall = syscall_trace_enter(regs, syscall, work);
-
- return syscall;
-}
-
-long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
-{
- return __syscall_enter_from_user_work(regs, syscall);
-}
-
-noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
-{
- long ret;
-
- __enter_from_user_mode(regs);
-
- instrumentation_begin();
- local_irq_enable();
- ret = __syscall_enter_from_user_work(regs, syscall);
- instrumentation_end();
-
- return ret;
-}
-
noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
{
- __enter_from_user_mode(regs);
+ enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
instrumentation_end();
}
-/* See comment for exit_to_user_mode() in entry-common.h */
-static __always_inline void __exit_to_user_mode(void)
-{
- instrumentation_begin();
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- instrumentation_end();
-
- user_enter_irqoff();
- arch_exit_to_user_mode();
- lockdep_hardirqs_on(CALLER_ADDR0);
-}
-
-void noinstr exit_to_user_mode(void)
-{
- __exit_to_user_mode();
-}
-
/* Workaround to allow gradual conversion of architecture code */
-void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
-
-static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
-{
- if (ti_work & _TIF_NOTIFY_SIGNAL)
- tracehook_notify_signal();
-
- arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
-}
+void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
-static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
- unsigned long ti_work)
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ * @regs: Pointer to pt_regs on entry stack
+ * @ti_work: TIF work flags as read by the caller
+ */
+__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+ unsigned long ti_work)
{
/*
* Before returning to user space ensure that all pending work
@@ -168,12 +108,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
klp_update_patch_state(current);
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
- handle_signal_work(regs, ti_work);
+ arch_do_signal_or_restart(regs);
- if (ti_work & _TIF_NOTIFY_RESUME) {
- tracehook_notify_resume(regs);
- rseq_handle_notify_resume(NULL, regs);
- }
+ if (ti_work & _TIF_NOTIFY_RESUME)
+ resume_user_mode_work(regs);
/* Architecture specific TIF work */
arch_exit_to_user_mode_work(regs, ti_work);
@@ -184,31 +122,17 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
* enabled above.
*/
local_irq_disable_exit_to_user();
- ti_work = READ_ONCE(current_thread_info()->flags);
+
+ /* Check if any of the above work has queued a deferred wakeup */
+ tick_nohz_user_enter_prepare();
+
+ ti_work = read_thread_flags();
}
/* Return the latest work state for arch_exit_to_user_mode() */
return ti_work;
}
-static void exit_to_user_mode_prepare(struct pt_regs *regs)
-{
- unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
-
- lockdep_assert_irqs_disabled();
-
- if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
- ti_work = exit_to_user_mode_loop(regs, ti_work);
-
- arch_exit_to_user_mode_prepare(regs, ti_work);
-
- /* Ensure that the address limit is intact and no locks are held */
- addr_limit_user_check();
- kmap_assert_nomap();
- lockdep_assert_irqs_disabled();
- lockdep_sys_exit();
-}
-
/*
* If SYSCALL_EMU is set, then the only reason to report is when
* SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
@@ -246,7 +170,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
step = report_single_step(work);
if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
- arch_syscall_exit_tracehook(regs, step);
+ ptrace_report_syscall_exit(regs, step);
}
/*
@@ -293,12 +217,12 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
instrumentation_begin();
__syscall_exit_to_user_mode_work(regs);
instrumentation_end();
- __exit_to_user_mode();
+ exit_to_user_mode();
}
noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
- __enter_from_user_mode(regs);
+ enter_from_user_mode(regs);
}
noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
@@ -306,7 +230,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
instrumentation_begin();
exit_to_user_mode_prepare(regs);
instrumentation_end();
- __exit_to_user_mode();
+ exit_to_user_mode();
}
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
@@ -321,7 +245,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
}
/*
- * If this entry hit the idle task invoke rcu_irq_enter() whether
+ * If this entry hit the idle task invoke ct_irq_enter() whether
* RCU is watching or not.
*
* Interrupts can nest when the first interrupt invokes softirq
@@ -332,12 +256,12 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* not nested into another interrupt.
*
* Checking for rcu_is_watching() here would prevent the nesting
- * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
+ * interrupt to invoke ct_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
- * assume that it is the first interupt and eventually claim
+ * assume that it is the first interrupt and eventually claim
* quiescent state and end grace periods prematurely.
*
- * Unconditionally invoke rcu_irq_enter() so RCU state stays
+ * Unconditionally invoke ct_irq_enter() so RCU state stays
* consistent.
*
* TINY_RCU does not support EQS, so let the compiler eliminate
@@ -350,8 +274,9 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* as in irqentry_enter_from_user_mode().
*/
lockdep_hardirqs_off(CALLER_ADDR0);
- rcu_irq_enter();
+ ct_irq_enter();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
@@ -367,6 +292,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
*/
lockdep_hardirqs_off(CALLER_ADDR0);
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
rcu_irq_enter_check_tick();
trace_hardirqs_off_finish();
instrumentation_end();
@@ -374,7 +300,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
return ret;
}
-void irqentry_exit_cond_resched(void)
+void raw_irqentry_exit_cond_resched(void)
{
if (!preempt_count()) {
/* Sanity check RCU and thread stack */
@@ -385,6 +311,19 @@ void irqentry_exit_cond_resched(void)
preempt_schedule_irq();
}
}
+#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
+void dynamic_irqentry_exit_cond_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
+ return;
+ raw_irqentry_exit_cond_resched();
+}
+#endif
+#endif
noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
{
@@ -403,9 +342,9 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
instrumentation_begin();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
instrumentation_end();
- rcu_irq_exit();
+ ct_irq_exit();
lockdep_hardirqs_on(CALLER_ADDR0);
return;
}
@@ -413,6 +352,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
instrumentation_begin();
if (IS_ENABLED(CONFIG_PREEMPTION))
irqentry_exit_cond_resched();
+
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
@@ -422,7 +362,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
* was not watching on entry.
*/
if (state.exit_rcu)
- rcu_irq_exit();
+ ct_irq_exit();
}
}
@@ -435,9 +375,10 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
__nmi_enter();
lockdep_hardirqs_off(CALLER_ADDR0);
lockdep_hardirq_enter();
- rcu_nmi_enter();
+ ct_nmi_enter();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
ftrace_nmi_enter();
instrumentation_end();
@@ -451,11 +392,11 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
ftrace_nmi_exit();
if (irq_state.lockdep) {
trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
}
instrumentation_end();
- rcu_nmi_exit();
+ ct_nmi_exit();
lockdep_hardirq_exit();
if (irq_state.lockdep)
lockdep_hardirqs_on(CALLER_ADDR0);
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 49972ee99aff..2e0f75bcb7fd 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -8,10 +8,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
do {
int ret;
- if (ti_work & _TIF_NOTIFY_SIGNAL)
- tracehook_notify_signal();
-
- if (ti_work & _TIF_SIGPENDING) {
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
kvm_handle_signal_exit(vcpu);
return -EINTR;
}
@@ -20,13 +17,13 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
schedule();
if (ti_work & _TIF_NOTIFY_RESUME)
- tracehook_notify_resume(NULL);
+ resume_user_mode_work(NULL);
ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
if (ret)
return ret;
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = read_thread_flags();
} while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
return 0;
}
@@ -43,7 +40,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu)
* disabled in the inner loop before going into guest mode. No need
* to disable interrupts here.
*/
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = read_thread_flags();
if (!(ti_work & XFER_TO_GUEST_MODE_WORK))
return 0;
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index c240302f56e2..5340c5aa89e7 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
#include <linux/prctl.h>
+#include <linux/ptrace.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/uaccess.h>
#include <linux/signal.h>
@@ -47,14 +48,18 @@ bool syscall_user_dispatch(struct pt_regs *regs)
* access_ok() is performed once, at prctl time, when
* the selector is loaded by userspace.
*/
- if (unlikely(__get_user(state, sd->selector)))
- do_exit(SIGSEGV);
+ if (unlikely(__get_user(state, sd->selector))) {
+ force_exit_sig(SIGSEGV);
+ return true;
+ }
if (likely(state == SYSCALL_DISPATCH_FILTER_ALLOW))
return false;
- if (state != SYSCALL_DISPATCH_FILTER_BLOCK)
- do_exit(SIGSYS);
+ if (state != SYSCALL_DISPATCH_FILTER_BLOCK) {
+ force_exit_sig(SIGSYS);
+ return true;
+ }
}
sd->on_dispatch = true;
@@ -64,8 +69,9 @@ bool syscall_user_dispatch(struct pt_regs *regs)
return true;
}
-int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
- unsigned long len, char __user *selector)
+static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned long mode,
+ unsigned long offset, unsigned long len,
+ char __user *selector)
{
switch (mode) {
case PR_SYS_DISPATCH_OFF:
@@ -82,7 +88,16 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
if (offset && offset + len <= offset)
return -EINVAL;
- if (selector && !access_ok(selector, sizeof(*selector)))
+ /*
+ * access_ok() will clear memory tags for tagged addresses
+ * if current has memory tagging enabled.
+
+ * To enable a tracer to set a tracees selector the
+ * selector address must be untagged for access_ok(),
+ * otherwise an untagged tracer will always fail to set a
+ * tagged tracees selector.
+ */
+ if (selector && !access_ok(untagged_addr(selector), sizeof(*selector)))
return -EFAULT;
break;
@@ -90,15 +105,60 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
return -EINVAL;
}
- current->syscall_dispatch.selector = selector;
- current->syscall_dispatch.offset = offset;
- current->syscall_dispatch.len = len;
- current->syscall_dispatch.on_dispatch = false;
+ task->syscall_dispatch.selector = selector;
+ task->syscall_dispatch.offset = offset;
+ task->syscall_dispatch.len = len;
+ task->syscall_dispatch.on_dispatch = false;
if (mode == PR_SYS_DISPATCH_ON)
- set_syscall_work(SYSCALL_USER_DISPATCH);
+ set_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+ else
+ clear_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+
+ return 0;
+}
+
+int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+ unsigned long len, char __user *selector)
+{
+ return task_set_syscall_user_dispatch(current, mode, offset, len, selector);
+}
+
+int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct syscall_user_dispatch *sd = &task->syscall_dispatch;
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (test_task_syscall_work(task, SYSCALL_USER_DISPATCH))
+ cfg.mode = PR_SYS_DISPATCH_ON;
else
- clear_syscall_work(SYSCALL_USER_DISPATCH);
+ cfg.mode = PR_SYS_DISPATCH_OFF;
+
+ cfg.offset = sd->offset;
+ cfg.len = sd->len;
+ cfg.selector = (__u64)(uintptr_t)sd->selector;
+
+ if (copy_to_user(data, &cfg, sizeof(cfg)))
+ return -EFAULT;
return 0;
}
+
+int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (copy_from_user(&cfg, data, sizeof(cfg)))
+ return -EFAULT;
+
+ return task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset, cfg.len,
+ (char __user *)(uintptr_t)cfg.selector);
+}