diff options
Diffstat (limited to 'kernel/unwind')
-rw-r--r-- | kernel/unwind/Makefile | 1 | ||||
-rw-r--r-- | kernel/unwind/deferred.c | 362 | ||||
-rw-r--r-- | kernel/unwind/user.c | 128 |
3 files changed, 491 insertions, 0 deletions
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile new file mode 100644 index 000000000000..eae37bea54fd --- /dev/null +++ b/kernel/unwind/Makefile @@ -0,0 +1 @@ + obj-$(CONFIG_UNWIND_USER) += user.o deferred.o diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c new file mode 100644 index 000000000000..dc6040aae3ee --- /dev/null +++ b/kernel/unwind/deferred.c @@ -0,0 +1,362 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Deferred user space unwinding + */ +#include <linux/sched/task_stack.h> +#include <linux/unwind_deferred.h> +#include <linux/sched/clock.h> +#include <linux/task_work.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/mm.h> + +/* + * For requesting a deferred user space stack trace from NMI context + * the architecture must support a safe cmpxchg in NMI context. + * For those architectures that do not have that, then it cannot ask + * for a deferred user space stack trace from an NMI context. If it + * does, then it will get -EINVAL. + */ +#if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) +# define CAN_USE_IN_NMI 1 +static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) +{ + u32 old = 0; + + return try_cmpxchg(&info->id.cnt, &old, cnt); +} +#else +# define CAN_USE_IN_NMI 0 +/* When NMIs are not allowed, this always succeeds */ +static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) +{ + info->id.cnt = cnt; + return true; +} +#endif + +/* Make the cache fit in a 4K page */ +#define UNWIND_MAX_ENTRIES \ + ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) + +/* Guards adding to or removing from the list of callbacks */ +static DEFINE_MUTEX(callback_mutex); +static LIST_HEAD(callbacks); + +#define RESERVED_BITS (UNWIND_PENDING | UNWIND_USED) + +/* Zero'd bits are available for assigning callback users */ +static unsigned long unwind_mask = RESERVED_BITS; +DEFINE_STATIC_SRCU(unwind_srcu); + +static inline bool unwind_pending(struct unwind_task_info *info) +{ + return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask); +} + +/* + * This is a unique percpu identifier for a given task entry context. + * Conceptually, it's incremented every time the CPU enters the kernel from + * user space, so that each "entry context" on the CPU gets a unique ID. In + * reality, as an optimization, it's only incremented on demand for the first + * deferred unwind request after a given entry-from-user. + * + * It's combined with the CPU id to make a systemwide-unique "context cookie". + */ +static DEFINE_PER_CPU(u32, unwind_ctx_ctr); + +/* + * The context cookie is a unique identifier that is assigned to a user + * space stacktrace. As the user space stacktrace remains the same while + * the task is in the kernel, the cookie is an identifier for the stacktrace. + * Although it is possible for the stacktrace to get another cookie if another + * request is made after the cookie was cleared and before reentering user + * space. + */ +static u64 get_cookie(struct unwind_task_info *info) +{ + u32 cnt = 1; + + if (info->id.cpu) + return info->id.id; + + /* LSB is always set to ensure 0 is an invalid value */ + cnt |= __this_cpu_read(unwind_ctx_ctr) + 2; + if (try_assign_cnt(info, cnt)) { + /* Update the per cpu counter */ + __this_cpu_write(unwind_ctx_ctr, cnt); + } + /* Interrupts are disabled, the CPU will always be same */ + info->id.cpu = smp_processor_id() + 1; /* Must be non zero */ + + return info->id.id; +} + +/** + * unwind_user_faultable - Produce a user stacktrace in faultable context + * @trace: The descriptor that will store the user stacktrace + * + * This must be called in a known faultable context (usually when entering + * or exiting user space). Depending on the available implementations + * the @trace will be loaded with the addresses of the user space stacktrace + * if it can be found. + * + * Return: 0 on success and negative on error + * On success @trace will contain the user space stacktrace + */ +int unwind_user_faultable(struct unwind_stacktrace *trace) +{ + struct unwind_task_info *info = ¤t->unwind_info; + struct unwind_cache *cache; + + /* Should always be called from faultable context */ + might_fault(); + + if (!current->mm) + return -EINVAL; + + if (!info->cache) { + info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES), + GFP_KERNEL); + if (!info->cache) + return -ENOMEM; + } + + cache = info->cache; + trace->entries = cache->entries; + + if (cache->nr_entries) { + /* + * The user stack has already been previously unwound in this + * entry context. Skip the unwind and use the cache. + */ + trace->nr = cache->nr_entries; + return 0; + } + + trace->nr = 0; + unwind_user(trace, UNWIND_MAX_ENTRIES); + + cache->nr_entries = trace->nr; + + /* Clear nr_entries on way back to user space */ + set_bit(UNWIND_USED_BIT, &info->unwind_mask); + + return 0; +} + +static void process_unwind_deferred(struct task_struct *task) +{ + struct unwind_task_info *info = &task->unwind_info; + struct unwind_stacktrace trace; + struct unwind_work *work; + unsigned long bits; + u64 cookie; + + if (WARN_ON_ONCE(!unwind_pending(info))) + return; + + /* Clear pending bit but make sure to have the current bits */ + bits = atomic_long_fetch_andnot(UNWIND_PENDING, + (atomic_long_t *)&info->unwind_mask); + /* + * From here on out, the callback must always be called, even if it's + * just an empty trace. + */ + trace.nr = 0; + trace.entries = NULL; + + unwind_user_faultable(&trace); + + if (info->cache) + bits &= ~(info->cache->unwind_completed); + + cookie = info->id.id; + + guard(srcu)(&unwind_srcu); + list_for_each_entry_srcu(work, &callbacks, list, + srcu_read_lock_held(&unwind_srcu)) { + if (test_bit(work->bit, &bits)) { + work->func(work, &trace, cookie); + if (info->cache) + info->cache->unwind_completed |= BIT(work->bit); + } + } +} + +static void unwind_deferred_task_work(struct callback_head *head) +{ + process_unwind_deferred(current); +} + +void unwind_deferred_task_exit(struct task_struct *task) +{ + struct unwind_task_info *info = ¤t->unwind_info; + + if (!unwind_pending(info)) + return; + + process_unwind_deferred(task); + + task_work_cancel(task, &info->work); +} + +/** + * unwind_deferred_request - Request a user stacktrace on task kernel exit + * @work: Unwind descriptor requesting the trace + * @cookie: The cookie of the first request made for this task + * + * Schedule a user space unwind to be done in task work before exiting the + * kernel. + * + * The returned @cookie output is the generated cookie of the very first + * request for a user space stacktrace for this task since it entered the + * kernel. It can be from a request by any caller of this infrastructure. + * Its value will also be passed to the callback function. It can be + * used to stitch kernel and user stack traces together in post-processing. + * + * It's valid to call this function multiple times for the same @work within + * the same task entry context. Each call will return the same cookie + * while the task hasn't left the kernel. If the callback is not pending + * because it has already been previously called for the same entry context, + * it will be called again with the same stack trace and cookie. + * + * Return: 0 if the callback successfully was queued. + * 1 if the callback is pending or was already executed. + * Negative if there's an error. + * @cookie holds the cookie of the first request by any user + */ +int unwind_deferred_request(struct unwind_work *work, u64 *cookie) +{ + struct unwind_task_info *info = ¤t->unwind_info; + unsigned long old, bits; + unsigned long bit; + int ret; + + *cookie = 0; + + if ((current->flags & (PF_KTHREAD | PF_EXITING)) || + !user_mode(task_pt_regs(current))) + return -EINVAL; + + /* + * NMI requires having safe cmpxchg operations. + * Trigger a warning to make it obvious that an architecture + * is using this in NMI when it should not be. + */ + if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi())) + return -EINVAL; + + /* Do not allow cancelled works to request again */ + bit = READ_ONCE(work->bit); + if (WARN_ON_ONCE(bit < 0)) + return -EINVAL; + + /* Only need the mask now */ + bit = BIT(bit); + + guard(irqsave)(); + + *cookie = get_cookie(info); + + old = READ_ONCE(info->unwind_mask); + + /* Is this already queued or executed */ + if (old & bit) + return 1; + + /* + * This work's bit hasn't been set yet. Now set it with the PENDING + * bit and fetch the current value of unwind_mask. If ether the + * work's bit or PENDING was already set, then this is already queued + * to have a callback. + */ + bits = UNWIND_PENDING | bit; + old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask); + if (old & bits) { + /* + * If the work's bit was set, whatever set it had better + * have also set pending and queued a callback. + */ + WARN_ON_ONCE(!(old & UNWIND_PENDING)); + return old & bit; + } + + /* The work has been claimed, now schedule it. */ + ret = task_work_add(current, &info->work, TWA_RESUME); + + if (WARN_ON_ONCE(ret)) + WRITE_ONCE(info->unwind_mask, 0); + + return ret; +} + +void unwind_deferred_cancel(struct unwind_work *work) +{ + struct task_struct *g, *t; + int bit; + + if (!work) + return; + + bit = work->bit; + + /* No work should be using a reserved bit */ + if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS)) + return; + + guard(mutex)(&callback_mutex); + list_del_rcu(&work->list); + + /* Do not allow any more requests and prevent callbacks */ + work->bit = -1; + + __clear_bit(bit, &unwind_mask); + + synchronize_srcu(&unwind_srcu); + + guard(rcu)(); + /* Clear this bit from all threads */ + for_each_process_thread(g, t) { + clear_bit(bit, &t->unwind_info.unwind_mask); + if (t->unwind_info.cache) + clear_bit(bit, &t->unwind_info.cache->unwind_completed); + } +} + +int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) +{ + memset(work, 0, sizeof(*work)); + + guard(mutex)(&callback_mutex); + + /* See if there's a bit in the mask available */ + if (unwind_mask == ~0UL) + return -EBUSY; + + work->bit = ffz(unwind_mask); + __set_bit(work->bit, &unwind_mask); + + list_add_rcu(&work->list, &callbacks); + work->func = func; + return 0; +} + +void unwind_task_init(struct task_struct *task) +{ + struct unwind_task_info *info = &task->unwind_info; + + memset(info, 0, sizeof(*info)); + init_task_work(&info->work, unwind_deferred_task_work); + info->unwind_mask = 0; +} + +void unwind_task_free(struct task_struct *task) +{ + struct unwind_task_info *info = &task->unwind_info; + + kfree(info->cache); + task_work_cancel(task, &info->work); +} diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c new file mode 100644 index 000000000000..97a8415e3216 --- /dev/null +++ b/kernel/unwind/user.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +/* +* Generic interfaces for unwinding user space +*/ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/unwind_user.h> +#include <linux/uaccess.h> + +static const struct unwind_user_frame fp_frame = { + ARCH_INIT_USER_FP_FRAME +}; + +#define for_each_user_frame(state) \ + for (unwind_user_start(state); !(state)->done; unwind_user_next(state)) + +static int unwind_user_next_fp(struct unwind_user_state *state) +{ + const struct unwind_user_frame *frame = &fp_frame; + unsigned long cfa, fp, ra; + unsigned int shift; + + if (frame->use_fp) { + if (state->fp < state->sp) + return -EINVAL; + cfa = state->fp; + } else { + cfa = state->sp; + } + + /* Get the Canonical Frame Address (CFA) */ + cfa += frame->cfa_off; + + /* stack going in wrong direction? */ + if (cfa <= state->sp) + return -EINVAL; + + /* Make sure that the address is word aligned */ + shift = sizeof(long) == 4 ? 2 : 3; + if (cfa & ((1 << shift) - 1)) + return -EINVAL; + + /* Find the Return Address (RA) */ + if (get_user(ra, (unsigned long *)(cfa + frame->ra_off))) + return -EINVAL; + + if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off))) + return -EINVAL; + + state->ip = ra; + state->sp = cfa; + if (frame->fp_off) + state->fp = fp; + return 0; +} + +static int unwind_user_next(struct unwind_user_state *state) +{ + unsigned long iter_mask = state->available_types; + unsigned int bit; + + if (state->done) + return -EINVAL; + + for_each_set_bit(bit, &iter_mask, NR_UNWIND_USER_TYPE_BITS) { + enum unwind_user_type type = BIT(bit); + + state->current_type = type; + switch (type) { + case UNWIND_USER_TYPE_FP: + if (!unwind_user_next_fp(state)) + return 0; + continue; + default: + WARN_ONCE(1, "Undefined unwind bit %d", bit); + break; + } + break; + } + + /* No successful unwind method. */ + state->current_type = UNWIND_USER_TYPE_NONE; + state->done = true; + return -EINVAL; +} + +static int unwind_user_start(struct unwind_user_state *state) +{ + struct pt_regs *regs = task_pt_regs(current); + + memset(state, 0, sizeof(*state)); + + if ((current->flags & PF_KTHREAD) || !user_mode(regs)) { + state->done = true; + return -EINVAL; + } + + if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP)) + state->available_types |= UNWIND_USER_TYPE_FP; + + state->ip = instruction_pointer(regs); + state->sp = user_stack_pointer(regs); + state->fp = frame_pointer(regs); + + return 0; +} + +int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries) +{ + struct unwind_user_state state; + + trace->nr = 0; + + if (!max_entries) + return -EINVAL; + + if (current->flags & PF_KTHREAD) + return 0; + + for_each_user_frame(&state) { + trace->entries[trace->nr++] = state.ip; + if (trace->nr >= max_entries) + break; + } + + return 0; +} |