summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/btf.c20
-rw-r--r--kernel/bpf/hashtab.c6
-rw-r--r--kernel/bpf/map_in_map.c8
-rw-r--r--kernel/bpf/offload.c2
-rw-r--r--kernel/bpf/syscall.c14
-rw-r--r--kernel/bpf/verifier.c12
-rw-r--r--kernel/cgroup/cgroup-v1.c4
-rw-r--r--kernel/cgroup/cgroup.c37
-rw-r--r--kernel/cgroup/legacy_freezer.c8
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c15
-rw-r--r--kernel/irq/msi.c4
-rw-r--r--kernel/kexec_file.c14
-rw-r--r--kernel/locking/lockdep.c28
-rw-r--r--kernel/module/decompress.c2
-rw-r--r--kernel/module/main.c4
-rw-r--r--kernel/module/stats.c4
-rw-r--r--kernel/signal.c8
-rw-r--r--kernel/time/tick-common.c13
-rw-r--r--kernel/time/tick-sched.c13
-rw-r--r--kernel/trace/bpf_trace.c12
-rw-r--r--kernel/trace/fprobe.c73
-rw-r--r--kernel/trace/rethook.c4
-rw-r--r--kernel/trace/trace.c44
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_events_hist.c39
-rw-r--r--kernel/trace/trace_events_user.c400
-rw-r--r--kernel/trace/trace_osnoise.c2
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/trace/trace_probe.h2
-rw-r--r--kernel/trace/trace_selftest.c10
-rw-r--r--kernel/vhost_task.c94
-rw-r--r--kernel/workqueue.c13
33 files changed, 644 insertions, 274 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 6b682b8e4b50..72b32b7cd9cd 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -744,13 +744,12 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
return offset < btf->hdr.str_len;
}
-static bool __btf_name_char_ok(char c, bool first, bool dot_ok)
+static bool __btf_name_char_ok(char c, bool first)
{
if ((first ? !isalpha(c) :
!isalnum(c)) &&
c != '_' &&
- ((c == '.' && !dot_ok) ||
- c != '.'))
+ c != '.')
return false;
return true;
}
@@ -767,20 +766,20 @@ static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
return NULL;
}
-static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)
+static bool __btf_name_valid(const struct btf *btf, u32 offset)
{
/* offset must be valid */
const char *src = btf_str_by_offset(btf, offset);
const char *src_limit;
- if (!__btf_name_char_ok(*src, true, dot_ok))
+ if (!__btf_name_char_ok(*src, true))
return false;
/* set a limit on identifier length */
src_limit = src + KSYM_NAME_LEN;
src++;
while (*src && src < src_limit) {
- if (!__btf_name_char_ok(*src, false, dot_ok))
+ if (!__btf_name_char_ok(*src, false))
return false;
src++;
}
@@ -788,17 +787,14 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)
return !*src;
}
-/* Only C-style identifier is permitted. This can be relaxed if
- * necessary.
- */
static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
{
- return __btf_name_valid(btf, offset, false);
+ return __btf_name_valid(btf, offset);
}
static bool btf_name_valid_section(const struct btf *btf, u32 offset)
{
- return __btf_name_valid(btf, offset, true);
+ return __btf_name_valid(btf, offset);
}
static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
@@ -4422,7 +4418,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env,
}
if (!t->name_off ||
- !__btf_name_valid(env->btf, t->name_off, true)) {
+ !__btf_name_valid(env->btf, t->name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 00c253b84bf5..9901efee4339 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1215,7 +1215,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
ret = htab_lock_bucket(htab, b, hash, &flags);
if (ret)
- return ret;
+ goto err_lock_bucket;
l_old = lookup_elem_raw(head, hash, key, key_size);
@@ -1236,6 +1236,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
err:
htab_unlock_bucket(htab, b, hash, flags);
+err_lock_bucket:
if (ret)
htab_lru_push_free(htab, l_new);
else if (l_old)
@@ -1338,7 +1339,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
ret = htab_lock_bucket(htab, b, hash, &flags);
if (ret)
- return ret;
+ goto err_lock_bucket;
l_old = lookup_elem_raw(head, hash, key, key_size);
@@ -1361,6 +1362,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
ret = 0;
err:
htab_unlock_bucket(htab, b, hash, flags);
+err_lock_bucket:
if (l_new)
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
return ret;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 2c5c64c2a53b..cd5eafaba97e 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -69,9 +69,13 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
if (inner_map->ops == &array_map_ops) {
+ struct bpf_array *inner_array_meta =
+ container_of(inner_map_meta, struct bpf_array, map);
+ struct bpf_array *inner_array = container_of(inner_map, struct bpf_array, map);
+
+ inner_array_meta->index_mask = inner_array->index_mask;
+ inner_array_meta->elem_size = inner_array->elem_size;
inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
- container_of(inner_map_meta, struct bpf_array, map)->index_mask =
- container_of(inner_map, struct bpf_array, map)->index_mask;
}
fdput(f);
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index d9c9f45e3529..8a26cd8814c1 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -859,4 +859,4 @@ static int __init bpf_offload_init(void)
return rhashtable_init(&offdevs, &offdevs_params);
}
-late_initcall(bpf_offload_init);
+core_initcall(bpf_offload_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 14f39c1e573e..f1c8733f76b8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2433,6 +2433,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
default:
return -EINVAL;
}
+ case BPF_PROG_TYPE_NETFILTER:
+ if (expected_attach_type == BPF_NETFILTER)
+ return 0;
+ return -EINVAL;
case BPF_PROG_TYPE_SYSCALL:
case BPF_PROG_TYPE_EXT:
if (expected_attach_type)
@@ -3436,6 +3440,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
return prog->enforce_expected_attach_type &&
prog->expected_attach_type != attach_type ?
-EINVAL : 0;
+ case BPF_PROG_TYPE_KPROBE:
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_KPROBE_MULTI)
+ return -EINVAL;
+ return 0;
default:
return 0;
}
@@ -4590,7 +4599,12 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
switch (prog->type) {
case BPF_PROG_TYPE_EXT:
+ break;
case BPF_PROG_TYPE_NETFILTER:
+ if (attr->link_create.attach_type != BPF_NETFILTER) {
+ ret = -EINVAL;
+ goto out;
+ }
break;
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_TRACEPOINT:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fbcf5a4e2fcd..cf5f230360f5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3868,6 +3868,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
return err;
}
save_register_state(state, spi, reg, size);
+ /* Break the relation on a narrowing spill. */
+ if (fls64(reg->umax_value) > BITS_PER_BYTE * size)
+ state->stack[spi].spilled_ptr.id = 0;
} else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
insn->imm != 0 && env->bpf_capable) {
struct bpf_reg_state fake_reg = {};
@@ -17033,7 +17036,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
insn->dst_reg,
shift);
- insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
(1ULL << size * 8) - 1);
}
}
@@ -17214,9 +17217,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
}
/* finally lock prog and jit images for all functions and
- * populate kallsysm
+ * populate kallsysm. Begin at the first subprogram, since
+ * bpf_prog_load will add the kallsyms for the main program.
*/
- for (i = 0; i < env->subprog_cnt; i++) {
+ for (i = 1; i < env->subprog_cnt; i++) {
bpf_prog_lock_ro(func[i]);
bpf_prog_kallsyms_add(func[i]);
}
@@ -17242,6 +17246,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->jited = 1;
prog->bpf_func = func[0]->bpf_func;
prog->jited_len = func[0]->jited_len;
+ prog->aux->extable = func[0]->aux->extable;
+ prog->aux->num_exentries = func[0]->aux->num_exentries;
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt;
bpf_prog_jit_attempt_done(prog);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index aeef06c465ef..5407241dbb45 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -108,7 +108,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
cgroup_lock();
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_lock(true);
/* all tasks in @from are being moved, all csets are source */
spin_lock_irq(&css_set_lock);
@@ -144,7 +144,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
} while (task && !ret);
out_err:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(true);
cgroup_unlock();
return ret;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 625d7483951c..4d42f0cbc11e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1798,7 +1798,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
- int ssid, i, ret;
+ int ssid, ret;
u16 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -1842,7 +1842,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
- struct css_set *cset;
+ struct css_set *cset, *cset_pos;
+ struct css_task_iter *it;
WARN_ON(!css || cgroup_css(dcgrp, ss));
@@ -1860,9 +1861,22 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
css->cgroup = dcgrp;
spin_lock_irq(&css_set_lock);
- hash_for_each(css_set_table, i, cset, hlist)
+ WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
+ list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
+ e_cset_node[ss->id]) {
list_move_tail(&cset->e_cset_node[ss->id],
&dcgrp->e_csets[ss->id]);
+ /*
+ * all css_sets of scgrp together in same order to dcgrp,
+ * patch in-flight iterators to preserve correct iteration.
+ * since the iterator is always advanced right away and
+ * finished when it->cset_pos meets it->cset_head, so only
+ * update it->cset_head is enough here.
+ */
+ list_for_each_entry(it, &cset->task_iters, iters_node)
+ if (it->cset_head == &scgrp->e_csets[ss->id])
+ it->cset_head = &dcgrp->e_csets[ss->id];
+ }
spin_unlock_irq(&css_set_lock);
if (ss->css_rstat_flush) {
@@ -6486,19 +6500,18 @@ err:
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
+ struct cgroup *cgrp = kargs->cgrp;
+ struct css_set *cset = kargs->cset;
+
cgroup_threadgroup_change_end(current);
- if (kargs->flags & CLONE_INTO_CGROUP) {
- struct cgroup *cgrp = kargs->cgrp;
- struct css_set *cset = kargs->cset;
+ if (cset) {
+ put_css_set(cset);
+ kargs->cset = NULL;
+ }
+ if (kargs->flags & CLONE_INTO_CGROUP) {
cgroup_unlock();
-
- if (cset) {
- put_css_set(cset);
- kargs->cset = NULL;
- }
-
if (cgrp) {
cgroup_put(cgrp);
kargs->cgrp = NULL;
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 936473203a6b..122dacb3a443 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -108,16 +108,18 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
struct freezer *freezer = css_freezer(css);
struct freezer *parent = parent_freezer(freezer);
+ cpus_read_lock();
mutex_lock(&freezer_mutex);
freezer->state |= CGROUP_FREEZER_ONLINE;
if (parent && (parent->state & CGROUP_FREEZING)) {
freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
- static_branch_inc(&freezer_active);
+ static_branch_inc_cpuslocked(&freezer_active);
}
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
return 0;
}
@@ -132,14 +134,16 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
+ cpus_read_lock();
mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
- static_branch_dec(&freezer_active);
+ static_branch_dec_cpuslocked(&freezer_active);
freezer->state = 0;
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
}
static void freezer_css_free(struct cgroup_subsys_state *css)
diff --git a/kernel/exit.c b/kernel/exit.c
index 34b90e2e7cf7..edb50b4c9972 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -411,7 +411,10 @@ static void coredump_task_exit(struct task_struct *tsk)
tsk->flags |= PF_POSTCOREDUMP;
core_state = tsk->signal->core_state;
spin_unlock_irq(&tsk->sighand->siglock);
- if (core_state) {
+
+ /* The vhost_worker does not particpate in coredumps */
+ if (core_state &&
+ ((tsk->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)) {
struct core_thread self;
self.task = current;
diff --git a/kernel/fork.c b/kernel/fork.c
index ed4e01daccaa..41c964104b58 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -627,6 +627,7 @@ void free_task(struct task_struct *tsk)
arch_release_task_struct(tsk);
if (tsk->flags & PF_KTHREAD)
free_kthread_struct(tsk);
+ bpf_task_storage_free(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -979,7 +980,6 @@ void __put_task_struct(struct task_struct *tsk)
cgroup_free(tsk);
task_numa_free(tsk, true);
security_task_free(tsk);
- bpf_task_storage_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
@@ -2336,16 +2336,16 @@ __latent_entropy struct task_struct *copy_process(
p->flags &= ~PF_KTHREAD;
if (args->kthread)
p->flags |= PF_KTHREAD;
- if (args->user_worker)
- p->flags |= PF_USER_WORKER;
- if (args->io_thread) {
+ if (args->user_worker) {
/*
- * Mark us an IO worker, and block any signal that isn't
+ * Mark us a user worker, and block any signal that isn't
* fatal or STOP
*/
- p->flags |= PF_IO_WORKER;
+ p->flags |= PF_USER_WORKER;
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
+ if (args->io_thread)
+ p->flags |= PF_IO_WORKER;
if (args->name)
strscpy_pad(p->comm, args->name, sizeof(p->comm));
@@ -2517,9 +2517,6 @@ __latent_entropy struct task_struct *copy_process(
if (retval)
goto bad_fork_cleanup_io;
- if (args->ignore_signals)
- ignore_signals(p);
-
stackleak_task_init(p);
if (pid != &init_struct_pid) {
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7a97bcb086bf..b4c31a5c1147 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -542,7 +542,7 @@ fail:
return ret;
}
-#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
+#if defined(CONFIG_PCI_MSI_ARCH_FALLBACKS) || defined(CONFIG_PCI_XEN)
/**
* msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device
* @dev: The device (PCI, platform etc) which will get sysfs entries
@@ -574,7 +574,7 @@ void msi_device_destroy_sysfs(struct device *dev)
msi_for_each_desc(desc, dev, MSI_DESC_ALL)
msi_sysfs_remove_desc(dev, desc);
}
-#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */
+#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK || CONFIG_PCI_XEN */
#else /* CONFIG_SYSFS */
static inline int msi_sysfs_create_group(struct device *dev) { return 0; }
static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; }
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f989f5f1933b..69ee4a29136f 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -901,10 +901,22 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
}
offset = ALIGN(offset, align);
+
+ /*
+ * Check if the segment contains the entry point, if so,
+ * calculate the value of image->start based on it.
+ * If the compiler has produced more than one .text section
+ * (Eg: .text.hot), they are generally after the main .text
+ * section, and they shall not be used to calculate
+ * image->start. So do not re-calculate image->start if it
+ * is not set to the initial value, and warn the user so they
+ * have a chance to fix their purgatory's linker script.
+ */
if (sechdrs[i].sh_flags & SHF_EXECINSTR &&
pi->ehdr->e_entry >= sechdrs[i].sh_addr &&
pi->ehdr->e_entry < (sechdrs[i].sh_addr
- + sechdrs[i].sh_size)) {
+ + sechdrs[i].sh_size) &&
+ !WARN_ON(kbuf->image->start != pi->ehdr->e_entry)) {
kbuf->image->start -= sechdrs[i].sh_addr;
kbuf->image->start += kbuf->mem + offset;
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index dcd1d5bfc1e0..4dfd2f3e09b2 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2263,6 +2263,9 @@ static inline bool usage_match(struct lock_list *entry, void *mask)
static inline bool usage_skip(struct lock_list *entry, void *mask)
{
+ if (entry->class->lock_type == LD_LOCK_NORMAL)
+ return false;
+
/*
* Skip local_lock() for irq inversion detection.
*
@@ -2289,14 +2292,16 @@ static inline bool usage_skip(struct lock_list *entry, void *mask)
* As a result, we will skip local_lock(), when we search for irq
* inversion bugs.
*/
- if (entry->class->lock_type == LD_LOCK_PERCPU) {
- if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
- return false;
+ if (entry->class->lock_type == LD_LOCK_PERCPU &&
+ DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
+ return false;
- return true;
- }
+ /*
+ * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually
+ * a lock and only used to override the wait_type.
+ */
- return false;
+ return true;
}
/*
@@ -4768,7 +4773,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
for (; depth < curr->lockdep_depth; depth++) {
struct held_lock *prev = curr->held_locks + depth;
- u8 prev_inner = hlock_class(prev)->wait_type_inner;
+ struct lock_class *class = hlock_class(prev);
+ u8 prev_inner = class->wait_type_inner;
if (prev_inner) {
/*
@@ -4778,6 +4784,14 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
* Also due to trylocks.
*/
curr_inner = min(curr_inner, prev_inner);
+
+ /*
+ * Allow override for annotations -- this is typically
+ * only valid/needed for code that only exists when
+ * CONFIG_PREEMPT_RT=n.
+ */
+ if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE))
+ curr_inner = prev_inner;
}
}
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
index e97232b125eb..8a5d6d63b06c 100644
--- a/kernel/module/decompress.c
+++ b/kernel/module/decompress.c
@@ -257,7 +257,7 @@ static ssize_t module_zstd_decompress(struct load_info *info,
do {
struct page *page = module_get_next_page(info);
- if (!IS_ERR(page)) {
+ if (IS_ERR(page)) {
retval = PTR_ERR(page);
goto out;
}
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 044aa2c9e3cb..4e2cf784cf8c 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1521,14 +1521,14 @@ static void __layout_sections(struct module *mod, struct load_info *info, bool i
MOD_RODATA,
MOD_RO_AFTER_INIT,
MOD_DATA,
- MOD_INVALID, /* This is needed to match the masks array */
+ MOD_DATA,
};
static const int init_m_to_mem_type[] = {
MOD_INIT_TEXT,
MOD_INIT_RODATA,
MOD_INVALID,
MOD_INIT_DATA,
- MOD_INVALID, /* This is needed to match the masks array */
+ MOD_INIT_DATA,
};
for (m = 0; m < ARRAY_SIZE(masks); ++m) {
diff --git a/kernel/module/stats.c b/kernel/module/stats.c
index ad7b6ada29f2..6ab2c94d6bc3 100644
--- a/kernel/module/stats.c
+++ b/kernel/module/stats.c
@@ -276,6 +276,7 @@ static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
struct mod_fail_load *mod_fail;
unsigned int len, size, count_failed = 0;
char *buf;
+ int ret;
u32 live_mod_count, fkreads, fdecompress, fbecoming, floads;
unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes,
idecompress_bytes, imod_bytes, total_virtual_lost;
@@ -390,8 +391,9 @@ static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
out_unlock:
mutex_unlock(&module_mutex);
out:
+ ret = simple_read_from_buffer(user_buf, count, ppos, buf, len);
kfree(buf);
- return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+ return ret;
}
#undef MAX_PREAMBLE
#undef MAX_FAILED_MOD_PRINT
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f6330f0e9ca..2547fa73bde5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1368,7 +1368,9 @@ int zap_other_threads(struct task_struct *p)
while_each_thread(p, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
- count++;
+ /* Don't require de_thread to wait for the vhost_worker */
+ if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)
+ count++;
/* Don't bother with already dead threads */
if (t->exit_state)
@@ -2861,11 +2863,11 @@ relock:
}
/*
- * PF_IO_WORKER threads will catch and exit on fatal signals
+ * PF_USER_WORKER threads will catch and exit on fatal signals
* themselves. They have cleanup that must be performed, so
* we cannot call do_exit() on their behalf.
*/
- if (current->flags & PF_IO_WORKER)
+ if (current->flags & PF_USER_WORKER)
goto out;
/*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 65b8658da829..e9138cd7a0f5 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -218,19 +218,8 @@ static void tick_setup_device(struct tick_device *td,
* this cpu:
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
- ktime_t next_p;
- u32 rem;
-
tick_do_timer_cpu = cpu;
-
- next_p = ktime_get();
- div_u64_rem(next_p, TICK_NSEC, &rem);
- if (rem) {
- next_p -= rem;
- next_p += TICK_NSEC;
- }
-
- tick_next_period = next_p;
+ tick_next_period = ktime_get();
#ifdef CONFIG_NO_HZ_FULL
/*
* The boot CPU may be nohz_full, in which case set
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 52254679ec48..42c0be3080bd 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -161,8 +161,19 @@ static ktime_t tick_init_jiffy_update(void)
raw_spin_lock(&jiffies_lock);
write_seqcount_begin(&jiffies_seq);
/* Did we start the jiffies update yet ? */
- if (last_jiffies_update == 0)
+ if (last_jiffies_update == 0) {
+ u32 rem;
+
+ /*
+ * Ensure that the tick is aligned to a multiple of
+ * TICK_NSEC.
+ */
+ div_u64_rem(tick_next_period, TICK_NSEC, &rem);
+ if (rem)
+ tick_next_period += TICK_NSEC - rem;
+
last_jiffies_update = tick_next_period;
+ }
period = last_jiffies_update;
write_seqcount_end(&jiffies_seq);
raw_spin_unlock(&jiffies_lock);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 9a050e36dc6c..1f4b07da327a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -900,13 +900,23 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = {
BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz)
{
+ struct path copy;
long len;
char *p;
if (!sz)
return 0;
- p = d_path(path, buf, sz);
+ /*
+ * The path pointer is verified as trusted and safe to use,
+ * but let's double check it's valid anyway to workaround
+ * potentially broken verifier.
+ */
+ len = copy_from_kernel_nofault(&copy, path, sizeof(*path));
+ if (len < 0)
+ return len;
+
+ p = d_path(&copy, buf, sz);
if (IS_ERR(p)) {
len = PTR_ERR(p);
} else {
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 9abb3905bc8e..18d36842faf5 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -17,36 +17,30 @@
struct fprobe_rethook_node {
struct rethook_node node;
unsigned long entry_ip;
+ unsigned long entry_parent_ip;
char data[];
};
-static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
+static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
struct fprobe_rethook_node *fpr;
struct rethook_node *rh = NULL;
struct fprobe *fp;
void *entry_data = NULL;
- int bit, ret;
+ int ret = 0;
fp = container_of(ops, struct fprobe, ops);
- if (fprobe_disabled(fp))
- return;
-
- bit = ftrace_test_recursion_trylock(ip, parent_ip);
- if (bit < 0) {
- fp->nmissed++;
- return;
- }
if (fp->exit_handler) {
rh = rethook_try_get(fp->rethook);
if (!rh) {
fp->nmissed++;
- goto out;
+ return;
}
fpr = container_of(rh, struct fprobe_rethook_node, node);
fpr->entry_ip = ip;
+ fpr->entry_parent_ip = parent_ip;
if (fp->entry_data_size)
entry_data = fpr->data;
}
@@ -61,23 +55,60 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
else
rethook_hook(rh, ftrace_get_regs(fregs), true);
}
-out:
+}
+
+static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe *fp;
+ int bit;
+
+ fp = container_of(ops, struct fprobe, ops);
+ if (fprobe_disabled(fp))
+ return;
+
+ /* recursion detection has to go before any traceable function and
+ * all functions before this point should be marked as notrace
+ */
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0) {
+ fp->nmissed++;
+ return;
+ }
+ __fprobe_handler(ip, parent_ip, ops, fregs);
ftrace_test_recursion_unlock(bit);
+
}
NOKPROBE_SYMBOL(fprobe_handler);
static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
- struct fprobe *fp = container_of(ops, struct fprobe, ops);
+ struct fprobe *fp;
+ int bit;
+
+ fp = container_of(ops, struct fprobe, ops);
+ if (fprobe_disabled(fp))
+ return;
+
+ /* recursion detection has to go before any traceable function and
+ * all functions called before this point should be marked as notrace
+ */
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0) {
+ fp->nmissed++;
+ return;
+ }
if (unlikely(kprobe_running())) {
fp->nmissed++;
return;
}
+
kprobe_busy_begin();
- fprobe_handler(ip, parent_ip, ops, fregs);
+ __fprobe_handler(ip, parent_ip, ops, fregs);
kprobe_busy_end();
+ ftrace_test_recursion_unlock(bit);
}
static void fprobe_exit_handler(struct rethook_node *rh, void *data,
@@ -85,14 +116,26 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data,
{
struct fprobe *fp = (struct fprobe *)data;
struct fprobe_rethook_node *fpr;
+ int bit;
if (!fp || fprobe_disabled(fp))
return;
fpr = container_of(rh, struct fprobe_rethook_node, node);
+ /*
+ * we need to assure no calls to traceable functions in-between the
+ * end of fprobe_handler and the beginning of fprobe_exit_handler.
+ */
+ bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip);
+ if (bit < 0) {
+ fp->nmissed++;
+ return;
+ }
+
fp->exit_handler(fp, fpr->entry_ip, regs,
fp->entry_data_size ? (void *)fpr->data : NULL);
+ ftrace_test_recursion_unlock(bit);
}
NOKPROBE_SYMBOL(fprobe_exit_handler);
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index 32c3dfdb4d6a..60f6cb2b486b 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -288,7 +288,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs,
* These loops must be protected from rethook_free_rcu() because those
* are accessing 'rhn->rethook'.
*/
- preempt_disable();
+ preempt_disable_notrace();
/*
* Run the handler on the shadow stack. Do not unlink the list here because
@@ -321,7 +321,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs,
first = first->next;
rethook_recycle(rhn);
}
- preempt_enable();
+ preempt_enable_notrace();
return correct_ret_addr;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ebc59781456a..64a4dde073ef 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -60,6 +60,7 @@
*/
bool ring_buffer_expanded;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
/*
* We need to change this state when a selftest is running.
* A selftest will lurk into the ring-buffer to count the
@@ -75,7 +76,6 @@ static bool __read_mostly tracing_selftest_running;
*/
bool __read_mostly tracing_selftest_disabled;
-#ifdef CONFIG_FTRACE_STARTUP_TEST
void __init disable_tracing_selftest(const char *reason)
{
if (!tracing_selftest_disabled) {
@@ -83,6 +83,9 @@ void __init disable_tracing_selftest(const char *reason)
pr_info("Ftrace startup test is disabled due to %s\n", reason);
}
}
+#else
+#define tracing_selftest_running 0
+#define tracing_selftest_disabled 0
#endif
/* Pipe tracepoints to printk */
@@ -1051,7 +1054,10 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip,
if (!(tr->trace_flags & TRACE_ITER_PRINTK))
return 0;
- if (unlikely(tracing_selftest_running || tracing_disabled))
+ if (unlikely(tracing_selftest_running && tr == &global_trace))
+ return 0;
+
+ if (unlikely(tracing_disabled))
return 0;
alloc = sizeof(*entry) + size + 2; /* possible \n added */
@@ -2041,6 +2047,24 @@ static int run_tracer_selftest(struct tracer *type)
return 0;
}
+static int do_run_tracer_selftest(struct tracer *type)
+{
+ int ret;
+
+ /*
+ * Tests can take a long time, especially if they are run one after the
+ * other, as does happen during bootup when all the tracers are
+ * registered. This could cause the soft lockup watchdog to trigger.
+ */
+ cond_resched();
+
+ tracing_selftest_running = true;
+ ret = run_tracer_selftest(type);
+ tracing_selftest_running = false;
+
+ return ret;
+}
+
static __init int init_trace_selftests(void)
{
struct trace_selftests *p, *n;
@@ -2092,6 +2116,10 @@ static inline int run_tracer_selftest(struct tracer *type)
{
return 0;
}
+static inline int do_run_tracer_selftest(struct tracer *type)
+{
+ return 0;
+}
#endif /* CONFIG_FTRACE_STARTUP_TEST */
static void add_tracer_options(struct trace_array *tr, struct tracer *t);
@@ -2127,8 +2155,6 @@ int __init register_tracer(struct tracer *type)
mutex_lock(&trace_types_lock);
- tracing_selftest_running = true;
-
for (t = trace_types; t; t = t->next) {
if (strcmp(type->name, t->name) == 0) {
/* already found */
@@ -2157,7 +2183,7 @@ int __init register_tracer(struct tracer *type)
/* store the tracer for __set_tracer_option */
type->flags->trace = type;
- ret = run_tracer_selftest(type);
+ ret = do_run_tracer_selftest(type);
if (ret < 0)
goto out;
@@ -2166,7 +2192,6 @@ int __init register_tracer(struct tracer *type)
add_tracer_options(&global_trace, type);
out:
- tracing_selftest_running = false;
mutex_unlock(&trace_types_lock);
if (ret || !default_bootup_tracer)
@@ -3490,7 +3515,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
unsigned int trace_ctx;
char *tbuffer;
- if (tracing_disabled || tracing_selftest_running)
+ if (tracing_disabled)
return 0;
/* Don't pollute graph traces with trace_vprintk internals */
@@ -3538,6 +3563,9 @@ __printf(3, 0)
int trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args)
{
+ if (tracing_selftest_running && tr == &global_trace)
+ return 0;
+
return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
}
@@ -5752,7 +5780,7 @@ static const char readme_msg[] =
"\t table using the key(s) and value(s) named, and the value of a\n"
"\t sum called 'hitcount' is incremented. Keys and values\n"
"\t correspond to fields in the event's format description. Keys\n"
- "\t can be any field, or the special string 'stacktrace'.\n"
+ "\t can be any field, or the special string 'common_stacktrace'.\n"
"\t Compound keys consisting of up to two fields can be specified\n"
"\t by the 'keys' keyword. Values must correspond to numeric\n"
"\t fields. Sort keys consisting of up to two fields can be\n"
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 654ffa40457a..57e539d47989 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -194,6 +194,8 @@ static int trace_define_generic_fields(void)
__generic_field(int, common_cpu, FILTER_CPU);
__generic_field(char *, COMM, FILTER_COMM);
__generic_field(char *, comm, FILTER_COMM);
+ __generic_field(char *, stacktrace, FILTER_STACKTRACE);
+ __generic_field(char *, STACKTRACE, FILTER_STACKTRACE);
return ret;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 486cca3c2b75..b97d3ad832f1 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1364,7 +1364,7 @@ static const char *hist_field_name(struct hist_field *field,
if (field->field)
field_name = field->field->name;
else
- field_name = "stacktrace";
+ field_name = "common_stacktrace";
} else if (field->flags & HIST_FIELD_FL_HITCOUNT)
field_name = "hitcount";
@@ -2367,7 +2367,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
hist_data->enable_timestamps = true;
if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
hist_data->attrs->ts_in_usecs = true;
- } else if (strcmp(field_name, "stacktrace") == 0) {
+ } else if (strcmp(field_name, "common_stacktrace") == 0) {
*flags |= HIST_FIELD_FL_STACKTRACE;
} else if (strcmp(field_name, "common_cpu") == 0)
*flags |= HIST_FIELD_FL_CPU;
@@ -2378,11 +2378,15 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
if (!field || !field->size) {
/*
* For backward compatibility, if field_name
- * was "cpu", then we treat this the same as
- * common_cpu. This also works for "CPU".
+ * was "cpu" or "stacktrace", then we treat this
+ * the same as common_cpu and common_stacktrace
+ * respectively. This also works for "CPU", and
+ * "STACKTRACE".
*/
if (field && field->filter_type == FILTER_CPU) {
*flags |= HIST_FIELD_FL_CPU;
+ } else if (field && field->filter_type == FILTER_STACKTRACE) {
+ *flags |= HIST_FIELD_FL_STACKTRACE;
} else {
hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
errpos(field_name));
@@ -4238,13 +4242,19 @@ static int __create_val_field(struct hist_trigger_data *hist_data,
goto out;
}
- /* Some types cannot be a value */
- if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT |
- HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 |
- HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET |
- HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) {
- hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str));
- ret = -EINVAL;
+ /* values and variables should not have some modifiers */
+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
+ /* Variable */
+ if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT |
+ HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2))
+ goto err;
+ } else {
+ /* Value */
+ if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT |
+ HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 |
+ HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET |
+ HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE))
+ goto err;
}
hist_data->fields[val_idx] = hist_field;
@@ -4256,6 +4266,9 @@ static int __create_val_field(struct hist_trigger_data *hist_data,
ret = -EINVAL;
out:
return ret;
+ err:
+ hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str));
+ return -EINVAL;
}
static int create_val_field(struct hist_trigger_data *hist_data,
@@ -5385,7 +5398,7 @@ static void hist_trigger_print_key(struct seq_file *m,
if (key_field->field)
seq_printf(m, "%s.stacktrace", key_field->field->name);
else
- seq_puts(m, "stacktrace:\n");
+ seq_puts(m, "common_stacktrace:\n");
hist_trigger_stacktrace_print(m,
key + key_field->offset,
HIST_STACKTRACE_DEPTH);
@@ -5968,7 +5981,7 @@ static int event_hist_trigger_print(struct seq_file *m,
if (field->field)
seq_printf(m, "%s.stacktrace", field->field->name);
else
- seq_puts(m, "stacktrace");
+ seq_puts(m, "common_stacktrace");
} else
hist_field_print(m, field);
}
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index b1ecd7677642..8df0550415e7 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -50,6 +50,18 @@
#define EVENT_STATUS_OTHER BIT(7)
/*
+ * User register flags are not allowed yet, keep them here until we are
+ * ready to expose them out to the user ABI.
+ */
+enum user_reg_flag {
+ /* Event will not delete upon last reference closing */
+ USER_EVENT_REG_PERSIST = 1U << 0,
+
+ /* This value or above is currently non-ABI */
+ USER_EVENT_REG_MAX = 1U << 1,
+};
+
+/*
* Stores the system name, tables, and locks for a group of events. This
* allows isolation for events by various means.
*/
@@ -85,8 +97,10 @@ struct user_event {
struct hlist_node node;
struct list_head fields;
struct list_head validators;
+ struct work_struct put_work;
refcount_t refcnt;
int min_size;
+ int reg_flags;
char status;
};
@@ -96,12 +110,12 @@ struct user_event {
* these to track enablement sites that are tied to an event.
*/
struct user_event_enabler {
- struct list_head link;
+ struct list_head mm_enablers_link;
struct user_event *event;
unsigned long addr;
/* Track enable bit, flags, etc. Aligned for bitops. */
- unsigned int values;
+ unsigned long values;
};
/* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */
@@ -116,7 +130,9 @@ struct user_event_enabler {
/* Only duplicate the bit value */
#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
-#define ENABLE_BITOPS(e) ((unsigned long *)&(e)->values)
+#define ENABLE_BITOPS(e) (&(e)->values)
+
+#define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK))
/* Used for asynchronous faulting in of pages */
struct user_event_enabler_fault {
@@ -153,7 +169,7 @@ struct user_event_file_info {
#define VALIDATOR_REL (1 << 1)
struct user_event_validator {
- struct list_head link;
+ struct list_head user_event_link;
int offset;
int flags;
};
@@ -163,76 +179,151 @@ typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
static int user_event_parse(struct user_event_group *group, char *name,
char *args, char *flags,
- struct user_event **newuser);
+ struct user_event **newuser, int reg_flags);
static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm);
static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
static void user_event_mm_put(struct user_event_mm *mm);
+static int destroy_user_event(struct user_event *user);
static u32 user_event_key(char *name)
{
return jhash(name, strlen(name), 0);
}
-static void user_event_group_destroy(struct user_event_group *group)
+static struct user_event *user_event_get(struct user_event *user)
{
- kfree(group->system_name);
- kfree(group);
+ refcount_inc(&user->refcnt);
+
+ return user;
}
-static char *user_event_group_system_name(struct user_namespace *user_ns)
+static void delayed_destroy_user_event(struct work_struct *work)
{
- char *system_name;
- int len = sizeof(USER_EVENTS_SYSTEM) + 1;
+ struct user_event *user = container_of(
+ work, struct user_event, put_work);
+
+ mutex_lock(&event_mutex);
- if (user_ns != &init_user_ns) {
+ if (!refcount_dec_and_test(&user->refcnt))
+ goto out;
+
+ if (destroy_user_event(user)) {
/*
- * Unexpected at this point:
- * We only currently support init_user_ns.
- * When we enable more, this will trigger a failure so log.
+ * The only reason this would fail here is if we cannot
+ * update the visibility of the event. In this case the
+ * event stays in the hashtable, waiting for someone to
+ * attempt to delete it later.
*/
- pr_warn("user_events: Namespace other than init_user_ns!\n");
- return NULL;
+ pr_warn("user_events: Unable to delete event\n");
+ refcount_set(&user->refcnt, 1);
}
+out:
+ mutex_unlock(&event_mutex);
+}
- system_name = kmalloc(len, GFP_KERNEL);
+static void user_event_put(struct user_event *user, bool locked)
+{
+ bool delete;
- if (!system_name)
- return NULL;
+ if (unlikely(!user))
+ return;
- snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM);
+ /*
+ * When the event is not enabled for auto-delete there will always
+ * be at least 1 reference to the event. During the event creation
+ * we initially set the refcnt to 2 to achieve this. In those cases
+ * the caller must acquire event_mutex and after decrement check if
+ * the refcnt is 1, meaning this is the last reference. When auto
+ * delete is enabled, there will only be 1 ref, IE: refcnt will be
+ * only set to 1 during creation to allow the below checks to go
+ * through upon the last put. The last put must always be done with
+ * the event mutex held.
+ */
+ if (!locked) {
+ lockdep_assert_not_held(&event_mutex);
+ delete = refcount_dec_and_mutex_lock(&user->refcnt, &event_mutex);
+ } else {
+ lockdep_assert_held(&event_mutex);
+ delete = refcount_dec_and_test(&user->refcnt);
+ }
- return system_name;
+ if (!delete)
+ return;
+
+ /*
+ * We now have the event_mutex in all cases, which ensures that
+ * no new references will be taken until event_mutex is released.
+ * New references come through find_user_event(), which requires
+ * the event_mutex to be held.
+ */
+
+ if (user->reg_flags & USER_EVENT_REG_PERSIST) {
+ /* We should not get here when persist flag is set */
+ pr_alert("BUG: Auto-delete engaged on persistent event\n");
+ goto out;
+ }
+
+ /*
+ * Unfortunately we have to attempt the actual destroy in a work
+ * queue. This is because not all cases handle a trace_event_call
+ * being removed within the class->reg() operation for unregister.
+ */
+ INIT_WORK(&user->put_work, delayed_destroy_user_event);
+
+ /*
+ * Since the event is still in the hashtable, we have to re-inc
+ * the ref count to 1. This count will be decremented and checked
+ * in the work queue to ensure it's still the last ref. This is
+ * needed because a user-process could register the same event in
+ * between the time of event_mutex release and the work queue
+ * running the delayed destroy. If we removed the item now from
+ * the hashtable, this would result in a timing window where a
+ * user process would fail a register because the trace_event_call
+ * register would fail in the tracing layers.
+ */
+ refcount_set(&user->refcnt, 1);
+
+ if (WARN_ON_ONCE(!schedule_work(&user->put_work))) {
+ /*
+ * If we fail we must wait for an admin to attempt delete or
+ * another register/close of the event, whichever is first.
+ */
+ pr_warn("user_events: Unable to queue delayed destroy\n");
+ }
+out:
+ /* Ensure if we didn't have event_mutex before we unlock it */
+ if (!locked)
+ mutex_unlock(&event_mutex);
}
-static inline struct user_event_group
-*user_event_group_from_user_ns(struct user_namespace *user_ns)
+static void user_event_group_destroy(struct user_event_group *group)
{
- if (user_ns == &init_user_ns)
- return init_group;
-
- return NULL;
+ kfree(group->system_name);
+ kfree(group);
}
-static struct user_event_group *current_user_event_group(void)
+static char *user_event_group_system_name(void)
{
- struct user_namespace *user_ns = current_user_ns();
- struct user_event_group *group = NULL;
+ char *system_name;
+ int len = sizeof(USER_EVENTS_SYSTEM) + 1;
- while (user_ns) {
- group = user_event_group_from_user_ns(user_ns);
+ system_name = kmalloc(len, GFP_KERNEL);
- if (group)
- break;
+ if (!system_name)
+ return NULL;
- user_ns = user_ns->parent;
- }
+ snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM);
- return group;
+ return system_name;
+}
+
+static struct user_event_group *current_user_event_group(void)
+{
+ return init_group;
}
-static struct user_event_group
-*user_event_group_create(struct user_namespace *user_ns)
+static struct user_event_group *user_event_group_create(void)
{
struct user_event_group *group;
@@ -241,7 +332,7 @@ static struct user_event_group
if (!group)
return NULL;
- group->system_name = user_event_group_system_name(user_ns);
+ group->system_name = user_event_group_system_name();
if (!group->system_name)
goto error;
@@ -257,12 +348,13 @@ error:
return NULL;
};
-static void user_event_enabler_destroy(struct user_event_enabler *enabler)
+static void user_event_enabler_destroy(struct user_event_enabler *enabler,
+ bool locked)
{
- list_del_rcu(&enabler->link);
+ list_del_rcu(&enabler->mm_enablers_link);
/* No longer tracking the event via the enabler */
- refcount_dec(&enabler->event->refcnt);
+ user_event_put(enabler->event, locked);
kfree(enabler);
}
@@ -324,7 +416,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work)
/* User asked for enabler to be removed during fault */
if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) {
- user_event_enabler_destroy(enabler);
+ user_event_enabler_destroy(enabler, true);
goto out;
}
@@ -423,9 +515,9 @@ static int user_event_enabler_write(struct user_event_mm *mm,
/* Update bit atomically, user tracers must be atomic as well */
if (enabler->event && enabler->event->status)
- set_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr);
+ set_bit(ENABLE_BIT(enabler), ptr);
else
- clear_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr);
+ clear_bit(ENABLE_BIT(enabler), ptr);
kunmap_local(kaddr);
unpin_user_pages_dirty_lock(&page, 1, true);
@@ -437,11 +529,9 @@ static bool user_event_enabler_exists(struct user_event_mm *mm,
unsigned long uaddr, unsigned char bit)
{
struct user_event_enabler *enabler;
- struct user_event_enabler *next;
- list_for_each_entry_safe(enabler, next, &mm->enablers, link) {
- if (enabler->addr == uaddr &&
- (enabler->values & ENABLE_VAL_BIT_MASK) == bit)
+ list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) {
+ if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit)
return true;
}
@@ -451,23 +541,36 @@ static bool user_event_enabler_exists(struct user_event_mm *mm,
static void user_event_enabler_update(struct user_event *user)
{
struct user_event_enabler *enabler;
- struct user_event_mm *mm = user_event_mm_get_all(user);
struct user_event_mm *next;
+ struct user_event_mm *mm;
int attempt;
+ lockdep_assert_held(&event_mutex);
+
+ /*
+ * We need to build a one-shot list of all the mms that have an
+ * enabler for the user_event passed in. This list is only valid
+ * while holding the event_mutex. The only reason for this is due
+ * to the global mm list being RCU protected and we use methods
+ * which can wait (mmap_read_lock and pin_user_pages_remote).
+ *
+ * NOTE: user_event_mm_get_all() increments the ref count of each
+ * mm that is added to the list to prevent removal timing windows.
+ * We must always put each mm after they are used, which may wait.
+ */
+ mm = user_event_mm_get_all(user);
+
while (mm) {
next = mm->next;
mmap_read_lock(mm->mm);
- rcu_read_lock();
- list_for_each_entry_rcu(enabler, &mm->enablers, link) {
+ list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) {
if (enabler->event == user) {
attempt = 0;
user_event_enabler_write(mm, enabler, true, &attempt);
}
}
- rcu_read_unlock();
mmap_read_unlock(mm->mm);
user_event_mm_put(mm);
mm = next;
@@ -488,14 +591,14 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig,
if (!enabler)
return false;
- enabler->event = orig->event;
+ enabler->event = user_event_get(orig->event);
enabler->addr = orig->addr;
/* Only dup part of value (ignore future flags, etc) */
enabler->values = orig->values & ENABLE_VAL_DUP_MASK;
- refcount_inc(&enabler->event->refcnt);
- list_add_rcu(&enabler->link, &mm->enablers);
+ /* Enablers not exposed yet, RCU not required */
+ list_add(&enabler->mm_enablers_link, &mm->enablers);
return true;
}
@@ -514,6 +617,14 @@ static struct user_event_mm *user_event_mm_get_all(struct user_event *user)
struct user_event_mm *mm;
/*
+ * We use the mm->next field to build a one-shot list from the global
+ * RCU protected list. To build this list the event_mutex must be held.
+ * This lets us build a list without requiring allocs that could fail
+ * when user based events are most wanted for diagnostics.
+ */
+ lockdep_assert_held(&event_mutex);
+
+ /*
* We do not want to block fork/exec while enablements are being
* updated, so we use RCU to walk the current tasks that have used
* user_events ABI for 1 or more events. Each enabler found in each
@@ -525,23 +636,24 @@ static struct user_event_mm *user_event_mm_get_all(struct user_event *user)
*/
rcu_read_lock();
- list_for_each_entry_rcu(mm, &user_event_mms, link)
- list_for_each_entry_rcu(enabler, &mm->enablers, link)
+ list_for_each_entry_rcu(mm, &user_event_mms, mms_link) {
+ list_for_each_entry_rcu(enabler, &mm->enablers, mm_enablers_link) {
if (enabler->event == user) {
mm->next = found;
found = user_event_mm_get(mm);
break;
}
+ }
+ }
rcu_read_unlock();
return found;
}
-static struct user_event_mm *user_event_mm_create(struct task_struct *t)
+static struct user_event_mm *user_event_mm_alloc(struct task_struct *t)
{
struct user_event_mm *user_mm;
- unsigned long flags;
user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL_ACCOUNT);
@@ -553,12 +665,6 @@ static struct user_event_mm *user_event_mm_create(struct task_struct *t)
refcount_set(&user_mm->refcnt, 1);
refcount_set(&user_mm->tasks, 1);
- spin_lock_irqsave(&user_event_mms_lock, flags);
- list_add_rcu(&user_mm->link, &user_event_mms);
- spin_unlock_irqrestore(&user_event_mms_lock, flags);
-
- t->user_event_mm = user_mm;
-
/*
* The lifetime of the memory descriptor can slightly outlast
* the task lifetime if a ref to the user_event_mm is taken
@@ -572,6 +678,17 @@ static struct user_event_mm *user_event_mm_create(struct task_struct *t)
return user_mm;
}
+static void user_event_mm_attach(struct user_event_mm *user_mm, struct task_struct *t)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&user_event_mms_lock, flags);
+ list_add_rcu(&user_mm->mms_link, &user_event_mms);
+ spin_unlock_irqrestore(&user_event_mms_lock, flags);
+
+ t->user_event_mm = user_mm;
+}
+
static struct user_event_mm *current_user_event_mm(void)
{
struct user_event_mm *user_mm = current->user_event_mm;
@@ -579,10 +696,12 @@ static struct user_event_mm *current_user_event_mm(void)
if (user_mm)
goto inc;
- user_mm = user_event_mm_create(current);
+ user_mm = user_event_mm_alloc(current);
if (!user_mm)
goto error;
+
+ user_event_mm_attach(user_mm, current);
inc:
refcount_inc(&user_mm->refcnt);
error:
@@ -593,8 +712,8 @@ static void user_event_mm_destroy(struct user_event_mm *mm)
{
struct user_event_enabler *enabler, *next;
- list_for_each_entry_safe(enabler, next, &mm->enablers, link)
- user_event_enabler_destroy(enabler);
+ list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link)
+ user_event_enabler_destroy(enabler, false);
mmdrop(mm->mm);
kfree(mm);
@@ -630,7 +749,7 @@ void user_event_mm_remove(struct task_struct *t)
/* Remove the mm from the list, so it can no longer be enabled */
spin_lock_irqsave(&user_event_mms_lock, flags);
- list_del_rcu(&mm->link);
+ list_del_rcu(&mm->mms_link);
spin_unlock_irqrestore(&user_event_mms_lock, flags);
/*
@@ -670,7 +789,7 @@ void user_event_mm_remove(struct task_struct *t)
void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)
{
- struct user_event_mm *mm = user_event_mm_create(t);
+ struct user_event_mm *mm = user_event_mm_alloc(t);
struct user_event_enabler *enabler;
if (!mm)
@@ -678,16 +797,18 @@ void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)
rcu_read_lock();
- list_for_each_entry_rcu(enabler, &old_mm->enablers, link)
+ list_for_each_entry_rcu(enabler, &old_mm->enablers, mm_enablers_link) {
if (!user_event_enabler_dup(enabler, mm))
goto error;
+ }
rcu_read_unlock();
+ user_event_mm_attach(mm, t);
return;
error:
rcu_read_unlock();
- user_event_mm_remove(t);
+ user_event_mm_destroy(mm);
}
static bool current_user_event_enabler_exists(unsigned long uaddr,
@@ -747,8 +868,8 @@ retry:
* exit or run exec(), which includes forks and clones.
*/
if (!*write_result) {
- refcount_inc(&enabler->event->refcnt);
- list_add_rcu(&enabler->link, &user_mm->enablers);
+ user_event_get(user);
+ list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers);
}
mutex_unlock(&event_mutex);
@@ -770,7 +891,12 @@ out:
static __always_inline __must_check
bool user_event_last_ref(struct user_event *user)
{
- return refcount_read(&user->refcnt) == 1;
+ int last = 0;
+
+ if (user->reg_flags & USER_EVENT_REG_PERSIST)
+ last = 1;
+
+ return refcount_read(&user->refcnt) == last;
}
static __always_inline __must_check
@@ -809,7 +935,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call)
* Upon success user_event has its ref count increased by 1.
*/
static int user_event_parse_cmd(struct user_event_group *group,
- char *raw_command, struct user_event **newuser)
+ char *raw_command, struct user_event **newuser,
+ int reg_flags)
{
char *name = raw_command;
char *args = strpbrk(name, " ");
@@ -823,7 +950,7 @@ static int user_event_parse_cmd(struct user_event_group *group,
if (flags)
*flags++ = '\0';
- return user_event_parse(group, name, args, flags, newuser);
+ return user_event_parse(group, name, args, flags, newuser, reg_flags);
}
static int user_field_array_size(const char *type)
@@ -904,8 +1031,8 @@ static void user_event_destroy_validators(struct user_event *user)
struct user_event_validator *validator, *next;
struct list_head *head = &user->validators;
- list_for_each_entry_safe(validator, next, head, link) {
- list_del(&validator->link);
+ list_for_each_entry_safe(validator, next, head, user_event_link) {
+ list_del(&validator->user_event_link);
kfree(validator);
}
}
@@ -959,7 +1086,7 @@ add_validator:
validator->offset = offset;
/* Want sequential access when validating */
- list_add_tail(&validator->link, &user->validators);
+ list_add_tail(&validator->user_event_link, &user->validators);
add_field:
field->type = type;
@@ -1334,10 +1461,8 @@ static struct user_event *find_user_event(struct user_event_group *group,
*outkey = key;
hash_for_each_possible(group->register_table, user, node, key)
- if (!strcmp(EVENT_NAME(user), name)) {
- refcount_inc(&user->refcnt);
- return user;
- }
+ if (!strcmp(EVENT_NAME(user), name))
+ return user_event_get(user);
return NULL;
}
@@ -1349,7 +1474,7 @@ static int user_event_validate(struct user_event *user, void *data, int len)
void *pos, *end = data + len;
u32 loc, offset, size;
- list_for_each_entry(validator, head, link) {
+ list_for_each_entry(validator, head, user_event_link) {
pos = data + validator->offset;
/* Already done min_size check, no bounds check here */
@@ -1399,7 +1524,7 @@ static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
if (unlikely(!entry))
return;
- if (unlikely(!copy_nofault(entry + 1, i->count, i)))
+ if (unlikely(i->count != 0 && !copy_nofault(entry + 1, i->count, i)))
goto discard;
if (!list_empty(&user->validators) &&
@@ -1440,7 +1565,7 @@ static void user_event_perf(struct user_event *user, struct iov_iter *i,
perf_fetch_caller_regs(regs);
- if (unlikely(!copy_nofault(perf_entry + 1, i->count, i)))
+ if (unlikely(i->count != 0 && !copy_nofault(perf_entry + 1, i->count, i)))
goto discard;
if (!list_empty(&user->validators) &&
@@ -1551,12 +1676,12 @@ static int user_event_reg(struct trace_event_call *call,
return ret;
inc:
- refcount_inc(&user->refcnt);
+ user_event_get(user);
update_enable_bit_for(user);
return 0;
dec:
update_enable_bit_for(user);
- refcount_dec(&user->refcnt);
+ user_event_put(user, true);
return 0;
}
@@ -1587,10 +1712,11 @@ static int user_event_create(const char *raw_command)
mutex_lock(&group->reg_mutex);
- ret = user_event_parse_cmd(group, name, &user);
+ /* Dyn events persist, otherwise they would cleanup immediately */
+ ret = user_event_parse_cmd(group, name, &user, USER_EVENT_REG_PERSIST);
if (!ret)
- refcount_dec(&user->refcnt);
+ user_event_put(user, false);
mutex_unlock(&group->reg_mutex);
@@ -1712,6 +1838,8 @@ static bool user_event_match(const char *system, const char *event,
if (match && argc > 0)
match = user_fields_match(user, argc, argv);
+ else if (match && argc == 0)
+ match = list_empty(&user->fields);
return match;
}
@@ -1748,11 +1876,17 @@ static int user_event_trace_register(struct user_event *user)
*/
static int user_event_parse(struct user_event_group *group, char *name,
char *args, char *flags,
- struct user_event **newuser)
+ struct user_event **newuser, int reg_flags)
{
int ret;
u32 key;
struct user_event *user;
+ int argc = 0;
+ char **argv;
+
+ /* User register flags are not ready yet */
+ if (reg_flags != 0 || flags != NULL)
+ return -EINVAL;
/* Prevent dyn_event from racing */
mutex_lock(&event_mutex);
@@ -1760,13 +1894,35 @@ static int user_event_parse(struct user_event_group *group, char *name,
mutex_unlock(&event_mutex);
if (user) {
- *newuser = user;
- /*
- * Name is allocated by caller, free it since it already exists.
- * Caller only worries about failure cases for freeing.
- */
- kfree(name);
+ if (args) {
+ argv = argv_split(GFP_KERNEL, args, &argc);
+ if (!argv) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = user_fields_match(user, argc, (const char **)argv);
+ argv_free(argv);
+
+ } else
+ ret = list_empty(&user->fields);
+
+ if (ret) {
+ *newuser = user;
+ /*
+ * Name is allocated by caller, free it since it already exists.
+ * Caller only worries about failure cases for freeing.
+ */
+ kfree(name);
+ } else {
+ ret = -EADDRINUSE;
+ goto error;
+ }
+
return 0;
+error:
+ user_event_put(user, false);
+ return ret;
}
user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT);
@@ -1819,8 +1975,15 @@ static int user_event_parse(struct user_event_group *group, char *name,
if (ret)
goto put_user_lock;
- /* Ensure we track self ref and caller ref (2) */
- refcount_set(&user->refcnt, 2);
+ user->reg_flags = reg_flags;
+
+ if (user->reg_flags & USER_EVENT_REG_PERSIST) {
+ /* Ensure we track self ref and caller ref (2) */
+ refcount_set(&user->refcnt, 2);
+ } else {
+ /* Ensure we track only caller ref (1) */
+ refcount_set(&user->refcnt, 1);
+ }
dyn_event_init(&user->devent, &user_event_dops);
dyn_event_add(&user->devent, &user->call);
@@ -1852,7 +2015,7 @@ static int delete_user_event(struct user_event_group *group, char *name)
if (!user)
return -ENOENT;
- refcount_dec(&user->refcnt);
+ user_event_put(user, true);
if (!user_event_last_ref(user))
return -EBUSY;
@@ -2011,9 +2174,7 @@ static int user_events_ref_add(struct user_event_file_info *info,
for (i = 0; i < count; ++i)
new_refs->events[i] = refs->events[i];
- new_refs->events[i] = user;
-
- refcount_inc(&user->refcnt);
+ new_refs->events[i] = user_event_get(user);
rcu_assign_pointer(info->refs, new_refs);
@@ -2044,8 +2205,8 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
if (ret)
return ret;
- /* Ensure no flags, since we don't support any yet */
- if (kreg->flags != 0)
+ /* Ensure only valid flags */
+ if (kreg->flags & ~(USER_EVENT_REG_MAX-1))
return -EINVAL;
/* Ensure supported size */
@@ -2117,7 +2278,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
return ret;
}
- ret = user_event_parse_cmd(info->group, name, &user);
+ ret = user_event_parse_cmd(info->group, name, &user, reg.flags);
if (ret) {
kfree(name);
@@ -2127,7 +2288,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
ret = user_events_ref_add(info, user);
/* No longer need parse ref, ref_add either worked or not */
- refcount_dec(&user->refcnt);
+ user_event_put(user, false);
/* Positive number is index and valid */
if (ret < 0)
@@ -2270,17 +2431,18 @@ static long user_events_ioctl_unreg(unsigned long uarg)
*/
mutex_lock(&event_mutex);
- list_for_each_entry_safe(enabler, next, &mm->enablers, link)
+ list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) {
if (enabler->addr == reg.disable_addr &&
- (enabler->values & ENABLE_VAL_BIT_MASK) == reg.disable_bit) {
+ ENABLE_BIT(enabler) == reg.disable_bit) {
set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
- user_event_enabler_destroy(enabler);
+ user_event_enabler_destroy(enabler, true);
/* Removed at least one */
ret = 0;
}
+ }
mutex_unlock(&event_mutex);
@@ -2333,7 +2495,6 @@ static int user_events_release(struct inode *node, struct file *file)
struct user_event_file_info *info = file->private_data;
struct user_event_group *group;
struct user_event_refs *refs;
- struct user_event *user;
int i;
if (!info)
@@ -2357,12 +2518,9 @@ static int user_events_release(struct inode *node, struct file *file)
* The underlying user_events are ref counted, and cannot be freed.
* After this decrement, the user_events may be freed elsewhere.
*/
- for (i = 0; i < refs->count; ++i) {
- user = refs->events[i];
+ for (i = 0; i < refs->count; ++i)
+ user_event_put(refs->events[i], false);
- if (user)
- refcount_dec(&user->refcnt);
- }
out:
file->private_data = NULL;
@@ -2543,7 +2701,7 @@ static int __init trace_events_user_init(void)
if (!fault_cache)
return -ENOMEM;
- init_group = user_event_group_create(&init_user_ns);
+ init_group = user_event_group_create();
if (!init_group) {
kmem_cache_destroy(fault_cache);
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index efbbec2caff8..e97e3fa5cbed 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1652,6 +1652,8 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
osnoise_stop_tracing();
notify_new_max_latency(diff);
+ wake_up_process(tlat->kthread);
+
return HRTIMER_NORESTART;
}
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 15f05faaae44..1e33f367783e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -847,7 +847,7 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
int ret;
void *pos;
- list_for_each_entry(field, head, link) {
+ list_for_each_entry_reverse(field, head, link) {
trace_seq_printf(&iter->seq, " %s=", field->name);
if (field->offset + field->size > iter->ent_size) {
trace_seq_puts(&iter->seq, "<OVERFLOW>");
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index ef8ed3b65d05..6a4ecfb1da43 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -308,7 +308,7 @@ trace_probe_primary_from_call(struct trace_event_call *call)
{
struct trace_probe_event *tpe = trace_probe_event_from_call(call);
- return list_first_entry(&tpe->probes, struct trace_probe, list);
+ return list_first_entry_or_null(&tpe->probes, struct trace_probe, list);
}
static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index a931d9aaea26..529590499b1f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -848,6 +848,12 @@ trace_selftest_startup_function_graph(struct tracer *trace,
}
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ /*
+ * These tests can take some time to run. Make sure on non PREEMPT
+ * kernels, we do not trigger the softlockup detector.
+ */
+ cond_resched();
+
tracing_reset_online_cpus(&tr->array_buffer);
set_graph_array(tr);
@@ -869,6 +875,8 @@ trace_selftest_startup_function_graph(struct tracer *trace,
if (ret)
goto out;
+ cond_resched();
+
ret = register_ftrace_graph(&fgraph_ops);
if (ret) {
warn_failed_init_tracer(trace, ret);
@@ -891,6 +899,8 @@ trace_selftest_startup_function_graph(struct tracer *trace,
if (ret)
goto out;
+ cond_resched();
+
tracing_start();
if (!ret && !count) {
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index b7cbd66f889e..da35e5b7f047 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -12,58 +12,90 @@ enum vhost_task_flags {
VHOST_TASK_FLAGS_STOP,
};
+struct vhost_task {
+ bool (*fn)(void *data);
+ void *data;
+ struct completion exited;
+ unsigned long flags;
+ struct task_struct *task;
+};
+
static int vhost_task_fn(void *data)
{
struct vhost_task *vtsk = data;
- int ret;
+ bool dead = false;
+
+ for (;;) {
+ bool did_work;
+
+ if (!dead && signal_pending(current)) {
+ struct ksignal ksig;
+ /*
+ * Calling get_signal will block in SIGSTOP,
+ * or clear fatal_signal_pending, but remember
+ * what was set.
+ *
+ * This thread won't actually exit until all
+ * of the file descriptors are closed, and
+ * the release function is called.
+ */
+ dead = get_signal(&ksig);
+ if (dead)
+ clear_thread_flag(TIF_SIGPENDING);
+ }
+
+ /* mb paired w/ vhost_task_stop */
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) {
+ __set_current_state(TASK_RUNNING);
+ break;
+ }
+
+ did_work = vtsk->fn(vtsk->data);
+ if (!did_work)
+ schedule();
+ }
- ret = vtsk->fn(vtsk->data);
complete(&vtsk->exited);
- do_exit(ret);
+ do_exit(0);
}
/**
+ * vhost_task_wake - wakeup the vhost_task
+ * @vtsk: vhost_task to wake
+ *
+ * wake up the vhost_task worker thread
+ */
+void vhost_task_wake(struct vhost_task *vtsk)
+{
+ wake_up_process(vtsk->task);
+}
+EXPORT_SYMBOL_GPL(vhost_task_wake);
+
+/**
* vhost_task_stop - stop a vhost_task
* @vtsk: vhost_task to stop
*
- * Callers must call vhost_task_should_stop and return from their worker
- * function when it returns true;
+ * vhost_task_fn ensures the worker thread exits after
+ * VHOST_TASK_FLAGS_SOP becomes true.
*/
void vhost_task_stop(struct vhost_task *vtsk)
{
- pid_t pid = vtsk->task->pid;
-
set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
- wake_up_process(vtsk->task);
+ vhost_task_wake(vtsk);
/*
* Make sure vhost_task_fn is no longer accessing the vhost_task before
- * freeing it below. If userspace crashed or exited without closing,
- * then the vhost_task->task could already be marked dead so
- * kernel_wait will return early.
+ * freeing it below.
*/
wait_for_completion(&vtsk->exited);
- /*
- * If we are just closing/removing a device and the parent process is
- * not exiting then reap the task.
- */
- kernel_wait4(pid, NULL, __WCLONE, NULL);
kfree(vtsk);
}
EXPORT_SYMBOL_GPL(vhost_task_stop);
/**
- * vhost_task_should_stop - should the vhost task return from the work function
- * @vtsk: vhost_task to stop
- */
-bool vhost_task_should_stop(struct vhost_task *vtsk)
-{
- return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
-}
-EXPORT_SYMBOL_GPL(vhost_task_should_stop);
-
-/**
- * vhost_task_create - create a copy of a process to be used by the kernel
- * @fn: thread stack
+ * vhost_task_create - create a copy of a task to be used by the kernel
+ * @fn: vhost worker function
* @arg: data to be passed to fn
* @name: the thread's name
*
@@ -71,17 +103,17 @@ EXPORT_SYMBOL_GPL(vhost_task_should_stop);
* failure. The returned task is inactive, and the caller must fire it up
* through vhost_task_start().
*/
-struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
+struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg,
const char *name)
{
struct kernel_clone_args args = {
- .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM,
+ .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM |
+ CLONE_THREAD | CLONE_SIGHAND,
.exit_signal = 0,
.fn = vhost_task_fn,
.name = name,
.user_worker = 1,
.no_files = 1,
- .ignore_signals = 1,
};
struct vhost_task *vtsk;
struct task_struct *tsk;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4666a1a92a31..c913e333cce8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -705,12 +705,17 @@ static void clear_work_data(struct work_struct *work)
set_work_data(work, WORK_STRUCT_NO_POOL, 0);
}
+static inline struct pool_workqueue *work_struct_pwq(unsigned long data)
+{
+ return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK);
+}
+
static struct pool_workqueue *get_work_pwq(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
if (data & WORK_STRUCT_PWQ)
- return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
+ return work_struct_pwq(data);
else
return NULL;
}
@@ -738,8 +743,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
assert_rcu_or_pool_mutex();
if (data & WORK_STRUCT_PWQ)
- return ((struct pool_workqueue *)
- (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
+ return work_struct_pwq(data)->pool;
pool_id = data >> WORK_OFFQ_POOL_SHIFT;
if (pool_id == WORK_OFFQ_POOL_NONE)
@@ -760,8 +764,7 @@ static int get_work_pool_id(struct work_struct *work)
unsigned long data = atomic_long_read(&work->data);
if (data & WORK_STRUCT_PWQ)
- return ((struct pool_workqueue *)
- (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
+ return work_struct_pwq(data)->pool->id;
return data >> WORK_OFFQ_POOL_SHIFT;
}