summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/bpf_inode_storage.c2
-rw-r--r--kernel/bpf/bpf_struct_ops.c2
-rw-r--r--kernel/bpf/btf.c2
-rw-r--r--kernel/bpf/core.c10
-rw-r--r--kernel/bpf/disasm.c2
-rw-r--r--kernel/bpf/inode.c4
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c19
-rw-r--r--kernel/bpf/stackmap.c12
-rw-r--r--kernel/bpf/syscall.c5
-rw-r--r--kernel/bpf/trampoline.c248
-rw-r--r--kernel/bpf/verifier.c341
-rw-r--r--kernel/events/core.c42
-rw-r--r--kernel/fork.c24
-rw-r--r--kernel/futex.c3
-rw-r--r--kernel/gcov/clang.c88
-rw-r--r--kernel/irq/irq_sim.c4
-rw-r--r--kernel/irq/irqdomain.c9
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/jump_label.c8
-rw-r--r--kernel/locking/lockdep.c5
-rw-r--r--kernel/locking/mutex.c25
-rw-r--r--kernel/power/energy_model.c2
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/sched/core.c126
-rw-r--r--kernel/sched/membarrier.c4
-rw-r--r--kernel/signal.c14
-rw-r--r--kernel/static_call.c47
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/hrtimer.c62
-rw-r--r--kernel/time/posix-cpu-timers.c2
-rw-r--r--kernel/trace/ftrace.c52
-rw-r--r--kernel/trace/trace.c13
-rw-r--r--kernel/trace/trace_dynevent.c6
-rw-r--r--kernel/user_namespace.c65
-rw-r--r--kernel/usermode_driver.c21
-rw-r--r--kernel/watchdog.c5
-rw-r--r--kernel/workqueue.c19
39 files changed, 937 insertions, 368 deletions
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 6639640523c0..b58b2efb9b43 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -109,7 +109,7 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
fd = *(int *)key;
f = fget_raw(fd);
if (!f)
- return NULL;
+ return ERR_PTR(-EBADF);
sdata = inode_storage_lookup(f->f_inode, map, true);
fput(f);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 1a666a975416..70f6fd4fa305 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -430,7 +430,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
- err = arch_prepare_bpf_trampoline(image,
+ err = arch_prepare_bpf_trampoline(NULL, image,
st_map->image + PAGE_SIZE,
&st_ops->func_models[i], 0,
tprogs, NULL);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2efeb5f4b343..b1a76fe046cb 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4321,8 +4321,6 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
* is not supported yet.
* BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
*/
- if (log->level & BPF_LOG_LEVEL)
- bpf_log(log, "arg#%d type is not a struct\n", arg);
return NULL;
}
tname = btf_name_by_offset(btf, t->name_off);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 0ae015ad1e05..75244ecb2389 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -827,7 +827,7 @@ static int __init bpf_jit_charge_init(void)
}
pure_initcall(bpf_jit_charge_init);
-static int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 pages)
{
if (atomic_long_add_return(pages, &bpf_jit_current) >
(bpf_jit_limit >> PAGE_SHIFT)) {
@@ -840,7 +840,7 @@ static int bpf_jit_charge_modmem(u32 pages)
return 0;
}
-static void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 pages)
{
atomic_long_sub(pages, &bpf_jit_current);
}
@@ -1118,6 +1118,8 @@ static void bpf_prog_clone_free(struct bpf_prog *fp)
* clone is guaranteed to not be locked.
*/
fp->aux = NULL;
+ fp->stats = NULL;
+ fp->active = NULL;
__bpf_prog_free(fp);
}
@@ -2342,6 +2344,10 @@ bool __weak bpf_helper_changes_pkt_data(void *func)
/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
* analysis code and wants explicit zero extension inserted by verifier.
* Otherwise, return FALSE.
+ *
+ * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if
+ * you don't override this. JITs that don't want these extra insns can detect
+ * them using insn_is_zext.
*/
bool __weak bpf_jit_needs_zext(void)
{
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 3acc7e0b6916..faa54d58972c 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -84,7 +84,7 @@ static const char *const bpf_atomic_alu_string[16] = {
[BPF_ADD >> 4] = "add",
[BPF_AND >> 4] = "and",
[BPF_OR >> 4] = "or",
- [BPF_XOR >> 4] = "or",
+ [BPF_XOR >> 4] = "xor",
};
static const char *const bpf_ldst_string[] = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 1576ff331ee4..d2de2abec35b 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -543,11 +543,11 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
return PTR_ERR(raw);
if (type == BPF_TYPE_PROG)
- ret = bpf_prog_new_fd(raw);
+ ret = (f_flags != O_RDWR) ? -EINVAL : bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
ret = bpf_map_new_fd(raw, f_flags);
else if (type == BPF_TYPE_LINK)
- ret = bpf_link_new_fd(raw);
+ ret = (f_flags != O_RDWR) ? -EINVAL : bpf_link_new_fd(raw);
else
return -ENOENT;
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 79c5772465f1..53736e52c1df 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -60,9 +60,12 @@ static int finish(void)
&magic, sizeof(magic), &pos);
if (n != sizeof(magic))
return -EPIPE;
+
tgid = umd_ops.info.tgid;
- wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
- umd_ops.info.tgid = NULL;
+ if (tgid) {
+ wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
+ umd_cleanup_helper(&umd_ops.info);
+ }
return 0;
}
@@ -80,10 +83,18 @@ static int __init load_umd(void)
static void __exit fini_umd(void)
{
+ struct pid *tgid;
+
bpf_preload_ops = NULL;
+
/* kill UMD in case it's still there due to earlier error */
- kill_pid(umd_ops.info.tgid, SIGKILL, 1);
- umd_ops.info.tgid = NULL;
+ tgid = umd_ops.info.tgid;
+ if (tgid) {
+ kill_pid(tgid, SIGKILL, 1);
+
+ wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
+ umd_cleanup_helper(&umd_ops.info);
+ }
umd_unload_blob(&umd_ops.info);
}
late_initcall(load_umd);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index be35bfb7fb13..6fbc2abe9c91 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -517,9 +517,17 @@ const struct bpf_func_proto bpf_get_stack_proto = {
BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
u32, size, u64, flags)
{
- struct pt_regs *regs = task_pt_regs(task);
+ struct pt_regs *regs;
+ long res;
- return __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ if (!try_get_task_stack(task))
+ return -EFAULT;
+
+ regs = task_pt_regs(task);
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ put_task_stack(task);
+
+ return res;
}
BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c859bc46d06c..250503482cda 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -854,6 +854,11 @@ static int map_create(union bpf_attr *attr)
err = PTR_ERR(btf);
goto free_map;
}
+ if (btf_is_kernel(btf)) {
+ btf_put(btf);
+ err = -EACCES;
+ goto free_map;
+ }
map->btf = btf;
if (attr->btf_value_type_id) {
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 7bc3b3209224..4aa8b52adf25 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -9,6 +9,7 @@
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>
+#include <linux/module.h>
/* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -57,19 +58,10 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym)
PAGE_SIZE, true, ksym->name);
}
-static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr)
-{
- struct bpf_ksym *ksym = &tr->ksym;
-
- snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key);
- bpf_image_ksym_add(tr->image, ksym);
-}
-
static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
{
struct bpf_trampoline *tr;
struct hlist_head *head;
- void *image;
int i;
mutex_lock(&trampoline_mutex);
@@ -84,14 +76,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
if (!tr)
goto out;
- /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
- image = bpf_jit_alloc_exec_page();
- if (!image) {
- kfree(tr);
- tr = NULL;
- goto out;
- }
-
tr->key = key;
INIT_HLIST_NODE(&tr->hlist);
hlist_add_head(&tr->hlist, head);
@@ -99,14 +83,31 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
mutex_init(&tr->mutex);
for (i = 0; i < BPF_TRAMP_MAX; i++)
INIT_HLIST_HEAD(&tr->progs_hlist[i]);
- tr->image = image;
- INIT_LIST_HEAD_RCU(&tr->ksym.lnode);
- bpf_trampoline_ksym_add(tr);
out:
mutex_unlock(&trampoline_mutex);
return tr;
}
+static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
+{
+ struct module *mod;
+ int err = 0;
+
+ preempt_disable();
+ mod = __module_text_address((unsigned long) tr->func.addr);
+ if (mod && !try_module_get(mod))
+ err = -ENOENT;
+ preempt_enable();
+ tr->mod = mod;
+ return err;
+}
+
+static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
+{
+ module_put(tr->mod);
+ tr->mod = NULL;
+}
+
static int is_ftrace_location(void *ip)
{
long addr;
@@ -128,6 +129,9 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
ret = unregister_ftrace_direct((long)ip, (long)old_addr);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
+
+ if (!ret)
+ bpf_trampoline_module_put(tr);
return ret;
}
@@ -154,10 +158,16 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
return ret;
tr->func.ftrace_managed = ret;
+ if (bpf_trampoline_module_get(tr))
+ return -ENOENT;
+
if (tr->func.ftrace_managed)
ret = register_ftrace_direct((long)ip, (long)new_addr);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+
+ if (ret)
+ bpf_trampoline_module_put(tr);
return ret;
}
@@ -185,10 +195,142 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
return tprogs;
}
+static void __bpf_tramp_image_put_deferred(struct work_struct *work)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(work, struct bpf_tramp_image, work);
+ bpf_image_ksym_del(&im->ksym);
+ bpf_jit_free_exec(im->image);
+ bpf_jit_uncharge_modmem(1);
+ percpu_ref_exit(&im->pcref);
+ kfree_rcu(im, rcu);
+}
+
+/* callback, fexit step 3 or fentry step 2 */
+static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(rcu, struct bpf_tramp_image, rcu);
+ INIT_WORK(&im->work, __bpf_tramp_image_put_deferred);
+ schedule_work(&im->work);
+}
+
+/* callback, fexit step 2. Called after percpu_ref_kill confirms. */
+static void __bpf_tramp_image_release(struct percpu_ref *pcref)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(pcref, struct bpf_tramp_image, pcref);
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
+}
+
+/* callback, fexit or fentry step 1 */
+static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(rcu, struct bpf_tramp_image, rcu);
+ if (im->ip_after_call)
+ /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */
+ percpu_ref_kill(&im->pcref);
+ else
+ /* the case of fentry trampoline */
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
+}
+
+static void bpf_tramp_image_put(struct bpf_tramp_image *im)
+{
+ /* The trampoline image that calls original function is using:
+ * rcu_read_lock_trace to protect sleepable bpf progs
+ * rcu_read_lock to protect normal bpf progs
+ * percpu_ref to protect trampoline itself
+ * rcu tasks to protect trampoline asm not covered by percpu_ref
+ * (which are few asm insns before __bpf_tramp_enter and
+ * after __bpf_tramp_exit)
+ *
+ * The trampoline is unreachable before bpf_tramp_image_put().
+ *
+ * First, patch the trampoline to avoid calling into fexit progs.
+ * The progs will be freed even if the original function is still
+ * executing or sleeping.
+ * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on
+ * first few asm instructions to execute and call into
+ * __bpf_tramp_enter->percpu_ref_get.
+ * Then use percpu_ref_kill to wait for the trampoline and the original
+ * function to finish.
+ * Then use call_rcu_tasks() to make sure few asm insns in
+ * the trampoline epilogue are done as well.
+ *
+ * In !PREEMPT case the task that got interrupted in the first asm
+ * insns won't go through an RCU quiescent state which the
+ * percpu_ref_kill will be waiting for. Hence the first
+ * call_rcu_tasks() is not necessary.
+ */
+ if (im->ip_after_call) {
+ int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
+ NULL, im->ip_epilogue);
+ WARN_ON(err);
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
+ else
+ percpu_ref_kill(&im->pcref);
+ return;
+ }
+
+ /* The trampoline without fexit and fmod_ret progs doesn't call original
+ * function and doesn't use percpu_ref.
+ * Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * Then use call_rcu_tasks() to wait for the rest of trampoline asm
+ * and normal progs.
+ */
+ call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
+}
+
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
+{
+ struct bpf_tramp_image *im;
+ struct bpf_ksym *ksym;
+ void *image;
+ int err = -ENOMEM;
+
+ im = kzalloc(sizeof(*im), GFP_KERNEL);
+ if (!im)
+ goto out;
+
+ err = bpf_jit_charge_modmem(1);
+ if (err)
+ goto out_free_im;
+
+ err = -ENOMEM;
+ im->image = image = bpf_jit_alloc_exec_page();
+ if (!image)
+ goto out_uncharge;
+
+ err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
+ if (err)
+ goto out_free_image;
+
+ ksym = &im->ksym;
+ INIT_LIST_HEAD_RCU(&ksym->lnode);
+ snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx);
+ bpf_image_ksym_add(image, ksym);
+ return im;
+
+out_free_image:
+ bpf_jit_free_exec(im->image);
+out_uncharge:
+ bpf_jit_uncharge_modmem(1);
+out_free_im:
+ kfree(im);
+out:
+ return ERR_PTR(err);
+}
+
static int bpf_trampoline_update(struct bpf_trampoline *tr)
{
- void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
- void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
+ struct bpf_tramp_image *im;
struct bpf_tramp_progs *tprogs;
u32 flags = BPF_TRAMP_F_RESTORE_REGS;
int err, total;
@@ -198,41 +340,42 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
return PTR_ERR(tprogs);
if (total == 0) {
- err = unregister_fentry(tr, old_image);
+ err = unregister_fentry(tr, tr->cur_image->image);
+ bpf_tramp_image_put(tr->cur_image);
+ tr->cur_image = NULL;
tr->selector = 0;
goto out;
}
+ im = bpf_tramp_image_alloc(tr->key, tr->selector);
+ if (IS_ERR(im)) {
+ err = PTR_ERR(im);
+ goto out;
+ }
+
if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
- /* Though the second half of trampoline page is unused a task could be
- * preempted in the middle of the first half of trampoline and two
- * updates to trampoline would change the code from underneath the
- * preempted task. Hence wait for tasks to voluntarily schedule or go
- * to userspace.
- * The same trampoline can hold both sleepable and non-sleepable progs.
- * synchronize_rcu_tasks_trace() is needed to make sure all sleepable
- * programs finish executing.
- * Wait for these two grace periods together.
- */
- synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace);
-
- err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
+ err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
&tr->func.model, flags, tprogs,
tr->func.addr);
if (err < 0)
goto out;
- if (tr->selector)
+ WARN_ON(tr->cur_image && tr->selector == 0);
+ WARN_ON(!tr->cur_image && tr->selector);
+ if (tr->cur_image)
/* progs already running at this address */
- err = modify_fentry(tr, old_image, new_image);
+ err = modify_fentry(tr, tr->cur_image->image, im->image);
else
/* first time registering */
- err = register_fentry(tr, new_image);
+ err = register_fentry(tr, im->image);
if (err)
goto out;
+ if (tr->cur_image)
+ bpf_tramp_image_put(tr->cur_image);
+ tr->cur_image = im;
tr->selector++;
out:
kfree(tprogs);
@@ -364,17 +507,12 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
goto out;
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
goto out;
- bpf_image_ksym_del(&tr->ksym);
- /* This code will be executed when all bpf progs (both sleepable and
- * non-sleepable) went through
- * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred().
- * Hence no need for another synchronize_rcu_tasks_trace() here,
- * but synchronize_rcu_tasks() is still needed, since trampoline
- * may not have had any sleepable programs and we need to wait
- * for tasks to get out of trampoline code before freeing it.
+ /* This code will be executed even when the last bpf_tramp_image
+ * is alive. All progs are detached from the trampoline and the
+ * trampoline image is patched with jmp into epilogue to skip
+ * fexit progs. The fentry-only trampoline will be freed via
+ * multiple rcu callbacks.
*/
- synchronize_rcu_tasks();
- bpf_jit_free_exec(tr->image);
hlist_del(&tr->hlist);
kfree(tr);
out:
@@ -478,8 +616,18 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
rcu_read_unlock_trace();
}
+void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
+{
+ percpu_ref_get(&tr->pcref);
+}
+
+void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
+{
+ percpu_ref_put(&tr->pcref);
+}
+
int __weak
-arch_prepare_bpf_trampoline(void *image, void *image_end,
+arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
struct bpf_tramp_progs *tprogs,
void *orig_call)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1dda9d81f12c..0399ac092b36 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -504,6 +504,13 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
func_id == BPF_FUNC_skc_to_tcp_request_sock;
}
+static bool is_cmpxchg_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_CMPXCHG;
+}
+
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
[NOT_INIT] = "?",
@@ -1120,7 +1127,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
reg->type = PTR_TO_RDWR_BUF;
break;
default:
- WARN_ON("unknown nullable register type");
+ WARN_ONCE(1, "unknown nullable register type");
}
}
@@ -1703,7 +1710,11 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
if (class == BPF_STX) {
- if (reg->type != SCALAR_VALUE)
+ /* BPF_STX (including atomic variants) has multiple source
+ * operands, one of which is a ptr. Check whether the caller is
+ * asking about it.
+ */
+ if (t == SRC_OP && reg->type != SCALAR_VALUE)
return true;
return BPF_SIZE(code) == BPF_DW;
}
@@ -1735,22 +1746,38 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
return true;
}
-/* Return TRUE if INSN doesn't have explicit value define. */
-static bool insn_no_def(struct bpf_insn *insn)
+/* Return the regno defined by the insn, or -1. */
+static int insn_def_regno(const struct bpf_insn *insn)
{
- u8 class = BPF_CLASS(insn->code);
-
- return (class == BPF_JMP || class == BPF_JMP32 ||
- class == BPF_STX || class == BPF_ST);
+ switch (BPF_CLASS(insn->code)) {
+ case BPF_JMP:
+ case BPF_JMP32:
+ case BPF_ST:
+ return -1;
+ case BPF_STX:
+ if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm & BPF_FETCH)) {
+ if (insn->imm == BPF_CMPXCHG)
+ return BPF_REG_0;
+ else
+ return insn->src_reg;
+ } else {
+ return -1;
+ }
+ default:
+ return insn->dst_reg;
+ }
}
/* Return TRUE if INSN has defined any 32-bit value explicitly. */
static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- if (insn_no_def(insn))
+ int dst_reg = insn_def_regno(insn);
+
+ if (dst_reg == -1)
return false;
- return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP);
+ return !is_reg64(env, insn, dst_reg, NULL, DST_OP);
}
static void mark_insn_zext(struct bpf_verifier_env *env,
@@ -5829,35 +5856,51 @@ static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
return &env->insn_aux_data[env->insn_idx];
}
+enum {
+ REASON_BOUNDS = -1,
+ REASON_TYPE = -2,
+ REASON_PATHS = -3,
+ REASON_LIMIT = -4,
+ REASON_STACK = -5,
+};
+
static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
- u32 *ptr_limit, u8 opcode, bool off_is_neg)
+ const struct bpf_reg_state *off_reg,
+ u32 *alu_limit, u8 opcode)
{
+ bool off_is_neg = off_reg->smin_value < 0;
bool mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
(opcode == BPF_SUB && !off_is_neg);
- u32 off;
+ u32 max = 0, ptr_limit = 0;
+
+ if (!tnum_is_const(off_reg->var_off) &&
+ (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
+ return REASON_BOUNDS;
switch (ptr_reg->type) {
case PTR_TO_STACK:
- /* Indirect variable offset stack access is prohibited in
- * unprivileged mode so it's not handled here.
+ /* Offset 0 is out-of-bounds, but acceptable start for the
+ * left direction, see BPF_REG_FP. Also, unknown scalar
+ * offset where we would need to deal with min/max bounds is
+ * currently prohibited for unprivileged.
*/
- off = ptr_reg->off + ptr_reg->var_off.value;
- if (mask_to_left)
- *ptr_limit = MAX_BPF_STACK + off;
- else
- *ptr_limit = -off;
- return 0;
+ max = MAX_BPF_STACK + mask_to_left;
+ ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off);
+ break;
case PTR_TO_MAP_VALUE:
- if (mask_to_left) {
- *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
- } else {
- off = ptr_reg->smin_value + ptr_reg->off;
- *ptr_limit = ptr_reg->map_ptr->value_size - off;
- }
- return 0;
+ max = ptr_reg->map_ptr->value_size;
+ ptr_limit = (mask_to_left ?
+ ptr_reg->smin_value :
+ ptr_reg->umax_value) + ptr_reg->off;
+ break;
default:
- return -EINVAL;
+ return REASON_TYPE;
}
+
+ if (ptr_limit >= max)
+ return REASON_LIMIT;
+ *alu_limit = ptr_limit;
+ return 0;
}
static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
@@ -5875,7 +5918,7 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
if (aux->alu_state &&
(aux->alu_state != alu_state ||
aux->alu_limit != alu_limit))
- return -EACCES;
+ return REASON_PATHS;
/* Corresponding fixup done in fixup_bpf_calls(). */
aux->alu_state = alu_state;
@@ -5894,19 +5937,28 @@ static int sanitize_val_alu(struct bpf_verifier_env *env,
return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0);
}
+static bool sanitize_needed(u8 opcode)
+{
+ return opcode == BPF_ADD || opcode == BPF_SUB;
+}
+
static int sanitize_ptr_alu(struct bpf_verifier_env *env,
struct bpf_insn *insn,
const struct bpf_reg_state *ptr_reg,
+ const struct bpf_reg_state *off_reg,
struct bpf_reg_state *dst_reg,
- bool off_is_neg)
+ struct bpf_insn_aux_data *tmp_aux,
+ const bool commit_window)
{
+ struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux;
struct bpf_verifier_state *vstate = env->cur_state;
- struct bpf_insn_aux_data *aux = cur_aux(env);
+ bool off_is_neg = off_reg->smin_value < 0;
bool ptr_is_dst_reg = ptr_reg == dst_reg;
u8 opcode = BPF_OP(insn->code);
u32 alu_state, alu_limit;
struct bpf_reg_state tmp;
bool ret;
+ int err;
if (can_skip_alu_sanitation(env, insn))
return 0;
@@ -5918,15 +5970,33 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
if (vstate->speculative)
goto do_sim;
- alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
- alu_state |= ptr_is_dst_reg ?
- BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+ err = retrieve_ptr_limit(ptr_reg, off_reg, &alu_limit, opcode);
+ if (err < 0)
+ return err;
- if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg))
- return 0;
- if (update_alu_sanitation_state(aux, alu_state, alu_limit))
- return -EACCES;
+ if (commit_window) {
+ /* In commit phase we narrow the masking window based on
+ * the observed pointer move after the simulated operation.
+ */
+ alu_state = tmp_aux->alu_state;
+ alu_limit = abs(tmp_aux->alu_limit - alu_limit);
+ } else {
+ alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+ alu_state |= ptr_is_dst_reg ?
+ BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+ }
+
+ err = update_alu_sanitation_state(aux, alu_state, alu_limit);
+ if (err < 0)
+ return err;
do_sim:
+ /* If we're in commit phase, we're done here given we already
+ * pushed the truncated dst_reg into the speculative verification
+ * stack.
+ */
+ if (commit_window)
+ return 0;
+
/* Simulate and find potential out-of-bounds access under
* speculative execution from truncation as a result of
* masking when off was not within expected range. If off
@@ -5943,7 +6013,46 @@ do_sim:
ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
if (!ptr_is_dst_reg && ret)
*dst_reg = tmp;
- return !ret ? -EFAULT : 0;
+ return !ret ? REASON_STACK : 0;
+}
+
+static int sanitize_err(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, int reason,
+ const struct bpf_reg_state *off_reg,
+ const struct bpf_reg_state *dst_reg)
+{
+ static const char *err = "pointer arithmetic with it prohibited for !root";
+ const char *op = BPF_OP(insn->code) == BPF_ADD ? "add" : "sub";
+ u32 dst = insn->dst_reg, src = insn->src_reg;
+
+ switch (reason) {
+ case REASON_BOUNDS:
+ verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n",
+ off_reg == dst_reg ? dst : src, err);
+ break;
+ case REASON_TYPE:
+ verbose(env, "R%d has pointer with unsupported alu operation, %s\n",
+ off_reg == dst_reg ? src : dst, err);
+ break;
+ case REASON_PATHS:
+ verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n",
+ dst, op, err);
+ break;
+ case REASON_LIMIT:
+ verbose(env, "R%d tried to %s beyond pointer bounds, %s\n",
+ dst, op, err);
+ break;
+ case REASON_STACK:
+ verbose(env, "R%d could not be pushed for speculative verification, %s\n",
+ dst, err);
+ break;
+ default:
+ verbose(env, "verifier internal error: unknown reason (%d)\n",
+ reason);
+ break;
+ }
+
+ return -EACCES;
}
/* check that stack access falls within stack limits and that 'reg' doesn't
@@ -5980,6 +6089,37 @@ static int check_stack_access_for_ptr_arithmetic(
return 0;
}
+static int sanitize_check_bounds(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn,
+ const struct bpf_reg_state *dst_reg)
+{
+ u32 dst = insn->dst_reg;
+
+ /* For unprivileged we require that resulting offset must be in bounds
+ * in order to be able to sanitize access later on.
+ */
+ if (env->bypass_spec_v1)
+ return 0;
+
+ switch (dst_reg->type) {
+ case PTR_TO_STACK:
+ if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg,
+ dst_reg->off + dst_reg->var_off.value))
+ return -EACCES;
+ break;
+ case PTR_TO_MAP_VALUE:
+ if (check_map_access(env, dst, dst_reg->off, 1, false)) {
+ verbose(env, "R%d pointer arithmetic of map value goes out of range, "
+ "prohibited for !root\n", dst);
+ return -EACCES;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
* Caller should also handle BPF_MOV case separately.
@@ -5999,8 +6139,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
- u32 dst = insn->dst_reg, src = insn->src_reg;
+ struct bpf_insn_aux_data tmp_aux = {};
u8 opcode = BPF_OP(insn->code);
+ u32 dst = insn->dst_reg;
int ret;
dst_reg = &regs[dst];
@@ -6048,13 +6189,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str[ptr_reg->type]);
return -EACCES;
- case PTR_TO_MAP_VALUE:
- if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
- verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
- off_reg == dst_reg ? dst : src);
- return -EACCES;
- }
- fallthrough;
default:
break;
}
@@ -6072,13 +6206,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* pointer types do not carry 32-bit bounds at the moment. */
__mark_reg32_unbounded(dst_reg);
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
+ &tmp_aux, false);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, off_reg, dst_reg);
+ }
+
switch (opcode) {
case BPF_ADD:
- ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
- if (ret < 0) {
- verbose(env, "R%d tried to add from different maps or paths\n", dst);
- return ret;
- }
/* We can take a fixed offset as long as it doesn't overflow
* the s32 'off' field
*/
@@ -6129,11 +6265,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
break;
case BPF_SUB:
- ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
- if (ret < 0) {
- verbose(env, "R%d tried to sub from different maps or paths\n", dst);
- return ret;
- }
if (dst_reg == off_reg) {
/* scalar -= pointer. Creates an unknown scalar */
verbose(env, "R%d tried to subtract pointer from scalar\n",
@@ -6214,21 +6345,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
- /* For unprivileged we require that resulting offset must be in bounds
- * in order to be able to sanitize access later on.
- */
- if (!env->bypass_spec_v1) {
- if (dst_reg->type == PTR_TO_MAP_VALUE &&
- check_map_access(env, dst, dst_reg->off, 1, false)) {
- verbose(env, "R%d pointer arithmetic of map value goes out of range, "
- "prohibited for !root\n", dst);
- return -EACCES;
- } else if (dst_reg->type == PTR_TO_STACK &&
- check_stack_access_for_ptr_arithmetic(
- env, dst, dst_reg, dst_reg->off +
- dst_reg->var_off.value)) {
- return -EACCES;
- }
+ if (sanitize_check_bounds(env, insn, dst_reg) < 0)
+ return -EACCES;
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
+ &tmp_aux, true);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, off_reg, dst_reg);
}
return 0;
@@ -6822,9 +6945,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
s32 s32_min_val, s32_max_val;
u32 u32_min_val, u32_max_val;
u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
- u32 dst = insn->dst_reg;
- int ret;
bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
+ int ret;
smin_val = src_reg.smin_value;
smax_val = src_reg.smax_value;
@@ -6866,6 +6988,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
return 0;
}
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_val_alu(env, insn);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, NULL, NULL);
+ }
+
/* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops.
* There are two classes of instructions: The first class we track both
* alu32 and alu64 sign/unsigned bounds independently this provides the
@@ -6882,21 +7010,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
*/
switch (opcode) {
case BPF_ADD:
- ret = sanitize_val_alu(env, insn);
- if (ret < 0) {
- verbose(env, "R%d tried to add from different pointers or scalars\n", dst);
- return ret;
- }
scalar32_min_max_add(dst_reg, &src_reg);
scalar_min_max_add(dst_reg, &src_reg);
dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
break;
case BPF_SUB:
- ret = sanitize_val_alu(env, insn);
- if (ret < 0) {
- verbose(env, "R%d tried to sub from different pointers or scalars\n", dst);
- return ret;
- }
scalar32_min_max_sub(dst_reg, &src_reg);
scalar_min_max_sub(dst_reg, &src_reg);
dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
@@ -9029,6 +9147,10 @@ static int check_btf_info(struct bpf_verifier_env *env,
btf = btf_get_by_fd(attr->prog_btf_fd);
if (IS_ERR(btf))
return PTR_ERR(btf);
+ if (btf_is_kernel(btf)) {
+ btf_put(btf);
+ return -EACCES;
+ }
env->prog->aux->btf = btf;
err = check_btf_func(env, attr, uattr);
@@ -11006,9 +11128,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
for (i = 0; i < len; i++) {
int adj_idx = i + delta;
struct bpf_insn insn;
- u8 load_reg;
+ int load_reg;
insn = insns[adj_idx];
+ load_reg = insn_def_regno(&insn);
if (!aux[adj_idx].zext_dst) {
u8 code, class;
u32 imm_rnd;
@@ -11018,14 +11141,14 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
code = insn.code;
class = BPF_CLASS(code);
- if (insn_no_def(&insn))
+ if (load_reg == -1)
continue;
/* NOTE: arg "reg" (the fourth one) is only used for
- * BPF_STX which has been ruled out in above
- * check, it is safe to pass NULL here.
+ * BPF_STX + SRC_OP, so it is safe to pass NULL
+ * here.
*/
- if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) {
+ if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) {
if (class == BPF_LD &&
BPF_MODE(code) == BPF_IMM)
i++;
@@ -11040,31 +11163,28 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
imm_rnd = get_random_int();
rnd_hi32_patch[0] = insn;
rnd_hi32_patch[1].imm = imm_rnd;
- rnd_hi32_patch[3].dst_reg = insn.dst_reg;
+ rnd_hi32_patch[3].dst_reg = load_reg;
patch = rnd_hi32_patch;
patch_len = 4;
goto apply_patch_buffer;
}
- if (!bpf_jit_needs_zext())
+ /* Add in an zero-extend instruction if a) the JIT has requested
+ * it or b) it's a CMPXCHG.
+ *
+ * The latter is because: BPF_CMPXCHG always loads a value into
+ * R0, therefore always zero-extends. However some archs'
+ * equivalent instruction only does this load when the
+ * comparison is successful. This detail of CMPXCHG is
+ * orthogonal to the general zero-extension behaviour of the
+ * CPU, so it's treated independently of bpf_jit_needs_zext.
+ */
+ if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
continue;
- /* zext_dst means that we want to zero-extend whatever register
- * the insn defines, which is dst_reg most of the time, with
- * the notable exception of BPF_STX + BPF_ATOMIC + BPF_FETCH.
- */
- if (BPF_CLASS(insn.code) == BPF_STX &&
- BPF_MODE(insn.code) == BPF_ATOMIC) {
- /* BPF_STX + BPF_ATOMIC insns without BPF_FETCH do not
- * define any registers, therefore zext_dst cannot be
- * set.
- */
- if (WARN_ON(!(insn.imm & BPF_FETCH)))
- return -EINVAL;
- load_reg = insn.imm == BPF_CMPXCHG ? BPF_REG_0
- : insn.src_reg;
- } else {
- load_reg = insn.dst_reg;
+ if (WARN_ON(load_reg == -1)) {
+ verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
+ return -EFAULT;
}
zext_patch[0] = insn;
@@ -11635,7 +11755,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
off_reg = issrc ? insn->src_reg : insn->dst_reg;
if (isneg)
*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
- *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
@@ -12120,6 +12240,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
u32 btf_id, member_idx;
const char *mname;
+ if (!prog->gpl_compatible) {
+ verbose(env, "struct ops programs must have a GPL compatible license\n");
+ return -EINVAL;
+ }
+
btf_id = prog->aux->attach_btf_id;
st_ops = bpf_struct_ops_find(btf_id);
if (!st_ops) {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0aeca5f3c0ac..03db40f6cba9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -3461,11 +3462,16 @@ unlock:
}
}
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
void perf_sched_cb_dec(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- --cpuctx->sched_cb_usage;
+ this_cpu_dec(perf_sched_cb_usages);
+
+ if (!--cpuctx->sched_cb_usage)
+ list_del(&cpuctx->sched_cb_entry);
}
@@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- cpuctx->sched_cb_usage++;
+ if (!cpuctx->sched_cb_usage++)
+ list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+ this_cpu_inc(perf_sched_cb_usages);
}
/*
@@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
+static void perf_pmu_sched_task(struct task_struct *prev,
+ struct task_struct *next,
+ bool sched_in)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (prev == next)
+ return;
+
+ list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+ /* will be handled in perf_event_context_sched_in/out */
+ if (cpuctx->task_ctx)
+ continue;
+
+ __perf_pmu_sched_task(cpuctx, sched_in);
+ }
+}
+
static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);
@@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(task, next, false);
+
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
@@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
+
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(prev, task, true);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -4656,7 +4689,7 @@ static void unaccount_event(struct perf_event *event)
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
dec = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
@@ -11175,7 +11208,7 @@ static void account_event(struct perf_event *event)
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
inc = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
@@ -12972,6 +13005,7 @@ static void __init perf_event_init_all_cpus(void)
#ifdef CONFIG_CGROUP_PERF
INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
#endif
+ INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}
diff --git a/kernel/fork.c b/kernel/fork.c
index d3171e8e88e5..426cd0c51f9e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -994,6 +994,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
+static void mm_init_pasid(struct mm_struct *mm)
+{
+#ifdef CONFIG_IOMMU_SUPPORT
+ mm->pasid = INIT_PASID;
+#endif
+}
+
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
@@ -1024,6 +1031,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ mm_init_pasid(mm);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
@@ -1940,8 +1948,14 @@ static __latent_entropy struct task_struct *copy_process(
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
- if (args->io_thread)
+ if (args->io_thread) {
+ /*
+ * Mark us an IO worker, and block any signal that isn't
+ * fatal or STOP
+ */
p->flags |= PF_IO_WORKER;
+ siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
+ }
/*
* This _must_ happen before we call free_task(), i.e. before we jump
@@ -2430,14 +2444,8 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
.stack_size = (unsigned long)arg,
.io_thread = 1,
};
- struct task_struct *tsk;
- tsk = copy_process(NULL, 0, node, &args);
- if (!IS_ERR(tsk)) {
- sigfillset(&tsk->blocked);
- sigdelsetmask(&tsk->blocked, sigmask(SIGKILL));
- }
- return tsk;
+ return copy_process(NULL, 0, node, &args);
}
/*
diff --git a/kernel/futex.c b/kernel/futex.c
index e68db7745039..00febd6dea9c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2728,14 +2728,13 @@ retry:
goto out;
restart = &current->restart_block;
- restart->fn = futex_wait_restart;
restart->futex.uaddr = uaddr;
restart->futex.val = val;
restart->futex.time = *abs_time;
restart->futex.bitset = bitset;
restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
- ret = -ERESTART_RESTARTBLOCK;
+ ret = set_restart_fn(restart, futex_wait_restart);
out:
if (to) {
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
index c94b820a1b62..c466c7fbdece 100644
--- a/kernel/gcov/clang.c
+++ b/kernel/gcov/clang.c
@@ -70,12 +70,16 @@ struct gcov_fn_info {
u32 ident;
u32 checksum;
+#if CONFIG_CLANG_VERSION < 110000
u8 use_extra_checksum;
+#endif
u32 cfg_checksum;
u32 num_counters;
u64 *counters;
+#if CONFIG_CLANG_VERSION < 110000
const char *function_name;
+#endif
};
static struct gcov_info *current_info;
@@ -105,6 +109,7 @@ void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
}
EXPORT_SYMBOL(llvm_gcov_init);
+#if CONFIG_CLANG_VERSION < 110000
void llvm_gcda_start_file(const char *orig_filename, const char version[4],
u32 checksum)
{
@@ -113,7 +118,17 @@ void llvm_gcda_start_file(const char *orig_filename, const char version[4],
current_info->checksum = checksum;
}
EXPORT_SYMBOL(llvm_gcda_start_file);
+#else
+void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum)
+{
+ current_info->filename = orig_filename;
+ current_info->version = version;
+ current_info->checksum = checksum;
+}
+EXPORT_SYMBOL(llvm_gcda_start_file);
+#endif
+#if CONFIG_CLANG_VERSION < 110000
void llvm_gcda_emit_function(u32 ident, const char *function_name,
u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
{
@@ -132,6 +147,21 @@ void llvm_gcda_emit_function(u32 ident, const char *function_name,
list_add_tail(&info->head, &current_info->functions);
}
+#else
+void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum)
+{
+ struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return;
+
+ INIT_LIST_HEAD(&info->head);
+ info->ident = ident;
+ info->checksum = func_checksum;
+ info->cfg_checksum = cfg_checksum;
+ list_add_tail(&info->head, &current_info->functions);
+}
+#endif
EXPORT_SYMBOL(llvm_gcda_emit_function);
void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
@@ -262,11 +292,16 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
!list_is_last(&fn_ptr2->head, &info2->functions)) {
if (fn_ptr1->checksum != fn_ptr2->checksum)
return false;
+#if CONFIG_CLANG_VERSION < 110000
if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
return false;
if (fn_ptr1->use_extra_checksum &&
fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
return false;
+#else
+ if (fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
+ return false;
+#endif
fn_ptr1 = list_next_entry(fn_ptr1, head);
fn_ptr2 = list_next_entry(fn_ptr2, head);
}
@@ -295,6 +330,7 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
}
}
+#if CONFIG_CLANG_VERSION < 110000
static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
{
size_t cv_size; /* counter values size */
@@ -322,6 +358,28 @@ err_name:
kfree(fn_dup);
return NULL;
}
+#else
+static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
+{
+ size_t cv_size; /* counter values size */
+ struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
+ GFP_KERNEL);
+ if (!fn_dup)
+ return NULL;
+ INIT_LIST_HEAD(&fn_dup->head);
+
+ cv_size = fn->num_counters * sizeof(fn->counters[0]);
+ fn_dup->counters = vmalloc(cv_size);
+ if (!fn_dup->counters) {
+ kfree(fn_dup);
+ return NULL;
+ }
+
+ memcpy(fn_dup->counters, fn->counters, cv_size);
+
+ return fn_dup;
+}
+#endif
/**
* gcov_info_dup - duplicate profiling data set
@@ -362,6 +420,7 @@ err:
* gcov_info_free - release memory for profiling data set duplicate
* @info: profiling data set duplicate to free
*/
+#if CONFIG_CLANG_VERSION < 110000
void gcov_info_free(struct gcov_info *info)
{
struct gcov_fn_info *fn, *tmp;
@@ -375,6 +434,20 @@ void gcov_info_free(struct gcov_info *info)
kfree(info->filename);
kfree(info);
}
+#else
+void gcov_info_free(struct gcov_info *info)
+{
+ struct gcov_fn_info *fn, *tmp;
+
+ list_for_each_entry_safe(fn, tmp, &info->functions, head) {
+ vfree(fn->counters);
+ list_del(&fn->head);
+ kfree(fn);
+ }
+ kfree(info->filename);
+ kfree(info);
+}
+#endif
#define ITER_STRIDE PAGE_SIZE
@@ -460,17 +533,22 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
list_for_each_entry(fi_ptr, &info->functions, head) {
u32 i;
- u32 len = 2;
-
- if (fi_ptr->use_extra_checksum)
- len++;
pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
- pos += store_gcov_u32(buffer, pos, len);
+#if CONFIG_CLANG_VERSION < 110000
+ pos += store_gcov_u32(buffer, pos,
+ fi_ptr->use_extra_checksum ? 3 : 2);
+#else
+ pos += store_gcov_u32(buffer, pos, 3);
+#endif
pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
+#if CONFIG_CLANG_VERSION < 110000
if (fi_ptr->use_extra_checksum)
pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+#else
+ pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+#endif
pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 48006608baf0..40880c350b95 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -159,7 +159,7 @@ static const struct irq_domain_ops irq_sim_domain_ops = {
* irq_domain_create_sim - Create a new interrupt simulator irq_domain and
* allocate a range of dummy interrupts.
*
- * @fnode: struct fwnode_handle to be associated with this domain.
+ * @fwnode: struct fwnode_handle to be associated with this domain.
* @num_irqs: Number of interrupts to allocate.
*
* On success: return a new irq_domain object.
@@ -228,7 +228,7 @@ static void devm_irq_domain_release_sim(struct device *dev, void *res)
* a managed device.
*
* @dev: Device to initialize the simulator object for.
- * @fnode: struct fwnode_handle to be associated with this domain.
+ * @fwnode: struct fwnode_handle to be associated with this domain.
* @num_irqs: Number of interrupts to allocate
*
* On success: return a new irq_domain object.
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 288151393a06..d10ab1d689d5 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1898,16 +1898,15 @@ DEFINE_SHOW_ATTRIBUTE(irq_domain_debug);
static void debugfs_add_domain_dir(struct irq_domain *d)
{
- if (!d->name || !domain_dir || d->debugfs_file)
+ if (!d->name || !domain_dir)
return;
- d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d,
- &irq_domain_debug_fops);
+ debugfs_create_file(d->name, 0444, domain_dir, d,
+ &irq_domain_debug_fops);
}
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
- debugfs_remove(d->debugfs_file);
- d->debugfs_file = NULL;
+ debugfs_remove(debugfs_lookup(d->name, domain_dir));
}
void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index dec3f73e8db9..21ea370fccda 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1142,11 +1142,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
irqreturn_t ret;
local_bh_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
ret = action->thread_fn(action->irq, action->dev_id);
if (ret == IRQ_HANDLED)
atomic_inc(&desc->threads_handled);
irq_finalize_oneshot(desc, action);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
local_bh_enable();
return ret;
}
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index c6a39d662935..ba39fbb1f8e7 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -407,6 +407,14 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init)
return false;
if (!kernel_text_address(jump_entry_code(entry))) {
+ /*
+ * This skips patching built-in __exit, which
+ * is part of init_section_contains() but is
+ * not part of kernel_text_address().
+ *
+ * Skipping built-in __exit is fine since it
+ * will never be executed.
+ */
WARN_ONCE(!jump_entry_is_init(entry),
"can't patch jump_label at %pS",
(void *)jump_entry_code(entry));
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c6d0c1dc6253..f160f1c97ca1 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -705,7 +705,7 @@ static void print_lock_name(struct lock_class *class)
printk(KERN_CONT " (");
__print_lock_name(class);
- printk(KERN_CONT "){%s}-{%hd:%hd}", usage,
+ printk(KERN_CONT "){%s}-{%d:%d}", usage,
class->wait_type_outer ?: class->wait_type_inner,
class->wait_type_inner);
}
@@ -930,7 +930,8 @@ static bool assign_lock_key(struct lockdep_map *lock)
/* Debug-check: all keys must be persistent! */
debug_locks_off();
pr_err("INFO: trying to register non-static key.\n");
- pr_err("the code is fine but needs lockdep annotation.\n");
+ pr_err("The code is fine but needs lockdep annotation, or maybe\n");
+ pr_err("you didn't initialize this object before use?\n");
pr_err("turning off the locking correctness validator.\n");
dump_stack();
return false;
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index adb935090768..622ebdfcd083 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -626,7 +626,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
*/
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
if (!waiter) {
/*
@@ -702,7 +702,7 @@ fail:
#else
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
return false;
}
@@ -922,6 +922,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
struct ww_mutex *ww;
int ret;
+ if (!use_ww_ctx)
+ ww_ctx = NULL;
+
might_sleep();
#ifdef CONFIG_DEBUG_MUTEXES
@@ -929,7 +932,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
#endif
ww = container_of(lock, struct ww_mutex, base);
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
return -EALREADY;
@@ -946,10 +949,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
if (__mutex_trylock(lock) ||
- mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) {
+ mutex_optimistic_spin(lock, ww_ctx, NULL)) {
/* got the lock, yay! */
lock_acquired(&lock->dep_map, ip);
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
ww_mutex_set_context_fastpath(ww, ww_ctx);
preempt_enable();
return 0;
@@ -960,7 +963,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* After waiting to acquire the wait_lock, try again.
*/
if (__mutex_trylock(lock)) {
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
__ww_mutex_check_waiters(lock, ww_ctx);
goto skip_wait;
@@ -1013,7 +1016,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
goto err;
}
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx);
if (ret)
goto err;
@@ -1026,7 +1029,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* ww_mutex needs to always recheck its position since its waiter
* list is not FIFO ordered.
*/
- if ((use_ww_ctx && ww_ctx) || !first) {
+ if (ww_ctx || !first) {
first = __mutex_waiter_is_first(lock, &waiter);
if (first)
__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
@@ -1039,7 +1042,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* or we must see its unlock and acquire.
*/
if (__mutex_trylock(lock) ||
- (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter)))
+ (first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))
break;
spin_lock(&lock->wait_lock);
@@ -1048,7 +1051,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
acquired:
__set_current_state(TASK_RUNNING);
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
/*
* Wound-Wait; we stole the lock (!first_waiter), check the
* waiters as anyone might want to wound us.
@@ -1068,7 +1071,7 @@ skip_wait:
/* got the lock - cleanup and rejoice! */
lock_acquired(&lock->dep_map, ip);
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
spin_unlock(&lock->wait_lock);
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 1358fa4abfa8..0f4530b3a8cd 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -98,7 +98,7 @@ static int __init em_debug_init(void)
return 0;
}
-core_initcall(em_debug_init);
+fs_initcall(em_debug_init);
#else /* CONFIG_DEBUG_FS */
static void em_debug_create_pd(struct device *dev) {}
static void em_debug_remove_pd(struct device *dev) {}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 821cf1723814..61db50f7ca86 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -375,7 +375,7 @@ static int ptrace_attach(struct task_struct *task, long request,
audit_ptrace(task);
retval = -EPERM;
- if (unlikely(task->flags & (PF_KTHREAD | PF_IO_WORKER)))
+ if (unlikely(task->flags & PF_KTHREAD))
goto out;
if (same_thread_group(task, current))
goto out;
diff --git a/kernel/reboot.c b/kernel/reboot.c
index eb1b15850761..a6ad5eb2fa73 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -244,8 +244,6 @@ void migrate_to_reboot_cpu(void)
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
- if (pm_power_off_prepare)
- pm_power_off_prepare();
migrate_to_reboot_cpu();
syscore_shutdown();
if (!cmd)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ca2bb629595f..98191218d891 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1862,8 +1862,13 @@ struct migration_arg {
struct set_affinity_pending *pending;
};
+/*
+ * @refs: number of wait_for_completion()
+ * @stop_pending: is @stop_work in use
+ */
struct set_affinity_pending {
refcount_t refs;
+ unsigned int stop_pending;
struct completion done;
struct cpu_stop_work stop_work;
struct migration_arg arg;
@@ -1898,8 +1903,8 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
*/
static int migration_cpu_stop(void *data)
{
- struct set_affinity_pending *pending;
struct migration_arg *arg = data;
+ struct set_affinity_pending *pending = arg->pending;
struct task_struct *p = arg->task;
int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
@@ -1921,7 +1926,6 @@ static int migration_cpu_stop(void *data)
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
- pending = p->migration_pending;
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@@ -1932,21 +1936,14 @@ static int migration_cpu_stop(void *data)
goto out;
if (pending) {
- p->migration_pending = NULL;
+ if (p->migration_pending == pending)
+ p->migration_pending = NULL;
complete = true;
}
- /* migrate_enable() -- we must not race against SCA */
if (dest_cpu < 0) {
- /*
- * When this was migrate_enable() but we no longer
- * have a @pending, a concurrent SCA 'fixed' things
- * and we should be valid again. Nothing to do.
- */
- if (!pending) {
- WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
goto out;
- }
dest_cpu = cpumask_any_distribute(&p->cpus_mask);
}
@@ -1956,7 +1953,14 @@ static int migration_cpu_stop(void *data)
else
p->wake_cpu = dest_cpu;
- } else if (dest_cpu < 0 || pending) {
+ /*
+ * XXX __migrate_task() can fail, at which point we might end
+ * up running on a dodgy CPU, AFAICT this can only happen
+ * during CPU hotplug, at which point we'll get pushed out
+ * anyway, so it's probably not a big deal.
+ */
+
+ } else if (pending) {
/*
* This happens when we get migrated between migrate_enable()'s
* preempt_enable() and scheduling the stopper task. At that
@@ -1971,43 +1975,32 @@ static int migration_cpu_stop(void *data)
* ->pi_lock, so the allowed mask is stable - if it got
* somewhere allowed, we're done.
*/
- if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
- p->migration_pending = NULL;
+ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+ if (p->migration_pending == pending)
+ p->migration_pending = NULL;
complete = true;
goto out;
}
/*
- * When this was migrate_enable() but we no longer have an
- * @pending, a concurrent SCA 'fixed' things and we should be
- * valid again. Nothing to do.
- */
- if (!pending) {
- WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
- goto out;
- }
-
- /*
* When migrate_enable() hits a rq mis-match we can't reliably
* determine is_migration_disabled() and so have to chase after
* it.
*/
+ WARN_ON_ONCE(!pending->stop_pending);
task_rq_unlock(rq, p, &rf);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
return 0;
}
out:
+ if (pending)
+ pending->stop_pending = false;
task_rq_unlock(rq, p, &rf);
if (complete)
complete_all(&pending->done);
- /* For pending->{arg,stop_work} */
- pending = arg->pending;
- if (pending && refcount_dec_and_test(&pending->refs))
- wake_up_var(&pending->refs);
-
return 0;
}
@@ -2194,11 +2187,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
int dest_cpu, unsigned int flags)
{
struct set_affinity_pending my_pending = { }, *pending = NULL;
- struct migration_arg arg = {
- .task = p,
- .dest_cpu = dest_cpu,
- };
- bool complete = false;
+ bool stop_pending, complete = false;
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
@@ -2210,12 +2199,16 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
push_task = get_task_struct(p);
}
+ /*
+ * If there are pending waiters, but no pending stop_work,
+ * then complete now.
+ */
pending = p->migration_pending;
- if (pending) {
- refcount_inc(&pending->refs);
+ if (pending && !pending->stop_pending) {
p->migration_pending = NULL;
complete = true;
}
+
task_rq_unlock(rq, p, rf);
if (push_task) {
@@ -2224,7 +2217,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
}
if (complete)
- goto do_complete;
+ complete_all(&pending->done);
return 0;
}
@@ -2235,6 +2228,12 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
/* Install the request */
refcount_set(&my_pending.refs, 1);
init_completion(&my_pending.done);
+ my_pending.arg = (struct migration_arg) {
+ .task = p,
+ .dest_cpu = -1, /* any */
+ .pending = &my_pending,
+ };
+
p->migration_pending = &my_pending;
} else {
pending = p->migration_pending;
@@ -2259,45 +2258,41 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
return -EINVAL;
}
- if (flags & SCA_MIGRATE_ENABLE) {
-
- refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
- p->migration_flags &= ~MDF_PUSH;
- task_rq_unlock(rq, p, rf);
-
- pending->arg = (struct migration_arg) {
- .task = p,
- .dest_cpu = -1,
- .pending = pending,
- };
-
- stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
- &pending->arg, &pending->stop_work);
-
- return 0;
- }
-
if (task_running(rq, p) || p->state == TASK_WAKING) {
/*
- * Lessen races (and headaches) by delegating
- * is_migration_disabled(p) checks to the stopper, which will
- * run on the same CPU as said p.
+ * MIGRATE_ENABLE gets here because 'p == current', but for
+ * anything else we cannot do is_migration_disabled(), punt
+ * and have the stopper function handle it all race-free.
*/
+ stop_pending = pending->stop_pending;
+ if (!stop_pending)
+ pending->stop_pending = true;
+
+ if (flags & SCA_MIGRATE_ENABLE)
+ p->migration_flags &= ~MDF_PUSH;
+
task_rq_unlock(rq, p, rf);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ if (!stop_pending) {
+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ }
+
+ if (flags & SCA_MIGRATE_ENABLE)
+ return 0;
} else {
if (!is_migration_disabled(p)) {
if (task_on_rq_queued(p))
rq = move_queued_task(rq, rf, p, dest_cpu);
- p->migration_pending = NULL;
- complete = true;
+ if (!pending->stop_pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
}
task_rq_unlock(rq, p, rf);
-do_complete:
if (complete)
complete_all(&pending->done);
}
@@ -2305,7 +2300,7 @@ do_complete:
wait_for_completion(&pending->done);
if (refcount_dec_and_test(&pending->refs))
- wake_up_var(&pending->refs);
+ wake_up_var(&pending->refs); /* No UaF, just an address */
/*
* Block the original owner of &pending until all subsequent callers
@@ -2313,6 +2308,9 @@ do_complete:
*/
wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+ /* ARGH */
+ WARN_ON_ONCE(my_pending.stop_pending);
+
return 0;
}
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index acdae625c636..b5add64d9698 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -471,9 +471,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm)
}
rcu_read_unlock();
- preempt_disable();
- smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
- preempt_enable();
+ on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
free_cpumask_var(tmpmask);
cpus_read_unlock();
diff --git a/kernel/signal.c b/kernel/signal.c
index ba4d1ef39a9e..f2718350bf4b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -91,7 +91,7 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
return true;
/* Only allow kernel generated signals to this kthread */
- if (unlikely((t->flags & (PF_KTHREAD | PF_IO_WORKER)) &&
+ if (unlikely((t->flags & PF_KTHREAD) &&
(handler == SIG_KTHREAD_KERNEL) && !force))
return true;
@@ -1096,7 +1096,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
/*
* Skip useless siginfo allocation for SIGKILL and kernel threads.
*/
- if ((sig == SIGKILL) || (t->flags & (PF_KTHREAD | PF_IO_WORKER)))
+ if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
goto out_set;
/*
@@ -2768,13 +2768,21 @@ relock:
}
/*
+ * PF_IO_WORKER threads will catch and exit on fatal signals
+ * themselves. They have cleanup that must be performed, so
+ * we cannot call do_exit() on their behalf.
+ */
+ if (current->flags & PF_IO_WORKER)
+ goto out;
+
+ /*
* Death signals, no core dump.
*/
do_group_exit(ksig->info.si_signo);
/* NOTREACHED */
}
spin_unlock_irq(&sighand->siglock);
-
+out:
ksig->sig = signr;
if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
diff --git a/kernel/static_call.c b/kernel/static_call.c
index 6906c6ec4c97..2c5950b0b90e 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -35,27 +35,30 @@ static inline void *static_call_addr(struct static_call_site *site)
return (void *)((long)site->addr + (long)&site->addr);
}
+static inline unsigned long __static_call_key(const struct static_call_site *site)
+{
+ return (long)site->key + (long)&site->key;
+}
static inline struct static_call_key *static_call_key(const struct static_call_site *site)
{
- return (struct static_call_key *)
- (((long)site->key + (long)&site->key) & ~STATIC_CALL_SITE_FLAGS);
+ return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS);
}
/* These assume the key is word-aligned. */
static inline bool static_call_is_init(struct static_call_site *site)
{
- return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_INIT;
+ return __static_call_key(site) & STATIC_CALL_SITE_INIT;
}
static inline bool static_call_is_tail(struct static_call_site *site)
{
- return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_TAIL;
+ return __static_call_key(site) & STATIC_CALL_SITE_TAIL;
}
static inline void static_call_set_init(struct static_call_site *site)
{
- site->key = ((long)static_call_key(site) | STATIC_CALL_SITE_INIT) -
+ site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) -
(long)&site->key;
}
@@ -146,6 +149,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
};
for (site_mod = &first; site_mod; site_mod = site_mod->next) {
+ bool init = system_state < SYSTEM_RUNNING;
struct module *mod = site_mod->mod;
if (!site_mod->sites) {
@@ -165,6 +169,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
if (mod) {
stop = mod->static_call_sites +
mod->num_static_call_sites;
+ init = mod->state == MODULE_STATE_COMING;
}
#endif
@@ -172,25 +177,26 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
site < stop && static_call_key(site) == key; site++) {
void *site_addr = static_call_addr(site);
- if (static_call_is_init(site)) {
- /*
- * Don't write to call sites which were in
- * initmem and have since been freed.
- */
- if (!mod && system_state >= SYSTEM_RUNNING)
- continue;
- if (mod && !within_module_init((unsigned long)site_addr, mod))
- continue;
- }
+ if (!init && static_call_is_init(site))
+ continue;
if (!kernel_text_address((unsigned long)site_addr)) {
- WARN_ONCE(1, "can't patch static call site at %pS",
+ /*
+ * This skips patching built-in __exit, which
+ * is part of init_section_contains() but is
+ * not part of kernel_text_address().
+ *
+ * Skipping built-in __exit is fine since it
+ * will never be executed.
+ */
+ WARN_ONCE(!static_call_is_init(site),
+ "can't patch static call site at %pS",
site_addr);
continue;
}
arch_static_call_transform(site_addr, NULL, func,
- static_call_is_tail(site));
+ static_call_is_tail(site));
}
}
@@ -349,7 +355,8 @@ static int static_call_add_module(struct module *mod)
struct static_call_site *site;
for (site = start; site != stop; site++) {
- unsigned long addr = (unsigned long)static_call_key(site);
+ unsigned long s_key = __static_call_key(site);
+ unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS;
unsigned long key;
/*
@@ -373,8 +380,8 @@ static int static_call_add_module(struct module *mod)
return -EINVAL;
}
- site->key = (key - (long)&site->key) |
- (site->key & STATIC_CALL_SITE_FLAGS);
+ key |= s_key & STATIC_CALL_SITE_FLAGS;
+ site->key = key - (long)&site->key;
}
return __static_call_init(mod, start, stop);
diff --git a/kernel/sys.c b/kernel/sys.c
index b09fe21e88ff..2e2e3f378d97 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2079,7 +2079,7 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
* up to the caller to provide sane values here, otherwise userspace
* tools which use this vector might be unhappy.
*/
- unsigned long user_auxv[AT_VECTOR_SIZE];
+ unsigned long user_auxv[AT_VECTOR_SIZE] = {};
if (len > sizeof(user_auxv))
return -EINVAL;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 98d7a15e8cf6..4d94e2b5499d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -854,9 +854,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
if (flags == TIMER_ABSTIME)
return -ERESTARTNOHAND;
- restart->fn = alarm_timer_nsleep_restart;
restart->nanosleep.clockid = type;
restart->nanosleep.expires = exp;
+ set_restart_fn(restart, alarm_timer_nsleep_restart);
return ret;
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 743c852e10f2..5c9d968187ae 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -546,8 +546,11 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
}
/*
- * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
- * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
+ * Recomputes cpu_base::*next_timer and returns the earliest expires_next
+ * but does not set cpu_base::*expires_next, that is done by
+ * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
+ * cpu_base::*expires_next right away, reprogramming logic would no longer
+ * work.
*
* When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
* those timers will get run whenever the softirq gets handled, at the end of
@@ -588,6 +591,37 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
return expires_next;
}
+static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
+{
+ ktime_t expires_next, soft = KTIME_MAX;
+
+ /*
+ * If the soft interrupt has already been activated, ignore the
+ * soft bases. They will be handled in the already raised soft
+ * interrupt.
+ */
+ if (!cpu_base->softirq_activated) {
+ soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
+ /*
+ * Update the soft expiry time. clock_settime() might have
+ * affected it.
+ */
+ cpu_base->softirq_expires_next = soft;
+ }
+
+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
+ /*
+ * If a softirq timer is expiring first, update cpu_base->next_timer
+ * and program the hardware with the soft expiry time.
+ */
+ if (expires_next > soft) {
+ cpu_base->next_timer = cpu_base->softirq_next_timer;
+ expires_next = soft;
+ }
+
+ return expires_next;
+}
+
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
@@ -628,23 +662,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
ktime_t expires_next;
- /*
- * Find the current next expiration time.
- */
- expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
-
- if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
- /*
- * When the softirq is activated, hrtimer has to be
- * programmed with the first hard hrtimer because soft
- * timer interrupt could occur too late.
- */
- if (cpu_base->softirq_activated)
- expires_next = __hrtimer_get_next_event(cpu_base,
- HRTIMER_ACTIVE_HARD);
- else
- cpu_base->softirq_expires_next = expires_next;
- }
+ expires_next = hrtimer_update_next_event(cpu_base);
if (skip_equal && expires_next == cpu_base->expires_next)
return;
@@ -1644,8 +1662,8 @@ retry:
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
- /* Reevaluate the clock bases for the next expiry */
- expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
+ /* Reevaluate the clock bases for the [soft] next expiry */
+ expires_next = hrtimer_update_next_event(cpu_base);
/*
* Store the new expiry value so the migration code can verify
* against it.
@@ -1939,9 +1957,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
}
restart = &current->restart_block;
- restart->fn = hrtimer_nanosleep_restart;
restart->nanosleep.clockid = t.timer.base->clockid;
restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+ set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
destroy_hrtimer_on_stack(&t.timer);
return ret;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a71758e34e45..9abe15255bc4 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1480,8 +1480,8 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
if (flags & TIMER_ABSTIME)
return -ERESTARTNOHAND;
- restart_block->fn = posix_cpu_nsleep_restart;
restart_block->nanosleep.clockid = which_clock;
+ set_restart_fn(restart_block, posix_cpu_nsleep_restart);
}
return error;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4d8e35575549..3ba52d4e1314 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3231,7 +3231,8 @@ ftrace_allocate_pages(unsigned long num_to_init)
pg = start_pg;
while (pg) {
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
+ if (order >= 0)
+ free_pages((unsigned long)pg->records, order);
start_pg = pg->next;
kfree(pg);
pg = start_pg;
@@ -5045,6 +5046,20 @@ struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)
return NULL;
}
+static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr)
+{
+ struct ftrace_direct_func *direct;
+
+ direct = kmalloc(sizeof(*direct), GFP_KERNEL);
+ if (!direct)
+ return NULL;
+ direct->addr = addr;
+ direct->count = 0;
+ list_add_rcu(&direct->next, &ftrace_direct_funcs);
+ ftrace_direct_func_count++;
+ return direct;
+}
+
/**
* register_ftrace_direct - Call a custom trampoline directly
* @ip: The address of the nop at the beginning of a function
@@ -5120,15 +5135,11 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
direct = ftrace_find_direct_func(addr);
if (!direct) {
- direct = kmalloc(sizeof(*direct), GFP_KERNEL);
+ direct = ftrace_alloc_direct_func(addr);
if (!direct) {
kfree(entry);
goto out_unlock;
}
- direct->addr = addr;
- direct->count = 0;
- list_add_rcu(&direct->next, &ftrace_direct_funcs);
- ftrace_direct_func_count++;
}
entry->ip = ip;
@@ -5329,6 +5340,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
int modify_ftrace_direct(unsigned long ip,
unsigned long old_addr, unsigned long new_addr)
{
+ struct ftrace_direct_func *direct, *new_direct = NULL;
struct ftrace_func_entry *entry;
struct dyn_ftrace *rec;
int ret = -ENODEV;
@@ -5344,6 +5356,20 @@ int modify_ftrace_direct(unsigned long ip,
if (entry->direct != old_addr)
goto out_unlock;
+ direct = ftrace_find_direct_func(old_addr);
+ if (WARN_ON(!direct))
+ goto out_unlock;
+ if (direct->count > 1) {
+ ret = -ENOMEM;
+ new_direct = ftrace_alloc_direct_func(new_addr);
+ if (!new_direct)
+ goto out_unlock;
+ direct->count--;
+ new_direct->count++;
+ } else {
+ direct->addr = new_addr;
+ }
+
/*
* If there's no other ftrace callback on the rec->ip location,
* then it can be changed directly by the architecture.
@@ -5357,6 +5383,14 @@ int modify_ftrace_direct(unsigned long ip,
ret = 0;
}
+ if (unlikely(ret && new_direct)) {
+ direct->count++;
+ list_del_rcu(&new_direct->next);
+ synchronize_rcu_tasks();
+ kfree(new_direct);
+ ftrace_direct_func_count--;
+ }
+
out_unlock:
mutex_unlock(&ftrace_lock);
mutex_unlock(&direct_mutex);
@@ -6418,7 +6452,8 @@ void ftrace_release_mod(struct module *mod)
clear_mod_from_hashes(pg);
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
+ if (order >= 0)
+ free_pages((unsigned long)pg->records, order);
tmp_page = pg->next;
kfree(pg);
ftrace_number_of_pages -= 1 << order;
@@ -6778,7 +6813,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
if (!pg->index) {
*last_pg = pg->next;
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
+ if (order >= 0)
+ free_pages((unsigned long)pg->records, order);
ftrace_number_of_pages -= 1 << order;
ftrace_number_of_groups--;
kfree(pg);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index eccb4e1187cc..c0c9aa5cd8e2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2984,7 +2984,8 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
size = nr_entries * sizeof(unsigned long);
event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
- sizeof(*entry) + size, trace_ctx);
+ (sizeof(*entry) - sizeof(entry->caller)) + size,
+ trace_ctx);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -3544,7 +3545,11 @@ static char *trace_iter_expand_format(struct trace_iterator *iter)
{
char *tmp;
- if (iter->fmt == static_fmt_buf)
+ /*
+ * iter->tr is NULL when used with tp_printk, which makes
+ * this get called where it is not safe to call krealloc().
+ */
+ if (!iter->tr || iter->fmt == static_fmt_buf)
return NULL;
tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE,
@@ -3565,7 +3570,7 @@ const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
if (WARN_ON_ONCE(!fmt))
return fmt;
- if (iter->tr->trace_flags & TRACE_ITER_HASH_PTR)
+ if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR)
return fmt;
p = fmt;
@@ -9691,7 +9696,7 @@ void __init early_trace_init(void)
{
if (tracepoint_printk) {
tracepoint_print_iter =
- kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
+ kzalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
if (MEM_FAIL(!tracepoint_print_iter,
"Failed to allocate trace iterator\n"))
tracepoint_printk = 0;
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index dc971a68dda4..e57cc0870892 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -63,8 +63,10 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
event = p + 1;
*p = '\0';
}
- if (event[0] == '\0')
- return -EINVAL;
+ if (event[0] == '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
mutex_lock(&event_mutex);
for_each_dyn_event_safe(pos, n) {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index af612945a4d0..9a4b980d695b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -106,6 +106,7 @@ int create_user_ns(struct cred *new)
if (!ns)
goto fail_dec;
+ ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
ret = ns_alloc_inum(&ns->ns);
if (ret)
goto fail_free;
@@ -841,6 +842,60 @@ static int sort_idmaps(struct uid_gid_map *map)
return 0;
}
+/**
+ * verify_root_map() - check the uid 0 mapping
+ * @file: idmapping file
+ * @map_ns: user namespace of the target process
+ * @new_map: requested idmap
+ *
+ * If a process requests mapping parent uid 0 into the new ns, verify that the
+ * process writing the map had the CAP_SETFCAP capability as the target process
+ * will be able to write fscaps that are valid in ancestor user namespaces.
+ *
+ * Return: true if the mapping is allowed, false if not.
+ */
+static bool verify_root_map(const struct file *file,
+ struct user_namespace *map_ns,
+ struct uid_gid_map *new_map)
+{
+ int idx;
+ const struct user_namespace *file_ns = file->f_cred->user_ns;
+ struct uid_gid_extent *extent0 = NULL;
+
+ for (idx = 0; idx < new_map->nr_extents; idx++) {
+ if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ extent0 = &new_map->extent[idx];
+ else
+ extent0 = &new_map->forward[idx];
+ if (extent0->lower_first == 0)
+ break;
+
+ extent0 = NULL;
+ }
+
+ if (!extent0)
+ return true;
+
+ if (map_ns == file_ns) {
+ /* The process unshared its ns and is writing to its own
+ * /proc/self/uid_map. User already has full capabilites in
+ * the new namespace. Verify that the parent had CAP_SETFCAP
+ * when it unshared.
+ * */
+ if (!file_ns->parent_could_setfcap)
+ return false;
+ } else {
+ /* Process p1 is writing to uid_map of p2, who is in a child
+ * user namespace to p1's. Verify that the opener of the map
+ * file has CAP_SETFCAP against the parent of the new map
+ * namespace */
+ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
+ return false;
+ }
+
+ return true;
+}
+
static ssize_t map_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos,
int cap_setid,
@@ -848,7 +903,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
struct uid_gid_map *parent_map)
{
struct seq_file *seq = file->private_data;
- struct user_namespace *ns = seq->private;
+ struct user_namespace *map_ns = seq->private;
struct uid_gid_map new_map;
unsigned idx;
struct uid_gid_extent extent;
@@ -895,7 +950,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
/*
* Adjusting namespace settings requires capabilities on the target.
*/
- if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
+ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
goto out;
/* Parse the user data */
@@ -965,7 +1020,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ret = -EPERM;
/* Validate the user is allowed to use user id's mapped to. */
- if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
+ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
goto out;
ret = -EPERM;
@@ -1086,6 +1141,10 @@ static bool new_idmap_permitted(const struct file *file,
struct uid_gid_map *new_map)
{
const struct cred *cred = file->f_cred;
+
+ if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
+ return false;
+
/* Don't allow mappings that would allow anything that wouldn't
* be allowed without the establishment of unprivileged mappings.
*/
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index 0b35212ffc3d..bb7bb3b478ab 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -139,13 +139,22 @@ static void umd_cleanup(struct subprocess_info *info)
struct umd_info *umd_info = info->data;
/* cleanup if umh_setup() was successful but exec failed */
- if (info->retval) {
- fput(umd_info->pipe_to_umh);
- fput(umd_info->pipe_from_umh);
- put_pid(umd_info->tgid);
- umd_info->tgid = NULL;
- }
+ if (info->retval)
+ umd_cleanup_helper(umd_info);
+}
+
+/**
+ * umd_cleanup_helper - release the resources which were allocated in umd_setup
+ * @info: information about usermode driver
+ */
+void umd_cleanup_helper(struct umd_info *info)
+{
+ fput(info->pipe_to_umh);
+ fput(info->pipe_from_umh);
+ put_pid(info->tgid);
+ info->tgid = NULL;
}
+EXPORT_SYMBOL_GPL(umd_cleanup_helper);
/**
* fork_usermode_driver - fork a usermode driver
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 71109065bd8e..107bc38b1945 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -278,9 +278,10 @@ void touch_all_softlockup_watchdogs(void)
* update as well, the only side effect might be a cycle delay for
* the softlockup check.
*/
- for_each_cpu(cpu, &watchdog_allowed_mask)
+ for_each_cpu(cpu, &watchdog_allowed_mask) {
per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET;
- wq_watchdog_touch(-1);
+ wq_watchdog_touch(cpu);
+ }
}
void touch_softlockup_watchdog_sync(void)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0d150da252e8..79f2319543ce 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1412,7 +1412,6 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
*/
lockdep_assert_irqs_disabled();
- debug_work_activate(work);
/* if draining, only works from the same workqueue are allowed */
if (unlikely(wq->flags & __WQ_DRAINING) &&
@@ -1494,6 +1493,7 @@ retry:
worklist = &pwq->delayed_works;
}
+ debug_work_activate(work);
insert_work(pwq, work, worklist, work_flags);
out:
@@ -5787,22 +5787,17 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
continue;
/* get the latest of pool and touched timestamps */
+ if (pool->cpu >= 0)
+ touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
+ else
+ touched = READ_ONCE(wq_watchdog_touched);
pool_ts = READ_ONCE(pool->watchdog_ts);
- touched = READ_ONCE(wq_watchdog_touched);
if (time_after(pool_ts, touched))
ts = pool_ts;
else
ts = touched;
- if (pool->cpu >= 0) {
- unsigned long cpu_touched =
- READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
- pool->cpu));
- if (time_after(cpu_touched, ts))
- ts = cpu_touched;
- }
-
/* did we stall? */
if (time_after(jiffies, ts + thresh)) {
lockup_detected = true;
@@ -5826,8 +5821,8 @@ notrace void wq_watchdog_touch(int cpu)
{
if (cpu >= 0)
per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
- else
- wq_watchdog_touched = jiffies;
+
+ wq_watchdog_touched = jiffies;
}
static void wq_watchdog_set_thresh(unsigned long thresh)