summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks12
-rw-r--r--kernel/audit.c5
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/arraymap.c2
-rw-r--r--kernel/bpf/bpf_struct_ops.c634
-rw-r--r--kernel/bpf/bpf_struct_ops_types.h9
-rw-r--r--kernel/bpf/btf.c505
-rw-r--r--kernel/bpf/cgroup.c110
-rw-r--r--kernel/bpf/core.c24
-rw-r--r--kernel/bpf/cpumap.c76
-rw-r--r--kernel/bpf/devmap.c190
-rw-r--r--kernel/bpf/dispatcher.c158
-rw-r--r--kernel/bpf/hashtab.c264
-rw-r--r--kernel/bpf/helpers.c12
-rw-r--r--kernel/bpf/inode.c46
-rw-r--r--kernel/bpf/local_storage.c28
-rw-r--r--kernel/bpf/map_in_map.c3
-rw-r--r--kernel/bpf/syscall.c695
-rw-r--r--kernel/bpf/tnum.c9
-rw-r--r--kernel/bpf/trampoline.c221
-rw-r--r--kernel/bpf/verifier.c593
-rw-r--r--kernel/bpf/xskmap.c18
-rw-r--r--kernel/cgroup/cgroup.c16
-rw-r--r--kernel/cgroup/rstat.c2
-rw-r--r--kernel/cpu.c156
-rw-r--r--kernel/cred.c10
-rw-r--r--kernel/events/core.c6
-rw-r--r--kernel/exit.c12
-rw-r--r--kernel/extable.c7
-rw-r--r--kernel/fork.c26
-rw-r--r--kernel/futex.c1
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/irq/cpuhotplug.c21
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/irq/irqdomain.c18
-rw-r--r--kernel/irq/manage.c45
-rw-r--r--kernel/irq/spurious.c1
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kexec_core.c8
-rw-r--r--kernel/kexec_file.c4
-rw-r--r--kernel/kexec_internal.h2
-rw-r--r--kernel/kprobes.c71
-rw-r--r--kernel/locking/lockdep.c7
-rw-r--r--kernel/locking/lockdep_proc.c4
-rw-r--r--kernel/locking/mutex.c4
-rw-r--r--kernel/locking/osq_lock.c23
-rw-r--r--kernel/locking/qspinlock.c13
-rw-r--r--kernel/locking/rwsem.c4
-rw-r--r--kernel/locking/spinlock_debug.c32
-rw-r--r--kernel/module.c49
-rw-r--r--kernel/nsproxy.c41
-rw-r--r--kernel/padata.c386
-rw-r--r--kernel/power/Kconfig5
-rw-r--r--kernel/power/hibernate.c23
-rw-r--r--kernel/power/main.c33
-rw-r--r--kernel/power/snapshot.c48
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/power/suspend_test.c6
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/rseq.c2
-rw-r--r--kernel/sched/clock.c6
-rw-r--r--kernel/sched/core.c34
-rw-r--r--kernel/sched/cpufreq.c18
-rw-r--r--kernel/sched/cpufreq_schedutil.c10
-rw-r--r--kernel/sched/cpupri.c25
-rw-r--r--kernel/sched/cpupri.h4
-rw-r--r--kernel/sched/cputime.c15
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c184
-rw-r--r--kernel/sched/idle.c2
-rw-r--r--kernel/sched/isolation.c6
-rw-r--r--kernel/sched/pelt.c20
-rw-r--r--kernel/sched/psi.c15
-rw-r--r--kernel/sched/rt.c83
-rw-r--r--kernel/sched/sched.h24
-rw-r--r--kernel/sched/topology.c39
-rw-r--r--kernel/sched/wait_bit.c1
-rw-r--r--kernel/seccomp.c7
-rw-r--r--kernel/smp.c99
-rw-r--r--kernel/stop_machine.c32
-rw-r--r--kernel/taskstats.c30
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/alarmtimer.c121
-rw-r--r--kernel/time/hrtimer.c14
-rw-r--r--kernel/time/namespace.c468
-rw-r--r--kernel/time/posix-clock.c39
-rw-r--r--kernel/time/posix-cpu-timers.c32
-rw-r--r--kernel/time/posix-stubs.c18
-rw-r--r--kernel/time/posix-timers.c88
-rw-r--r--kernel/time/posix-timers.h7
-rw-r--r--kernel/time/sched_clock.c7
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/time/tick-sched.c14
-rw-r--r--kernel/time/vsyscall.c37
-rw-r--r--kernel/trace/bpf_trace.c27
-rw-r--r--kernel/trace/fgraph.c23
-rw-r--r--kernel/trace/ftrace.c25
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c19
-rw-r--r--kernel/trace/trace.h31
-rw-r--r--kernel/trace/trace_entries.h66
-rw-r--r--kernel/trace/trace_events.c28
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_events_hist.c92
-rw-r--r--kernel/trace/trace_events_inject.c4
-rw-r--r--kernel/trace/trace_events_trigger.c20
-rw-r--r--kernel/trace/trace_export.c106
-rw-r--r--kernel/trace/trace_kprobe.c18
-rw-r--r--kernel/trace/trace_probe.c14
-rw-r--r--kernel/trace/trace_probe.h9
-rw-r--r--kernel/trace/trace_sched_wakeup.c4
-rw-r--r--kernel/trace/trace_seq.c2
-rw-r--r--kernel/trace/trace_stack.c5
-rw-r--r--kernel/trace/trace_syscalls.c51
-rw-r--r--kernel/trace/trace_uprobe.c130
-rw-r--r--kernel/trace/tracing_map.c4
-rw-r--r--kernel/up.c12
-rw-r--r--kernel/watchdog.c31
-rw-r--r--kernel/workqueue.c8
119 files changed, 5098 insertions, 1876 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index e0852dc333ac..3de8fd11873b 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -101,7 +101,7 @@ config UNINLINE_SPIN_UNLOCK
# unlock and unlock_irq functions are inlined when:
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
# or
-# - DEBUG_SPINLOCK=n and PREEMPT=n
+# - DEBUG_SPINLOCK=n and PREEMPTION=n
#
# unlock_bh and unlock_irqrestore functions are inlined when:
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
@@ -139,7 +139,7 @@ config INLINE_SPIN_UNLOCK_BH
config INLINE_SPIN_UNLOCK_IRQ
def_bool y
- depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
+ depends on !PREEMPTION || ARCH_INLINE_SPIN_UNLOCK_IRQ
config INLINE_SPIN_UNLOCK_IRQRESTORE
def_bool y
@@ -168,7 +168,7 @@ config INLINE_READ_LOCK_IRQSAVE
config INLINE_READ_UNLOCK
def_bool y
- depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
+ depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK
config INLINE_READ_UNLOCK_BH
def_bool y
@@ -176,7 +176,7 @@ config INLINE_READ_UNLOCK_BH
config INLINE_READ_UNLOCK_IRQ
def_bool y
- depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
+ depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK_IRQ
config INLINE_READ_UNLOCK_IRQRESTORE
def_bool y
@@ -205,7 +205,7 @@ config INLINE_WRITE_LOCK_IRQSAVE
config INLINE_WRITE_UNLOCK
def_bool y
- depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
+ depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK
config INLINE_WRITE_UNLOCK_BH
def_bool y
@@ -213,7 +213,7 @@ config INLINE_WRITE_UNLOCK_BH
config INLINE_WRITE_UNLOCK_IRQ
def_bool y
- depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
+ depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK_IRQ
config INLINE_WRITE_UNLOCK_IRQRESTORE
def_bool y
diff --git a/kernel/audit.c b/kernel/audit.c
index 8e09f0f55b4b..17b0d523afb3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -102,12 +102,13 @@ struct audit_net {
* This struct is RCU protected; you must either hold the RCU lock for reading
* or the associated spinlock for writing.
*/
-static struct auditd_connection {
+struct auditd_connection {
struct pid *pid;
u32 portid;
struct net *net;
struct rcu_head rcu;
-} *auditd_conn = NULL;
+};
+static struct auditd_connection __rcu *auditd_conn;
static DEFINE_SPINLOCK(auditd_conn_lock);
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 3f671bf617e8..046ce5d98033 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
@@ -26,3 +27,6 @@ endif
ifeq ($(CONFIG_SYSFS),y)
obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
endif
+ifeq ($(CONFIG_BPF_JIT),y)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
+endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index f0d19bbb9211..95d77770353c 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -503,6 +503,8 @@ const struct bpf_map_ops array_map_ops = {
.map_mmap = array_map_mmap,
.map_seq_show_elem = array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
};
const struct bpf_map_ops percpu_array_map_ops = {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
new file mode 100644
index 000000000000..8ad1c9ea26b2
--- /dev/null
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -0,0 +1,634 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2019 Facebook */
+
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <linux/slab.h>
+#include <linux/numa.h>
+#include <linux/seq_file.h>
+#include <linux/refcount.h>
+#include <linux/mutex.h>
+
+enum bpf_struct_ops_state {
+ BPF_STRUCT_OPS_STATE_INIT,
+ BPF_STRUCT_OPS_STATE_INUSE,
+ BPF_STRUCT_OPS_STATE_TOBEFREE,
+};
+
+#define BPF_STRUCT_OPS_COMMON_VALUE \
+ refcount_t refcnt; \
+ enum bpf_struct_ops_state state
+
+struct bpf_struct_ops_value {
+ BPF_STRUCT_OPS_COMMON_VALUE;
+ char data[0] ____cacheline_aligned_in_smp;
+};
+
+struct bpf_struct_ops_map {
+ struct bpf_map map;
+ const struct bpf_struct_ops *st_ops;
+ /* protect map_update */
+ struct mutex lock;
+ /* progs has all the bpf_prog that is populated
+ * to the func ptr of the kernel's struct
+ * (in kvalue.data).
+ */
+ struct bpf_prog **progs;
+ /* image is a page that has all the trampolines
+ * that stores the func args before calling the bpf_prog.
+ * A PAGE_SIZE "image" is enough to store all trampoline for
+ * "progs[]".
+ */
+ void *image;
+ /* uvalue->data stores the kernel struct
+ * (e.g. tcp_congestion_ops) that is more useful
+ * to userspace than the kvalue. For example,
+ * the bpf_prog's id is stored instead of the kernel
+ * address of a func ptr.
+ */
+ struct bpf_struct_ops_value *uvalue;
+ /* kvalue.data stores the actual kernel's struct
+ * (e.g. tcp_congestion_ops) that will be
+ * registered to the kernel subsystem.
+ */
+ struct bpf_struct_ops_value kvalue;
+};
+
+#define VALUE_PREFIX "bpf_struct_ops_"
+#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
+
+/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is
+ * the map's value exposed to the userspace and its btf-type-id is
+ * stored at the map->btf_vmlinux_value_type_id.
+ *
+ */
+#define BPF_STRUCT_OPS_TYPE(_name) \
+extern struct bpf_struct_ops bpf_##_name; \
+ \
+struct bpf_struct_ops_##_name { \
+ BPF_STRUCT_OPS_COMMON_VALUE; \
+ struct _name data ____cacheline_aligned_in_smp; \
+};
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+
+enum {
+#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name,
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+ __NR_BPF_STRUCT_OPS_TYPE,
+};
+
+static struct bpf_struct_ops * const bpf_struct_ops[] = {
+#define BPF_STRUCT_OPS_TYPE(_name) \
+ [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name,
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+};
+
+const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
+};
+
+const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
+};
+
+static const struct btf_type *module_type;
+
+void bpf_struct_ops_init(struct btf *btf)
+{
+ s32 type_id, value_id, module_id;
+ const struct btf_member *member;
+ struct bpf_struct_ops *st_ops;
+ struct bpf_verifier_log log = {};
+ const struct btf_type *t;
+ char value_name[128];
+ const char *mname;
+ u32 i, j;
+
+ /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
+#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name);
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+
+ module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
+ if (module_id < 0) {
+ pr_warn("Cannot find struct module in btf_vmlinux\n");
+ return;
+ }
+ module_type = btf_type_by_id(btf, module_id);
+
+ for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
+ st_ops = bpf_struct_ops[i];
+
+ if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
+ sizeof(value_name)) {
+ pr_warn("struct_ops name %s is too long\n",
+ st_ops->name);
+ continue;
+ }
+ sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
+
+ value_id = btf_find_by_name_kind(btf, value_name,
+ BTF_KIND_STRUCT);
+ if (value_id < 0) {
+ pr_warn("Cannot find struct %s in btf_vmlinux\n",
+ value_name);
+ continue;
+ }
+
+ type_id = btf_find_by_name_kind(btf, st_ops->name,
+ BTF_KIND_STRUCT);
+ if (type_id < 0) {
+ pr_warn("Cannot find struct %s in btf_vmlinux\n",
+ st_ops->name);
+ continue;
+ }
+ t = btf_type_by_id(btf, type_id);
+ if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
+ pr_warn("Cannot support #%u members in struct %s\n",
+ btf_type_vlen(t), st_ops->name);
+ continue;
+ }
+
+ for_each_member(j, t, member) {
+ const struct btf_type *func_proto;
+
+ mname = btf_name_by_offset(btf, member->name_off);
+ if (!*mname) {
+ pr_warn("anon member in struct %s is not supported\n",
+ st_ops->name);
+ break;
+ }
+
+ if (btf_member_bitfield_size(t, member)) {
+ pr_warn("bit field member %s in struct %s is not supported\n",
+ mname, st_ops->name);
+ break;
+ }
+
+ func_proto = btf_type_resolve_func_ptr(btf,
+ member->type,
+ NULL);
+ if (func_proto &&
+ btf_distill_func_proto(&log, btf,
+ func_proto, mname,
+ &st_ops->func_models[j])) {
+ pr_warn("Error in parsing func ptr %s in struct %s\n",
+ mname, st_ops->name);
+ break;
+ }
+ }
+
+ if (j == btf_type_vlen(t)) {
+ if (st_ops->init(btf)) {
+ pr_warn("Error in init bpf_struct_ops %s\n",
+ st_ops->name);
+ } else {
+ st_ops->type_id = type_id;
+ st_ops->type = t;
+ st_ops->value_id = value_id;
+ st_ops->value_type = btf_type_by_id(btf,
+ value_id);
+ }
+ }
+ }
+}
+
+extern struct btf *btf_vmlinux;
+
+static const struct bpf_struct_ops *
+bpf_struct_ops_find_value(u32 value_id)
+{
+ unsigned int i;
+
+ if (!value_id || !btf_vmlinux)
+ return NULL;
+
+ for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
+ if (bpf_struct_ops[i]->value_id == value_id)
+ return bpf_struct_ops[i];
+ }
+
+ return NULL;
+}
+
+const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
+{
+ unsigned int i;
+
+ if (!type_id || !btf_vmlinux)
+ return NULL;
+
+ for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
+ if (bpf_struct_ops[i]->type_id == type_id)
+ return bpf_struct_ops[i];
+ }
+
+ return NULL;
+}
+
+static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ if (key && *(u32 *)key == 0)
+ return -ENOENT;
+
+ *(u32 *)next_key = 0;
+ return 0;
+}
+
+int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
+ void *value)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ struct bpf_struct_ops_value *uvalue, *kvalue;
+ enum bpf_struct_ops_state state;
+
+ if (unlikely(*(u32 *)key != 0))
+ return -ENOENT;
+
+ kvalue = &st_map->kvalue;
+ /* Pair with smp_store_release() during map_update */
+ state = smp_load_acquire(&kvalue->state);
+ if (state == BPF_STRUCT_OPS_STATE_INIT) {
+ memset(value, 0, map->value_size);
+ return 0;
+ }
+
+ /* No lock is needed. state and refcnt do not need
+ * to be updated together under atomic context.
+ */
+ uvalue = (struct bpf_struct_ops_value *)value;
+ memcpy(uvalue, st_map->uvalue, map->value_size);
+ uvalue->state = state;
+ refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+ return 0;
+}
+
+static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EINVAL);
+}
+
+static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
+{
+ const struct btf_type *t = st_map->st_ops->type;
+ u32 i;
+
+ for (i = 0; i < btf_type_vlen(t); i++) {
+ if (st_map->progs[i]) {
+ bpf_prog_put(st_map->progs[i]);
+ st_map->progs[i] = NULL;
+ }
+ }
+}
+
+static int check_zero_holes(const struct btf_type *t, void *data)
+{
+ const struct btf_member *member;
+ u32 i, moff, msize, prev_mend = 0;
+ const struct btf_type *mtype;
+
+ for_each_member(i, t, member) {
+ moff = btf_member_bit_offset(t, member) / 8;
+ if (moff > prev_mend &&
+ memchr_inv(data + prev_mend, 0, moff - prev_mend))
+ return -EINVAL;
+
+ mtype = btf_type_by_id(btf_vmlinux, member->type);
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
+ NULL, NULL);
+ if (IS_ERR(mtype))
+ return PTR_ERR(mtype);
+ prev_mend = moff + msize;
+ }
+
+ if (t->size > prev_mend &&
+ memchr_inv(data + prev_mend, 0, t->size - prev_mend))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ const struct bpf_struct_ops *st_ops = st_map->st_ops;
+ struct bpf_struct_ops_value *uvalue, *kvalue;
+ const struct btf_member *member;
+ const struct btf_type *t = st_ops->type;
+ void *udata, *kdata;
+ int prog_fd, err = 0;
+ void *image;
+ u32 i;
+
+ if (flags)
+ return -EINVAL;
+
+ if (*(u32 *)key != 0)
+ return -E2BIG;
+
+ err = check_zero_holes(st_ops->value_type, value);
+ if (err)
+ return err;
+
+ uvalue = (struct bpf_struct_ops_value *)value;
+ err = check_zero_holes(t, uvalue->data);
+ if (err)
+ return err;
+
+ if (uvalue->state || refcount_read(&uvalue->refcnt))
+ return -EINVAL;
+
+ uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
+ kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;
+
+ mutex_lock(&st_map->lock);
+
+ if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
+ err = -EBUSY;
+ goto unlock;
+ }
+
+ memcpy(uvalue, value, map->value_size);
+
+ udata = &uvalue->data;
+ kdata = &kvalue->data;
+ image = st_map->image;
+
+ for_each_member(i, t, member) {
+ const struct btf_type *mtype, *ptype;
+ struct bpf_prog *prog;
+ u32 moff;
+
+ moff = btf_member_bit_offset(t, member) / 8;
+ ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
+ if (ptype == module_type) {
+ if (*(void **)(udata + moff))
+ goto reset_unlock;
+ *(void **)(kdata + moff) = BPF_MODULE_OWNER;
+ continue;
+ }
+
+ err = st_ops->init_member(t, member, kdata, udata);
+ if (err < 0)
+ goto reset_unlock;
+
+ /* The ->init_member() has handled this member */
+ if (err > 0)
+ continue;
+
+ /* If st_ops->init_member does not handle it,
+ * we will only handle func ptrs and zero-ed members
+ * here. Reject everything else.
+ */
+
+ /* All non func ptr member must be 0 */
+ if (!ptype || !btf_type_is_func_proto(ptype)) {
+ u32 msize;
+
+ mtype = btf_type_by_id(btf_vmlinux, member->type);
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
+ NULL, NULL);
+ if (IS_ERR(mtype)) {
+ err = PTR_ERR(mtype);
+ goto reset_unlock;
+ }
+
+ if (memchr_inv(udata + moff, 0, msize)) {
+ err = -EINVAL;
+ goto reset_unlock;
+ }
+
+ continue;
+ }
+
+ prog_fd = (int)(*(unsigned long *)(udata + moff));
+ /* Similar check as the attr->attach_prog_fd */
+ if (!prog_fd)
+ continue;
+
+ prog = bpf_prog_get(prog_fd);
+ if (IS_ERR(prog)) {
+ err = PTR_ERR(prog);
+ goto reset_unlock;
+ }
+ st_map->progs[i] = prog;
+
+ if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
+ prog->aux->attach_btf_id != st_ops->type_id ||
+ prog->expected_attach_type != i) {
+ err = -EINVAL;
+ goto reset_unlock;
+ }
+
+ err = arch_prepare_bpf_trampoline(image,
+ st_map->image + PAGE_SIZE,
+ &st_ops->func_models[i], 0,
+ &prog, 1, NULL, 0, NULL);
+ if (err < 0)
+ goto reset_unlock;
+
+ *(void **)(kdata + moff) = image;
+ image += err;
+
+ /* put prog_id to udata */
+ *(unsigned long *)(udata + moff) = prog->aux->id;
+ }
+
+ refcount_set(&kvalue->refcnt, 1);
+ bpf_map_inc(map);
+
+ set_memory_ro((long)st_map->image, 1);
+ set_memory_x((long)st_map->image, 1);
+ err = st_ops->reg(kdata);
+ if (likely(!err)) {
+ /* Pair with smp_load_acquire() during lookup_elem().
+ * It ensures the above udata updates (e.g. prog->aux->id)
+ * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
+ */
+ smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);
+ goto unlock;
+ }
+
+ /* Error during st_ops->reg(). It is very unlikely since
+ * the above init_member() should have caught it earlier
+ * before reg(). The only possibility is if there was a race
+ * in registering the struct_ops (under the same name) to
+ * a sub-system through different struct_ops's maps.
+ */
+ set_memory_nx((long)st_map->image, 1);
+ set_memory_rw((long)st_map->image, 1);
+ bpf_map_put(map);
+
+reset_unlock:
+ bpf_struct_ops_map_put_progs(st_map);
+ memset(uvalue, 0, map->value_size);
+ memset(kvalue, 0, map->value_size);
+unlock:
+ mutex_unlock(&st_map->lock);
+ return err;
+}
+
+static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
+{
+ enum bpf_struct_ops_state prev_state;
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = (struct bpf_struct_ops_map *)map;
+ prev_state = cmpxchg(&st_map->kvalue.state,
+ BPF_STRUCT_OPS_STATE_INUSE,
+ BPF_STRUCT_OPS_STATE_TOBEFREE);
+ if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) {
+ st_map->st_ops->unreg(&st_map->kvalue.data);
+ if (refcount_dec_and_test(&st_map->kvalue.refcnt))
+ bpf_map_put(map);
+ }
+
+ return 0;
+}
+
+static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ void *value;
+ int err;
+
+ value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+ if (!value)
+ return;
+
+ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
+ if (!err) {
+ btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id,
+ value, m);
+ seq_puts(m, "\n");
+ }
+
+ kfree(value);
+}
+
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ if (st_map->progs)
+ bpf_struct_ops_map_put_progs(st_map);
+ bpf_map_area_free(st_map->progs);
+ bpf_jit_free_exec(st_map->image);
+ bpf_map_area_free(st_map->uvalue);
+ bpf_map_area_free(st_map);
+}
+
+static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
+ attr->map_flags || !attr->btf_vmlinux_value_type_id)
+ return -EINVAL;
+ return 0;
+}
+
+static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
+{
+ const struct bpf_struct_ops *st_ops;
+ size_t map_total_size, st_map_size;
+ struct bpf_struct_ops_map *st_map;
+ const struct btf_type *t, *vt;
+ struct bpf_map_memory mem;
+ struct bpf_map *map;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
+ if (!st_ops)
+ return ERR_PTR(-ENOTSUPP);
+
+ vt = st_ops->value_type;
+ if (attr->value_size != vt->size)
+ return ERR_PTR(-EINVAL);
+
+ t = st_ops->type;
+
+ st_map_size = sizeof(*st_map) +
+ /* kvalue stores the
+ * struct bpf_struct_ops_tcp_congestions_ops
+ */
+ (vt->size - sizeof(struct bpf_struct_ops_value));
+ map_total_size = st_map_size +
+ /* uvalue */
+ sizeof(vt->size) +
+ /* struct bpf_progs **progs */
+ btf_type_vlen(t) * sizeof(struct bpf_prog *);
+ err = bpf_map_charge_init(&mem, map_total_size);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
+ if (!st_map) {
+ bpf_map_charge_finish(&mem);
+ return ERR_PTR(-ENOMEM);
+ }
+ st_map->st_ops = st_ops;
+ map = &st_map->map;
+
+ st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
+ st_map->progs =
+ bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *),
+ NUMA_NO_NODE);
+ st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!st_map->uvalue || !st_map->progs || !st_map->image) {
+ bpf_struct_ops_map_free(map);
+ bpf_map_charge_finish(&mem);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ mutex_init(&st_map->lock);
+ set_vm_flush_reset_perms(st_map->image);
+ bpf_map_init_from_attr(map, attr);
+ bpf_map_charge_move(&map->memory, &mem);
+
+ return map;
+}
+
+const struct bpf_map_ops bpf_struct_ops_map_ops = {
+ .map_alloc_check = bpf_struct_ops_map_alloc_check,
+ .map_alloc = bpf_struct_ops_map_alloc,
+ .map_free = bpf_struct_ops_map_free,
+ .map_get_next_key = bpf_struct_ops_map_get_next_key,
+ .map_lookup_elem = bpf_struct_ops_map_lookup_elem,
+ .map_delete_elem = bpf_struct_ops_map_delete_elem,
+ .map_update_elem = bpf_struct_ops_map_update_elem,
+ .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
+};
+
+/* "const void *" because some subsystem is
+ * passing a const (e.g. const struct tcp_congestion_ops *)
+ */
+bool bpf_struct_ops_get(const void *kdata)
+{
+ struct bpf_struct_ops_value *kvalue;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+
+ return refcount_inc_not_zero(&kvalue->refcnt);
+}
+
+void bpf_struct_ops_put(const void *kdata)
+{
+ struct bpf_struct_ops_value *kvalue;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ if (refcount_dec_and_test(&kvalue->refcnt)) {
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = container_of(kvalue, struct bpf_struct_ops_map,
+ kvalue);
+ bpf_map_put(&st_map->map);
+ }
+}
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
new file mode 100644
index 000000000000..066d83ea1c99
--- /dev/null
+++ b/kernel/bpf/bpf_struct_ops_types.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* internal file - do not include directly */
+
+#ifdef CONFIG_BPF_JIT
+#ifdef CONFIG_INET
+#include <net/tcp.h>
+BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
+#endif
+#endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7d40da240891..b7c1660fb594 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -180,11 +180,6 @@
*/
#define BTF_MAX_SIZE (16 * 1024 * 1024)
-#define for_each_member(i, struct_type, member) \
- for (i = 0, member = btf_type_member(struct_type); \
- i < btf_type_vlen(struct_type); \
- i++, member++)
-
#define for_each_member_from(i, from, struct_type, member) \
for (i = from, member = btf_type_member(struct_type) + from; \
i < btf_type_vlen(struct_type); \
@@ -281,6 +276,11 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_DATASEC] = "DATASEC",
};
+static const char *btf_type_str(const struct btf_type *t)
+{
+ return btf_kind_str[BTF_INFO_KIND(t->info)];
+}
+
struct btf_kind_operations {
s32 (*check_meta)(struct btf_verifier_env *env,
const struct btf_type *t,
@@ -382,6 +382,65 @@ static bool btf_type_is_datasec(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}
+s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
+{
+ const struct btf_type *t;
+ const char *tname;
+ u32 i;
+
+ for (i = 1; i <= btf->nr_types; i++) {
+ t = btf->types[i];
+ if (BTF_INFO_KIND(t->info) != kind)
+ continue;
+
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!strcmp(tname, name))
+ return i;
+ }
+
+ return -ENOENT;
+}
+
+const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
+ u32 id, u32 *res_id)
+{
+ const struct btf_type *t = btf_type_by_id(btf, id);
+
+ while (btf_type_is_modifier(t)) {
+ id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ }
+
+ if (res_id)
+ *res_id = id;
+
+ return t;
+}
+
+const struct btf_type *btf_type_resolve_ptr(const struct btf *btf,
+ u32 id, u32 *res_id)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, id, NULL);
+ if (!btf_type_is_ptr(t))
+ return NULL;
+
+ return btf_type_skip_modifiers(btf, t->type, res_id);
+}
+
+const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
+ u32 id, u32 *res_id)
+{
+ const struct btf_type *ptype;
+
+ ptype = btf_type_resolve_ptr(btf, id, res_id);
+ if (ptype && btf_type_is_func_proto(ptype))
+ return ptype;
+
+ return NULL;
+}
+
/* Types that act only as a source, not sink or intermediate
* type when resolving.
*/
@@ -446,30 +505,6 @@ static const char *btf_int_encoding_str(u8 encoding)
return "UNKN";
}
-static u16 btf_type_vlen(const struct btf_type *t)
-{
- return BTF_INFO_VLEN(t->info);
-}
-
-static bool btf_type_kflag(const struct btf_type *t)
-{
- return BTF_INFO_KFLAG(t->info);
-}
-
-static u32 btf_member_bit_offset(const struct btf_type *struct_type,
- const struct btf_member *member)
-{
- return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
- : member->offset;
-}
-
-static u32 btf_member_bitfield_size(const struct btf_type *struct_type,
- const struct btf_member *member)
-{
- return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
- : 0;
-}
-
static u32 btf_type_int(const struct btf_type *t)
{
return *(u32 *)(t + 1);
@@ -480,11 +515,6 @@ static const struct btf_array *btf_type_array(const struct btf_type *t)
return (const struct btf_array *)(t + 1);
}
-static const struct btf_member *btf_type_member(const struct btf_type *t)
-{
- return (const struct btf_member *)(t + 1);
-}
-
static const struct btf_enum *btf_type_enum(const struct btf_type *t)
{
return (const struct btf_enum *)(t + 1);
@@ -1057,7 +1087,7 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
* *elem_type: same as return type ("struct X")
* *total_nelems: 1
*/
-static const struct btf_type *
+const struct btf_type *
btf_resolve_size(const struct btf *btf, const struct btf_type *type,
u32 *type_size, const struct btf_type **elem_type,
u32 *total_nelems)
@@ -1111,8 +1141,10 @@ resolved:
return ERR_PTR(-EINVAL);
*type_size = nelems * size;
- *total_nelems = nelems;
- *elem_type = type;
+ if (total_nelems)
+ *total_nelems = nelems;
+ if (elem_type)
+ *elem_type = type;
return array_type ? : type;
}
@@ -1826,7 +1858,10 @@ static void btf_modifier_seq_show(const struct btf *btf,
u32 type_id, void *data,
u8 bits_offset, struct seq_file *m)
{
- t = btf_type_id_resolve(btf, &type_id);
+ if (btf->resolved_ids)
+ t = btf_type_id_resolve(btf, &type_id);
+ else
+ t = btf_type_skip_modifiers(btf, type_id, NULL);
btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
}
@@ -2621,8 +2656,8 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_vlen(t)) {
- btf_verifier_log_type(env, t, "vlen != 0");
+ if (btf_type_vlen(t) > BTF_FUNC_GLOBAL) {
+ btf_verifier_log_type(env, t, "Invalid func linkage");
return -EINVAL;
}
@@ -3470,12 +3505,14 @@ static u8 bpf_ctx_convert_map[] = {
[_id] = __ctx_convert##_id,
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
+ 0, /* avoid empty array */
};
#undef BPF_MAP_TYPE
static const struct btf_member *
btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
- const struct btf_type *t, enum bpf_prog_type prog_type)
+ const struct btf_type *t, enum bpf_prog_type prog_type,
+ int arg)
{
const struct btf_type *conv_struct;
const struct btf_type *ctx_struct;
@@ -3496,12 +3533,13 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
* is not supported yet.
* BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
*/
- bpf_log(log, "BPF program ctx type is not a struct\n");
+ if (log->level & BPF_LOG_LEVEL)
+ bpf_log(log, "arg#%d type is not a struct\n", arg);
return NULL;
}
tname = btf_name_by_offset(btf, t->name_off);
if (!tname) {
- bpf_log(log, "BPF program ctx struct doesn't have a name\n");
+ bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
return NULL;
}
/* prog_type is valid bpf program type. No need for bounds check. */
@@ -3534,11 +3572,12 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *t,
- enum bpf_prog_type prog_type)
+ enum bpf_prog_type prog_type,
+ int arg)
{
const struct btf_member *prog_ctx_type, *kern_ctx_type;
- prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type);
+ prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg);
if (!prog_ctx_type)
return -ENOENT;
kern_ctx_type = prog_ctx_type + 1;
@@ -3604,6 +3643,8 @@ struct btf *btf_parse_vmlinux(void)
goto errout;
}
+ bpf_struct_ops_init(btf);
+
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
@@ -3628,6 +3669,19 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
}
}
+static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
+{
+ /* t comes in already as a pointer */
+ t = btf_type_by_id(btf, t->type);
+
+ /* allow const */
+ if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
+ t = btf_type_by_id(btf, t->type);
+
+ /* char, signed char, unsigned char */
+ return btf_type_is_int(t) && t->size == 1;
+}
+
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -3676,7 +3730,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t))
+ if (btf_type_is_int(t) || btf_type_is_enum(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
@@ -3694,12 +3748,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
*/
return true;
+ if (is_string_ptr(btf, t))
+ return true;
+
/* this is a pointer to another type */
info->reg_type = PTR_TO_BTF_ID;
- info->btf_id = t->type;
if (tgt_prog) {
- ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type);
+ ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
if (ret > 0) {
info->btf_id = ret;
return true;
@@ -3707,10 +3763,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
return false;
}
}
+
+ info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
/* skip modifiers */
- while (btf_type_is_modifier(t))
+ while (btf_type_is_modifier(t)) {
+ info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
+ }
if (!btf_type_is_struct(t)) {
bpf_log(log,
"func '%s' arg%d type %s is not a struct\n",
@@ -3736,23 +3796,57 @@ int btf_struct_access(struct bpf_verifier_log *log,
again:
tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
if (!btf_type_is_struct(t)) {
- bpf_log(log, "Type '%s' is not a struct", tname);
+ bpf_log(log, "Type '%s' is not a struct\n", tname);
return -EINVAL;
}
- for_each_member(i, t, member) {
- if (btf_member_bitfield_size(t, member))
- /* bitfields are not supported yet */
- continue;
+ if (off + size > t->size) {
+ bpf_log(log, "access beyond struct %s at off %u size %u\n",
+ tname, off, size);
+ return -EACCES;
+ }
+ for_each_member(i, t, member) {
/* offset of the field in bytes */
moff = btf_member_bit_offset(t, member) / 8;
if (off + size <= moff)
/* won't find anything, field is already too far */
break;
+
+ if (btf_member_bitfield_size(t, member)) {
+ u32 end_bit = btf_member_bit_offset(t, member) +
+ btf_member_bitfield_size(t, member);
+
+ /* off <= moff instead of off == moff because clang
+ * does not generate a BTF member for anonymous
+ * bitfield like the ":16" here:
+ * struct {
+ * int :16;
+ * int x:8;
+ * };
+ */
+ if (off <= moff &&
+ BITS_ROUNDUP_BYTES(end_bit) <= off + size)
+ return SCALAR_VALUE;
+
+ /* off may be accessing a following member
+ *
+ * or
+ *
+ * Doing partial access at either end of this
+ * bitfield. Continue on this case also to
+ * treat it as not accessing this bitfield
+ * and eventually error out as field not
+ * found to keep it simple.
+ * It could be relaxed if there was a legit
+ * partial access case later.
+ */
+ continue;
+ }
+
/* In case of "off" is pointing to holes of a struct */
if (off < moff)
- continue;
+ break;
/* type of the field */
mtype = btf_type_by_id(btf_vmlinux, member->type);
@@ -4042,11 +4136,158 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return 0;
}
-int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog)
+/* Compare BTFs of two functions assuming only scalars and pointers to context.
+ * t1 points to BTF_KIND_FUNC in btf1
+ * t2 points to BTF_KIND_FUNC in btf2
+ * Returns:
+ * EINVAL - function prototype mismatch
+ * EFAULT - verifier bug
+ * 0 - 99% match. The last 1% is validated by the verifier.
+ */
+int btf_check_func_type_match(struct bpf_verifier_log *log,
+ struct btf *btf1, const struct btf_type *t1,
+ struct btf *btf2, const struct btf_type *t2)
+{
+ const struct btf_param *args1, *args2;
+ const char *fn1, *fn2, *s1, *s2;
+ u32 nargs1, nargs2, i;
+
+ fn1 = btf_name_by_offset(btf1, t1->name_off);
+ fn2 = btf_name_by_offset(btf2, t2->name_off);
+
+ if (btf_func_linkage(t1) != BTF_FUNC_GLOBAL) {
+ bpf_log(log, "%s() is not a global function\n", fn1);
+ return -EINVAL;
+ }
+ if (btf_func_linkage(t2) != BTF_FUNC_GLOBAL) {
+ bpf_log(log, "%s() is not a global function\n", fn2);
+ return -EINVAL;
+ }
+
+ t1 = btf_type_by_id(btf1, t1->type);
+ if (!t1 || !btf_type_is_func_proto(t1))
+ return -EFAULT;
+ t2 = btf_type_by_id(btf2, t2->type);
+ if (!t2 || !btf_type_is_func_proto(t2))
+ return -EFAULT;
+
+ args1 = (const struct btf_param *)(t1 + 1);
+ nargs1 = btf_type_vlen(t1);
+ args2 = (const struct btf_param *)(t2 + 1);
+ nargs2 = btf_type_vlen(t2);
+
+ if (nargs1 != nargs2) {
+ bpf_log(log, "%s() has %d args while %s() has %d args\n",
+ fn1, nargs1, fn2, nargs2);
+ return -EINVAL;
+ }
+
+ t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
+ if (t1->info != t2->info) {
+ bpf_log(log,
+ "Return type %s of %s() doesn't match type %s of %s()\n",
+ btf_type_str(t1), fn1,
+ btf_type_str(t2), fn2);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < nargs1; i++) {
+ t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, args2[i].type, NULL);
+
+ if (t1->info != t2->info) {
+ bpf_log(log, "arg%d in %s() is %s while %s() has %s\n",
+ i, fn1, btf_type_str(t1),
+ fn2, btf_type_str(t2));
+ return -EINVAL;
+ }
+ if (btf_type_has_size(t1) && t1->size != t2->size) {
+ bpf_log(log,
+ "arg%d in %s() has size %d while %s() has %d\n",
+ i, fn1, t1->size,
+ fn2, t2->size);
+ return -EINVAL;
+ }
+
+ /* global functions are validated with scalars and pointers
+ * to context only. And only global functions can be replaced.
+ * Hence type check only those types.
+ */
+ if (btf_type_is_int(t1) || btf_type_is_enum(t1))
+ continue;
+ if (!btf_type_is_ptr(t1)) {
+ bpf_log(log,
+ "arg%d in %s() has unrecognized type\n",
+ i, fn1);
+ return -EINVAL;
+ }
+ t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
+ if (!btf_type_is_struct(t1)) {
+ bpf_log(log,
+ "arg%d in %s() is not a pointer to context\n",
+ i, fn1);
+ return -EINVAL;
+ }
+ if (!btf_type_is_struct(t2)) {
+ bpf_log(log,
+ "arg%d in %s() is not a pointer to context\n",
+ i, fn2);
+ return -EINVAL;
+ }
+ /* This is an optional check to make program writing easier.
+ * Compare names of structs and report an error to the user.
+ * btf_prepare_func_args() already checked that t2 struct
+ * is a context type. btf_prepare_func_args() will check
+ * later that t1 struct is a context type as well.
+ */
+ s1 = btf_name_by_offset(btf1, t1->name_off);
+ s2 = btf_name_by_offset(btf2, t2->name_off);
+ if (strcmp(s1, s2)) {
+ bpf_log(log,
+ "arg%d %s(struct %s *) doesn't match %s(struct %s *)\n",
+ i, fn1, s1, fn2, s2);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/* Compare BTFs of given program with BTF of target program */
+int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
+ struct btf *btf2, const struct btf_type *t2)
+{
+ struct btf *btf1 = prog->aux->btf;
+ const struct btf_type *t1;
+ u32 btf_id = 0;
+
+ if (!prog->aux->func_info) {
+ bpf_log(&env->log, "Program extension requires BTF\n");
+ return -EINVAL;
+ }
+
+ btf_id = prog->aux->func_info[0].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ t1 = btf_type_by_id(btf1, btf_id);
+ if (!t1 || !btf_type_is_func(t1))
+ return -EFAULT;
+
+ return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2);
+}
+
+/* Compare BTF of a function with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *reg)
{
- struct bpf_verifier_state *st = env->cur_state;
- struct bpf_func_state *func = st->frame[st->curframe];
- struct bpf_reg_state *reg = func->regs;
struct bpf_verifier_log *log = &env->log;
struct bpf_prog *prog = env->prog;
struct btf *btf = prog->aux->btf;
@@ -4056,27 +4297,30 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog)
const char *tname;
if (!prog->aux->func_info)
- return 0;
+ return -EINVAL;
btf_id = prog->aux->func_info[subprog].type_id;
if (!btf_id)
- return 0;
+ return -EFAULT;
if (prog->aux->func_info_aux[subprog].unreliable)
- return 0;
+ return -EINVAL;
t = btf_type_by_id(btf, btf_id);
if (!t || !btf_type_is_func(t)) {
- bpf_log(log, "BTF of subprog %d doesn't point to KIND_FUNC\n",
+ /* These checks were already done by the verifier while loading
+ * struct bpf_func_info
+ */
+ bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
subprog);
- return -EINVAL;
+ return -EFAULT;
}
tname = btf_name_by_offset(btf, t->name_off);
t = btf_type_by_id(btf, t->type);
if (!t || !btf_type_is_func_proto(t)) {
- bpf_log(log, "Invalid type of func %s\n", tname);
- return -EINVAL;
+ bpf_log(log, "Invalid BTF of func %s\n", tname);
+ return -EFAULT;
}
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
@@ -4102,25 +4346,130 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog)
bpf_log(log, "R%d is not a pointer\n", i + 1);
goto out;
}
- /* If program is passing PTR_TO_CTX into subprogram
- * check that BTF type matches.
+ /* If function expects ctx type in BTF check that caller
+ * is passing PTR_TO_CTX.
*/
- if (reg[i + 1].type == PTR_TO_CTX &&
- !btf_get_prog_ctx_type(log, btf, t, prog->type))
- goto out;
- /* All other pointers are ok */
- continue;
+ if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) {
+ if (reg[i + 1].type != PTR_TO_CTX) {
+ bpf_log(log,
+ "arg#%d expected pointer to ctx, but got %s\n",
+ i, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ goto out;
+ }
+ if (check_ctx_reg(env, &reg[i + 1], i + 1))
+ goto out;
+ continue;
+ }
}
- bpf_log(log, "Unrecognized argument type %s\n",
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ bpf_log(log, "Unrecognized arg#%d type %s\n",
+ i, btf_kind_str[BTF_INFO_KIND(t->info)]);
goto out;
}
return 0;
out:
- /* LLVM optimizations can remove arguments from static functions. */
- bpf_log(log,
- "Type info disagrees with actual arguments due to compiler optimizations\n");
+ /* Compiler optimizations can remove arguments from static functions
+ * or mismatched type can be passed into a global function.
+ * In such cases mark the function as unreliable from BTF point of view.
+ */
prog->aux->func_info_aux[subprog].unreliable = true;
+ return -EINVAL;
+}
+
+/* Convert BTF of a function into bpf_reg_state if possible
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - cannot convert BTF.
+ * 0 - Successfully converted BTF into bpf_reg_state
+ * (either PTR_TO_CTX or SCALAR_VALUE).
+ */
+int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *reg)
+{
+ struct bpf_verifier_log *log = &env->log;
+ struct bpf_prog *prog = env->prog;
+ enum bpf_prog_type prog_type = prog->type;
+ struct btf *btf = prog->aux->btf;
+ const struct btf_param *args;
+ const struct btf_type *t;
+ u32 i, nargs, btf_id;
+ const char *tname;
+
+ if (!prog->aux->func_info ||
+ prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) {
+ bpf_log(log, "Verifier bug\n");
+ return -EFAULT;
+ }
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id) {
+ bpf_log(log, "Global functions need valid BTF\n");
+ return -EFAULT;
+ }
+
+ t = btf_type_by_id(btf, btf_id);
+ if (!t || !btf_type_is_func(t)) {
+ /* These checks were already done by the verifier while loading
+ * struct bpf_func_info
+ */
+ bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
+ subprog);
+ return -EFAULT;
+ }
+ tname = btf_name_by_offset(btf, t->name_off);
+
+ if (log->level & BPF_LOG_LEVEL)
+ bpf_log(log, "Validating %s() func#%d...\n",
+ tname, subprog);
+
+ if (prog->aux->func_info_aux[subprog].unreliable) {
+ bpf_log(log, "Verifier bug in function %s()\n", tname);
+ return -EFAULT;
+ }
+ if (prog_type == BPF_PROG_TYPE_EXT)
+ prog_type = prog->aux->linked_prog->type;
+
+ t = btf_type_by_id(btf, t->type);
+ if (!t || !btf_type_is_func_proto(t)) {
+ bpf_log(log, "Invalid type of function %s()\n", tname);
+ return -EFAULT;
+ }
+ args = (const struct btf_param *)(t + 1);
+ nargs = btf_type_vlen(t);
+ if (nargs > 5) {
+ bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n",
+ tname, nargs);
+ return -EINVAL;
+ }
+ /* check that function returns int */
+ t = btf_type_by_id(btf, t->type);
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_int(t) && !btf_type_is_enum(t)) {
+ bpf_log(log,
+ "Global function %s() doesn't return scalar. Only those are supported.\n",
+ tname);
+ return -EINVAL;
+ }
+ /* Convert BTF function arguments into verifier types.
+ * Only PTR_TO_CTX and SCALAR are supported atm.
+ */
+ for (i = 0; i < nargs; i++) {
+ t = btf_type_by_id(btf, args[i].type);
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+ reg[i + 1].type = SCALAR_VALUE;
+ continue;
+ }
+ if (btf_type_is_ptr(t) &&
+ btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+ reg[i + 1].type = PTR_TO_CTX;
+ continue;
+ }
+ bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
+ i, btf_kind_str[BTF_INFO_KIND(t->info)], tname);
+ return -EINVAL;
+ }
return 0;
}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 9f90d3c92bda..9a500fadbef5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -35,8 +35,8 @@ void cgroup_bpf_offline(struct cgroup *cgrp)
*/
static void cgroup_bpf_release(struct work_struct *work)
{
- struct cgroup *cgrp = container_of(work, struct cgroup,
- bpf.release_work);
+ struct cgroup *p, *cgrp = container_of(work, struct cgroup,
+ bpf.release_work);
enum bpf_cgroup_storage_type stype;
struct bpf_prog_array *old_array;
unsigned int type;
@@ -65,6 +65,9 @@ static void cgroup_bpf_release(struct work_struct *work)
mutex_unlock(&cgroup_mutex);
+ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
+ cgroup_bpf_put(p);
+
percpu_ref_exit(&cgrp->bpf.refcnt);
cgroup_put(cgrp);
}
@@ -103,8 +106,7 @@ static u32 prog_list_length(struct list_head *head)
* if parent has overridable or multi-prog, allow attaching
*/
static bool hierarchy_allows_attach(struct cgroup *cgrp,
- enum bpf_attach_type type,
- u32 new_flags)
+ enum bpf_attach_type type)
{
struct cgroup *p;
@@ -199,6 +201,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
*/
#define NR ARRAY_SIZE(cgrp->bpf.effective)
struct bpf_prog_array *arrays[NR] = {};
+ struct cgroup *p;
int ret, i;
ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
@@ -206,6 +209,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
if (ret)
return ret;
+ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
+ cgroup_bpf_get(p);
+
for (i = 0; i < NR; i++)
INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
@@ -283,31 +289,34 @@ cleanup:
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to attach
+ * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
* @type: Type of attach operation
* @flags: Option flags
*
* Must be called with cgroup_mutex held.
*/
int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_prog *replace_prog,
enum bpf_attach_type type, u32 flags)
{
+ u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
*old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
+ struct bpf_prog_list *pl, *replace_pl = NULL;
enum bpf_cgroup_storage_type stype;
- struct bpf_prog_list *pl;
- bool pl_was_allocated;
int err;
- if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+ if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
+ ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
/* invalid combination */
return -EINVAL;
- if (!hierarchy_allows_attach(cgrp, type, flags))
+ if (!hierarchy_allows_attach(cgrp, type))
return -EPERM;
- if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+ if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
/* Disallow attaching non-overridable on top
* of existing overridable in this cgroup.
* Disallow attaching multi-prog if overridable or none
@@ -317,6 +326,21 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
return -E2BIG;
+ if (flags & BPF_F_ALLOW_MULTI) {
+ list_for_each_entry(pl, progs, node) {
+ if (pl->prog == prog)
+ /* disallow attaching the same prog twice */
+ return -EINVAL;
+ if (pl->prog == replace_prog)
+ replace_pl = pl;
+ }
+ if ((flags & BPF_F_REPLACE) && !replace_pl)
+ /* prog to replace not found for cgroup */
+ return -ENOENT;
+ } else if (!list_empty(progs)) {
+ replace_pl = list_first_entry(progs, typeof(*pl), node);
+ }
+
for_each_cgroup_storage_type(stype) {
storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
if (IS_ERR(storage[stype])) {
@@ -327,53 +351,28 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
}
}
- if (flags & BPF_F_ALLOW_MULTI) {
- list_for_each_entry(pl, progs, node) {
- if (pl->prog == prog) {
- /* disallow attaching the same prog twice */
- for_each_cgroup_storage_type(stype)
- bpf_cgroup_storage_free(storage[stype]);
- return -EINVAL;
- }
+ if (replace_pl) {
+ pl = replace_pl;
+ old_prog = pl->prog;
+ for_each_cgroup_storage_type(stype) {
+ old_storage[stype] = pl->storage[stype];
+ bpf_cgroup_storage_unlink(old_storage[stype]);
}
-
+ } else {
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
if (!pl) {
for_each_cgroup_storage_type(stype)
bpf_cgroup_storage_free(storage[stype]);
return -ENOMEM;
}
-
- pl_was_allocated = true;
- pl->prog = prog;
- for_each_cgroup_storage_type(stype)
- pl->storage[stype] = storage[stype];
list_add_tail(&pl->node, progs);
- } else {
- if (list_empty(progs)) {
- pl = kmalloc(sizeof(*pl), GFP_KERNEL);
- if (!pl) {
- for_each_cgroup_storage_type(stype)
- bpf_cgroup_storage_free(storage[stype]);
- return -ENOMEM;
- }
- pl_was_allocated = true;
- list_add_tail(&pl->node, progs);
- } else {
- pl = list_first_entry(progs, typeof(*pl), node);
- old_prog = pl->prog;
- for_each_cgroup_storage_type(stype) {
- old_storage[stype] = pl->storage[stype];
- bpf_cgroup_storage_unlink(old_storage[stype]);
- }
- pl_was_allocated = false;
- }
- pl->prog = prog;
- for_each_cgroup_storage_type(stype)
- pl->storage[stype] = storage[stype];
}
- cgrp->bpf.flags[type] = flags;
+ pl->prog = prog;
+ for_each_cgroup_storage_type(stype)
+ pl->storage[stype] = storage[stype];
+
+ cgrp->bpf.flags[type] = saved_flags;
err = update_effective_progs(cgrp, type);
if (err)
@@ -401,7 +400,7 @@ cleanup:
pl->storage[stype] = old_storage[stype];
bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
}
- if (pl_was_allocated) {
+ if (!replace_pl) {
list_del(&pl->node);
kfree(pl);
}
@@ -539,6 +538,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype, struct bpf_prog *prog)
{
+ struct bpf_prog *replace_prog = NULL;
struct cgroup *cgrp;
int ret;
@@ -546,8 +546,20 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
- ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+ if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
+ (attr->attach_flags & BPF_F_REPLACE)) {
+ replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
+ if (IS_ERR(replace_prog)) {
+ cgroup_put(cgrp);
+ return PTR_ERR(replace_prog);
+ }
+ }
+
+ ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type,
attr->attach_flags);
+
+ if (replace_prog)
+ bpf_prog_put(replace_prog);
cgroup_put(cgrp);
return ret;
}
@@ -1341,7 +1353,7 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_LDX_MEM(
BPF_SIZE(si->code), si->dst_reg, si->src_reg,
bpf_target_off(struct bpf_sysctl_kern, write,
- FIELD_SIZEOF(struct bpf_sysctl_kern,
+ sizeof_field(struct bpf_sysctl_kern,
write),
target_size));
break;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 49e32acad7d8..973a20d49749 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -222,8 +222,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
u32 pages, delta;
int ret;
- BUG_ON(fp_old == NULL);
-
size = round_up(size, PAGE_SIZE);
pages = size / PAGE_SIZE;
if (pages <= fp_old->pages)
@@ -520,9 +518,9 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
#ifdef CONFIG_BPF_JIT
/* All BPF JIT sysctl knobs here. */
-int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
+int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden __read_mostly;
-int bpf_jit_kallsyms __read_mostly;
long bpf_jit_limit __read_mostly;
static __always_inline void
@@ -2043,23 +2041,28 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
for_each_cgroup_storage_type(stype) {
if (!aux->cgroup_storage[stype])
continue;
- bpf_cgroup_storage_release(aux->prog,
- aux->cgroup_storage[stype]);
+ bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]);
}
}
-static void bpf_free_used_maps(struct bpf_prog_aux *aux)
+void __bpf_free_used_maps(struct bpf_prog_aux *aux,
+ struct bpf_map **used_maps, u32 len)
{
struct bpf_map *map;
- int i;
+ u32 i;
bpf_free_cgroup_storage(aux);
- for (i = 0; i < aux->used_map_cnt; i++) {
- map = aux->used_maps[i];
+ for (i = 0; i < len; i++) {
+ map = used_maps[i];
if (map->ops->map_poke_untrack)
map->ops->map_poke_untrack(map, aux);
bpf_map_put(map);
}
+}
+
+static void bpf_free_used_maps(struct bpf_prog_aux *aux)
+{
+ __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt);
kfree(aux->used_maps);
}
@@ -2134,6 +2137,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
+const struct bpf_func_proto bpf_jiffies64_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ef49e17ae47c..70f71b154fa5 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -72,17 +72,18 @@ struct bpf_cpu_map {
struct bpf_map map;
/* Below members specific for map type */
struct bpf_cpu_map_entry **cpu_map;
- struct list_head __percpu *flush_list;
};
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
+static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
+
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq);
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
struct bpf_cpu_map *cmap;
int err = -ENOMEM;
- int ret, cpu;
u64 cost;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
@@ -106,7 +107,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
- cost += sizeof(struct list_head) * num_possible_cpus();
/* Notice returns -EPERM on if map size is larger than memlock limit */
ret = bpf_map_charge_init(&cmap->map.memory, cost);
@@ -115,23 +115,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
goto free_cmap;
}
- cmap->flush_list = alloc_percpu(struct list_head);
- if (!cmap->flush_list)
- goto free_charge;
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
-
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *),
cmap->map.numa_node);
if (!cmap->cpu_map)
- goto free_percpu;
+ goto free_charge;
return &cmap->map;
-free_percpu:
- free_percpu(cmap->flush_list);
free_charge:
bpf_map_charge_finish(&cmap->map.memory);
free_cmap:
@@ -399,22 +390,14 @@ free_rcu:
static void __cpu_map_entry_free(struct rcu_head *rcu)
{
struct bpf_cpu_map_entry *rcpu;
- int cpu;
/* This cpu_map_entry have been disconnected from map and one
- * RCU graze-period have elapsed. Thus, XDP cannot queue any
+ * RCU grace-period have elapsed. Thus, XDP cannot queue any
* new packets and cannot change/set flush_needed that can
* find this entry.
*/
rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
- /* Flush remaining packets in percpu bulkq */
- for_each_online_cpu(cpu) {
- struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
-
- /* No concurrent bq_enqueue can run at this point */
- bq_flush_to_queue(bq, false);
- }
free_percpu(rcpu->bulkq);
/* Cannot kthread_stop() here, last put free rcpu resources */
put_cpu_map_entry(rcpu);
@@ -436,7 +419,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
* percpu bulkq to queue. Due to caller map_delete_elem() disable
* preemption, cannot call kthread_stop() to make sure queue is empty.
* Instead a work_queue is started for stopping kthread,
- * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * cpu_map_kthread_stop, which waits for an RCU grace period before
* stopping kthread, emptying the queue.
*/
static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
@@ -507,7 +490,6 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
static void cpu_map_free(struct bpf_map *map)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
- int cpu;
u32 i;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
@@ -522,18 +504,6 @@ static void cpu_map_free(struct bpf_map *map)
bpf_clear_redirect_map(map);
synchronize_rcu();
- /* To ensure all pending flush operations have completed wait for flush
- * list be empty on _all_ cpus. Because the above synchronize_rcu()
- * ensures the map is disconnected from the program we can assume no new
- * items will be added to the list.
- */
- for_each_online_cpu(cpu) {
- struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
-
- while (!list_empty(flush_list))
- cond_resched();
- }
-
/* For cpu_map the remote CPUs can still be using the entries
* (struct bpf_cpu_map_entry).
*/
@@ -544,10 +514,9 @@ static void cpu_map_free(struct bpf_map *map)
if (!rcpu)
continue;
- /* bq flush and cleanup happens after RCU graze-period */
+ /* bq flush and cleanup happens after RCU grace-period */
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
}
- free_percpu(cmap->flush_list);
bpf_map_area_free(cmap->cpu_map);
kfree(cmap);
}
@@ -599,7 +568,7 @@ const struct bpf_map_ops cpu_map_ops = {
.map_check_btf = map_check_no_btf,
};
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
{
struct bpf_cpu_map_entry *rcpu = bq->obj;
unsigned int processed = 0, drops = 0;
@@ -620,10 +589,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
err = __ptr_ring_produce(q, xdpf);
if (err) {
drops++;
- if (likely(in_napi_ctx))
- xdp_return_frame_rx_napi(xdpf);
- else
- xdp_return_frame(xdpf);
+ xdp_return_frame_rx_napi(xdpf);
}
processed++;
}
@@ -642,11 +608,11 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
*/
static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
+ struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
- bq_flush_to_queue(bq, true);
+ bq_flush_to_queue(bq);
/* Notice, xdp_buff/page MUST be queued here, long enough for
* driver to code invoking us to finished, due to driver
@@ -681,16 +647,26 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0;
}
-void __cpu_map_flush(struct bpf_map *map)
+void __cpu_map_flush(void)
{
- struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
- struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
+ struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
- bq_flush_to_queue(bq, true);
+ bq_flush_to_queue(bq);
/* If already running, costs spin_lock_irqsave + smb_mb */
wake_up_process(bq->obj->kthread);
}
}
+
+static int __init cpu_map_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu));
+ return 0;
+}
+
+subsys_initcall(cpu_map_init);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 3d3d61b5985b..58bdca5d978a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -53,13 +53,11 @@
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
#define DEV_MAP_BULK_SIZE 16
-struct bpf_dtab_netdev;
-
-struct xdp_bulk_queue {
+struct xdp_dev_bulk_queue {
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
struct list_head flush_node;
+ struct net_device *dev;
struct net_device *dev_rx;
- struct bpf_dtab_netdev *obj;
unsigned int count;
};
@@ -67,15 +65,13 @@ struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
struct hlist_node index_hlist;
struct bpf_dtab *dtab;
- struct xdp_bulk_queue __percpu *bulkq;
struct rcu_head rcu;
- unsigned int idx; /* keep track of map index for tracepoint */
+ unsigned int idx;
};
struct bpf_dtab {
struct bpf_map map;
struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
- struct list_head __percpu *flush_list;
struct list_head list;
/* these are only used for DEVMAP_HASH type maps */
@@ -85,6 +81,7 @@ struct bpf_dtab {
u32 n_buckets;
};
+static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
@@ -109,8 +106,8 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
{
- int err, cpu;
- u64 cost;
+ u64 cost = 0;
+ int err;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -125,9 +122,6 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
bpf_map_init_from_attr(&dtab->map, attr);
- /* make sure page count doesn't overflow */
- cost = (u64) sizeof(struct list_head) * num_possible_cpus();
-
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
@@ -143,17 +137,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
if (err)
return -EINVAL;
- dtab->flush_list = alloc_percpu(struct list_head);
- if (!dtab->flush_list)
- goto free_charge;
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
-
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
if (!dtab->dev_index_head)
- goto free_percpu;
+ goto free_charge;
spin_lock_init(&dtab->index_lock);
} else {
@@ -161,13 +148,11 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
sizeof(struct bpf_dtab_netdev *),
dtab->map.numa_node);
if (!dtab->netdev_map)
- goto free_percpu;
+ goto free_charge;
}
return 0;
-free_percpu:
- free_percpu(dtab->flush_list);
free_charge:
bpf_map_charge_finish(&dtab->map.memory);
return -ENOMEM;
@@ -201,14 +186,16 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
static void dev_map_free(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int i, cpu;
+ int i;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding critical sections in
- * these programs to complete. The rcu critical section only guarantees
- * no further reads against netdev_map. It does __not__ ensure pending
- * flush operations (if any) are complete.
+ * disconnected from events. The following synchronize_rcu() guarantees
+ * both rcu read critical sections complete and waits for
+ * preempt-disable regions (NAPI being the relevant context here) so we
+ * are certain there will be no further reads against the netdev_map and
+ * all flush operations are complete. Flush operations can only be done
+ * from NAPI context for this reason.
*/
spin_lock(&dev_map_lock);
@@ -221,18 +208,6 @@ static void dev_map_free(struct bpf_map *map)
/* Make sure prior __dev_map_entry_free() have completed. */
rcu_barrier();
- /* To ensure all pending flush operations have completed wait for flush
- * list to empty on _all_ cpus.
- * Because the above synchronize_rcu() ensures the map is disconnected
- * from the program we can assume no new items will be added.
- */
- for_each_online_cpu(cpu) {
- struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
-
- while (!list_empty(flush_list))
- cond_resched();
- }
-
if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
for (i = 0; i < dtab->n_buckets; i++) {
struct bpf_dtab_netdev *dev;
@@ -243,7 +218,6 @@ static void dev_map_free(struct bpf_map *map)
hlist_for_each_entry_safe(dev, next, head, index_hlist) {
hlist_del_rcu(&dev->index_hlist);
- free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -258,7 +232,6 @@ static void dev_map_free(struct bpf_map *map)
if (!dev)
continue;
- free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -266,7 +239,6 @@ static void dev_map_free(struct bpf_map *map)
bpf_map_area_free(dtab->netdev_map);
}
- free_percpu(dtab->flush_list);
kfree(dtab);
}
@@ -293,7 +265,8 @@ struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
struct hlist_head *head = dev_map_index_hash(dtab, key);
struct bpf_dtab_netdev *dev;
- hlist_for_each_entry_rcu(dev, head, index_hlist)
+ hlist_for_each_entry_rcu(dev, head, index_hlist,
+ lockdep_is_held(&dtab->index_lock))
if (dev->idx == key)
return dev;
@@ -345,11 +318,9 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
return -ENOENT;
}
-static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
- bool in_napi_ctx)
+static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
- struct bpf_dtab_netdev *obj = bq->obj;
- struct net_device *dev = obj->dev;
+ struct net_device *dev = bq->dev;
int sent = 0, drops = 0, err = 0;
int i;
@@ -372,8 +343,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
out:
bq->count = 0;
- trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx,
- sent, drops, bq->dev_rx, dev, err);
+ trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
bq->dev_rx = NULL;
__list_del_clearprev(&bq->flush_node);
return 0;
@@ -384,33 +354,29 @@ error:
for (i = 0; i < bq->count; i++) {
struct xdp_frame *xdpf = bq->q[i];
- /* RX path under NAPI protection, can return frames faster */
- if (likely(in_napi_ctx))
- xdp_return_frame_rx_napi(xdpf);
- else
- xdp_return_frame(xdpf);
+ xdp_return_frame_rx_napi(xdpf);
drops++;
}
goto out;
}
-/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
+/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
* from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
* net device can be torn down. On devmap tear down we ensure the flush list
* is empty before completing to ensure all flush operations have completed.
+ * When drivers update the bpf program they may need to ensure any flush ops
+ * are also complete. Using synchronize_rcu or call_rcu will suffice for this
+ * because both wait for napi context to exit.
*/
-void __dev_map_flush(struct bpf_map *map)
+void __dev_flush(void)
{
- struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
- struct xdp_bulk_queue *bq, *tmp;
+ struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
+ struct xdp_dev_bulk_queue *bq, *tmp;
- rcu_read_lock();
list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
- bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
- rcu_read_unlock();
+ bq_xmit_all(bq, XDP_XMIT_FLUSH);
}
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -432,15 +398,14 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
* Thus, safe percpu variable access.
*/
-static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
+static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx)
-
{
- struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
- struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
+ struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
+ struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
- bq_xmit_all(bq, 0, true);
+ bq_xmit_all(bq, 0);
/* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed
@@ -457,10 +422,9 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
return 0;
}
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
{
- struct net_device *dev = dst->dev;
struct xdp_frame *xdpf;
int err;
@@ -475,7 +439,21 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
if (unlikely(!xdpf))
return -EOVERFLOW;
- return bq_enqueue(dst, xdpf, dev_rx);
+ return bq_enqueue(dev, xdpf, dev_rx);
+}
+
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ return __xdp_enqueue(dev, xdp, dev_rx);
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ struct net_device *dev = dst->dev;
+
+ return __xdp_enqueue(dev, xdp, dev_rx);
}
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
@@ -509,28 +487,11 @@ static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
return dev ? &dev->ifindex : NULL;
}
-static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
-{
- if (dev->dev->netdev_ops->ndo_xdp_xmit) {
- struct xdp_bulk_queue *bq;
- int cpu;
-
- rcu_read_lock();
- for_each_online_cpu(cpu) {
- bq = per_cpu_ptr(dev->bulkq, cpu);
- bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
- }
- rcu_read_unlock();
- }
-}
-
static void __dev_map_entry_free(struct rcu_head *rcu)
{
struct bpf_dtab_netdev *dev;
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
- dev_map_flush_old(dev);
- free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -545,12 +506,11 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
/* Use call_rcu() here to ensure any rcu critical sections have
- * completed, but this does not guarantee a flush has happened
- * yet. Because driver side rcu_read_lock/unlock only protects the
- * running XDP program. However, for pending flush operations the
- * dev and ctx are stored in another per cpu map. And additionally,
- * the driver tear down ensures all soft irqs are complete before
- * removing the net device in the case of dev_put equals zero.
+ * completed as well as any flush operations because call_rcu
+ * will wait for preempt-disable region to complete, NAPI in this
+ * context. And additionally, the driver tear down ensures all
+ * soft irqs are complete before removing the net device in the
+ * case of dev_put equals zero.
*/
old_dev = xchg(&dtab->netdev_map[k], NULL);
if (old_dev)
@@ -585,30 +545,15 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
u32 ifindex,
unsigned int idx)
{
- gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
struct bpf_dtab_netdev *dev;
- struct xdp_bulk_queue *bq;
- int cpu;
- dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node);
+ dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
+ dtab->map.numa_node);
if (!dev)
return ERR_PTR(-ENOMEM);
- dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
- sizeof(void *), gfp);
- if (!dev->bulkq) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
- }
-
- for_each_possible_cpu(cpu) {
- bq = per_cpu_ptr(dev->bulkq, cpu);
- bq->obj = dev;
- }
-
dev->dev = dev_get_by_index(net, ifindex);
if (!dev->dev) {
- free_percpu(dev->bulkq);
kfree(dev);
return ERR_PTR(-EINVAL);
}
@@ -768,9 +713,23 @@ static int dev_map_notification(struct notifier_block *notifier,
{
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
struct bpf_dtab *dtab;
- int i;
+ int i, cpu;
switch (event) {
+ case NETDEV_REGISTER:
+ if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq)
+ break;
+
+ /* will be freed in free_netdev() */
+ netdev->xdp_bulkq =
+ __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue),
+ sizeof(void *), GFP_ATOMIC);
+ if (!netdev->xdp_bulkq)
+ return NOTIFY_BAD;
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
+ break;
case NETDEV_UNREGISTER:
/* This rcu_read_lock/unlock pair is needed because
* dev_map_list is an RCU list AND to ensure a delete
@@ -810,10 +769,15 @@ static struct notifier_block dev_map_notifier = {
static int __init dev_map_init(void)
{
+ int cpu;
+
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
offsetof(struct _bpf_dtab_netdev, dev));
register_netdevice_notifier(&dev_map_notifier);
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
return 0;
}
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
new file mode 100644
index 000000000000..b3e5b214fed8
--- /dev/null
+++ b/kernel/bpf/dispatcher.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2019 Intel Corporation. */
+
+#include <linux/hash.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+/* The BPF dispatcher is a multiway branch code generator. The
+ * dispatcher is a mechanism to avoid the performance penalty of an
+ * indirect call, which is expensive when retpolines are enabled. A
+ * dispatch client registers a BPF program into the dispatcher, and if
+ * there is available room in the dispatcher a direct call to the BPF
+ * program will be generated. All calls to the BPF programs called via
+ * the dispatcher will then be a direct call, instead of an
+ * indirect. The dispatcher hijacks a trampoline function it via the
+ * __fentry__ of the trampoline. The trampoline function has the
+ * following signature:
+ *
+ * unsigned int trampoline(const void *ctx, const struct bpf_insn *insnsi,
+ * unsigned int (*bpf_func)(const void *,
+ * const struct bpf_insn *));
+ */
+
+static struct bpf_dispatcher_prog *bpf_dispatcher_find_prog(
+ struct bpf_dispatcher *d, struct bpf_prog *prog)
+{
+ int i;
+
+ for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+ if (prog == d->progs[i].prog)
+ return &d->progs[i];
+ }
+ return NULL;
+}
+
+static struct bpf_dispatcher_prog *bpf_dispatcher_find_free(
+ struct bpf_dispatcher *d)
+{
+ return bpf_dispatcher_find_prog(d, NULL);
+}
+
+static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d,
+ struct bpf_prog *prog)
+{
+ struct bpf_dispatcher_prog *entry;
+
+ if (!prog)
+ return false;
+
+ entry = bpf_dispatcher_find_prog(d, prog);
+ if (entry) {
+ refcount_inc(&entry->users);
+ return false;
+ }
+
+ entry = bpf_dispatcher_find_free(d);
+ if (!entry)
+ return false;
+
+ bpf_prog_inc(prog);
+ entry->prog = prog;
+ refcount_set(&entry->users, 1);
+ d->num_progs++;
+ return true;
+}
+
+static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
+ struct bpf_prog *prog)
+{
+ struct bpf_dispatcher_prog *entry;
+
+ if (!prog)
+ return false;
+
+ entry = bpf_dispatcher_find_prog(d, prog);
+ if (!entry)
+ return false;
+
+ if (refcount_dec_and_test(&entry->users)) {
+ entry->prog = NULL;
+ bpf_prog_put(prog);
+ d->num_progs--;
+ return true;
+ }
+ return false;
+}
+
+int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+{
+ return -ENOTSUPP;
+}
+
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
+{
+ s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
+ int i;
+
+ for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+ if (d->progs[i].prog)
+ *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
+ }
+ return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
+}
+
+static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
+{
+ void *old, *new;
+ u32 noff;
+ int err;
+
+ if (!prev_num_progs) {
+ old = NULL;
+ noff = 0;
+ } else {
+ old = d->image + d->image_off;
+ noff = d->image_off ^ (BPF_IMAGE_SIZE / 2);
+ }
+
+ new = d->num_progs ? d->image + noff : NULL;
+ if (new) {
+ if (bpf_dispatcher_prepare(d, new))
+ return;
+ }
+
+ err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new);
+ if (err || !new)
+ return;
+
+ d->image_off = noff;
+}
+
+void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
+ struct bpf_prog *to)
+{
+ bool changed = false;
+ int prev_num_progs;
+
+ if (from == to)
+ return;
+
+ mutex_lock(&d->mutex);
+ if (!d->image) {
+ d->image = bpf_image_alloc();
+ if (!d->image)
+ goto out;
+ }
+
+ prev_num_progs = d->num_progs;
+ changed |= bpf_dispatcher_remove_prog(d, from);
+ changed |= bpf_dispatcher_add_prog(d, to);
+
+ if (!changed)
+ goto out;
+
+ bpf_dispatcher_update(d, prev_num_progs);
+out:
+ mutex_unlock(&d->mutex);
+}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22066a62c8c9..2d182c4ee9d9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -17,6 +17,16 @@
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
+#define BATCH_OPS(_name) \
+ .map_lookup_batch = \
+ _name##_map_lookup_batch, \
+ .map_lookup_and_delete_batch = \
+ _name##_map_lookup_and_delete_batch, \
+ .map_update_batch = \
+ generic_map_update_batch, \
+ .map_delete_batch = \
+ generic_map_delete_batch
+
struct bucket {
struct hlist_nulls_head head;
raw_spinlock_t lock;
@@ -1232,6 +1242,256 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+static int
+__htab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ bool do_delete, bool is_lru_map,
+ bool is_percpu)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
+ void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
+ void __user *uvalues = u64_to_user_ptr(attr->batch.values);
+ void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
+ void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ u32 batch, max_count, size, bucket_size;
+ u64 elem_map_flags, map_flags;
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ unsigned long flags;
+ struct htab_elem *l;
+ struct bucket *b;
+ int ret = 0;
+
+ elem_map_flags = attr->batch.elem_flags;
+ if ((elem_map_flags & ~BPF_F_LOCK) ||
+ ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+ return -EINVAL;
+
+ map_flags = attr->batch.flags;
+ if (map_flags)
+ return -EINVAL;
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ batch = 0;
+ if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
+ return -EFAULT;
+
+ if (batch >= htab->n_buckets)
+ return -ENOENT;
+
+ key_size = htab->map.key_size;
+ roundup_key_size = round_up(htab->map.key_size, 8);
+ value_size = htab->map.value_size;
+ size = round_up(value_size, 8);
+ if (is_percpu)
+ value_size = size * num_possible_cpus();
+ total = 0;
+ /* while experimenting with hash tables with sizes ranging from 10 to
+ * 1000, it was observed that a bucket can have upto 5 entries.
+ */
+ bucket_size = 5;
+
+alloc:
+ /* We cannot do copy_from_user or copy_to_user inside
+ * the rcu_read_lock. Allocate enough space here.
+ */
+ keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
+ if (!keys || !values) {
+ ret = -ENOMEM;
+ goto after_loop;
+ }
+
+again:
+ preempt_disable();
+ this_cpu_inc(bpf_prog_active);
+ rcu_read_lock();
+again_nocopy:
+ dst_key = keys;
+ dst_val = values;
+ b = &htab->buckets[batch];
+ head = &b->head;
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ bucket_cnt = 0;
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+ bucket_cnt++;
+
+ if (bucket_cnt > (max_count - total)) {
+ if (total == 0)
+ ret = -ENOSPC;
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ rcu_read_unlock();
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ goto after_loop;
+ }
+
+ if (bucket_cnt > bucket_size) {
+ bucket_size = bucket_cnt;
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ rcu_read_unlock();
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ kvfree(keys);
+ kvfree(values);
+ goto alloc;
+ }
+
+ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+ memcpy(dst_key, l->key, key_size);
+
+ if (is_percpu) {
+ int off = 0, cpu;
+ void __percpu *pptr;
+
+ pptr = htab_elem_get_ptr(l, map->key_size);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(dst_val + off,
+ per_cpu_ptr(pptr, cpu), size);
+ off += size;
+ }
+ } else {
+ value = l->key + roundup_key_size;
+ if (elem_map_flags & BPF_F_LOCK)
+ copy_map_value_locked(map, dst_val, value,
+ true);
+ else
+ copy_map_value(map, dst_val, value);
+ check_and_init_map_lock(map, dst_val);
+ }
+ if (do_delete) {
+ hlist_nulls_del_rcu(&l->hash_node);
+ if (is_lru_map)
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+ else
+ free_htab_elem(htab, l);
+ }
+ dst_key += key_size;
+ dst_val += value_size;
+ }
+
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ /* If we are not copying data, we can go to next bucket and avoid
+ * unlocking the rcu.
+ */
+ if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
+ batch++;
+ goto again_nocopy;
+ }
+
+ rcu_read_unlock();
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
+ key_size * bucket_cnt) ||
+ copy_to_user(uvalues + total * value_size, values,
+ value_size * bucket_cnt))) {
+ ret = -EFAULT;
+ goto after_loop;
+ }
+
+ total += bucket_cnt;
+ batch++;
+ if (batch >= htab->n_buckets) {
+ ret = -ENOENT;
+ goto after_loop;
+ }
+ goto again;
+
+after_loop:
+ if (ret == -EFAULT)
+ goto out;
+
+ /* copy # of entries and next batch */
+ ubatch = u64_to_user_ptr(attr->batch.out_batch);
+ if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
+ put_user(total, &uattr->batch.count))
+ ret = -EFAULT;
+
+out:
+ kvfree(keys);
+ kvfree(values);
+ return ret;
+}
+
+static int
+htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ false, true);
+}
+
+static int
+htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ false, true);
+}
+
+static int
+htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ false, false);
+}
+
+static int
+htab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ false, false);
+}
+
+static int
+htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ true, true);
+}
+
+static int
+htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ true, true);
+}
+
+static int
+htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ true, false);
+}
+
+static int
+htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ true, false);
+}
+
const struct bpf_map_ops htab_map_ops = {
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
@@ -1242,6 +1502,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_delete_elem = htab_map_delete_elem,
.map_gen_lookup = htab_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ BATCH_OPS(htab),
};
const struct bpf_map_ops htab_lru_map_ops = {
@@ -1255,6 +1516,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ BATCH_OPS(htab_lru),
};
/* Called from eBPF program */
@@ -1368,6 +1630,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ BATCH_OPS(htab_percpu),
};
const struct bpf_map_ops htab_lru_percpu_map_ops = {
@@ -1379,6 +1642,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ BATCH_OPS(htab_lru_percpu),
};
static int fd_htab_map_alloc_check(union bpf_attr *attr)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index cada974c9f4e..d8b7b110a1c5 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -11,6 +11,7 @@
#include <linux/uidgid.h>
#include <linux/filter.h>
#include <linux/ctype.h>
+#include <linux/jiffies.h>
#include "../../lib/kstrtox.h"
@@ -312,6 +313,17 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
preempt_enable();
}
+BPF_CALL_0(bpf_jiffies64)
+{
+ return get_jiffies_64();
+}
+
+const struct bpf_func_proto bpf_jiffies64_proto = {
+ .func = bpf_jiffies64,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index ecf42bec38c0..bd2fd8eab470 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -196,6 +196,7 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
void *key = map_iter(m)->key;
void *prev_key;
+ (*pos)++;
if (map_iter(m)->done)
return NULL;
@@ -208,8 +209,6 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
map_iter(m)->done = true;
return NULL;
}
-
- ++(*pos);
return key;
}
@@ -380,7 +379,7 @@ static const struct inode_operations bpf_dir_iops = {
.unlink = simple_unlink,
};
-static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
+static int bpf_obj_do_pin(const char __user *pathname, void *raw,
enum bpf_type type)
{
struct dentry *dentry;
@@ -389,7 +388,7 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
umode_t mode;
int ret;
- dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
+ dentry = user_path_create(AT_FDCWD, pathname, &path, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -422,30 +421,22 @@ out:
int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
{
- struct filename *pname;
enum bpf_type type;
void *raw;
int ret;
- pname = getname(pathname);
- if (IS_ERR(pname))
- return PTR_ERR(pname);
-
raw = bpf_fd_probe_obj(ufd, &type);
- if (IS_ERR(raw)) {
- ret = PTR_ERR(raw);
- goto out;
- }
+ if (IS_ERR(raw))
+ return PTR_ERR(raw);
- ret = bpf_obj_do_pin(pname, raw, type);
+ ret = bpf_obj_do_pin(pathname, raw, type);
if (ret != 0)
bpf_any_put(raw, type);
-out:
- putname(pname);
+
return ret;
}
-static void *bpf_obj_do_get(const struct filename *pathname,
+static void *bpf_obj_do_get(const char __user *pathname,
enum bpf_type *type, int flags)
{
struct inode *inode;
@@ -453,7 +444,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
void *raw;
int ret;
- ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
+ ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path);
if (ret)
return ERR_PTR(ret);
@@ -480,36 +471,27 @@ out:
int bpf_obj_get_user(const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
- struct filename *pname;
- int ret = -ENOENT;
int f_flags;
void *raw;
+ int ret;
f_flags = bpf_get_file_flag(flags);
if (f_flags < 0)
return f_flags;
- pname = getname(pathname);
- if (IS_ERR(pname))
- return PTR_ERR(pname);
-
- raw = bpf_obj_do_get(pname, &type, f_flags);
- if (IS_ERR(raw)) {
- ret = PTR_ERR(raw);
- goto out;
- }
+ raw = bpf_obj_do_get(pathname, &type, f_flags);
+ if (IS_ERR(raw))
+ return PTR_ERR(raw);
if (type == BPF_TYPE_PROG)
ret = bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
ret = bpf_map_new_fd(raw, f_flags);
else
- goto out;
+ return -ENOENT;
if (ret < 0)
bpf_any_put(raw, type);
-out:
- putname(pname);
return ret;
}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 2ba750725cb2..33d01866bcc2 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -20,7 +20,7 @@ struct bpf_cgroup_storage_map {
struct bpf_map map;
spinlock_t lock;
- struct bpf_prog *prog;
+ struct bpf_prog_aux *aux;
struct rb_root root;
struct list_head list;
};
@@ -357,7 +357,7 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
* The first field must be a 64 bit integer at 0 offset.
*/
m = (struct btf_member *)(key_type + 1);
- size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id);
+ size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id);
if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
return -EINVAL;
@@ -366,7 +366,7 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
*/
m++;
offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
- size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type);
+ size = sizeof_field(struct bpf_cgroup_storage_key, attach_type);
if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
return -EINVAL;
@@ -420,7 +420,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
.map_seq_show_elem = cgroup_storage_seq_show_elem,
};
-int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
+int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
{
enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
@@ -428,14 +428,14 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
spin_lock_bh(&map->lock);
- if (map->prog && map->prog != prog)
+ if (map->aux && map->aux != aux)
goto unlock;
- if (prog->aux->cgroup_storage[stype] &&
- prog->aux->cgroup_storage[stype] != _map)
+ if (aux->cgroup_storage[stype] &&
+ aux->cgroup_storage[stype] != _map)
goto unlock;
- map->prog = prog;
- prog->aux->cgroup_storage[stype] = _map;
+ map->aux = aux;
+ aux->cgroup_storage[stype] = _map;
ret = 0;
unlock:
spin_unlock_bh(&map->lock);
@@ -443,16 +443,16 @@ unlock:
return ret;
}
-void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
+void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map)
{
enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
spin_lock_bh(&map->lock);
- if (map->prog == prog) {
- WARN_ON(prog->aux->cgroup_storage[stype] != _map);
- map->prog = NULL;
- prog->aux->cgroup_storage[stype] = NULL;
+ if (map->aux == aux) {
+ WARN_ON(aux->cgroup_storage[stype] != _map);
+ map->aux = NULL;
+ aux->cgroup_storage[stype] = NULL;
}
spin_unlock_bh(&map->lock);
}
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 5e9366b33f0f..b3c48d1533cb 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -22,7 +22,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
*/
if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
- inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE ||
+ inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
fdput(f);
return ERR_PTR(-ENOTSUPP);
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e3461ec59570..a91ad518c050 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,7 @@
#include <linux/timekeeping.h>
#include <linux/ctype.h>
#include <linux/nospec.h>
+#include <linux/audit.h>
#include <uapi/linux/btf.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -128,6 +129,152 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map;
}
+static u32 bpf_map_value_size(struct bpf_map *map)
+{
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+ return round_up(map->value_size, 8) * num_possible_cpus();
+ else if (IS_FD_MAP(map))
+ return sizeof(u32);
+ else
+ return map->value_size;
+}
+
+static void maybe_wait_bpf_programs(struct bpf_map *map)
+{
+ /* Wait for any running BPF programs to complete so that
+ * userspace, when we return to it, knows that all programs
+ * that could be running use the new map value.
+ */
+ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
+ map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+ synchronize_rcu();
+}
+
+static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
+ void *value, __u64 flags)
+{
+ int err;
+
+ /* Need to create a kthread, thus must support schedule */
+ if (bpf_map_is_dev_bound(map)) {
+ return bpf_map_offload_update_elem(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+ map->map_type == BPF_MAP_TYPE_SOCKHASH ||
+ map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ return map->ops->map_update_elem(map, key, value, flags);
+ } else if (IS_FD_PROG_ARRAY(map)) {
+ return bpf_fd_array_map_update_elem(map, f.file, key, value,
+ flags);
+ }
+
+ /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+ * inside bpf map update or delete otherwise deadlocks are possible
+ */
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ err = bpf_percpu_hash_update(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_update(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ err = bpf_percpu_cgroup_storage_update(map, key, value,
+ flags);
+ } else if (IS_FD_ARRAY(map)) {
+ rcu_read_lock();
+ err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ flags);
+ rcu_read_unlock();
+ } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ rcu_read_lock();
+ err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+ flags);
+ rcu_read_unlock();
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+ /* rcu_read_lock() is not needed */
+ err = bpf_fd_reuseport_array_update_elem(map, key, value,
+ flags);
+ } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK) {
+ err = map->ops->map_push_elem(map, value, flags);
+ } else {
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, flags);
+ rcu_read_unlock();
+ }
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ maybe_wait_bpf_programs(map);
+
+ return err;
+}
+
+static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
+ __u64 flags)
+{
+ void *ptr;
+ int err;
+
+ if (bpf_map_is_dev_bound(map))
+ return bpf_map_offload_lookup_elem(map, key, value);
+
+ preempt_disable();
+ this_cpu_inc(bpf_prog_active);
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ err = bpf_percpu_hash_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ err = bpf_percpu_cgroup_storage_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+ err = bpf_stackmap_copy(map, key, value);
+ } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
+ err = bpf_fd_array_map_lookup_elem(map, key, value);
+ } else if (IS_FD_HASH(map)) {
+ err = bpf_fd_htab_map_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+ err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK) {
+ err = map->ops->map_peek_elem(map, value);
+ } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ /* struct_ops map requires directly updating "value" */
+ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
+ } else {
+ rcu_read_lock();
+ if (map->ops->map_lookup_elem_sys_only)
+ ptr = map->ops->map_lookup_elem_sys_only(map, key);
+ else
+ ptr = map->ops->map_lookup_elem(map, key);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ } else if (!ptr) {
+ err = -ENOENT;
+ } else {
+ err = 0;
+ if (flags & BPF_F_LOCK)
+ /* lock 'ptr' and copy everything but lock */
+ copy_map_value_locked(map, value, ptr, true);
+ else
+ copy_map_value(map, value, ptr);
+ /* mask lock, since value wasn't zero inited */
+ check_and_init_map_lock(map, value);
+ }
+ rcu_read_unlock();
+ }
+
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ maybe_wait_bpf_programs(map);
+
+ return err;
+}
+
static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
{
/* We really just want to fail instead of triggering OOM killer
@@ -627,7 +774,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
return ret;
}
-#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
+#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -641,6 +788,14 @@ static int map_create(union bpf_attr *attr)
if (err)
return -EINVAL;
+ if (attr->btf_vmlinux_value_type_id) {
+ if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
+ attr->btf_key_type_id || attr->btf_value_type_id)
+ return -EINVAL;
+ } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
+ return -EINVAL;
+ }
+
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
return f_flags;
@@ -663,32 +818,35 @@ static int map_create(union bpf_attr *attr)
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
- if (attr->btf_key_type_id || attr->btf_value_type_id) {
+ map->spin_lock_off = -EINVAL;
+ if (attr->btf_key_type_id || attr->btf_value_type_id ||
+ /* Even the map's value is a kernel's struct,
+ * the bpf_prog.o must have BTF to begin with
+ * to figure out the corresponding kernel's
+ * counter part. Thus, attr->btf_fd has
+ * to be valid also.
+ */
+ attr->btf_vmlinux_value_type_id) {
struct btf *btf;
- if (!attr->btf_value_type_id) {
- err = -EINVAL;
- goto free_map;
- }
-
btf = btf_get_by_fd(attr->btf_fd);
if (IS_ERR(btf)) {
err = PTR_ERR(btf);
goto free_map;
}
+ map->btf = btf;
- err = map_check_btf(map, btf, attr->btf_key_type_id,
- attr->btf_value_type_id);
- if (err) {
- btf_put(btf);
- goto free_map;
+ if (attr->btf_value_type_id) {
+ err = map_check_btf(map, btf, attr->btf_key_type_id,
+ attr->btf_value_type_id);
+ if (err)
+ goto free_map;
}
- map->btf = btf;
map->btf_key_type_id = attr->btf_key_type_id;
map->btf_value_type_id = attr->btf_value_type_id;
- } else {
- map->spin_lock_off = -EINVAL;
+ map->btf_vmlinux_value_type_id =
+ attr->btf_vmlinux_value_type_id;
}
err = security_bpf_map_alloc(map);
@@ -815,7 +973,7 @@ static int map_lookup_elem(union bpf_attr *attr)
void __user *uvalue = u64_to_user_ptr(attr->value);
int ufd = attr->map_fd;
struct bpf_map *map;
- void *key, *value, *ptr;
+ void *key, *value;
u32 value_size;
struct fd f;
int err;
@@ -847,72 +1005,14 @@ static int map_lookup_elem(union bpf_attr *attr)
goto err_put;
}
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
- map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
- value_size = round_up(map->value_size, 8) * num_possible_cpus();
- else if (IS_FD_MAP(map))
- value_size = sizeof(u32);
- else
- value_size = map->value_size;
+ value_size = bpf_map_value_size(map);
err = -ENOMEM;
value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
- if (bpf_map_is_dev_bound(map)) {
- err = bpf_map_offload_lookup_elem(map, key, value);
- goto done;
- }
-
- preempt_disable();
- this_cpu_inc(bpf_prog_active);
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- err = bpf_percpu_hash_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- err = bpf_percpu_array_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
- err = bpf_percpu_cgroup_storage_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
- err = bpf_stackmap_copy(map, key, value);
- } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
- err = bpf_fd_array_map_lookup_elem(map, key, value);
- } else if (IS_FD_HASH(map)) {
- err = bpf_fd_htab_map_lookup_elem(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
- err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
- map->map_type == BPF_MAP_TYPE_STACK) {
- err = map->ops->map_peek_elem(map, value);
- } else {
- rcu_read_lock();
- if (map->ops->map_lookup_elem_sys_only)
- ptr = map->ops->map_lookup_elem_sys_only(map, key);
- else
- ptr = map->ops->map_lookup_elem(map, key);
- if (IS_ERR(ptr)) {
- err = PTR_ERR(ptr);
- } else if (!ptr) {
- err = -ENOENT;
- } else {
- err = 0;
- if (attr->flags & BPF_F_LOCK)
- /* lock 'ptr' and copy everything but lock */
- copy_map_value_locked(map, value, ptr, true);
- else
- copy_map_value(map, value, ptr);
- /* mask lock, since value wasn't zero inited */
- check_and_init_map_lock(map, value);
- }
- rcu_read_unlock();
- }
- this_cpu_dec(bpf_prog_active);
- preempt_enable();
-
-done:
+ err = bpf_map_copy_value(map, key, value, attr->flags);
if (err)
goto free_value;
@@ -931,16 +1031,6 @@ err_put:
return err;
}
-static void maybe_wait_bpf_programs(struct bpf_map *map)
-{
- /* Wait for any running BPF programs to complete so that
- * userspace, when we return to it, knows that all programs
- * that could be running use the new map value.
- */
- if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
- map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
- synchronize_rcu();
-}
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
@@ -996,60 +1086,8 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value;
- /* Need to create a kthread, thus must support schedule */
- if (bpf_map_is_dev_bound(map)) {
- err = bpf_map_offload_update_elem(map, key, value, attr->flags);
- goto out;
- } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
- map->map_type == BPF_MAP_TYPE_SOCKHASH ||
- map->map_type == BPF_MAP_TYPE_SOCKMAP) {
- err = map->ops->map_update_elem(map, key, value, attr->flags);
- goto out;
- } else if (IS_FD_PROG_ARRAY(map)) {
- err = bpf_fd_array_map_update_elem(map, f.file, key, value,
- attr->flags);
- goto out;
- }
+ err = bpf_map_update_value(map, f, key, value, attr->flags);
- /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
- * inside bpf map update or delete otherwise deadlocks are possible
- */
- preempt_disable();
- __this_cpu_inc(bpf_prog_active);
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- err = bpf_percpu_hash_update(map, key, value, attr->flags);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- err = bpf_percpu_array_update(map, key, value, attr->flags);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
- err = bpf_percpu_cgroup_storage_update(map, key, value,
- attr->flags);
- } else if (IS_FD_ARRAY(map)) {
- rcu_read_lock();
- err = bpf_fd_array_map_update_elem(map, f.file, key, value,
- attr->flags);
- rcu_read_unlock();
- } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
- rcu_read_lock();
- err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
- attr->flags);
- rcu_read_unlock();
- } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
- /* rcu_read_lock() is not needed */
- err = bpf_fd_reuseport_array_update_elem(map, key, value,
- attr->flags);
- } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
- map->map_type == BPF_MAP_TYPE_STACK) {
- err = map->ops->map_push_elem(map, value, attr->flags);
- } else {
- rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value, attr->flags);
- rcu_read_unlock();
- }
- __this_cpu_dec(bpf_prog_active);
- preempt_enable();
- maybe_wait_bpf_programs(map);
-out:
free_value:
kfree(value);
free_key:
@@ -1091,7 +1129,9 @@ static int map_delete_elem(union bpf_attr *attr)
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
- } else if (IS_FD_PROG_ARRAY(map)) {
+ } else if (IS_FD_PROG_ARRAY(map) ||
+ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ /* These maps require sleepable context */
err = map->ops->map_delete_elem(map, key);
goto out;
}
@@ -1178,6 +1218,220 @@ err_put:
return err;
}
+int generic_map_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ u32 cp, max_count;
+ int err = 0;
+ void *key;
+
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ return -EINVAL;
+ }
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!key)
+ return -ENOMEM;
+
+ for (cp = 0; cp < max_count; cp++) {
+ err = -EFAULT;
+ if (copy_from_user(key, keys + cp * map->key_size,
+ map->key_size))
+ break;
+
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_delete_elem(map, key);
+ break;
+ }
+
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ rcu_read_lock();
+ err = map->ops->map_delete_elem(map, key);
+ rcu_read_unlock();
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ maybe_wait_bpf_programs(map);
+ if (err)
+ break;
+ }
+ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
+ err = -EFAULT;
+
+ kfree(key);
+ return err;
+}
+
+int generic_map_update_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ void __user *values = u64_to_user_ptr(attr->batch.values);
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ u32 value_size, cp, max_count;
+ int ufd = attr->map_fd;
+ void *key, *value;
+ struct fd f;
+ int err = 0;
+
+ f = fdget(ufd);
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ return -EINVAL;
+ }
+
+ value_size = bpf_map_value_size(map);
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!key)
+ return -ENOMEM;
+
+ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ if (!value) {
+ kfree(key);
+ return -ENOMEM;
+ }
+
+ for (cp = 0; cp < max_count; cp++) {
+ err = -EFAULT;
+ if (copy_from_user(key, keys + cp * map->key_size,
+ map->key_size) ||
+ copy_from_user(value, values + cp * value_size, value_size))
+ break;
+
+ err = bpf_map_update_value(map, f, key, value,
+ attr->batch.elem_flags);
+
+ if (err)
+ break;
+ }
+
+ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
+ err = -EFAULT;
+
+ kfree(value);
+ kfree(key);
+ return err;
+}
+
+#define MAP_LOOKUP_RETRIES 3
+
+int generic_map_lookup_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
+ void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ void __user *values = u64_to_user_ptr(attr->batch.values);
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ void *buf, *buf_prevkey, *prev_key, *key, *value;
+ int err, retry = MAP_LOOKUP_RETRIES;
+ u32 value_size, cp, max_count;
+
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map))
+ return -EINVAL;
+
+ value_size = bpf_map_value_size(map);
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!buf_prevkey)
+ return -ENOMEM;
+
+ buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
+ if (!buf) {
+ kvfree(buf_prevkey);
+ return -ENOMEM;
+ }
+
+ err = -EFAULT;
+ prev_key = NULL;
+ if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
+ goto free_buf;
+ key = buf;
+ value = key + map->key_size;
+ if (ubatch)
+ prev_key = buf_prevkey;
+
+ for (cp = 0; cp < max_count;) {
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, prev_key, key);
+ rcu_read_unlock();
+ if (err)
+ break;
+ err = bpf_map_copy_value(map, key, value,
+ attr->batch.elem_flags);
+
+ if (err == -ENOENT) {
+ if (retry) {
+ retry--;
+ continue;
+ }
+ err = -EINTR;
+ break;
+ }
+
+ if (err)
+ goto free_buf;
+
+ if (copy_to_user(keys + cp * map->key_size, key,
+ map->key_size)) {
+ err = -EFAULT;
+ goto free_buf;
+ }
+ if (copy_to_user(values + cp * value_size, value, value_size)) {
+ err = -EFAULT;
+ goto free_buf;
+ }
+
+ if (!prev_key)
+ prev_key = buf_prevkey;
+
+ swap(prev_key, key);
+ retry = MAP_LOOKUP_RETRIES;
+ cp++;
+ }
+
+ if (err == -EFAULT)
+ goto free_buf;
+
+ if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
+ (cp && copy_to_user(uobatch, prev_key, map->key_size))))
+ err = -EFAULT;
+
+free_buf:
+ kfree(buf_prevkey);
+ kfree(buf);
+ return err;
+}
+
#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
static int map_lookup_and_delete_elem(union bpf_attr *attr)
@@ -1306,6 +1560,36 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
return 0;
}
+enum bpf_audit {
+ BPF_AUDIT_LOAD,
+ BPF_AUDIT_UNLOAD,
+ BPF_AUDIT_MAX,
+};
+
+static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
+ [BPF_AUDIT_LOAD] = "LOAD",
+ [BPF_AUDIT_UNLOAD] = "UNLOAD",
+};
+
+static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
+{
+ struct audit_context *ctx = NULL;
+ struct audit_buffer *ab;
+
+ if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
+ return;
+ if (audit_enabled == AUDIT_OFF)
+ return;
+ if (op == BPF_AUDIT_LOAD)
+ ctx = audit_context();
+ ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "prog-id=%u op=%s",
+ prog->aux->id, bpf_audit_str[op]);
+ audit_log_end(ab);
+}
+
int __bpf_prog_charge(struct user_struct *user, u32 pages)
{
unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -1421,6 +1705,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic64_dec_and_test(&prog->aux->refcnt)) {
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
__bpf_prog_put_noref(prog, true);
@@ -1640,17 +1925,24 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
enum bpf_attach_type expected_attach_type,
u32 btf_id, u32 prog_fd)
{
- switch (prog_type) {
- case BPF_PROG_TYPE_TRACING:
+ if (btf_id) {
if (btf_id > BTF_MAX_TYPE)
return -EINVAL;
- break;
- default:
- if (btf_id || prog_fd)
+
+ switch (prog_type) {
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ case BPF_PROG_TYPE_EXT:
+ break;
+ default:
return -EINVAL;
- break;
+ }
}
+ if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING &&
+ prog_type != BPF_PROG_TYPE_EXT)
+ return -EINVAL;
+
switch (prog_type) {
case BPF_PROG_TYPE_CGROUP_SOCK:
switch (expected_attach_type) {
@@ -1691,6 +1983,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
default:
return -EINVAL;
}
+ case BPF_PROG_TYPE_EXT:
+ if (expected_attach_type)
+ return -EINVAL;
+ /* fallthrough */
default:
return 0;
}
@@ -1830,6 +2126,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
*/
bpf_prog_kallsyms_add(prog);
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_LOAD);
err = bpf_prog_new_fd(prog);
if (err < 0)
@@ -1892,7 +2189,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
int tr_fd, err;
if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
- prog->expected_attach_type != BPF_TRACE_FEXIT) {
+ prog->expected_attach_type != BPF_TRACE_FEXIT &&
+ prog->type != BPF_PROG_TYPE_EXT) {
err = -EINVAL;
goto out_put_prog;
}
@@ -1959,12 +2257,14 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
prog->type != BPF_PROG_TYPE_TRACING &&
+ prog->type != BPF_PROG_TYPE_EXT &&
prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
err = -EINVAL;
goto out_put_prog;
}
- if (prog->type == BPF_PROG_TYPE_TRACING) {
+ if (prog->type == BPF_PROG_TYPE_TRACING ||
+ prog->type == BPF_PROG_TYPE_EXT) {
if (attr->raw_tracepoint.name) {
/* The attach point for this category of programs
* should be specified via btf_id during program load.
@@ -2040,10 +2340,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
}
}
-#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
#define BPF_F_ATTACH_MASK \
- (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
static int bpf_prog_attach(const union bpf_attr *attr)
{
@@ -2305,17 +2605,12 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
-static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+struct bpf_prog *bpf_prog_by_id(u32 id)
{
struct bpf_prog *prog;
- u32 id = attr->prog_id;
- int fd;
- if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if (!id)
+ return ERR_PTR(-ENOENT);
spin_lock_bh(&prog_idr_lock);
prog = idr_find(&prog_idr, id);
@@ -2324,7 +2619,22 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
else
prog = ERR_PTR(-ENOENT);
spin_unlock_bh(&prog_idr_lock);
+ return prog;
+}
+
+static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ u32 id = attr->prog_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ prog = bpf_prog_by_id(id);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -2774,6 +3084,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
info.btf_key_type_id = map->btf_key_type_id;
info.btf_value_type_id = map->btf_value_type_id;
}
+ info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_info_fill(&info, map);
@@ -2986,6 +3297,61 @@ out:
return err;
}
+#define BPF_MAP_BATCH_LAST_FIELD batch.flags
+
+#define BPF_DO_BATCH(fn) \
+ do { \
+ if (!fn) { \
+ err = -ENOTSUPP; \
+ goto err_put; \
+ } \
+ err = fn(map, attr, uattr); \
+ } while (0)
+
+static int bpf_map_do_batch(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ int cmd)
+{
+ struct bpf_map *map;
+ int err, ufd;
+ struct fd f;
+
+ if (CHECK_ATTR(BPF_MAP_BATCH))
+ return -EINVAL;
+
+ ufd = attr->batch.map_fd;
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if ((cmd == BPF_MAP_LOOKUP_BATCH ||
+ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
+ !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ if (cmd != BPF_MAP_LOOKUP_BATCH &&
+ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ if (cmd == BPF_MAP_LOOKUP_BATCH)
+ BPF_DO_BATCH(map->ops->map_lookup_batch);
+ else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
+ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
+ else if (cmd == BPF_MAP_UPDATE_BATCH)
+ BPF_DO_BATCH(map->ops->map_update_batch);
+ else
+ BPF_DO_BATCH(map->ops->map_delete_batch);
+
+err_put:
+ fdput(f);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -3083,6 +3449,19 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
err = map_lookup_and_delete_elem(&attr);
break;
+ case BPF_MAP_LOOKUP_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
+ break;
+ case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr,
+ BPF_MAP_LOOKUP_AND_DELETE_BATCH);
+ break;
+ case BPF_MAP_UPDATE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
+ break;
+ case BPF_MAP_DELETE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index ca52b9642943..d4f335a9a899 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -44,14 +44,19 @@ struct tnum tnum_rshift(struct tnum a, u8 shift)
return TNUM(a.value >> shift, a.mask >> shift);
}
-struct tnum tnum_arshift(struct tnum a, u8 min_shift)
+struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness)
{
/* if a.value is negative, arithmetic shifting by minimum shift
* will have larger negative offset compared to more shifting.
* If a.value is nonnegative, arithmetic shifting by minimum shift
* will have larger positive offset compare to more shifting.
*/
- return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift);
+ if (insn_bitness == 32)
+ return TNUM((u32)(((s32)a.value) >> min_shift),
+ (u32)(((s32)a.mask) >> min_shift));
+ else
+ return TNUM((s64)a.value >> min_shift,
+ (s64)a.mask >> min_shift);
}
struct tnum tnum_add(struct tnum a, struct tnum b)
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 7e89f1f49d77..6b264a92064b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -3,16 +3,99 @@
#include <linux/hash.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/ftrace.h>
+#include <linux/rbtree_latch.h>
+
+/* dummy _ops. The verifier will operate on target program's ops. */
+const struct bpf_verifier_ops bpf_extension_verifier_ops = {
+};
+const struct bpf_prog_ops bpf_extension_prog_ops = {
+};
/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
#define TRAMPOLINE_HASH_BITS 10
#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
+static struct latch_tree_root image_tree __cacheline_aligned;
-/* serializes access to trampoline_table */
+/* serializes access to trampoline_table and image_tree */
static DEFINE_MUTEX(trampoline_mutex);
+static void *bpf_jit_alloc_exec_page(void)
+{
+ void *image;
+
+ image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!image)
+ return NULL;
+
+ set_vm_flush_reset_perms(image);
+ /* Keep image as writeable. The alternative is to keep flipping ro/rw
+ * everytime new program is attached or detached.
+ */
+ set_memory_x((long)image, 1);
+ return image;
+}
+
+static __always_inline bool image_tree_less(struct latch_tree_node *a,
+ struct latch_tree_node *b)
+{
+ struct bpf_image *ia = container_of(a, struct bpf_image, tnode);
+ struct bpf_image *ib = container_of(b, struct bpf_image, tnode);
+
+ return ia < ib;
+}
+
+static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n)
+{
+ void *image = container_of(n, struct bpf_image, tnode);
+
+ if (addr < image)
+ return -1;
+ if (addr >= image + PAGE_SIZE)
+ return 1;
+
+ return 0;
+}
+
+static const struct latch_tree_ops image_tree_ops = {
+ .less = image_tree_less,
+ .comp = image_tree_comp,
+};
+
+static void *__bpf_image_alloc(bool lock)
+{
+ struct bpf_image *image;
+
+ image = bpf_jit_alloc_exec_page();
+ if (!image)
+ return NULL;
+
+ if (lock)
+ mutex_lock(&trampoline_mutex);
+ latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops);
+ if (lock)
+ mutex_unlock(&trampoline_mutex);
+ return image->data;
+}
+
+void *bpf_image_alloc(void)
+{
+ return __bpf_image_alloc(true);
+}
+
+bool is_bpf_image_address(unsigned long addr)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = latch_tree_find((void *) addr, &image_tree, &image_tree_ops) != NULL;
+ rcu_read_unlock();
+
+ return ret;
+}
+
struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
{
struct bpf_trampoline *tr;
@@ -33,7 +116,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
goto out;
/* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
- image = bpf_jit_alloc_exec(PAGE_SIZE);
+ image = __bpf_image_alloc(false);
if (!image) {
kfree(tr);
tr = NULL;
@@ -47,27 +130,75 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
mutex_init(&tr->mutex);
for (i = 0; i < BPF_TRAMP_MAX; i++)
INIT_HLIST_HEAD(&tr->progs_hlist[i]);
-
- set_vm_flush_reset_perms(image);
- /* Keep image as writeable. The alternative is to keep flipping ro/rw
- * everytime new program is attached or detached.
- */
- set_memory_x((long)image, 1);
tr->image = image;
out:
mutex_unlock(&trampoline_mutex);
return tr;
}
+static int is_ftrace_location(void *ip)
+{
+ long addr;
+
+ addr = ftrace_location((long)ip);
+ if (!addr)
+ return 0;
+ if (WARN_ON_ONCE(addr != (long)ip))
+ return -EFAULT;
+ return 1;
+}
+
+static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
+{
+ void *ip = tr->func.addr;
+ int ret;
+
+ if (tr->func.ftrace_managed)
+ ret = unregister_ftrace_direct((long)ip, (long)old_addr);
+ else
+ ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
+ return ret;
+}
+
+static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
+{
+ void *ip = tr->func.addr;
+ int ret;
+
+ if (tr->func.ftrace_managed)
+ ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
+ else
+ ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
+ return ret;
+}
+
+/* first time registering */
+static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
+{
+ void *ip = tr->func.addr;
+ int ret;
+
+ ret = is_ftrace_location(ip);
+ if (ret < 0)
+ return ret;
+ tr->func.ftrace_managed = ret;
+
+ if (tr->func.ftrace_managed)
+ ret = register_ftrace_direct((long)ip, (long)new_addr);
+ else
+ ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+ return ret;
+}
+
/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
- * bytes on x86. Pick a number to fit into PAGE_SIZE / 2
+ * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2
*/
#define BPF_MAX_TRAMP_PROGS 40
static int bpf_trampoline_update(struct bpf_trampoline *tr)
{
- void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
- void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
+ void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2;
+ void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2;
struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS];
int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY];
int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT];
@@ -77,8 +208,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
int err;
if (fentry_cnt + fexit_cnt == 0) {
- err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL,
- old_image, NULL);
+ err = unregister_fentry(tr, old_image);
tr->selector = 0;
goto out;
}
@@ -96,21 +226,28 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
if (fexit_cnt)
flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
- err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags,
+ /* Though the second half of trampoline page is unused a task could be
+ * preempted in the middle of the first half of trampoline and two
+ * updates to trampoline would change the code from underneath the
+ * preempted task. Hence wait for tasks to voluntarily schedule or go
+ * to userspace.
+ */
+ synchronize_rcu_tasks();
+
+ err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2,
+ &tr->func.model, flags,
fentry, fentry_cnt,
fexit, fexit_cnt,
tr->func.addr);
- if (err)
+ if (err < 0)
goto out;
if (tr->selector)
/* progs already running at this address */
- err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL,
- old_image, new_image);
+ err = modify_fentry(tr, old_image, new_image);
else
/* first time registering */
- err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, NULL,
- new_image);
+ err = register_fentry(tr, new_image);
if (err)
goto out;
tr->selector++;
@@ -123,8 +260,10 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t)
switch (t) {
case BPF_TRACE_FENTRY:
return BPF_TRAMP_FENTRY;
- default:
+ case BPF_TRACE_FEXIT:
return BPF_TRAMP_FEXIT;
+ default:
+ return BPF_TRAMP_REPLACE;
}
}
@@ -133,12 +272,31 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog)
enum bpf_tramp_prog_type kind;
struct bpf_trampoline *tr;
int err = 0;
+ int cnt;
tr = prog->aux->trampoline;
kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
mutex_lock(&tr->mutex);
- if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]
- >= BPF_MAX_TRAMP_PROGS) {
+ if (tr->extension_prog) {
+ /* cannot attach fentry/fexit if extension prog is attached.
+ * cannot overwrite extension prog either.
+ */
+ err = -EBUSY;
+ goto out;
+ }
+ cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT];
+ if (kind == BPF_TRAMP_REPLACE) {
+ /* Cannot attach extension if fentry/fexit are in use. */
+ if (cnt) {
+ err = -EBUSY;
+ goto out;
+ }
+ tr->extension_prog = prog;
+ err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
+ prog->bpf_func);
+ goto out;
+ }
+ if (cnt >= BPF_MAX_TRAMP_PROGS) {
err = -E2BIG;
goto out;
}
@@ -169,15 +327,25 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
tr = prog->aux->trampoline;
kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
mutex_lock(&tr->mutex);
+ if (kind == BPF_TRAMP_REPLACE) {
+ WARN_ON_ONCE(!tr->extension_prog);
+ err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
+ tr->extension_prog->bpf_func, NULL);
+ tr->extension_prog = NULL;
+ goto out;
+ }
hlist_del(&prog->aux->tramp_hlist);
tr->progs_cnt[kind]--;
err = bpf_trampoline_update(prog->aux->trampoline);
+out:
mutex_unlock(&tr->mutex);
return err;
}
void bpf_trampoline_put(struct bpf_trampoline *tr)
{
+ struct bpf_image *image;
+
if (!tr)
return;
mutex_lock(&trampoline_mutex);
@@ -188,7 +356,11 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
goto out;
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
goto out;
- bpf_jit_free_exec(tr->image);
+ image = container_of(tr->image, struct bpf_image, data);
+ latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops);
+ /* wait for tasks to get out of trampoline before freeing it */
+ synchronize_rcu_tasks();
+ bpf_jit_free_exec(image);
hlist_del(&tr->hlist);
kfree(tr);
out:
@@ -234,7 +406,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
}
int __weak
-arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
+arch_prepare_bpf_trampoline(void *image, void *image_end,
+ const struct btf_func_model *m, u32 flags,
struct bpf_prog **fentry_progs, int fentry_cnt,
struct bpf_prog **fexit_progs, int fexit_cnt,
void *orig_call)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 034ef81f935b..1cc945daa9c8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -907,7 +907,8 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void __mark_reg_not_init(struct bpf_reg_state *reg);
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
/* Mark the unknown part of a register (variable offset or scalar value) as
* known to have the value @imm.
@@ -945,7 +946,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
- __mark_reg_not_init(regs + regno);
+ __mark_reg_not_init(env, regs + regno);
return;
}
__mark_reg_known_zero(regs + regno);
@@ -1054,7 +1055,8 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
}
/* Mark a register as having a completely unknown (scalar) value. */
-static void __mark_reg_unknown(struct bpf_reg_state *reg)
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
{
/*
* Clear type, id, off, and union(map_ptr, range) and
@@ -1064,6 +1066,8 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
reg->type = SCALAR_VALUE;
reg->var_off = tnum_unknown;
reg->frameno = 0;
+ reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ?
+ true : false;
__mark_reg_unbounded(reg);
}
@@ -1074,19 +1078,16 @@ static void mark_reg_unknown(struct bpf_verifier_env *env,
verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs except FP */
for (regno = 0; regno < BPF_REG_FP; regno++)
- __mark_reg_not_init(regs + regno);
+ __mark_reg_not_init(env, regs + regno);
return;
}
- regs += regno;
- __mark_reg_unknown(regs);
- /* constant backtracking is enabled for root without bpf2bpf calls */
- regs->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ?
- true : false;
+ __mark_reg_unknown(env, regs + regno);
}
-static void __mark_reg_not_init(struct bpf_reg_state *reg)
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
{
- __mark_reg_unknown(reg);
+ __mark_reg_unknown(env, reg);
reg->type = NOT_INIT;
}
@@ -1097,10 +1098,10 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs except FP */
for (regno = 0; regno < BPF_REG_FP; regno++)
- __mark_reg_not_init(regs + regno);
+ __mark_reg_not_init(env, regs + regno);
return;
}
- __mark_reg_not_init(regs + regno);
+ __mark_reg_not_init(env, regs + regno);
}
#define DEF_NOT_SUBREG (0)
@@ -1121,10 +1122,6 @@ static void init_reg_state(struct bpf_verifier_env *env,
regs[BPF_REG_FP].type = PTR_TO_STACK;
mark_reg_known_zero(env, regs, BPF_REG_FP);
regs[BPF_REG_FP].frameno = state->frameno;
-
- /* 1st arg to a function */
- regs[BPF_REG_1].type = PTR_TO_CTX;
- mark_reg_known_zero(env, regs, BPF_REG_1);
}
#define BPF_MAIN_FUNC (-1)
@@ -1915,6 +1912,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
+ case PTR_TO_BTF_ID:
return true;
default:
return false;
@@ -2737,8 +2735,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
}
#endif
-static int check_ctx_reg(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno)
+int check_ctx_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno)
{
/* Access to ctx or passing it to a helper is only allowed in
* its original, unmodified form.
@@ -2857,11 +2855,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
u32 btf_id;
int ret;
- if (atype != BPF_READ) {
- verbose(env, "only read is supported\n");
- return -EACCES;
- }
-
if (off < 0) {
verbose(env,
"R%d is ptr_%s invalid negative access: off=%d\n",
@@ -2878,17 +2871,32 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
- ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id);
+ if (env->ops->btf_struct_access) {
+ ret = env->ops->btf_struct_access(&env->log, t, off, size,
+ atype, &btf_id);
+ } else {
+ if (atype != BPF_READ) {
+ verbose(env, "only read is supported\n");
+ return -EACCES;
+ }
+
+ ret = btf_struct_access(&env->log, t, off, size, atype,
+ &btf_id);
+ }
+
if (ret < 0)
return ret;
- if (ret == SCALAR_VALUE) {
- mark_reg_unknown(env, regs, value_regno);
- return 0;
+ if (atype == BPF_READ) {
+ if (ret == SCALAR_VALUE) {
+ mark_reg_unknown(env, regs, value_regno);
+ return 0;
+ }
+ mark_reg_known_zero(env, regs, value_regno);
+ regs[value_regno].type = PTR_TO_BTF_ID;
+ regs[value_regno].btf_id = btf_id;
}
- mark_reg_known_zero(env, regs, value_regno);
- regs[value_regno].type = PTR_TO_BTF_ID;
- regs[value_regno].btf_id = btf_id;
+
return 0;
}
@@ -3234,7 +3242,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
}
if (state->stack[spi].slot_type[0] == STACK_SPILL &&
state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
- __mark_reg_unknown(&state->stack[spi].spilled_ptr);
+ __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
for (j = 0; j < BPF_REG_SIZE; j++)
state->stack[spi].slot_type[j] = STACK_MISC;
goto mark;
@@ -3892,7 +3900,7 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
if (!reg)
continue;
if (reg_is_pkt_pointer_any(reg))
- __mark_reg_unknown(reg);
+ __mark_reg_unknown(env, reg);
}
}
@@ -3920,7 +3928,7 @@ static void release_reg_references(struct bpf_verifier_env *env,
if (!reg)
continue;
if (reg->ref_obj_id == ref_obj_id)
- __mark_reg_unknown(reg);
+ __mark_reg_unknown(env, reg);
}
}
@@ -3944,12 +3952,26 @@ static int release_reference(struct bpf_verifier_env *env,
return 0;
}
+static void clear_caller_saved_regs(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
+{
+ int i;
+
+ /* after the call registers r0 - r5 were scratched */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ mark_reg_not_init(env, regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
+}
+
static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_info_aux *func_info_aux;
struct bpf_func_state *caller, *callee;
int i, err, subprog, target_insn;
+ bool is_global = false;
if (state->curframe + 1 >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep\n",
@@ -3972,6 +3994,32 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EFAULT;
}
+ func_info_aux = env->prog->aux->func_info_aux;
+ if (func_info_aux)
+ is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+ err = btf_check_func_arg_match(env, subprog, caller->regs);
+ if (err == -EFAULT)
+ return err;
+ if (is_global) {
+ if (err) {
+ verbose(env, "Caller passes invalid args into func#%d\n",
+ subprog);
+ return err;
+ } else {
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env,
+ "Func#%d is global and valid. Skipping.\n",
+ subprog);
+ clear_caller_saved_regs(env, caller->regs);
+
+ /* All global functions return SCALAR_VALUE */
+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
+
+ /* continue with next insn after call */
+ return 0;
+ }
+ }
+
callee = kzalloc(sizeof(*callee), GFP_KERNEL);
if (!callee)
return -ENOMEM;
@@ -3998,18 +4046,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
callee->regs[i] = caller->regs[i];
- /* after the call registers r0 - r5 were scratched */
- for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(env, caller->regs, caller_saved[i]);
- check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
- }
+ clear_caller_saved_regs(env, caller->regs);
/* only increment it after check_reg_arg() finished */
state->curframe++;
- if (btf_check_func_arg_match(env, subprog))
- return -EINVAL;
-
/* and go analyze first insn of the callee */
*insn_idx = target_insn;
@@ -4134,6 +4175,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
struct bpf_map *map = meta->map_ptr;
struct tnum range;
u64 val;
+ int err;
if (func_id != BPF_FUNC_tail_call)
return 0;
@@ -4150,6 +4192,10 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return 0;
}
+ err = mark_chain_precision(env, BPF_REG_3);
+ if (err)
+ return err;
+
val = reg->var_off.value;
if (bpf_map_key_unseen(aux))
bpf_map_key_store(aux, val);
@@ -4577,7 +4623,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* Taint dst register if offset had invalid bounds derived from
* e.g. dead branches.
*/
- __mark_reg_unknown(dst_reg);
+ __mark_reg_unknown(env, dst_reg);
return 0;
}
@@ -4829,13 +4875,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* Taint dst register if offset had invalid bounds derived from
* e.g. dead branches.
*/
- __mark_reg_unknown(dst_reg);
+ __mark_reg_unknown(env, dst_reg);
return 0;
}
if (!src_known &&
opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
- __mark_reg_unknown(dst_reg);
+ __mark_reg_unknown(env, dst_reg);
return 0;
}
@@ -5043,9 +5089,16 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* Upon reaching here, src_known is true and
* umax_val is equal to umin_val.
*/
- dst_reg->smin_value >>= umin_val;
- dst_reg->smax_value >>= umin_val;
- dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val);
+ if (insn_bitness == 32) {
+ dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val);
+ dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val);
+ } else {
+ dst_reg->smin_value >>= umin_val;
+ dst_reg->smax_value >>= umin_val;
+ }
+
+ dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val,
+ insn_bitness);
/* blow away the dst_reg umin_value/umax_value and rely on
* dst_reg var_off to refine the result.
@@ -6258,6 +6311,7 @@ static bool may_access_skb(enum bpf_prog_type type)
static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
struct bpf_reg_state *regs = cur_regs(env);
+ static const int ctx_reg = BPF_REG_6;
u8 mode = BPF_MODE(insn->code);
int i, err;
@@ -6291,7 +6345,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check whether implicit source operand (register R6) is readable */
- err = check_reg_arg(env, BPF_REG_6, SRC_OP);
+ err = check_reg_arg(env, ctx_reg, SRC_OP);
if (err)
return err;
@@ -6310,7 +6364,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- if (regs[BPF_REG_6].type != PTR_TO_CTX) {
+ if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
return -EINVAL;
@@ -6323,6 +6377,10 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
+ err = check_ctx_reg(env, &regs[ctx_reg], ctx_reg);
+ if (err < 0)
+ return err;
+
/* reset caller saved regs to unreadable */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
@@ -6342,8 +6400,30 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
static int check_return_code(struct bpf_verifier_env *env)
{
struct tnum enforce_attach_type_range = tnum_unknown;
+ const struct bpf_prog *prog = env->prog;
struct bpf_reg_state *reg;
struct tnum range = tnum_range(0, 1);
+ int err;
+
+ /* The struct_ops func-ptr's return type could be "void" */
+ if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS &&
+ !prog->aux->attach_func_proto->type)
+ return 0;
+
+ /* eBPF calling convetion is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ if (err)
+ return err;
+
+ if (is_pointer_value(env, BPF_REG_0)) {
+ verbose(env, "R0 leaks addr as return value\n");
+ return -EACCES;
+ }
switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
@@ -6732,12 +6812,13 @@ static int check_btf_func(struct bpf_verifier_env *env,
/* check type_id */
type = btf_type_by_id(btf, krecord[i].type_id);
- if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) {
+ if (!type || !btf_type_is_func(type)) {
verbose(env, "invalid type id %d in func info",
krecord[i].type_id);
ret = -EINVAL;
goto err_free;
}
+ info_aux[i].linkage = BTF_INFO_VLEN(type->info);
prev_offset = krecord[i].insn_off;
urecord += urec_size;
}
@@ -6977,7 +7058,7 @@ static void clean_func_state(struct bpf_verifier_env *env,
/* since the register is unused, clear its state
* to make further comparison simpler
*/
- __mark_reg_not_init(&st->regs[i]);
+ __mark_reg_not_init(env, &st->regs[i]);
}
for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
@@ -6985,7 +7066,7 @@ static void clean_func_state(struct bpf_verifier_env *env,
/* liveness must not touch this stack slot anymore */
st->stack[i].spilled_ptr.live |= REG_LIVE_DONE;
if (!(live & REG_LIVE_READ)) {
- __mark_reg_not_init(&st->stack[i].spilled_ptr);
+ __mark_reg_not_init(env, &st->stack[i].spilled_ptr);
for (j = 0; j < BPF_REG_SIZE; j++)
st->stack[i].slot_type[j] = STACK_INVALID;
}
@@ -7717,35 +7798,13 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
static int do_check(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
struct bpf_reg_state *regs;
int insn_cnt = env->prog->len;
bool do_print_state = false;
int prev_insn_idx = -1;
- env->prev_linfo = NULL;
-
- state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
- if (!state)
- return -ENOMEM;
- state->curframe = 0;
- state->speculative = false;
- state->branches = 1;
- state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
- if (!state->frame[0]) {
- kfree(state);
- return -ENOMEM;
- }
- env->cur_state = state;
- init_func_state(env, state->frame[0],
- BPF_MAIN_FUNC /* callsite */,
- 0 /* frameno */,
- 0 /* subprogno, zero == main subprog */);
-
- if (btf_check_func_arg_match(env, 0))
- return -EINVAL;
-
for (;;) {
struct bpf_insn *insn;
u8 class;
@@ -7823,7 +7882,7 @@ static int do_check(struct bpf_verifier_env *env)
}
regs = cur_regs(env);
- env->insn_aux_data[env->insn_idx].seen = true;
+ env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
prev_insn_idx = env->insn_idx;
if (class == BPF_ALU || class == BPF_ALU64) {
@@ -8009,21 +8068,6 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- /* eBPF calling convetion is such that R0 is used
- * to return the value from eBPF program.
- * Make sure that it's readable at this time
- * of bpf_exit, which means that program wrote
- * something into it earlier
- */
- err = check_reg_arg(env, BPF_REG_0, SRC_OP);
- if (err)
- return err;
-
- if (is_pointer_value(env, BPF_REG_0)) {
- verbose(env, "R0 leaks addr as return value\n");
- return -EACCES;
- }
-
err = check_return_code(env);
if (err)
return err;
@@ -8058,7 +8102,7 @@ process_bpf_exit:
return err;
env->insn_idx++;
- env->insn_aux_data[env->insn_idx].seen = true;
+ env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
} else {
verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
@@ -8071,7 +8115,6 @@ process_bpf_exit:
env->insn_idx++;
}
- env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
return 0;
}
@@ -8131,6 +8174,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ verbose(env, "bpf_struct_ops map cannot be used in prog\n");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -8268,7 +8316,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
env->used_maps[env->used_map_cnt++] = map;
if (bpf_map_is_cgroup_storage(map) &&
- bpf_cgroup_storage_assign(env->prog, map)) {
+ bpf_cgroup_storage_assign(env->prog->aux, map)) {
verbose(env, "only one cgroup storage of each type is allowed\n");
fdput(f);
return -EBUSY;
@@ -8298,18 +8346,8 @@ next_insn:
/* drop refcnt of maps used by the rejected program */
static void release_maps(struct bpf_verifier_env *env)
{
- enum bpf_cgroup_storage_type stype;
- int i;
-
- for_each_cgroup_storage_type(stype) {
- if (!env->prog->aux->cgroup_storage[stype])
- continue;
- bpf_cgroup_storage_release(env->prog,
- env->prog->aux->cgroup_storage[stype]);
- }
-
- for (i = 0; i < env->used_map_cnt; i++)
- bpf_map_put(env->used_maps[i]);
+ __bpf_free_used_maps(env->prog->aux, env->used_maps,
+ env->used_map_cnt);
}
/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
@@ -8353,7 +8391,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
for (i = off; i < off + cnt - 1; i++) {
- new_data[i].seen = true;
+ new_data[i].seen = env->pass_cnt;
new_data[i].zext_dst = insn_has_def32(env, insn + i);
}
env->insn_aux_data = new_data;
@@ -8832,12 +8870,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
break;
case PTR_TO_BTF_ID:
- if (type == BPF_WRITE) {
+ if (type == BPF_READ) {
+ insn->code = BPF_LDX | BPF_PROBE_MEM |
+ BPF_SIZE((insn)->code);
+ env->prog->aux->num_exentries++;
+ } else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) {
verbose(env, "Writes through BTF pointers are not allowed\n");
return -EINVAL;
}
- insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code);
- env->prog->aux->num_exentries++;
continue;
default:
continue;
@@ -9282,7 +9322,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->code = BPF_JMP | BPF_TAIL_CALL;
aux = &env->insn_aux_data[i + delta];
- if (prog->jit_requested && !expect_blinding &&
+ if (env->allow_ptr_leaks && !expect_blinding &&
+ prog->jit_requested &&
!bpf_map_key_poisoned(aux) &&
!bpf_map_ptr_poisoned(aux) &&
!bpf_map_ptr_unpriv(aux)) {
@@ -9416,6 +9457,30 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
goto patch_call_imm;
}
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_jiffies64) {
+ struct bpf_insn ld_jiffies_addr[2] = {
+ BPF_LD_IMM64(BPF_REG_0,
+ (unsigned long)&jiffies),
+ };
+
+ insn_buf[0] = ld_jiffies_addr[0];
+ insn_buf[1] = ld_jiffies_addr[1];
+ insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
+ BPF_REG_0, 0);
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
+ cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
patch_call_imm:
fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed
@@ -9462,6 +9527,7 @@ static void free_states(struct bpf_verifier_env *env)
kfree(sl);
sl = sln;
}
+ env->free_list = NULL;
if (!env->explored_states)
return;
@@ -9475,11 +9541,164 @@ static void free_states(struct bpf_verifier_env *env)
kfree(sl);
sl = sln;
}
+ env->explored_states[i] = NULL;
}
+}
- kvfree(env->explored_states);
+/* The verifier is using insn_aux_data[] to store temporary data during
+ * verification and to store information for passes that run after the
+ * verification like dead code sanitization. do_check_common() for subprogram N
+ * may analyze many other subprograms. sanitize_insn_aux_data() clears all
+ * temporary data after do_check_common() finds that subprogram N cannot be
+ * verified independently. pass_cnt counts the number of times
+ * do_check_common() was run and insn->aux->seen tells the pass number
+ * insn_aux_data was touched. These variables are compared to clear temporary
+ * data from failed pass. For testing and experiments do_check_common() can be
+ * run multiple times even when prior attempt to verify is unsuccessful.
+ */
+static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ struct bpf_insn_aux_data *aux;
+ int i, class;
+
+ for (i = 0; i < env->prog->len; i++) {
+ class = BPF_CLASS(insn[i].code);
+ if (class != BPF_LDX && class != BPF_STX)
+ continue;
+ aux = &env->insn_aux_data[i];
+ if (aux->seen != env->pass_cnt)
+ continue;
+ memset(aux, 0, offsetof(typeof(*aux), orig_idx));
+ }
}
+static int do_check_common(struct bpf_verifier_env *env, int subprog)
+{
+ struct bpf_verifier_state *state;
+ struct bpf_reg_state *regs;
+ int ret, i;
+
+ env->prev_linfo = NULL;
+ env->pass_cnt++;
+
+ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+ state->curframe = 0;
+ state->speculative = false;
+ state->branches = 1;
+ state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
+ if (!state->frame[0]) {
+ kfree(state);
+ return -ENOMEM;
+ }
+ env->cur_state = state;
+ init_func_state(env, state->frame[0],
+ BPF_MAIN_FUNC /* callsite */,
+ 0 /* frameno */,
+ subprog);
+
+ regs = state->frame[state->curframe]->regs;
+ if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
+ ret = btf_prepare_func_args(env, subprog, regs);
+ if (ret)
+ goto out;
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
+ if (regs[i].type == PTR_TO_CTX)
+ mark_reg_known_zero(env, regs, i);
+ else if (regs[i].type == SCALAR_VALUE)
+ mark_reg_unknown(env, regs, i);
+ }
+ } else {
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+ mark_reg_known_zero(env, regs, BPF_REG_1);
+ ret = btf_check_func_arg_match(env, subprog, regs);
+ if (ret == -EFAULT)
+ /* unlikely verifier bug. abort.
+ * ret == 0 and ret < 0 are sadly acceptable for
+ * main() function due to backward compatibility.
+ * Like socket filter program may be written as:
+ * int bpf_prog(struct pt_regs *ctx)
+ * and never dereference that ctx in the program.
+ * 'struct pt_regs' is a type mismatch for socket
+ * filter that should be using 'struct __sk_buff'.
+ */
+ goto out;
+ }
+
+ ret = do_check(env);
+out:
+ /* check for NULL is necessary, since cur_state can be freed inside
+ * do_check() under memory pressure.
+ */
+ if (env->cur_state) {
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ }
+ while (!pop_stack(env, NULL, NULL));
+ free_states(env);
+ if (ret)
+ /* clean aux data in case subprog was rejected */
+ sanitize_insn_aux_data(env);
+ return ret;
+}
+
+/* Verify all global functions in a BPF program one by one based on their BTF.
+ * All global functions must pass verification. Otherwise the whole program is rejected.
+ * Consider:
+ * int bar(int);
+ * int foo(int f)
+ * {
+ * return bar(f);
+ * }
+ * int bar(int b)
+ * {
+ * ...
+ * }
+ * foo() will be verified first for R1=any_scalar_value. During verification it
+ * will be assumed that bar() already verified successfully and call to bar()
+ * from foo() will be checked for type match only. Later bar() will be verified
+ * independently to check that it's safe for R1=any_scalar_value.
+ */
+static int do_check_subprogs(struct bpf_verifier_env *env)
+{
+ struct bpf_prog_aux *aux = env->prog->aux;
+ int i, ret;
+
+ if (!aux->func_info)
+ return 0;
+
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
+ continue;
+ env->insn_idx = env->subprog_info[i].start;
+ WARN_ON_ONCE(env->insn_idx == 0);
+ ret = do_check_common(env, i);
+ if (ret) {
+ return ret;
+ } else if (env->log.level & BPF_LOG_LEVEL) {
+ verbose(env,
+ "Func#%d is safe for any args that match its prototype\n",
+ i);
+ }
+ }
+ return 0;
+}
+
+static int do_check_main(struct bpf_verifier_env *env)
+{
+ int ret;
+
+ env->insn_idx = 0;
+ ret = do_check_common(env, 0);
+ if (!ret)
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+ return ret;
+}
+
+
static void print_verification_stats(struct bpf_verifier_env *env)
{
int i;
@@ -9504,9 +9723,62 @@ static void print_verification_stats(struct bpf_verifier_env *env)
env->peak_states, env->longest_mark_read_walk);
}
+static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
+{
+ const struct btf_type *t, *func_proto;
+ const struct bpf_struct_ops *st_ops;
+ const struct btf_member *member;
+ struct bpf_prog *prog = env->prog;
+ u32 btf_id, member_idx;
+ const char *mname;
+
+ btf_id = prog->aux->attach_btf_id;
+ st_ops = bpf_struct_ops_find(btf_id);
+ if (!st_ops) {
+ verbose(env, "attach_btf_id %u is not a supported struct\n",
+ btf_id);
+ return -ENOTSUPP;
+ }
+
+ t = st_ops->type;
+ member_idx = prog->expected_attach_type;
+ if (member_idx >= btf_type_vlen(t)) {
+ verbose(env, "attach to invalid member idx %u of struct %s\n",
+ member_idx, st_ops->name);
+ return -EINVAL;
+ }
+
+ member = &btf_type_member(t)[member_idx];
+ mname = btf_name_by_offset(btf_vmlinux, member->name_off);
+ func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type,
+ NULL);
+ if (!func_proto) {
+ verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
+ mname, member_idx, st_ops->name);
+ return -EINVAL;
+ }
+
+ if (st_ops->check_member) {
+ int err = st_ops->check_member(t, member);
+
+ if (err) {
+ verbose(env, "attach to unsupported member %s of struct %s\n",
+ mname, st_ops->name);
+ return err;
+ }
+ }
+
+ prog->aux->attach_func_proto = func_proto;
+ prog->aux->attach_func_name = mname;
+ env->ops = st_ops->verifier_ops;
+
+ return 0;
+}
+
static int check_attach_btf_id(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
+ bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
struct bpf_prog *tgt_prog = prog->aux->linked_prog;
u32 btf_id = prog->aux->attach_btf_id;
const char prefix[] = "btf_trace_";
@@ -9519,7 +9791,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
long addr;
u64 key;
- if (prog->type != BPF_PROG_TYPE_TRACING)
+ if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
+ return check_struct_ops_btf_id(env);
+
+ if (prog->type != BPF_PROG_TYPE_TRACING && !prog_extension)
return 0;
if (!btf_id) {
@@ -9555,8 +9830,59 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
}
conservative = aux->func_info_aux[subprog].unreliable;
+ if (prog_extension) {
+ if (conservative) {
+ verbose(env,
+ "Cannot replace static functions\n");
+ return -EINVAL;
+ }
+ if (!prog->jit_requested) {
+ verbose(env,
+ "Extension programs should be JITed\n");
+ return -EINVAL;
+ }
+ env->ops = bpf_verifier_ops[tgt_prog->type];
+ }
+ if (!tgt_prog->jited) {
+ verbose(env, "Can attach to only JITed progs\n");
+ return -EINVAL;
+ }
+ if (tgt_prog->type == prog->type) {
+ /* Cannot fentry/fexit another fentry/fexit program.
+ * Cannot attach program extension to another extension.
+ * It's ok to attach fentry/fexit to extension program.
+ */
+ verbose(env, "Cannot recursively attach\n");
+ return -EINVAL;
+ }
+ if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
+ prog_extension &&
+ (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
+ tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
+ /* Program extensions can extend all program types
+ * except fentry/fexit. The reason is the following.
+ * The fentry/fexit programs are used for performance
+ * analysis, stats and can be attached to any program
+ * type except themselves. When extension program is
+ * replacing XDP function it is necessary to allow
+ * performance analysis of all functions. Both original
+ * XDP program and its program extension. Hence
+ * attaching fentry/fexit to BPF_PROG_TYPE_EXT is
+ * allowed. If extending of fentry/fexit was allowed it
+ * would be possible to create long call chain
+ * fentry->extension->fentry->extension beyond
+ * reasonable stack size. Hence extending fentry is not
+ * allowed.
+ */
+ verbose(env, "Cannot extend fentry/fexit\n");
+ return -EINVAL;
+ }
key = ((u64)aux->id) << 32 | btf_id;
} else {
+ if (prog_extension) {
+ verbose(env, "Cannot replace kernel functions\n");
+ return -EINVAL;
+ }
key = btf_id;
}
@@ -9594,6 +9920,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
prog->aux->attach_func_proto = t;
prog->aux->attach_btf_trace = true;
return 0;
+ default:
+ if (!prog_extension)
+ return -EINVAL;
+ /* fallthrough */
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
if (!btf_type_is_func(t)) {
@@ -9601,6 +9931,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
btf_id);
return -EINVAL;
}
+ if (prog_extension &&
+ btf_check_type_match(env, prog, btf, t))
+ return -EINVAL;
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_func_proto(t))
return -EINVAL;
@@ -9624,18 +9957,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
if (ret < 0)
goto out;
if (tgt_prog) {
- if (!tgt_prog->jited) {
- /* for now */
- verbose(env, "Can trace only JITed BPF progs\n");
- ret = -EINVAL;
- goto out;
- }
- if (tgt_prog->type == BPF_PROG_TYPE_TRACING) {
- /* prevent cycles */
- verbose(env, "Cannot recursively attach\n");
- ret = -EINVAL;
- goto out;
- }
if (subprog == 0)
addr = (long) tgt_prog->bpf_func;
else
@@ -9657,8 +9978,6 @@ out:
if (ret)
bpf_trampoline_put(tr);
return ret;
- default:
- return -EINVAL;
}
}
@@ -9728,10 +10047,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
goto skip_full_check;
}
- ret = check_attach_btf_id(env);
- if (ret)
- goto skip_full_check;
-
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
@@ -9768,22 +10083,22 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (ret < 0)
goto skip_full_check;
+ ret = check_attach_btf_id(env);
+ if (ret)
+ goto skip_full_check;
+
ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;
- ret = do_check(env);
- if (env->cur_state) {
- free_verifier_state(env->cur_state, true);
- env->cur_state = NULL;
- }
+ ret = do_check_subprogs(env);
+ ret = ret ?: do_check_main(env);
if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
skip_full_check:
- while (!pop_stack(env, NULL, NULL));
- free_states(env);
+ kvfree(env->explored_states);
if (ret == 0)
ret = check_max_stack_depth(env);
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 90c4fce1c981..2cc5c8f4c800 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -72,9 +72,9 @@ static void xsk_map_sock_delete(struct xdp_sock *xs,
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
struct bpf_map_memory mem;
- int cpu, err, numa_node;
+ int err, numa_node;
struct xsk_map *m;
- u64 cost, size;
+ u64 size;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
@@ -86,9 +86,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
numa_node = bpf_map_attr_numa_node(attr);
size = struct_size(m, xsk_map, attr->max_entries);
- cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus());
- err = bpf_map_charge_init(&mem, cost);
+ err = bpf_map_charge_init(&mem, size);
if (err < 0)
return ERR_PTR(err);
@@ -102,16 +101,6 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
bpf_map_charge_move(&m->map.memory, &mem);
spin_lock_init(&m->lock);
- m->flush_list = alloc_percpu(struct list_head);
- if (!m->flush_list) {
- bpf_map_charge_finish(&m->map.memory);
- bpf_map_area_free(m);
- return ERR_PTR(-ENOMEM);
- }
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
-
return &m->map;
}
@@ -121,7 +110,6 @@ static void xsk_map_free(struct bpf_map *map)
bpf_clear_redirect_map(map);
synchronize_net();
- free_percpu(m->flush_list);
bpf_map_area_free(m);
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 735af8f15f95..b3744872263e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3055,8 +3055,6 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp)
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
- WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
-
if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
continue;
@@ -3066,6 +3064,8 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp)
return PTR_ERR(css);
}
+ WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
+
if (css_visible(css)) {
ret = css_populate_dir(css);
if (ret)
@@ -3101,11 +3101,11 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
- WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
-
if (!css)
continue;
+ WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
+
if (css->parent &&
!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
kill_css(css);
@@ -3392,7 +3392,8 @@ static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
if (strcmp(strstrip(buf), "threaded"))
return -EINVAL;
- cgrp = cgroup_kn_lock_live(of->kn, false);
+ /* drain dying csses before we re-apply (threaded) subtree control */
+ cgrp = cgroup_kn_lock_live(of->kn, true);
if (!cgrp)
return -ENOENT;
@@ -6288,12 +6289,13 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#ifdef CONFIG_CGROUP_BPF
int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, u32 flags)
+ struct bpf_prog *replace_prog, enum bpf_attach_type type,
+ u32 flags)
{
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags);
mutex_unlock(&cgroup_mutex);
return ret;
}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index b48b22d4deb6..6f87352f8219 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -33,7 +33,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
return;
/*
- * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we
+ * Paired with the one in cgroup_rstat_cpu_pop_updated(). Either we
* see NULL updated_next or they see our updated stat.
*/
smp_mb();
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a59cc980adad..9c706af713fb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -525,8 +525,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
if (WARN_ON_ONCE((!cpu_online(cpu))))
return -ECANCELED;
- /* Unpark the stopper thread and the hotplug thread of the target cpu */
- stop_machine_unpark(cpu);
+ /* Unpark the hotplug thread of the target cpu */
kthread_unpark(st->thread);
/*
@@ -1089,8 +1088,8 @@ void notify_cpu_starting(unsigned int cpu)
/*
* Called from the idle task. Wake up the controlling task which brings the
- * stopper and the hotplug thread of the upcoming CPU up and then delegates
- * the rest of the online bringup to the hotplug thread.
+ * hotplug thread of the upcoming CPU up and then delegates the rest of the
+ * online bringup to the hotplug thread.
*/
void cpuhp_online_idle(enum cpuhp_state state)
{
@@ -1100,6 +1099,12 @@ void cpuhp_online_idle(enum cpuhp_state state)
if (state != CPUHP_AP_ONLINE_IDLE)
return;
+ /*
+ * Unpart the stopper thread before we start the idle loop (and start
+ * scheduling); this ensures the stopper task is always available.
+ */
+ stop_machine_unpark(smp_processor_id());
+
st->state = CPUHP_AP_ONLINE_IDLE;
complete_ap_thread(st, true);
}
@@ -1909,6 +1914,78 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
}
EXPORT_SYMBOL(__cpuhp_remove_state);
+#ifdef CONFIG_HOTPLUG_SMT
+static void cpuhp_offline_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = true;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+}
+
+static void cpuhp_online_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = false;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+}
+
+int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ for_each_online_cpu(cpu) {
+ if (topology_is_primary_thread(cpu))
+ continue;
+ ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
+ if (ret)
+ break;
+ /*
+ * As this needs to hold the cpu maps lock it's impossible
+ * to call device_offline() because that ends up calling
+ * cpu_down() which takes cpu maps lock. cpu maps lock
+ * needs to be held as this might race against in kernel
+ * abusers of the hotplug machinery (thermal management).
+ *
+ * So nothing would update device:offline state. That would
+ * leave the sysfs entry stale and prevent onlining after
+ * smt control has been changed to 'off' again. This is
+ * called under the sysfs hotplug lock, so it is properly
+ * serialized against the regular offline usage.
+ */
+ cpuhp_offline_cpu_device(cpu);
+ }
+ if (!ret)
+ cpu_smt_control = ctrlval;
+ cpu_maps_update_done();
+ return ret;
+}
+
+int cpuhp_smt_enable(void)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ cpu_smt_control = CPU_SMT_ENABLED;
+ for_each_present_cpu(cpu) {
+ /* Skip online CPUs and CPUs on offline nodes */
+ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
+ continue;
+ ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
+ if (ret)
+ break;
+ /* See comment in cpuhp_smt_disable() */
+ cpuhp_online_cpu_device(cpu);
+ }
+ cpu_maps_update_done();
+ return ret;
+}
+#endif
+
#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
static ssize_t show_cpuhp_state(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -2063,77 +2140,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
#ifdef CONFIG_HOTPLUG_SMT
-static void cpuhp_offline_cpu_device(unsigned int cpu)
-{
- struct device *dev = get_cpu_device(cpu);
-
- dev->offline = true;
- /* Tell user space about the state change */
- kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
-}
-
-static void cpuhp_online_cpu_device(unsigned int cpu)
-{
- struct device *dev = get_cpu_device(cpu);
-
- dev->offline = false;
- /* Tell user space about the state change */
- kobject_uevent(&dev->kobj, KOBJ_ONLINE);
-}
-
-int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
-{
- int cpu, ret = 0;
-
- cpu_maps_update_begin();
- for_each_online_cpu(cpu) {
- if (topology_is_primary_thread(cpu))
- continue;
- ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
- if (ret)
- break;
- /*
- * As this needs to hold the cpu maps lock it's impossible
- * to call device_offline() because that ends up calling
- * cpu_down() which takes cpu maps lock. cpu maps lock
- * needs to be held as this might race against in kernel
- * abusers of the hotplug machinery (thermal management).
- *
- * So nothing would update device:offline state. That would
- * leave the sysfs entry stale and prevent onlining after
- * smt control has been changed to 'off' again. This is
- * called under the sysfs hotplug lock, so it is properly
- * serialized against the regular offline usage.
- */
- cpuhp_offline_cpu_device(cpu);
- }
- if (!ret)
- cpu_smt_control = ctrlval;
- cpu_maps_update_done();
- return ret;
-}
-
-int cpuhp_smt_enable(void)
-{
- int cpu, ret = 0;
-
- cpu_maps_update_begin();
- cpu_smt_control = CPU_SMT_ENABLED;
- for_each_present_cpu(cpu) {
- /* Skip online CPUs and CPUs on offline nodes */
- if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
- continue;
- ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
- if (ret)
- break;
- /* See comment in cpuhp_smt_disable() */
- cpuhp_online_cpu_device(cpu);
- }
- cpu_maps_update_done();
- return ret;
-}
-
-
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
diff --git a/kernel/cred.c b/kernel/cred.c
index c0a4c12d38b2..809a985b1793 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -175,8 +175,8 @@ void exit_creds(struct task_struct *tsk)
put_cred(cred);
#ifdef CONFIG_KEYS_REQUEST_CACHE
- key_put(current->cached_requested_key);
- current->cached_requested_key = NULL;
+ key_put(tsk->cached_requested_key);
+ tsk->cached_requested_key = NULL;
#endif
}
@@ -223,7 +223,7 @@ struct cred *cred_alloc_blank(void)
new->magic = CRED_MAGIC;
#endif
- if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
+ if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
return new;
@@ -282,7 +282,7 @@ struct cred *prepare_creds(void)
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
validate_creds(new);
return new;
@@ -715,7 +715,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
put_cred(old);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4ff86d57f9e5..2173c23c25b4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10523,7 +10523,7 @@ again:
goto unlock;
}
- list_for_each_entry_rcu(pmu, &pmus, entry) {
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
ret = perf_try_init_event(pmu, event);
if (!ret)
goto unlock;
@@ -11465,8 +11465,10 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
- if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader))
+ if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
+ err = -EINVAL;
goto err_locked;
+ }
/*
* Must be under the same ctx::mutex as perf_install_in_context(),
diff --git a/kernel/exit.c b/kernel/exit.c
index bcbd59888e67..2833ffb0c211 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -517,10 +517,6 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
}
write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns)) {
- panic("Attempted to kill init! exitcode=0x%08x\n",
- father->signal->group_exit_code ?: father->exit_code);
- }
list_for_each_entry_safe(p, n, dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
@@ -766,6 +762,14 @@ void __noreturn do_exit(long code)
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
+ /*
+ * If the last thread of global init has exited, panic
+ * immediately to get a useable coredump.
+ */
+ if (unlikely(is_global_init(tsk)))
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ tsk->signal->group_exit_code ?: (int)code);
+
#ifdef CONFIG_POSIX_TIMERS
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
diff --git a/kernel/extable.c b/kernel/extable.c
index f6920a11e28a..a0024f27d3a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -131,8 +131,9 @@ int kernel_text_address(unsigned long addr)
* triggers a stack trace, or a WARN() that happens during
* coming back from idle, or cpu on or offlining.
*
- * is_module_text_address() as well as the kprobe slots
- * and is_bpf_text_address() require RCU to be watching.
+ * is_module_text_address() as well as the kprobe slots,
+ * is_bpf_text_address() and is_bpf_image_address require
+ * RCU to be watching.
*/
no_rcu = !rcu_is_watching();
@@ -148,6 +149,8 @@ int kernel_text_address(unsigned long addr)
goto out;
if (is_bpf_text_address(addr))
goto out;
+ if (is_bpf_image_address(addr))
+ goto out;
ret = 0;
out:
if (no_rcu)
diff --git a/kernel/fork.c b/kernel/fork.c
index 2508a4f238a3..ef82feb4bddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1832,6 +1832,7 @@ static __latent_entropy struct task_struct *copy_process(
struct multiprocess_signals delayed;
struct file *pidfile = NULL;
u64 clone_flags = args->flags;
+ struct nsproxy *nsp = current->nsproxy;
/*
* Don't allow sharing the root directory with processes in a different
@@ -1874,8 +1875,16 @@ static __latent_entropy struct task_struct *copy_process(
*/
if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
- (task_active_pid_ns(current) !=
- current->nsproxy->pid_ns_for_children))
+ (task_active_pid_ns(current) != nsp->pid_ns_for_children))
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * If the new process will be in a different time namespace
+ * do not allow it to share VM or a thread group with the forking task.
+ */
+ if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
+ if (nsp->time_ns != nsp->time_ns_for_children)
return ERR_PTR(-EINVAL);
}
@@ -2578,6 +2587,16 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
#endif
#ifdef __ARCH_WANT_SYS_CLONE3
+
+/*
+ * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from
+ * the registers containing the syscall arguments for clone. This doesn't work
+ * with clone3 since the TLS value is passed in clone_args instead.
+ */
+#ifndef CONFIG_HAVE_COPY_THREAD_TLS
+#error clone3 requires copy_thread_tls support in arch
+#endif
+
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t usize)
@@ -2811,7 +2830,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
- CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
+ CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
+ CLONE_NEWTIME))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing
diff --git a/kernel/futex.c b/kernel/futex.c
index 03c518e9747e..0cf84c8664f2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1178,6 +1178,7 @@ out_error:
/**
* wait_for_owner_exiting - Block until the owner has exited
+ * @ret: owner's current futex lock status
* @exiting: Pointer to the exiting task
*
* Caller must hold a refcount on @exiting.
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 060e8e726755..3941a9c48f83 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -4,7 +4,7 @@ menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
depends on DEBUG_FS
- select CONSTRUCTORS
+ select CONSTRUCTORS if !UML
default n
---help---
This option enables gcov-based code profiling (e.g. for code coverage
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 6c7ca2e983a5..02236b13b359 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -12,6 +12,7 @@
#include <linux/interrupt.h>
#include <linux/ratelimit.h>
#include <linux/irq.h>
+#include <linux/sched/isolation.h>
#include "internals.h"
@@ -171,6 +172,20 @@ void irq_migrate_all_off_this_cpu(void)
}
}
+static bool hk_should_isolate(struct irq_data *data, unsigned int cpu)
+{
+ const struct cpumask *hk_mask;
+
+ if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ))
+ return false;
+
+ hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+ if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask))
+ return false;
+
+ return cpumask_test_cpu(cpu, hk_mask);
+}
+
static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
{
struct irq_data *data = irq_desc_get_irq_data(desc);
@@ -188,9 +203,11 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
/*
* If the interrupt can only be directed to a single target
* CPU then it is already assigned to a CPU in the affinity
- * mask. No point in trying to move it around.
+ * mask. No point in trying to move it around unless the
+ * isolation mechanism requests to move it to an upcoming
+ * housekeeping CPU.
*/
- if (!irqd_is_single_target(data))
+ if (!irqd_is_single_target(data) || hk_should_isolate(data, cpu))
irq_set_affinity_locked(data, affinity, false);
}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 5b8fdd659e54..98a5f10d1900 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -891,6 +891,7 @@ __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
}
void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
+ __releases(&desc->lock)
{
raw_spin_unlock_irqrestore(&desc->lock, flags);
if (bus)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index dd822fd8a7d5..7527e5ef6fe5 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -987,6 +987,23 @@ const struct irq_domain_ops irq_domain_simple_ops = {
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
/**
+ * irq_domain_translate_onecell() - Generic translate for direct one cell
+ * bindings
+ */
+int irq_domain_translate_onecell(struct irq_domain *d,
+ struct irq_fwspec *fwspec,
+ unsigned long *out_hwirq,
+ unsigned int *out_type)
+{
+ if (WARN_ON(fwspec->param_count < 1))
+ return -EINVAL;
+ *out_hwirq = fwspec->param[0];
+ *out_type = IRQ_TYPE_NONE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_translate_onecell);
+
+/**
* irq_domain_translate_twocell() - Generic translate for direct two cell
* bindings
*
@@ -1459,6 +1476,7 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
if (rv) {
/* Restore the original irq_data. */
*root_irq_data = *child_irq_data;
+ kfree(child_irq_data);
goto error;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1753486b440c..818b2802d3e7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/sched/task.h>
+#include <linux/sched/isolation.h>
#include <uapi/linux/sched/types.h>
#include <linux/task_work.h>
@@ -217,7 +218,45 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
- ret = chip->irq_set_affinity(data, mask, force);
+ /*
+ * If this is a managed interrupt and housekeeping is enabled on
+ * it check whether the requested affinity mask intersects with
+ * a housekeeping CPU. If so, then remove the isolated CPUs from
+ * the mask and just keep the housekeeping CPU(s). This prevents
+ * the affinity setter from routing the interrupt to an isolated
+ * CPU to avoid that I/O submitted from a housekeeping CPU causes
+ * interrupts on an isolated one.
+ *
+ * If the masks do not intersect or include online CPU(s) then
+ * keep the requested mask. The isolated target CPUs are only
+ * receiving interrupts when the I/O operation was submitted
+ * directly from them.
+ *
+ * If all housekeeping CPUs in the affinity mask are offline, the
+ * interrupt will be migrated by the CPU hotplug code once a
+ * housekeeping CPU which belongs to the affinity mask comes
+ * online.
+ */
+ if (irqd_affinity_is_managed(data) &&
+ housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) {
+ const struct cpumask *hk_mask, *prog_mask;
+
+ static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
+ static struct cpumask tmp_mask;
+
+ hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+
+ raw_spin_lock(&tmp_mask_lock);
+ cpumask_and(&tmp_mask, mask, hk_mask);
+ if (!cpumask_intersects(&tmp_mask, cpu_online_mask))
+ prog_mask = mask;
+ else
+ prog_mask = &tmp_mask;
+ ret = chip->irq_set_affinity(data, prog_mask, force);
+ raw_spin_unlock(&tmp_mask_lock);
+ } else {
+ ret = chip->irq_set_affinity(data, mask, force);
+ }
switch (ret) {
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
@@ -1500,8 +1539,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* has. The type flags are unreliable as the
* underlying chip implementation can override them.
*/
- pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
- irq);
+ pr_err("Threaded irq requested with handler=NULL and !ONESHOT for %s (irq %d)\n",
+ new->name, irq);
ret = -EINVAL;
goto out_unlock;
}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 2ed97a7c9b2a..f865e5f4d382 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -34,6 +34,7 @@ static atomic_t irq_poll_active;
* true and let the handler run.
*/
bool irq_wait_for_poll(struct irq_desc *desc)
+ __must_hold(&desc->lock)
{
if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
"irq poll in progress on cpu %d for irq %d\n",
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bc933c0db9bf..f977786fe498 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -159,6 +159,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
kimage_terminate(image);
+ ret = machine_kexec_post_load(image);
+ if (ret)
+ goto out;
+
/* Install the new kernel and uninstall the old */
image = xchg(dest_image, image);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 15d70a90b50d..c19c0dad1ebe 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -589,6 +589,12 @@ static void kimage_free_extra_pages(struct kimage *image)
kimage_free_page_list(&image->unusable_pages);
}
+
+int __weak machine_kexec_post_load(struct kimage *image)
+{
+ return 0;
+}
+
void kimage_terminate(struct kimage *image)
{
if (*image->entry != 0)
@@ -1171,7 +1177,7 @@ int kernel_kexec(void)
* CPU hotplug again; so re-enable it here.
*/
cpu_hotplug_enable();
- pr_emerg("Starting new kernel\n");
+ pr_notice("Starting new kernel\n");
machine_shutdown();
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index a2df93948665..faa74d5f6941 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -441,6 +441,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
kimage_terminate(image);
+ ret = machine_kexec_post_load(image);
+ if (ret)
+ goto out;
+
/*
* Free up any temporary buffers allocated which are not needed
* after image has been loaded
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 48aaf2ac0d0d..39d30ccf8d87 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -13,6 +13,8 @@ void kimage_terminate(struct kimage *image);
int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end);
+int machine_kexec_post_load(struct kimage *image);
+
extern struct mutex kexec_mutex;
#ifdef CONFIG_KEXEC_FILE
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 53534aa258a6..2625c241ac00 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -510,6 +510,8 @@ static void do_unoptimize_kprobes(void)
arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
/* Loop free_list for disarming */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
+ /* Switching from detour code to origin */
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
/* Disarm probes if marked disabled */
if (kprobe_disabled(&op->kp))
arch_disarm_kprobe(&op->kp);
@@ -610,6 +612,18 @@ void wait_for_kprobe_optimizer(void)
mutex_unlock(&kprobe_mutex);
}
+static bool optprobe_queued_unopt(struct optimized_kprobe *op)
+{
+ struct optimized_kprobe *_op;
+
+ list_for_each_entry(_op, &unoptimizing_list, list) {
+ if (op == _op)
+ return true;
+ }
+
+ return false;
+}
+
/* Optimize kprobe if p is ready to be optimized */
static void optimize_kprobe(struct kprobe *p)
{
@@ -631,17 +645,21 @@ static void optimize_kprobe(struct kprobe *p)
return;
/* Check if it is already optimized. */
- if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
+ if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) {
+ if (optprobe_queued_unopt(op)) {
+ /* This is under unoptimizing. Just dequeue the probe */
+ list_del_init(&op->list);
+ }
return;
+ }
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
- if (!list_empty(&op->list))
- /* This is under unoptimizing. Just dequeue the probe */
- list_del_init(&op->list);
- else {
- list_add(&op->list, &optimizing_list);
- kick_kprobe_optimizer();
- }
+ /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */
+ if (WARN_ON_ONCE(!list_empty(&op->list)))
+ return;
+
+ list_add(&op->list, &optimizing_list);
+ kick_kprobe_optimizer();
}
/* Short cut to direct unoptimizing */
@@ -649,6 +667,7 @@ static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
lockdep_assert_cpus_held();
arch_unoptimize_kprobe(op);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (kprobe_disabled(&op->kp))
arch_disarm_kprobe(&op->kp);
}
@@ -662,31 +681,33 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)
return; /* This is not an optprobe nor optimized */
op = container_of(p, struct optimized_kprobe, kp);
- if (!kprobe_optimized(p)) {
- /* Unoptimized or unoptimizing case */
- if (force && !list_empty(&op->list)) {
- /*
- * Only if this is unoptimizing kprobe and forced,
- * forcibly unoptimize it. (No need to unoptimize
- * unoptimized kprobe again :)
- */
- list_del_init(&op->list);
- force_unoptimize_kprobe(op);
- }
+ if (!kprobe_optimized(p))
return;
- }
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (!list_empty(&op->list)) {
- /* Dequeue from the optimization queue */
- list_del_init(&op->list);
+ if (optprobe_queued_unopt(op)) {
+ /* Queued in unoptimizing queue */
+ if (force) {
+ /*
+ * Forcibly unoptimize the kprobe here, and queue it
+ * in the freeing list for release afterwards.
+ */
+ force_unoptimize_kprobe(op);
+ list_move(&op->list, &freeing_list);
+ }
+ } else {
+ /* Dequeue from the optimizing queue */
+ list_del_init(&op->list);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ }
return;
}
+
/* Optimized kprobe case */
- if (force)
+ if (force) {
/* Forcibly update the code: this is a special case */
force_unoptimize_kprobe(op);
- else {
+ } else {
list_add(&op->list, &unoptimizing_list);
kick_kprobe_optimizer();
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 32282e7112d3..32406ef0d6a2 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -482,7 +482,7 @@ static struct lock_trace *save_trace(void)
struct lock_trace *trace, *t2;
struct hlist_head *hash_head;
u32 hash;
- unsigned int max_entries;
+ int max_entries;
BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE);
BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES);
@@ -490,10 +490,8 @@ static struct lock_trace *save_trace(void)
trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries);
max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries -
LOCK_TRACE_SIZE_IN_LONGS;
- trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3);
- if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES -
- LOCK_TRACE_SIZE_IN_LONGS - 1) {
+ if (max_entries <= 0) {
if (!debug_locks_off_graph_unlock())
return NULL;
@@ -502,6 +500,7 @@ static struct lock_trace *save_trace(void)
return NULL;
}
+ trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3);
hash = jhash(trace->entries, trace->nr_entries *
sizeof(trace->entries[0]), 0);
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index dadb7b7fba37..9bb6d2497b04 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -286,9 +286,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
- seq_printf(m, " number of stack traces: %llu\n",
+ seq_printf(m, " number of stack traces: %11llu\n",
lockdep_stack_trace_count());
- seq_printf(m, " number of stack hash chains: %llu\n",
+ seq_printf(m, " number of stack hash chains: %11llu\n",
lockdep_stack_hash_count());
#endif
seq_printf(m, " combined max dependencies: %11u\n",
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 54cc5f9286e9..5352ce50a97e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -733,9 +733,6 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
*/
void __sched mutex_unlock(struct mutex *lock)
{
-#ifdef CONFIG_DEBUG_MUTEXES
- WARN_ON(in_interrupt());
-#endif
#ifndef CONFIG_DEBUG_LOCK_ALLOC
if (__mutex_unlock_fast(lock))
return;
@@ -1416,7 +1413,6 @@ int __sched mutex_trylock(struct mutex *lock)
#ifdef CONFIG_DEBUG_MUTEXES
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
- WARN_ON(in_interrupt());
#endif
locked = __mutex_trylock(lock);
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 6ef600aa0f47..1f7734949ac8 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -134,20 +134,17 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* cmpxchg in an attempt to undo our queueing.
*/
- while (!READ_ONCE(node->locked)) {
- /*
- * If we need to reschedule bail... so we can block.
- * Use vcpu_is_preempted() to avoid waiting for a preempted
- * lock holder:
- */
- if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
- goto unqueue;
-
- cpu_relax();
- }
- return true;
+ /*
+ * Wait to acquire the lock or cancelation. Note that need_resched()
+ * will come with an IPI, which will wake smp_cond_load_relaxed() if it
+ * is implemented with a monitor-wait. vcpu_is_preempted() relies on
+ * polling, be careful.
+ */
+ if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
+ vcpu_is_preempted(node_cpu(node->prev))))
+ return true;
-unqueue:
+ /* unqueue */
/*
* Step - A -- stabilize @prev
*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 2473f10c6956..b9515fcc9b29 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -31,14 +31,15 @@
/*
* The basic principle of a queue-based spinlock can best be understood
* by studying a classic queue-based spinlock implementation called the
- * MCS lock. The paper below provides a good description for this kind
- * of lock.
+ * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
+ * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
+ * Scott") is available at
*
- * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ * https://bugzilla.kernel.org/show_bug.cgi?id=206115
*
- * This queued spinlock implementation is based on the MCS lock, however to make
- * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
- * API, we must modify it somehow.
+ * This queued spinlock implementation is based on the MCS lock, however to
+ * make it fit the 4 bytes we assume spinlock_t to be, and preserve its
+ * existing API, we must modify it somehow.
*
* In particular; where the traditional MCS lock consists of a tail pointer
* (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 44e68761f432..0d9b6be9ecc8 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1226,8 +1226,8 @@ wait:
* In this case, we attempt to acquire the lock again
* without sleeping.
*/
- if ((wstate == WRITER_HANDOFF) &&
- (rwsem_spin_on_owner(sem, 0) == OWNER_NULL))
+ if (wstate == WRITER_HANDOFF &&
+ rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL)
goto trylock_again;
/* Block until there are no active lockers. */
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 399669f7eba8..472dd462a40c 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -51,19 +51,19 @@ EXPORT_SYMBOL(__rwlock_init);
static void spin_dump(raw_spinlock_t *lock, const char *msg)
{
- struct task_struct *owner = NULL;
+ struct task_struct *owner = READ_ONCE(lock->owner);
- if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
- owner = lock->owner;
+ if (owner == SPINLOCK_OWNER_INIT)
+ owner = NULL;
printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
msg, raw_smp_processor_id(),
current->comm, task_pid_nr(current));
printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
".owner_cpu: %d\n",
- lock, lock->magic,
+ lock, READ_ONCE(lock->magic),
owner ? owner->comm : "<none>",
owner ? task_pid_nr(owner) : -1,
- lock->owner_cpu);
+ READ_ONCE(lock->owner_cpu));
dump_stack();
}
@@ -80,16 +80,16 @@ static void spin_bug(raw_spinlock_t *lock, const char *msg)
static inline void
debug_spin_lock_before(raw_spinlock_t *lock)
{
- SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
- SPIN_BUG_ON(lock->owner == current, lock, "recursion");
- SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ SPIN_BUG_ON(READ_ONCE(lock->magic) != SPINLOCK_MAGIC, lock, "bad magic");
+ SPIN_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion");
+ SPIN_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(),
lock, "cpu recursion");
}
static inline void debug_spin_lock_after(raw_spinlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_spin_unlock(raw_spinlock_t *lock)
@@ -99,8 +99,8 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock)
SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
/*
@@ -187,8 +187,8 @@ static inline void debug_write_lock_before(rwlock_t *lock)
static inline void debug_write_lock_after(rwlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_write_unlock(rwlock_t *lock)
@@ -197,8 +197,8 @@ static inline void debug_write_unlock(rwlock_t *lock)
RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
void do_raw_write_lock(rwlock_t *lock)
diff --git a/kernel/module.c b/kernel/module.c
index 3a486f826224..ac058a5ad1d1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2031,49 +2031,6 @@ static void module_enable_nx(const struct module *mod)
frob_writable_data(&mod->init_layout, set_memory_nx);
}
-/* Iterate through all modules and set each module's text as RW */
-void set_all_modules_text_rw(void)
-{
- struct module *mod;
-
- if (!rodata_enabled)
- return;
-
- mutex_lock(&module_mutex);
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
-
- frob_text(&mod->core_layout, set_memory_rw);
- frob_text(&mod->init_layout, set_memory_rw);
- }
- mutex_unlock(&module_mutex);
-}
-
-/* Iterate through all modules and set each module's text as RO */
-void set_all_modules_text_ro(void)
-{
- struct module *mod;
-
- if (!rodata_enabled)
- return;
-
- mutex_lock(&module_mutex);
- list_for_each_entry_rcu(mod, &modules, list) {
- /*
- * Ignore going modules since it's possible that ro
- * protection has already been disabled, otherwise we'll
- * run into protection faults at module deallocation.
- */
- if (mod->state == MODULE_STATE_UNFORMED ||
- mod->state == MODULE_STATE_GOING)
- continue;
-
- frob_text(&mod->core_layout, set_memory_ro);
- frob_text(&mod->init_layout, set_memory_ro);
- }
- mutex_unlock(&module_mutex);
-}
#else /* !CONFIG_STRICT_MODULE_RWX */
static void module_enable_nx(const struct module *mod) { }
#endif /* CONFIG_STRICT_MODULE_RWX */
@@ -3730,6 +3687,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
module_enable_ro(mod, false);
module_enable_nx(mod);
+ module_enable_x(mod);
/* Mark state as coming so strong_try_module_get() ignores us,
* but kallsyms etc. can see us. */
@@ -3752,11 +3710,6 @@ static int prepare_coming_module(struct module *mod)
if (err)
return err;
- /* Make module executable after ftrace is enabled */
- mutex_lock(&module_mutex);
- module_enable_x(mod);
- mutex_unlock(&module_mutex);
-
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod);
return 0;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index c815f58e6bc0..ed9882108cd2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -18,6 +18,7 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
+#include <linux/time_namespace.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
@@ -40,6 +41,10 @@ struct nsproxy init_nsproxy = {
#ifdef CONFIG_CGROUPS
.cgroup_ns = &init_cgroup_ns,
#endif
+#ifdef CONFIG_TIME_NS
+ .time_ns = &init_time_ns,
+ .time_ns_for_children = &init_time_ns,
+#endif
};
static inline struct nsproxy *create_nsproxy(void)
@@ -106,8 +111,18 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_net;
}
+ new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns,
+ tsk->nsproxy->time_ns_for_children);
+ if (IS_ERR(new_nsp->time_ns_for_children)) {
+ err = PTR_ERR(new_nsp->time_ns_for_children);
+ goto out_time;
+ }
+ new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns);
+
return new_nsp;
+out_time:
+ put_net(new_nsp->net_ns);
out_net:
put_cgroup_ns(new_nsp->cgroup_ns);
out_cgroup:
@@ -136,15 +151,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
struct nsproxy *old_ns = tsk->nsproxy;
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;
+ int ret;
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
- CLONE_NEWCGROUP)))) {
- get_nsproxy(old_ns);
- return 0;
- }
-
- if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
+ if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
+ get_nsproxy(old_ns);
+ return 0;
+ }
+ } else if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
@@ -162,6 +178,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
+ ret = timens_on_fork(new_ns, tsk);
+ if (ret) {
+ free_nsproxy(new_ns);
+ return ret;
+ }
+
tsk->nsproxy = new_ns;
return 0;
}
@@ -176,6 +198,10 @@ void free_nsproxy(struct nsproxy *ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns_for_children)
put_pid_ns(ns->pid_ns_for_children);
+ if (ns->time_ns)
+ put_time_ns(ns->time_ns);
+ if (ns->time_ns_for_children)
+ put_time_ns(ns->time_ns_for_children);
put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
@@ -192,7 +218,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
+ CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
+ CLONE_NEWTIME)))
return 0;
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
diff --git a/kernel/padata.c b/kernel/padata.c
index c3fec1413295..72777c10bb9c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -2,7 +2,7 @@
/*
* padata.c - generic interface to process data streams in parallel
*
- * See Documentation/padata.txt for an api documentation.
+ * See Documentation/core-api/padata.rst for more information.
*
* Copyright (C) 2008, 2009 secunet Security Networks AG
* Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
@@ -35,6 +35,8 @@
#define MAX_OBJ_NUM 1000
+static void padata_free_pd(struct parallel_data *pd);
+
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
{
int cpu, target_cpu;
@@ -87,7 +89,7 @@ static void padata_parallel_worker(struct work_struct *parallel_work)
/**
* padata_do_parallel - padata parallelization function
*
- * @pinst: padata instance
+ * @ps: padatashell
* @padata: object to be parallelized
* @cb_cpu: pointer to the CPU that the serialization callback function should
* run on. If it's not in the serial cpumask of @pinst
@@ -97,17 +99,20 @@ static void padata_parallel_worker(struct work_struct *parallel_work)
* The parallelization callback function will run with BHs off.
* Note: Every object which is parallelized by padata_do_parallel
* must be seen by padata_do_serial.
+ *
+ * Return: 0 on success or else negative error code.
*/
-int padata_do_parallel(struct padata_instance *pinst,
+int padata_do_parallel(struct padata_shell *ps,
struct padata_priv *padata, int *cb_cpu)
{
+ struct padata_instance *pinst = ps->pinst;
int i, cpu, cpu_index, target_cpu, err;
struct padata_parallel_queue *queue;
struct parallel_data *pd;
rcu_read_lock_bh();
- pd = rcu_dereference_bh(pinst->pd);
+ pd = rcu_dereference_bh(ps->pd);
err = -EINVAL;
if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
@@ -160,14 +165,12 @@ EXPORT_SYMBOL(padata_do_parallel);
/*
* padata_find_next - Find the next object that needs serialization.
*
- * Return values are:
- *
- * A pointer to the control struct of the next object that needs
- * serialization, if present in one of the percpu reorder queues.
- *
- * NULL, if the next object that needs serialization will
- * be parallel processed by another cpu and is not yet present in
- * the cpu's reorder queue.
+ * Return:
+ * * A pointer to the control struct of the next object that needs
+ * serialization, if present in one of the percpu reorder queues.
+ * * NULL, if the next object that needs serialization will
+ * be parallel processed by another cpu and is not yet present in
+ * the cpu's reorder queue.
*/
static struct padata_priv *padata_find_next(struct parallel_data *pd,
bool remove_object)
@@ -199,7 +202,6 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd,
if (remove_object) {
list_del_init(&padata->list);
- atomic_dec(&pd->reorder_objects);
++pd->processed;
pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false);
}
@@ -210,10 +212,10 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd,
static void padata_reorder(struct parallel_data *pd)
{
+ struct padata_instance *pinst = pd->ps->pinst;
int cb_cpu;
struct padata_priv *padata;
struct padata_serial_queue *squeue;
- struct padata_instance *pinst = pd->pinst;
struct padata_parallel_queue *next_queue;
/*
@@ -283,6 +285,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
struct padata_serial_queue *squeue;
struct parallel_data *pd;
LIST_HEAD(local_list);
+ int cnt;
local_bh_disable();
squeue = container_of(serial_work, struct padata_serial_queue, work);
@@ -292,6 +295,8 @@ static void padata_serial_worker(struct work_struct *serial_work)
list_replace_init(&squeue->serial.list, &local_list);
spin_unlock(&squeue->serial.lock);
+ cnt = 0;
+
while (!list_empty(&local_list)) {
struct padata_priv *padata;
@@ -301,9 +306,12 @@ static void padata_serial_worker(struct work_struct *serial_work)
list_del_init(&padata->list);
padata->serial(padata);
- atomic_dec(&pd->refcnt);
+ cnt++;
}
local_bh_enable();
+
+ if (atomic_sub_and_test(cnt, &pd->refcnt))
+ padata_free_pd(pd);
}
/**
@@ -327,7 +335,6 @@ void padata_do_serial(struct padata_priv *padata)
if (cur->seq_nr < padata->seq_nr)
break;
list_add(&padata->list, &cur->list);
- atomic_inc(&pd->reorder_objects);
spin_unlock(&pqueue->reorder.lock);
/*
@@ -341,36 +348,39 @@ void padata_do_serial(struct padata_priv *padata)
}
EXPORT_SYMBOL(padata_do_serial);
-static int padata_setup_cpumasks(struct parallel_data *pd,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask)
+static int padata_setup_cpumasks(struct padata_instance *pinst)
{
struct workqueue_attrs *attrs;
+ int err;
+
+ attrs = alloc_workqueue_attrs();
+ if (!attrs)
+ return -ENOMEM;
+
+ /* Restrict parallel_wq workers to pd->cpumask.pcpu. */
+ cpumask_copy(attrs->cpumask, pinst->cpumask.pcpu);
+ err = apply_workqueue_attrs(pinst->parallel_wq, attrs);
+ free_workqueue_attrs(attrs);
+
+ return err;
+}
+
+static int pd_setup_cpumasks(struct parallel_data *pd,
+ const struct cpumask *pcpumask,
+ const struct cpumask *cbcpumask)
+{
int err = -ENOMEM;
if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
goto out;
- cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
-
if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL))
goto free_pcpu_mask;
- cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask);
- attrs = alloc_workqueue_attrs();
- if (!attrs)
- goto free_cbcpu_mask;
-
- /* Restrict parallel_wq workers to pd->cpumask.pcpu. */
- cpumask_copy(attrs->cpumask, pd->cpumask.pcpu);
- err = apply_workqueue_attrs(pd->pinst->parallel_wq, attrs);
- free_workqueue_attrs(attrs);
- if (err < 0)
- goto free_cbcpu_mask;
+ cpumask_copy(pd->cpumask.pcpu, pcpumask);
+ cpumask_copy(pd->cpumask.cbcpu, cbcpumask);
return 0;
-free_cbcpu_mask:
- free_cpumask_var(pd->cpumask.cbcpu);
free_pcpu_mask:
free_cpumask_var(pd->cpumask.pcpu);
out:
@@ -414,12 +424,16 @@ static void padata_init_pqueues(struct parallel_data *pd)
}
/* Allocate and initialize the internal cpumask dependend resources. */
-static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask)
+static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
{
+ struct padata_instance *pinst = ps->pinst;
+ const struct cpumask *cbcpumask;
+ const struct cpumask *pcpumask;
struct parallel_data *pd;
+ cbcpumask = pinst->rcpumask.cbcpu;
+ pcpumask = pinst->rcpumask.pcpu;
+
pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
if (!pd)
goto err;
@@ -432,15 +446,14 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
if (!pd->squeue)
goto err_free_pqueue;
- pd->pinst = pinst;
- if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
+ pd->ps = ps;
+ if (pd_setup_cpumasks(pd, pcpumask, cbcpumask))
goto err_free_squeue;
padata_init_pqueues(pd);
padata_init_squeues(pd);
atomic_set(&pd->seq_nr, -1);
- atomic_set(&pd->reorder_objects, 0);
- atomic_set(&pd->refcnt, 0);
+ atomic_set(&pd->refcnt, 1);
spin_lock_init(&pd->lock);
pd->cpu = cpumask_first(pd->cpumask.pcpu);
INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
@@ -466,29 +479,6 @@ static void padata_free_pd(struct parallel_data *pd)
kfree(pd);
}
-/* Flush all objects out of the padata queues. */
-static void padata_flush_queues(struct parallel_data *pd)
-{
- int cpu;
- struct padata_parallel_queue *pqueue;
- struct padata_serial_queue *squeue;
-
- for_each_cpu(cpu, pd->cpumask.pcpu) {
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
- flush_work(&pqueue->work);
- }
-
- if (atomic_read(&pd->reorder_objects))
- padata_reorder(pd);
-
- for_each_cpu(cpu, pd->cpumask.cbcpu) {
- squeue = per_cpu_ptr(pd->squeue, cpu);
- flush_work(&squeue->work);
- }
-
- BUG_ON(atomic_read(&pd->refcnt) != 0);
-}
-
static void __padata_start(struct padata_instance *pinst)
{
pinst->flags |= PADATA_INIT;
@@ -502,72 +492,52 @@ static void __padata_stop(struct padata_instance *pinst)
pinst->flags &= ~PADATA_INIT;
synchronize_rcu();
-
- get_online_cpus();
- padata_flush_queues(pinst->pd);
- put_online_cpus();
}
/* Replace the internal control structure with a new one. */
-static void padata_replace(struct padata_instance *pinst,
- struct parallel_data *pd_new)
+static int padata_replace_one(struct padata_shell *ps)
+{
+ struct parallel_data *pd_new;
+
+ pd_new = padata_alloc_pd(ps);
+ if (!pd_new)
+ return -ENOMEM;
+
+ ps->opd = rcu_dereference_protected(ps->pd, 1);
+ rcu_assign_pointer(ps->pd, pd_new);
+
+ return 0;
+}
+
+static int padata_replace(struct padata_instance *pinst)
{
- struct parallel_data *pd_old = pinst->pd;
- int notification_mask = 0;
+ struct padata_shell *ps;
+ int err;
pinst->flags |= PADATA_RESET;
- rcu_assign_pointer(pinst->pd, pd_new);
+ cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu,
+ cpu_online_mask);
- synchronize_rcu();
+ cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu,
+ cpu_online_mask);
- if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
- notification_mask |= PADATA_CPU_PARALLEL;
- if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
- notification_mask |= PADATA_CPU_SERIAL;
+ list_for_each_entry(ps, &pinst->pslist, list) {
+ err = padata_replace_one(ps);
+ if (err)
+ break;
+ }
- padata_flush_queues(pd_old);
- padata_free_pd(pd_old);
+ synchronize_rcu();
- if (notification_mask)
- blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
- notification_mask,
- &pd_new->cpumask);
+ list_for_each_entry_continue_reverse(ps, &pinst->pslist, list)
+ if (atomic_dec_and_test(&ps->opd->refcnt))
+ padata_free_pd(ps->opd);
pinst->flags &= ~PADATA_RESET;
-}
-
-/**
- * padata_register_cpumask_notifier - Registers a notifier that will be called
- * if either pcpu or cbcpu or both cpumasks change.
- *
- * @pinst: A poineter to padata instance
- * @nblock: A pointer to notifier block.
- */
-int padata_register_cpumask_notifier(struct padata_instance *pinst,
- struct notifier_block *nblock)
-{
- return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
- nblock);
-}
-EXPORT_SYMBOL(padata_register_cpumask_notifier);
-/**
- * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
- * registered earlier using padata_register_cpumask_notifier
- *
- * @pinst: A pointer to data instance.
- * @nlock: A pointer to notifier block.
- */
-int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
- struct notifier_block *nblock)
-{
- return blocking_notifier_chain_unregister(
- &pinst->cpumask_change_notifier,
- nblock);
+ return err;
}
-EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
-
/* If cpumask contains no active cpu, we mark the instance as invalid. */
static bool padata_validate_cpumask(struct padata_instance *pinst,
@@ -587,7 +557,7 @@ static int __padata_set_cpumasks(struct padata_instance *pinst,
cpumask_var_t cbcpumask)
{
int valid;
- struct parallel_data *pd;
+ int err;
valid = padata_validate_cpumask(pinst, pcpumask);
if (!valid) {
@@ -600,29 +570,26 @@ static int __padata_set_cpumasks(struct padata_instance *pinst,
__padata_stop(pinst);
out_replace:
- pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
- if (!pd)
- return -ENOMEM;
-
cpumask_copy(pinst->cpumask.pcpu, pcpumask);
cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
- padata_replace(pinst, pd);
+ err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst);
if (valid)
__padata_start(pinst);
- return 0;
+ return err;
}
/**
- * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
- * equivalent to @cpumask.
- *
+ * padata_set_cpumask - Sets specified by @cpumask_type cpumask to the value
+ * equivalent to @cpumask.
* @pinst: padata instance
* @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
* to parallel and serial cpumasks respectively.
* @cpumask: the cpumask to use
+ *
+ * Return: 0 on success or negative error code
*/
int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
cpumask_var_t cpumask)
@@ -630,8 +597,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
struct cpumask *serial_mask, *parallel_mask;
int err = -EINVAL;
- mutex_lock(&pinst->lock);
get_online_cpus();
+ mutex_lock(&pinst->lock);
switch (cpumask_type) {
case PADATA_CPU_PARALLEL:
@@ -649,8 +616,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
out:
- put_online_cpus();
mutex_unlock(&pinst->lock);
+ put_online_cpus();
return err;
}
@@ -660,6 +627,8 @@ EXPORT_SYMBOL(padata_set_cpumask);
* padata_start - start the parallel processing
*
* @pinst: padata instance to start
+ *
+ * Return: 0 on success or negative error code
*/
int padata_start(struct padata_instance *pinst)
{
@@ -695,82 +664,33 @@ EXPORT_SYMBOL(padata_stop);
static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
{
- struct parallel_data *pd;
+ int err = 0;
if (cpumask_test_cpu(cpu, cpu_online_mask)) {
- pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
- pinst->cpumask.cbcpu);
- if (!pd)
- return -ENOMEM;
-
- padata_replace(pinst, pd);
+ err = padata_replace(pinst);
if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
__padata_start(pinst);
}
- return 0;
+ return err;
}
static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
{
- struct parallel_data *pd = NULL;
-
- if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+ int err = 0;
+ if (!cpumask_test_cpu(cpu, cpu_online_mask)) {
if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
!padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
__padata_stop(pinst);
- pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
- pinst->cpumask.cbcpu);
- if (!pd)
- return -ENOMEM;
-
- padata_replace(pinst, pd);
-
- cpumask_clear_cpu(cpu, pd->cpumask.cbcpu);
- cpumask_clear_cpu(cpu, pd->cpumask.pcpu);
+ err = padata_replace(pinst);
}
- return 0;
-}
-
- /**
- * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
- * padata cpumasks.
- *
- * @pinst: padata instance
- * @cpu: cpu to remove
- * @mask: bitmask specifying from which cpumask @cpu should be removed
- * The @mask may be any combination of the following flags:
- * PADATA_CPU_SERIAL - serial cpumask
- * PADATA_CPU_PARALLEL - parallel cpumask
- */
-int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
-{
- int err;
-
- if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
- return -EINVAL;
-
- mutex_lock(&pinst->lock);
-
- get_online_cpus();
- if (mask & PADATA_CPU_SERIAL)
- cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
- if (mask & PADATA_CPU_PARALLEL)
- cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
-
- err = __padata_remove_cpu(pinst, cpu);
- put_online_cpus();
-
- mutex_unlock(&pinst->lock);
-
return err;
}
-EXPORT_SYMBOL(padata_remove_cpu);
static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
{
@@ -793,7 +713,7 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node)
return ret;
}
-static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node)
+static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
struct padata_instance *pinst;
int ret;
@@ -814,11 +734,15 @@ static enum cpuhp_state hp_online;
static void __padata_free(struct padata_instance *pinst)
{
#ifdef CONFIG_HOTPLUG_CPU
+ cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, &pinst->node);
cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node);
#endif
+ WARN_ON(!list_empty(&pinst->pslist));
+
padata_stop(pinst);
- padata_free_pd(pinst->pd);
+ free_cpumask_var(pinst->rcpumask.cbcpu);
+ free_cpumask_var(pinst->rcpumask.pcpu);
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
destroy_workqueue(pinst->serial_wq);
@@ -959,13 +883,14 @@ static struct kobj_type padata_attr_type = {
* @name: used to identify the instance
* @pcpumask: cpumask that will be used for padata parallelization
* @cbcpumask: cpumask that will be used for padata serialization
+ *
+ * Return: new instance on success, NULL on error
*/
static struct padata_instance *padata_alloc(const char *name,
const struct cpumask *pcpumask,
const struct cpumask *cbcpumask)
{
struct padata_instance *pinst;
- struct parallel_data *pd = NULL;
pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
if (!pinst)
@@ -993,29 +918,40 @@ static struct padata_instance *padata_alloc(const char *name,
!padata_validate_cpumask(pinst, cbcpumask))
goto err_free_masks;
- pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
- if (!pd)
+ if (!alloc_cpumask_var(&pinst->rcpumask.pcpu, GFP_KERNEL))
goto err_free_masks;
+ if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL))
+ goto err_free_rcpumask_pcpu;
- rcu_assign_pointer(pinst->pd, pd);
+ INIT_LIST_HEAD(&pinst->pslist);
cpumask_copy(pinst->cpumask.pcpu, pcpumask);
cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
+ cpumask_and(pinst->rcpumask.pcpu, pcpumask, cpu_online_mask);
+ cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask);
+
+ if (padata_setup_cpumasks(pinst))
+ goto err_free_rcpumask_cbcpu;
pinst->flags = 0;
- BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
kobject_init(&pinst->kobj, &padata_attr_type);
mutex_init(&pinst->lock);
#ifdef CONFIG_HOTPLUG_CPU
cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node);
+ cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD,
+ &pinst->node);
#endif
put_online_cpus();
return pinst;
+err_free_rcpumask_cbcpu:
+ free_cpumask_var(pinst->rcpumask.cbcpu);
+err_free_rcpumask_pcpu:
+ free_cpumask_var(pinst->rcpumask.pcpu);
err_free_masks:
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
@@ -1036,6 +972,8 @@ err:
* parallel workers.
*
* @name: used to identify the instance
+ *
+ * Return: new instance on success, NULL on error
*/
struct padata_instance *padata_alloc_possible(const char *name)
{
@@ -1046,7 +984,7 @@ EXPORT_SYMBOL(padata_alloc_possible);
/**
* padata_free - free a padata instance
*
- * @padata_inst: padata instance to free
+ * @pinst: padata instance to free
*/
void padata_free(struct padata_instance *pinst)
{
@@ -1054,6 +992,63 @@ void padata_free(struct padata_instance *pinst)
}
EXPORT_SYMBOL(padata_free);
+/**
+ * padata_alloc_shell - Allocate and initialize padata shell.
+ *
+ * @pinst: Parent padata_instance object.
+ *
+ * Return: new shell on success, NULL on error
+ */
+struct padata_shell *padata_alloc_shell(struct padata_instance *pinst)
+{
+ struct parallel_data *pd;
+ struct padata_shell *ps;
+
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ if (!ps)
+ goto out;
+
+ ps->pinst = pinst;
+
+ get_online_cpus();
+ pd = padata_alloc_pd(ps);
+ put_online_cpus();
+
+ if (!pd)
+ goto out_free_ps;
+
+ mutex_lock(&pinst->lock);
+ RCU_INIT_POINTER(ps->pd, pd);
+ list_add(&ps->list, &pinst->pslist);
+ mutex_unlock(&pinst->lock);
+
+ return ps;
+
+out_free_ps:
+ kfree(ps);
+out:
+ return NULL;
+}
+EXPORT_SYMBOL(padata_alloc_shell);
+
+/**
+ * padata_free_shell - free a padata shell
+ *
+ * @ps: padata shell to free
+ */
+void padata_free_shell(struct padata_shell *ps)
+{
+ struct padata_instance *pinst = ps->pinst;
+
+ mutex_lock(&pinst->lock);
+ list_del(&ps->list);
+ padata_free_pd(rcu_dereference_protected(ps->pd, 1));
+ mutex_unlock(&pinst->lock);
+
+ kfree(ps);
+}
+EXPORT_SYMBOL(padata_free_shell);
+
#ifdef CONFIG_HOTPLUG_CPU
static __init int padata_driver_init(void)
@@ -1061,17 +1056,24 @@ static __init int padata_driver_init(void)
int ret;
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online",
- padata_cpu_online,
- padata_cpu_prep_down);
+ padata_cpu_online, NULL);
if (ret < 0)
return ret;
hp_online = ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead",
+ NULL, padata_cpu_dead);
+ if (ret < 0) {
+ cpuhp_remove_multi_state(hp_online);
+ return ret;
+ }
return 0;
}
module_init(padata_driver_init);
static __exit void padata_driver_exit(void)
{
+ cpuhp_remove_multi_state(CPUHP_PADATA_DEAD);
cpuhp_remove_multi_state(hp_online);
}
module_exit(padata_driver_exit);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index d3667b4075c1..7cbfbeacd68a 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,7 +27,10 @@ config SUSPEND_SKIP_SYNC
Skip the kernel sys_sync() before freezing user processes.
Some systems prefer not to pay this cost on every invocation
of suspend, or they are content with invoking sync() from
- user-space before invoking suspend. Say Y if that's your case.
+ user-space before invoking suspend. There's a run-time switch
+ at '/sys/power/sync_on_suspend' to configure this behaviour.
+ This setting changes the default for the run-tim switch. Say Y
+ to change the default to disable the kernel sys_sync().
config HIBERNATE_CALLBACKS
bool
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 3c0a5a8170b0..6dbeedb7354c 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -9,7 +9,7 @@
* Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
*/
-#define pr_fmt(fmt) "PM: " fmt
+#define pr_fmt(fmt) "PM: hibernation: " fmt
#include <linux/export.h>
#include <linux/suspend.h>
@@ -106,7 +106,7 @@ EXPORT_SYMBOL(system_entering_hibernation);
#ifdef CONFIG_PM_DEBUG
static void hibernation_debug_sleep(void)
{
- pr_info("hibernation debug: Waiting for 5 seconds.\n");
+ pr_info("debug: Waiting for 5 seconds.\n");
mdelay(5000);
}
@@ -277,7 +277,7 @@ static int create_image(int platform_mode)
error = dpm_suspend_end(PMSG_FREEZE);
if (error) {
- pr_err("Some devices failed to power down, aborting hibernation\n");
+ pr_err("Some devices failed to power down, aborting\n");
return error;
}
@@ -295,7 +295,7 @@ static int create_image(int platform_mode)
error = syscore_suspend();
if (error) {
- pr_err("Some system devices failed to power down, aborting hibernation\n");
+ pr_err("Some system devices failed to power down, aborting\n");
goto Enable_irqs;
}
@@ -310,7 +310,7 @@ static int create_image(int platform_mode)
restore_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
- pr_err("Error %d creating hibernation image\n", error);
+ pr_err("Error %d creating image\n", error);
if (!in_suspend) {
events_check_enabled = false;
@@ -680,7 +680,7 @@ static int load_image_and_restore(void)
if (!error)
hibernation_restore(flags & SF_PLATFORM_MODE);
- pr_err("Failed to load hibernation image, recovering.\n");
+ pr_err("Failed to load image, recovering.\n");
swsusp_free();
free_basic_memory_bitmaps();
Unlock:
@@ -743,7 +743,7 @@ int hibernate(void)
else
flags |= SF_CRC32_MODE;
- pm_pr_dbg("Writing image.\n");
+ pm_pr_dbg("Writing hibernation image.\n");
error = swsusp_write(flags);
swsusp_free();
if (!error) {
@@ -755,7 +755,7 @@ int hibernate(void)
in_suspend = 0;
pm_restore_gfp_mask();
} else {
- pm_pr_dbg("Image restored successfully.\n");
+ pm_pr_dbg("Hibernation image restored successfully.\n");
}
Free_bitmaps:
@@ -894,7 +894,7 @@ static int software_resume(void)
goto Close_Finish;
}
- pm_pr_dbg("Preparing processes for restore.\n");
+ pm_pr_dbg("Preparing processes for hibernation restore.\n");
error = freeze_processes();
if (error)
goto Close_Finish;
@@ -903,7 +903,7 @@ static int software_resume(void)
Finish:
__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
pm_restore_console();
- pr_info("resume from hibernation failed (%d)\n", error);
+ pr_info("resume failed (%d)\n", error);
atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
Unlock:
@@ -1068,7 +1068,8 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
lock_system_sleep();
swsusp_resume_device = res;
unlock_system_sleep();
- pm_pr_dbg("Configured resume from disk to %u\n", swsusp_resume_device);
+ pm_pr_dbg("Configured hibernation resume from disk to %u\n",
+ swsusp_resume_device);
noresume = 0;
software_resume();
return n;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index e26de7af520b..69b7a8aeca3b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -190,6 +190,38 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr
}
power_attr(mem_sleep);
+
+/*
+ * sync_on_suspend: invoke ksys_sync_helper() before suspend.
+ *
+ * show() returns whether ksys_sync_helper() is invoked before suspend.
+ * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it.
+ */
+bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC);
+
+static ssize_t sync_on_suspend_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", sync_on_suspend_enabled);
+}
+
+static ssize_t sync_on_suspend_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ sync_on_suspend_enabled = !!val;
+ return n;
+}
+
+power_attr(sync_on_suspend);
#endif /* CONFIG_SUSPEND */
#ifdef CONFIG_PM_SLEEP_DEBUG
@@ -855,6 +887,7 @@ static struct attribute * g[] = {
&wakeup_count_attr.attr,
#ifdef CONFIG_SUSPEND
&mem_sleep_attr.attr,
+ &sync_on_suspend_attr.attr,
#endif
#ifdef CONFIG_PM_AUTOSLEEP
&autosleep_attr.attr,
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 26b9168321e7..ddade80ad276 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -8,7 +8,7 @@
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
*/
-#define pr_fmt(fmt) "PM: " fmt
+#define pr_fmt(fmt) "PM: hibernation: " fmt
#include <linux/version.h>
#include <linux/module.h>
@@ -1147,24 +1147,24 @@ void free_basic_memory_bitmaps(void)
void clear_free_pages(void)
{
-#ifdef CONFIG_PAGE_POISONING_ZERO
struct memory_bitmap *bm = free_pages_map;
unsigned long pfn;
if (WARN_ON(!(free_pages_map)))
return;
- memory_bm_position_reset(bm);
- pfn = memory_bm_next_pfn(bm);
- while (pfn != BM_END_OF_MAP) {
- if (pfn_valid(pfn))
- clear_highpage(pfn_to_page(pfn));
-
+ if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) {
+ memory_bm_position_reset(bm);
pfn = memory_bm_next_pfn(bm);
+ while (pfn != BM_END_OF_MAP) {
+ if (pfn_valid(pfn))
+ clear_highpage(pfn_to_page(pfn));
+
+ pfn = memory_bm_next_pfn(bm);
+ }
+ memory_bm_position_reset(bm);
+ pr_info("free pages cleared after restore\n");
}
- memory_bm_position_reset(bm);
- pr_info("free pages cleared after restore\n");
-#endif /* PAGE_POISONING_ZERO */
}
/**
@@ -1566,9 +1566,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages)
*/
static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
{
- x *= multiplier;
- do_div(x, base);
- return (unsigned long)x;
+ return div64_u64(x * multiplier, base);
}
static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
@@ -1705,16 +1703,20 @@ int hibernate_preallocate_memory(void)
ktime_t start, stop;
int error;
- pr_info("Preallocating image memory... ");
+ pr_info("Preallocating image memory\n");
start = ktime_get();
error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
- if (error)
+ if (error) {
+ pr_err("Cannot allocate original bitmap\n");
goto err_out;
+ }
error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
- if (error)
+ if (error) {
+ pr_err("Cannot allocate copy bitmap\n");
goto err_out;
+ }
alloc_normal = 0;
alloc_highmem = 0;
@@ -1804,8 +1806,11 @@ int hibernate_preallocate_memory(void)
alloc -= pages;
pages += pages_highmem;
pages_highmem = preallocate_image_highmem(alloc);
- if (pages_highmem < alloc)
+ if (pages_highmem < alloc) {
+ pr_err("Image allocation is %lu pages short\n",
+ alloc - pages_highmem);
goto err_out;
+ }
pages += pages_highmem;
/*
* size is the desired number of saveable pages to leave in
@@ -1836,13 +1841,12 @@ int hibernate_preallocate_memory(void)
out:
stop = ktime_get();
- pr_cont("done (allocated %lu pages)\n", pages);
+ pr_info("Allocated %lu pages for snapshot\n", pages);
swsusp_show_speed(start, stop, pages, "Allocated");
return 0;
err_out:
- pr_cont("\n");
swsusp_free();
return -ENOMEM;
}
@@ -1976,7 +1980,7 @@ asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
- pr_info("Creating hibernation image:\n");
+ pr_info("Creating image:\n");
drain_local_pages(NULL);
nr_pages = count_data_pages();
@@ -2010,7 +2014,7 @@ asmlinkage __visible int swsusp_save(void)
nr_copy_pages = nr_pages;
nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
- pr_info("Hibernation image created (%d pages copied)\n", nr_pages);
+ pr_info("Image created (%d pages copied)\n", nr_pages);
return 0;
}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f3b7239f1892..2c47280fbfc7 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -564,7 +564,7 @@ static int enter_state(suspend_state_t state)
if (state == PM_SUSPEND_TO_IDLE)
s2idle_begin();
- if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) {
+ if (sync_on_suspend_enabled) {
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
ksys_sync_helper();
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 60564b58de07..e1ed58adb69e 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -70,7 +70,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
static char info_test[] __initdata =
KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
- unsigned long now;
+ time64_t now;
struct rtc_wkalrm alm;
int status;
@@ -81,10 +81,10 @@ repeat:
printk(err_readtime, dev_name(&rtc->dev), status);
return;
}
- rtc_tm_to_time(&alm.time, &now);
+ now = rtc_tm_to_time64(&alm.time);
memset(&alm, 0, sizeof alm);
- rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
+ rtc_time64_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
alm.enabled = true;
status = rtc_set_alarm(rtc, &alm);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index cb9ddcc08119..43d6179508d6 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -264,12 +264,17 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
return ret;
}
-static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns,
+ unsigned int mode)
{
+ int ret;
+
if (mode & PTRACE_MODE_NOAUDIT)
- return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
+ ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT);
else
- return has_ns_capability(current, ns, CAP_SYS_PTRACE);
+ ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE);
+
+ return ret == 0;
}
/* Returns 0 on success, -errno on denial. */
@@ -321,7 +326,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
gid_eq(caller_gid, tcred->sgid) &&
gid_eq(caller_gid, tcred->gid))
goto ok;
- if (ptrace_has_cap(tcred->user_ns, mode))
+ if (ptrace_has_cap(cred, tcred->user_ns, mode))
goto ok;
rcu_read_unlock();
return -EPERM;
@@ -340,7 +345,7 @@ ok:
mm = task->mm;
if (mm &&
((get_dumpable(mm) != SUID_DUMP_USER) &&
- !ptrace_has_cap(mm->user_ns, mode)))
+ !ptrace_has_cap(cred, mm->user_ns, mode)))
return -EPERM;
return security_ptrace_access_check(task, mode);
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 27c48eb7de40..a4f86a9d6937 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -310,6 +310,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
int ret;
if (flags & RSEQ_FLAG_UNREGISTER) {
+ if (flags & ~RSEQ_FLAG_UNREGISTER)
+ return -EINVAL;
/* Unregister rseq for current thread. */
if (current->rseq != rseq || !current->rseq)
return -EINVAL;
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 1152259a4ca0..12bca64dff73 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -370,7 +370,7 @@ u64 sched_clock_cpu(int cpu)
if (sched_clock_stable())
return sched_clock() + __sched_clock_offset;
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return sched_clock();
preempt_disable_notrace();
@@ -393,7 +393,7 @@ void sched_clock_tick(void)
if (sched_clock_stable())
return;
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return;
lockdep_assert_irqs_disabled();
@@ -460,7 +460,7 @@ void __init sched_clock_init(void)
u64 sched_clock_cpu(int cpu)
{
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return 0;
return sched_clock();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 90e4b00ace89..fc1dfc007604 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -919,17 +919,17 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
return uc_req;
}
-unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
/* Task currently refcounted: use back-annotated (effective) value */
if (p->uclamp[clamp_id].active)
- return p->uclamp[clamp_id].value;
+ return (unsigned long)p->uclamp[clamp_id].value;
uc_eff = uclamp_eff_get(p, clamp_id);
- return uc_eff.value;
+ return (unsigned long)uc_eff.value;
}
/*
@@ -1253,7 +1253,8 @@ static void __init init_uclamp(void)
mutex_init(&uclamp_mutex);
for_each_possible_cpu(cpu) {
- memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
+ memset(&cpu_rq(cpu)->uclamp, 0,
+ sizeof(struct uclamp_rq)*UCLAMP_CNT);
cpu_rq(cpu)->uclamp_flags = 0;
}
@@ -4504,7 +4505,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
void set_user_nice(struct task_struct *p, long nice)
{
bool queued, running;
- int old_prio, delta;
+ int old_prio;
struct rq_flags rf;
struct rq *rq;
@@ -4538,19 +4539,18 @@ void set_user_nice(struct task_struct *p, long nice)
set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
- delta = p->prio - old_prio;
- if (queued) {
+ if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- /*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
- */
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
- resched_curr(rq);
- }
if (running)
set_next_task(rq, p);
+
+ /*
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
+ */
+ p->sched_class->prio_changed(rq, p, old_prio);
+
out_unlock:
task_rq_unlock(rq, p, &rf);
}
@@ -7100,6 +7100,12 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent)
sched_online_group(tg, parent);
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ /* Propagate the effective uclamp value for the new group */
+ cpu_util_update_eff(css);
+#endif
+
return 0;
}
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index b5dcd1d83c7f..7c2fe50fd76d 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,6 +5,8 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
+#include <linux/cpufreq.h>
+
#include "sched.h"
DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
@@ -57,3 +59,19 @@ void cpufreq_remove_update_util_hook(int cpu)
rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
}
EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
+
+/**
+ * cpufreq_this_cpu_can_update - Check if cpufreq policy can be updated.
+ * @policy: cpufreq policy to check.
+ *
+ * Return 'true' if:
+ * - the local and remote CPUs share @policy,
+ * - dvfs_possible_from_any_cpu is set in @policy and the local CPU is not going
+ * offline (in which case it is not expected to run cpufreq updates any more).
+ */
+bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy)
+{
+ return cpumask_test_cpu(smp_processor_id(), policy->cpus) ||
+ (policy->dvfs_possible_from_any_cpu &&
+ rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)));
+}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 322ca8860f54..7fbaee24c824 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -82,12 +82,10 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
* by the hardware, as calculating the frequency is pointless if
* we cannot in fact act on it.
*
- * For the slow switching platforms, the kthread is always scheduled on
- * the right set of CPUs and any CPU can find the next frequency and
- * schedule the kthread.
+ * This is needed on the slow switching platforms too to prevent CPUs
+ * going offline from leaving stale IRQ work items behind.
*/
- if (sg_policy->policy->fast_switch_enabled &&
- !cpufreq_this_cpu_can_update(sg_policy->policy))
+ if (!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
if (unlikely(sg_policy->limits_changed)) {
@@ -240,7 +238,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
*/
util = util_cfs + cpu_util_rt(rq);
if (type == FREQUENCY_UTIL)
- util = uclamp_util_with(rq, util, p);
+ util = uclamp_rq_util_with(rq, util, p);
dl_util = cpu_util_dl(rq);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index b7abca987d94..1a2719e1350a 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -46,6 +46,8 @@ static int convert_prio(int prio)
* @cp: The cpupri context
* @p: The task
* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
+ * @fitness_fn: A pointer to a function to do custom checks whether the CPU
+ * fits a specific criteria so that we only return those CPUs.
*
* Note: This function returns the recommended CPUs as calculated during the
* current invocation. By the time the call returns, the CPUs may have in
@@ -57,7 +59,8 @@ static int convert_prio(int prio)
* Return: (int)bool - CPUs were found
*/
int cpupri_find(struct cpupri *cp, struct task_struct *p,
- struct cpumask *lowest_mask)
+ struct cpumask *lowest_mask,
+ bool (*fitness_fn)(struct task_struct *p, int cpu))
{
int idx = 0;
int task_pri = convert_prio(p->prio);
@@ -98,6 +101,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
continue;
if (lowest_mask) {
+ int cpu;
+
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
/*
@@ -108,7 +113,23 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
* condition, simply act as though we never hit this
* priority level and continue on.
*/
- if (cpumask_any(lowest_mask) >= nr_cpu_ids)
+ if (cpumask_empty(lowest_mask))
+ continue;
+
+ if (!fitness_fn)
+ return 1;
+
+ /* Ensure the capacity of the CPUs fit the task */
+ for_each_cpu(cpu, lowest_mask) {
+ if (!fitness_fn(p, cpu))
+ cpumask_clear_cpu(cpu, lowest_mask);
+ }
+
+ /*
+ * If no CPU at the current priority can fit the task
+ * continue looking
+ */
+ if (cpumask_empty(lowest_mask))
continue;
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 7dc20a3232e7..32dd520db11f 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -18,7 +18,9 @@ struct cpupri {
};
#ifdef CONFIG_SMP
-int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask,
+ bool (*fitness_fn)(struct task_struct *p, int cpu));
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index d43318a489f2..cff3e656566d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -355,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq, int ticks)
+ int ticks)
{
u64 other, cputime = TICK_NSEC * ticks;
@@ -381,7 +381,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
} else if (user_tick) {
account_user_time(p, cputime);
- } else if (p == rq->idle) {
+ } else if (p == this_rq()->idle) {
account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
account_guest_time(p, cputime);
@@ -392,14 +392,12 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
static void irqtime_account_idle_ticks(int ticks)
{
- struct rq *rq = this_rq();
-
- irqtime_account_process_tick(current, 0, rq, ticks);
+ irqtime_account_process_tick(current, 0, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq, int nr_ticks) { }
+ int nr_ticks) { }
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/*
@@ -473,13 +471,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
void account_process_tick(struct task_struct *p, int user_tick)
{
u64 cputime, steal;
- struct rq *rq = this_rq();
if (vtime_accounting_enabled_this_cpu())
return;
if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq, 1);
+ irqtime_account_process_tick(p, user_tick, 1);
return;
}
@@ -493,7 +490,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
if (user_tick)
account_user_time(p, cputime);
- else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+ else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
account_system_time(p, HARDIRQ_OFFSET, cputime);
else
account_idle_time(cputime);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f7e4579e746c..879d3ccf3806 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -751,9 +751,16 @@ void sysrq_sched_debug_show(void)
int cpu;
sched_debug_header(NULL);
- for_each_online_cpu(cpu)
+ for_each_online_cpu(cpu) {
+ /*
+ * Need to reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI or stop_machine.
+ */
+ touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
print_cpu(NULL, cpu);
-
+ }
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 08a233e97a01..fe4e0d775375 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -801,7 +801,7 @@ void post_init_entity_util_avg(struct task_struct *p)
* For !fair tasks do:
*
update_cfs_rq_load_avg(now, cfs_rq);
- attach_entity_load_avg(cfs_rq, se, 0);
+ attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
* such that the next switched_to_fair() has the
@@ -3114,7 +3114,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
struct rq *rq = rq_of(cfs_rq);
- if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
+ if (&rq->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
@@ -3366,16 +3366,17 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
- delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
- delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
-
- se->avg.runnable_load_sum = runnable_sum;
- se->avg.runnable_load_avg = runnable_load_avg;
if (se->on_rq) {
+ delta_sum = runnable_load_sum -
+ se_weight(se) * se->avg.runnable_load_sum;
+ delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
}
+
+ se->avg.runnable_load_sum = runnable_sum;
+ se->avg.runnable_load_avg = runnable_load_avg;
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3520,7 +3521,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
@@ -3556,7 +3557,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
- cfs_rq_util_change(cfs_rq, flags);
+ cfs_rq_util_change(cfs_rq, 0);
trace_pelt_cfs_tp(cfs_rq);
}
@@ -3614,7 +3615,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*
* IOW we're enqueueing a task on a new CPU.
*/
- attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
+ attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, 0);
} else if (decayed) {
@@ -3711,6 +3712,20 @@ static inline unsigned long task_util_est(struct task_struct *p)
return max(task_util(p), _task_util_est(p));
}
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+ return clamp(task_util_est(p),
+ uclamp_eff_value(p, UCLAMP_MIN),
+ uclamp_eff_value(p, UCLAMP_MAX));
+}
+#else
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+ return task_util_est(p);
+}
+#endif
+
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
struct task_struct *p)
{
@@ -3822,7 +3837,7 @@ done:
static inline int task_fits_capacity(struct task_struct *p, long capacity)
{
- return fits_capacity(task_util_est(p), capacity);
+ return fits_capacity(uclamp_task_util(p), capacity);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -3857,7 +3872,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
@@ -5196,6 +5211,20 @@ static inline void update_overutilized_status(struct rq *rq)
static inline void update_overutilized_status(struct rq *rq) { }
#endif
+/* Runqueue only has SCHED_IDLE tasks enqueued */
+static int sched_idle_rq(struct rq *rq)
+{
+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+ rq->nr_running);
+}
+
+#ifdef CONFIG_SMP
+static int sched_idle_cpu(int cpu)
+{
+ return sched_idle_rq(cpu_rq(cpu));
+}
+#endif
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -5310,6 +5339,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
int idle_h_nr_running = task_has_idle_policy(p);
+ bool was_sched_idle = sched_idle_rq(rq);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5356,6 +5386,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se)
sub_nr_running(rq, 1);
+ /* balance early to pull high priority tasks */
+ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
+ rq->next_balance = jiffies;
+
util_est_dequeue(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -5378,15 +5412,6 @@ static struct {
#endif /* CONFIG_NO_HZ_COMMON */
-/* CPU only has SCHED_IDLE tasks enqueued */
-static int sched_idle_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
- rq->nr_running);
-}
-
static unsigned long cpu_load(struct rq *rq)
{
return cfs_rq_load_avg(&rq->cfs);
@@ -5588,7 +5613,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
unsigned int min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0;
int least_loaded_cpu = this_cpu;
- int shallowest_idle_cpu = -1, si_cpu = -1;
+ int shallowest_idle_cpu = -1;
int i;
/* Check if we have any choice: */
@@ -5597,6 +5622,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+ if (sched_idle_cpu(i))
+ return i;
+
if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5619,12 +5647,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
- } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
- if (sched_idle_cpu(i)) {
- si_cpu = i;
- continue;
- }
-
+ } else if (shallowest_idle_cpu == -1) {
load = cpu_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
@@ -5633,11 +5656,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
}
}
- if (shallowest_idle_cpu != -1)
- return shallowest_idle_cpu;
- if (si_cpu != -1)
- return si_cpu;
- return least_loaded_cpu;
+ return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -5790,7 +5809,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
*/
static int select_idle_smt(struct task_struct *p, int target)
{
- int cpu, si_cpu = -1;
+ int cpu;
if (!static_branch_likely(&sched_smt_present))
return -1;
@@ -5798,13 +5817,11 @@ static int select_idle_smt(struct task_struct *p, int target)
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- if (available_idle_cpu(cpu))
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
- if (si_cpu == -1 && sched_idle_cpu(cpu))
- si_cpu = cpu;
}
- return si_cpu;
+ return -1;
}
#else /* CONFIG_SCHED_SMT */
@@ -5828,12 +5845,13 @@ static inline int select_idle_smt(struct task_struct *p, int target)
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
struct sched_domain *this_sd;
u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
int this = smp_processor_id();
- int cpu, nr = INT_MAX, si_cpu = -1;
+ int cpu, nr = INT_MAX;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -5859,15 +5877,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = cpu_clock(this);
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+ for_each_cpu_wrap(cpu, cpus, target) {
if (!--nr)
- return si_cpu;
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
- continue;
- if (available_idle_cpu(cpu))
+ return -1;
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
break;
- if (si_cpu == -1 && sched_idle_cpu(cpu))
- si_cpu = cpu;
}
time = cpu_clock(this) - time;
@@ -6268,9 +6284,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- /* Skip CPUs that will be overutilized. */
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
+ spare_cap = cpu_cap - util;
+
+ /*
+ * Skip CPUs that cannot satisfy the capacity request.
+ * IOW, placing the task there would make the CPU
+ * overutilized. Take uclamp into account to see how
+ * much capacity we can get out of the CPU; this is
+ * aligned with schedutil_cpu_util().
+ */
+ util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
if (!fits_capacity(util, cpu_cap))
continue;
@@ -6285,7 +6310,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* Find the CPU with the maximum spare capacity in
* the performance domain
*/
- spare_cap = cpu_cap - util;
if (spare_cap > max_spare_cap) {
max_spare_cap = spare_cap;
max_spare_cap_cpu = cpu;
@@ -7328,7 +7352,14 @@ static int detach_tasks(struct lb_env *env)
load < 16 && !env->sd->nr_balance_failed)
goto next;
- if (load/2 > env->imbalance)
+ /*
+ * Make sure that we don't migrate too much load.
+ * Nevertheless, let relax the constraint if
+ * scheduler fails to find a good waiting task to
+ * migrate.
+ */
+ if (load/2 > env->imbalance &&
+ env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
goto next;
env->imbalance -= load;
@@ -7773,29 +7804,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
*/
for_each_cpu(cpu, sched_group_span(sdg)) {
- struct sched_group_capacity *sgc;
- struct rq *rq = cpu_rq(cpu);
-
- /*
- * build_sched_domains() -> init_sched_groups_capacity()
- * gets here before we've attached the domains to the
- * runqueues.
- *
- * Use capacity_of(), which is set irrespective of domains
- * in update_cpu_capacity().
- *
- * This avoids capacity from being 0 and
- * causing divide-by-zero issues on boot.
- */
- if (unlikely(!rq->sd)) {
- capacity += capacity_of(cpu);
- } else {
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
- }
+ unsigned long cpu_cap = capacity_of(cpu);
- min_capacity = min(capacity, min_capacity);
- max_capacity = max(capacity, max_capacity);
+ capacity += cpu_cap;
+ min_capacity = min(cpu_cap, min_capacity);
+ max_capacity = max(cpu_cap, max_capacity);
}
} else {
/*
@@ -8161,14 +8174,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
case group_has_spare:
/*
- * Select not overloaded group with lowest number of
- * idle cpus. We could also compare the spare capacity
- * which is more stable but it can end up that the
- * group has less spare capacity but finally more idle
+ * Select not overloaded group with lowest number of idle cpus
+ * and highest number of running tasks. We could also compare
+ * the spare capacity which is more stable but it can end up
+ * that the group has less spare capacity but finally more idle
* CPUs which means less opportunity to pull tasks.
*/
- if (sgs->idle_cpus >= busiest->idle_cpus)
+ if (sgs->idle_cpus > busiest->idle_cpus)
return false;
+ else if ((sgs->idle_cpus == busiest->idle_cpus) &&
+ (sgs->sum_nr_running <= busiest->sum_nr_running))
+ return false;
+
break;
}
@@ -8417,6 +8434,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if (!idlest)
return NULL;
+ /* The local group has been skipped because of CPU affinity */
+ if (!local)
+ return idlest;
+
/*
* If the local group is idler than the selected idlest group
* don't try and push the task.
@@ -9518,6 +9539,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
+ int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
@@ -9554,7 +9576,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
break;
}
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ interval = get_sd_balance_interval(sd, busy);
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
@@ -9570,9 +9592,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
+ busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
}
sd->last_balance = jiffies;
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ interval = get_sd_balance_interval(sd, busy);
}
if (need_serialize)
spin_unlock(&balancing);
@@ -10322,6 +10345,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
+ if (rq->cfs.nr_running == 1)
+ return;
+
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
@@ -10412,7 +10438,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
- attach_entity_load_avg(cfs_rq, se, 0);
+ attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index ffa959e91227..b743bf38f08f 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -158,7 +158,7 @@ static void cpuidle_idle_call(void)
/*
* Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
- * activity happens here and in iterrupts (if any). In that case bypass
+ * activity happens here and in interrupts (if any). In that case bypass
* the cpuidle governor and go stratight for the deepest idle state
* available. Possibly also suspend the local tick and the entire
* timekeeping to prevent timer interrupts from kicking us out of idle
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 9fcb2a695a41..008d6ac2342b 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -163,6 +163,12 @@ static int __init housekeeping_isolcpus_setup(char *str)
continue;
}
+ if (!strncmp(str, "managed_irq,", 12)) {
+ str += 12;
+ flags |= HK_FLAG_MANAGED_IRQ;
+ continue;
+ }
+
pr_warn("isolcpus: Error, unknown flag\n");
return 0;
}
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index a96db50d40e0..bd006b79b360 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -129,8 +129,20 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
* Step 2
*/
delta %= 1024;
- contrib = __accumulate_pelt_segments(periods,
- 1024 - sa->period_contrib, delta);
+ if (load) {
+ /*
+ * This relies on the:
+ *
+ * if (!load)
+ * runnable = running = 0;
+ *
+ * clause from ___update_load_sum(); this results in
+ * the below usage of @contrib to dissapear entirely,
+ * so no point in calculating it.
+ */
+ contrib = __accumulate_pelt_segments(periods,
+ 1024 - sa->period_contrib, delta);
+ }
}
sa->period_contrib = delta;
@@ -205,7 +217,9 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
* This means that weight will be 0 but not running for a sched_entity
* but also for a cfs_rq if the latter becomes idle. As an example,
* this happens during idle_balance() which calls
- * update_blocked_averages()
+ * update_blocked_averages().
+ *
+ * Also see the comment in accumulate_sum().
*/
if (!load)
runnable = running = 0;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 517e3719027e..db7b50bba3f1 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -185,7 +185,8 @@ static void group_init(struct psi_group *group)
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
- group->avg_next_update = sched_clock() + psi_period;
+ group->avg_last_update = sched_clock();
+ group->avg_next_update = group->avg_last_update + psi_period;
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
@@ -481,7 +482,7 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
u32 remaining;
remaining = win->size - elapsed;
- growth += div_u64(win->prev_growth * remaining, win->size);
+ growth += div64_u64(win->prev_growth * remaining, win->size);
}
return growth;
@@ -1279,10 +1280,12 @@ static const struct file_operations psi_cpu_fops = {
static int __init psi_proc_init(void)
{
- proc_mkdir("pressure", NULL);
- proc_create("pressure/io", 0, NULL, &psi_io_fops);
- proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
- proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
+ if (psi_enable) {
+ proc_mkdir("pressure", NULL);
+ proc_create("pressure/io", 0, NULL, &psi_io_fops);
+ proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
+ proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
+ }
return 0;
}
module_init(psi_proc_init);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e591d40fd645..4043abe45459 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -437,6 +437,45 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
return rt_se->on_rq;
}
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the uclamp
+ * settings.
+ *
+ * This check is only important for heterogeneous systems where uclamp_min value
+ * is higher than the capacity of a @cpu. For non-heterogeneous system this
+ * function will always return true.
+ *
+ * The function will return true if the capacity of the @cpu is >= the
+ * uclamp_min and false otherwise.
+ *
+ * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
+ * > uclamp_max.
+ */
+static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ unsigned int min_cap;
+ unsigned int max_cap;
+ unsigned int cpu_cap;
+
+ /* Only heterogeneous systems can benefit from this check */
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return true;
+
+ min_cap = uclamp_eff_value(p, UCLAMP_MIN);
+ max_cap = uclamp_eff_value(p, UCLAMP_MAX);
+
+ cpu_cap = capacity_orig_of(cpu);
+
+ return cpu_cap >= min(min_cap, max_cap);
+}
+#else
+static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ return true;
+}
+#endif
+
#ifdef CONFIG_RT_GROUP_SCHED
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
@@ -1391,6 +1430,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
struct rq *rq;
+ bool test;
/* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
@@ -1422,10 +1462,16 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
*
* This test is optimistic, if we get it wrong the load-balancer
* will have to sort it out.
+ *
+ * We take into account the capacity of the CPU to ensure it fits the
+ * requirement of the task - which is only important on heterogeneous
+ * systems like big.LITTLE.
*/
- if (curr && unlikely(rt_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio)) {
+ test = curr &&
+ unlikely(rt_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
+
+ if (test || !rt_task_fits_capacity(p, cpu)) {
int target = find_lowest_rq(p);
/*
@@ -1449,15 +1495,15 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* let's hope p can move out.
*/
if (rq->curr->nr_cpus_allowed == 1 ||
- !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+ !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL))
return;
/*
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (p->nr_cpus_allowed != 1
- && cpupri_find(&rq->rd->cpupri, p, NULL))
+ if (p->nr_cpus_allowed != 1 &&
+ cpupri_find(&rq->rd->cpupri, p, NULL, NULL))
return;
/*
@@ -1601,7 +1647,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, p->cpus_ptr))
+ cpumask_test_cpu(cpu, p->cpus_ptr) &&
+ rt_task_fits_capacity(p, cpu))
return 1;
return 0;
@@ -1643,7 +1690,8 @@ static int find_lowest_rq(struct task_struct *task)
if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
- if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask,
+ rt_task_fits_capacity))
return -1; /* No targets found */
/*
@@ -2147,12 +2195,14 @@ skip:
*/
static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
- if (!task_running(rq, p) &&
- !test_tsk_need_resched(rq->curr) &&
- p->nr_cpus_allowed > 1 &&
- (dl_task(rq->curr) || rt_task(rq->curr)) &&
- (rq->curr->nr_cpus_allowed < 2 ||
- rq->curr->prio <= p->prio))
+ bool need_to_push = !task_running(rq, p) &&
+ !test_tsk_need_resched(rq->curr) &&
+ p->nr_cpus_allowed > 1 &&
+ (dl_task(rq->curr) || rt_task(rq->curr)) &&
+ (rq->curr->nr_cpus_allowed < 2 ||
+ rq->curr->prio <= p->prio);
+
+ if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq)))
push_rt_tasks(rq);
}
@@ -2224,7 +2274,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+ bool need_to_push = rq->rt.overloaded ||
+ !rt_task_fits_capacity(p, cpu_of(rq));
+
+ if (p->nr_cpus_allowed > 1 && need_to_push)
rt_queue_push_tasks(rq);
#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 280a3c735935..1a88dc8ad11b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2300,14 +2300,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
-unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
static __always_inline
-unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
- struct task_struct *p)
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+ struct task_struct *p)
{
- unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
- unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+ unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
+ unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
if (p) {
min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
@@ -2324,18 +2324,10 @@ unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
return clamp(util, min_util, max_util);
}
-
-static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
-{
- return uclamp_util_with(rq, util, NULL);
-}
#else /* CONFIG_UCLAMP_TASK */
-static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
- struct task_struct *p)
-{
- return util;
-}
-static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+static inline
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+ struct task_struct *p)
{
return util;
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6ec1e595b1d4..dfb64c08a407 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1880,6 +1880,42 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
}
/*
+ * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
+ * any two given CPUs at this (non-NUMA) topology level.
+ */
+static bool topology_span_sane(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map, int cpu)
+{
+ int i;
+
+ /* NUMA levels are allowed to overlap */
+ if (tl->flags & SDTL_OVERLAP)
+ return true;
+
+ /*
+ * Non-NUMA levels cannot partially overlap - they must be either
+ * completely equal or completely disjoint. Otherwise we can end up
+ * breaking the sched_group lists - i.e. a later get_group() pass
+ * breaks the linking done for an earlier span.
+ */
+ for_each_cpu(i, cpu_map) {
+ if (i == cpu)
+ continue;
+ /*
+ * We should 'and' all those masks with 'cpu_map' to exactly
+ * match the topology we're about to build, but that can only
+ * remove CPUs, which only lessens our ability to detect
+ * overlaps
+ */
+ if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
+ cpumask_intersects(tl->mask(cpu), tl->mask(i)))
+ return false;
+ }
+
+ return true;
+}
+
+/*
* Find the sched_domain_topology_level where all CPU capacities are visible
* for all CPUs.
*/
@@ -1975,6 +2011,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
has_asym = true;
}
+ if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
+ goto error;
+
sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
if (tl == sched_domain_topology)
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 45eba18a2898..02ce292b9bc0 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -179,6 +179,7 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int
.bit_nr = -1,
},
.wq_entry = {
+ .flags = flags,
.private = current,
.func = var_wake_function,
.entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 12d2227e5786..b6ea3dcb57bf 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1026,6 +1026,13 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
struct seccomp_notif unotif;
ssize_t ret;
+ /* Verify that we're not given garbage to keep struct extensible. */
+ ret = check_zeroed_user(buf, sizeof(unotif));
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return -EINVAL;
+
memset(&unotif, 0, sizeof(unotif));
ret = down_interruptible(&filter->notif->request);
diff --git a/kernel/smp.c b/kernel/smp.c
index 7dbcb402c2fc..3b7bedc97af3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -395,22 +395,9 @@ call:
}
EXPORT_SYMBOL_GPL(smp_call_function_any);
-/**
- * smp_call_function_many(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on (only runs on online subset).
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed
- * on other CPUs.
- *
- * If @wait is true, then returns once @func has returned.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler. Preemption
- * must be disabled when calling this function.
- */
-void smp_call_function_many(const struct cpumask *mask,
- smp_call_func_t func, void *info, bool wait)
+static void smp_call_function_many_cond(const struct cpumask *mask,
+ smp_call_func_t func, void *info,
+ bool wait, smp_cond_func_t cond_func)
{
struct call_function_data *cfd;
int cpu, next_cpu, this_cpu = smp_processor_id();
@@ -448,7 +435,8 @@ void smp_call_function_many(const struct cpumask *mask,
/* Fastpath: do that cpu by itself. */
if (next_cpu >= nr_cpu_ids) {
- smp_call_function_single(cpu, func, info, wait);
+ if (!cond_func || (cond_func && cond_func(cpu, info)))
+ smp_call_function_single(cpu, func, info, wait);
return;
}
@@ -465,6 +453,9 @@ void smp_call_function_many(const struct cpumask *mask,
for_each_cpu(cpu, cfd->cpumask) {
call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
+ if (cond_func && !cond_func(cpu, info))
+ continue;
+
csd_lock(csd);
if (wait)
csd->flags |= CSD_FLAG_SYNCHRONOUS;
@@ -486,6 +477,26 @@ void smp_call_function_many(const struct cpumask *mask,
}
}
}
+
+/**
+ * smp_call_function_many(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on (only runs on online subset).
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
+ *
+ * If @wait is true, then returns once @func has returned.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler. Preemption
+ * must be disabled when calling this function.
+ */
+void smp_call_function_many(const struct cpumask *mask,
+ smp_call_func_t func, void *info, bool wait)
+{
+ smp_call_function_many_cond(mask, func, info, wait, NULL);
+}
EXPORT_SYMBOL(smp_call_function_many);
/**
@@ -668,11 +679,6 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* @info: An arbitrary pointer to pass to both functions.
* @wait: If true, wait (atomically) until function has
* completed on other CPUs.
- * @gfp_flags: GFP flags to use when allocating the cpumask
- * used internally by the function.
- *
- * The function might sleep if the GFP flags indicates a non
- * atomic allocation is allowed.
*
* Preemption is disabled to protect against CPUs going offline but not online.
* CPUs going online during the call will not be seen or sent an IPI.
@@ -680,46 +686,27 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* You must not call this function with disabled interrupts or
* from a hardware interrupt handler or from a bottom half handler.
*/
-void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags, const struct cpumask *mask)
+void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, const struct cpumask *mask)
{
- cpumask_var_t cpus;
- int cpu, ret;
-
- might_sleep_if(gfpflags_allow_blocking(gfp_flags));
-
- if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
- preempt_disable();
- for_each_cpu(cpu, mask)
- if (cond_func(cpu, info))
- __cpumask_set_cpu(cpu, cpus);
- on_each_cpu_mask(cpus, func, info, wait);
- preempt_enable();
- free_cpumask_var(cpus);
- } else {
- /*
- * No free cpumask, bother. No matter, we'll
- * just have to IPI them one by one.
- */
- preempt_disable();
- for_each_cpu(cpu, mask)
- if (cond_func(cpu, info)) {
- ret = smp_call_function_single(cpu, func,
- info, wait);
- WARN_ON_ONCE(ret);
- }
- preempt_enable();
+ int cpu = get_cpu();
+
+ smp_call_function_many_cond(mask, func, info, wait, cond_func);
+ if (cpumask_test_cpu(cpu, mask) && cond_func(cpu, info)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
}
+ put_cpu();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags)
+void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait)
{
- on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags,
- cpu_online_mask);
+ on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask);
}
EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 1fe34a9fabc2..865bb0228ab6 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -442,7 +442,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
* @cpumask were offline; otherwise, 0 if all executions of @fn
* returned 0, any non zero return value if any returned non zero.
*/
-int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+static int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
{
int ret;
@@ -453,36 +453,6 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
return ret;
}
-/**
- * try_stop_cpus - try to stop multiple cpus
- * @cpumask: cpus to stop
- * @fn: function to execute
- * @arg: argument to @fn
- *
- * Identical to stop_cpus() except that it fails with -EAGAIN if
- * someone else is already using the facility.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * -EAGAIN if someone else is already stopping cpus, -ENOENT if
- * @fn(@arg) was not executed at all because all cpus in @cpumask were
- * offline; otherwise, 0 if all executions of @fn returned 0, any non
- * zero return value if any returned non zero.
- */
-int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
-{
- int ret;
-
- /* static works are used, process one request at a time */
- if (!mutex_trylock(&stop_cpus_mutex))
- return -EAGAIN;
- ret = __stop_cpus(cpumask, fn, arg);
- mutex_unlock(&stop_cpus_mutex);
- return ret;
-}
-
static int cpu_stop_should_run(unsigned int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 13a0f2e6ebc2..e2ac0e37c4ae 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -554,25 +554,33 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
- struct taskstats *stats;
+ struct taskstats *stats_new, *stats;
- if (sig->stats || thread_group_empty(tsk))
- goto ret;
+ /* Pairs with smp_store_release() below. */
+ stats = smp_load_acquire(&sig->stats);
+ if (stats || thread_group_empty(tsk))
+ return stats;
/* No problem if kmem_cache_zalloc() fails */
- stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
+ stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
spin_lock_irq(&tsk->sighand->siglock);
- if (!sig->stats) {
- sig->stats = stats;
- stats = NULL;
+ stats = sig->stats;
+ if (!stats) {
+ /*
+ * Pairs with smp_store_release() above and order the
+ * kmem_cache_zalloc().
+ */
+ smp_store_release(&sig->stats, stats_new);
+ stats = stats_new;
+ stats_new = NULL;
}
spin_unlock_irq(&tsk->sighand->siglock);
- if (stats)
- kmem_cache_free(taskstats_cache, stats);
-ret:
- return sig->stats;
+ if (stats_new)
+ kmem_cache_free(taskstats_cache, stats_new);
+
+ return stats;
}
/* Send pid data out on exit */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 1867044800bb..c8f00168afe8 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
+obj-$(CONFIG_TIME_NS) += namespace.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 451f9d05ccfe..2ffb466af77e 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -26,6 +26,7 @@
#include <linux/freezer.h>
#include <linux/compat.h>
#include <linux/module.h>
+#include <linux/time_namespace.h>
#include "posix-timers.h"
@@ -36,13 +37,15 @@
* struct alarm_base - Alarm timer bases
* @lock: Lock for syncrhonized access to the base
* @timerqueue: Timerqueue head managing the list of events
- * @gettime: Function to read the time correlating to the base
+ * @get_ktime: Function to read the time correlating to the base
+ * @get_timespec: Function to read the namespace time correlating to the base
* @base_clockid: clockid for the base
*/
static struct alarm_base {
spinlock_t lock;
struct timerqueue_head timerqueue;
- ktime_t (*gettime)(void);
+ ktime_t (*get_ktime)(void);
+ void (*get_timespec)(struct timespec64 *tp);
clockid_t base_clockid;
} alarm_bases[ALARM_NUMTYPE];
@@ -55,8 +58,6 @@ static DEFINE_SPINLOCK(freezer_delta_lock);
#endif
#ifdef CONFIG_RTC_CLASS
-static struct wakeup_source *ws;
-
/* rtc timer and device for setting alarm wakeups at suspend */
static struct rtc_timer rtctimer;
static struct rtc_device *rtcdev;
@@ -66,8 +67,6 @@ static DEFINE_SPINLOCK(rtcdev_lock);
* alarmtimer_get_rtcdev - Return selected rtcdevice
*
* This function returns the rtc device to use for wakealarms.
- * If one has not already been chosen, it checks to see if a
- * functional rtc device is available.
*/
struct rtc_device *alarmtimer_get_rtcdev(void)
{
@@ -87,7 +86,8 @@ static int alarmtimer_rtc_add_device(struct device *dev,
{
unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
- struct wakeup_source *__ws;
+ struct platform_device *pdev;
+ int ret = 0;
if (rtcdev)
return -EBUSY;
@@ -97,26 +97,31 @@ static int alarmtimer_rtc_add_device(struct device *dev,
if (!device_may_wakeup(rtc->dev.parent))
return -1;
- __ws = wakeup_source_register(dev, "alarmtimer");
+ pdev = platform_device_register_data(dev, "alarmtimer",
+ PLATFORM_DEVID_AUTO, NULL, 0);
+ if (!IS_ERR(pdev))
+ device_init_wakeup(&pdev->dev, true);
spin_lock_irqsave(&rtcdev_lock, flags);
- if (!rtcdev) {
+ if (!IS_ERR(pdev) && !rtcdev) {
if (!try_module_get(rtc->owner)) {
- spin_unlock_irqrestore(&rtcdev_lock, flags);
- return -1;
+ ret = -1;
+ goto unlock;
}
rtcdev = rtc;
/* hold a reference so it doesn't go away */
get_device(dev);
- ws = __ws;
- __ws = NULL;
+ pdev = NULL;
+ } else {
+ ret = -1;
}
+unlock:
spin_unlock_irqrestore(&rtcdev_lock, flags);
- wakeup_source_unregister(__ws);
+ platform_device_unregister(pdev);
- return 0;
+ return ret;
}
static inline void alarmtimer_rtc_timer_init(void)
@@ -138,11 +143,6 @@ static void alarmtimer_rtc_interface_remove(void)
class_interface_unregister(&alarmtimer_rtc_interface);
}
#else
-struct rtc_device *alarmtimer_get_rtcdev(void)
-{
- return NULL;
-}
-#define rtcdev (NULL)
static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
static inline void alarmtimer_rtc_interface_remove(void) { }
static inline void alarmtimer_rtc_timer_init(void) { }
@@ -207,7 +207,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
spin_unlock_irqrestore(&base->lock, flags);
if (alarm->function)
- restart = alarm->function(alarm, base->gettime());
+ restart = alarm->function(alarm, base->get_ktime());
spin_lock_irqsave(&base->lock, flags);
if (restart != ALARMTIMER_NORESTART) {
@@ -217,7 +217,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
}
spin_unlock_irqrestore(&base->lock, flags);
- trace_alarmtimer_fired(alarm, base->gettime());
+ trace_alarmtimer_fired(alarm, base->get_ktime());
return ret;
}
@@ -225,7 +225,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
ktime_t alarm_expires_remaining(const struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- return ktime_sub(alarm->node.expires, base->gettime());
+ return ktime_sub(alarm->node.expires, base->get_ktime());
}
EXPORT_SYMBOL_GPL(alarm_expires_remaining);
@@ -270,7 +270,7 @@ static int alarmtimer_suspend(struct device *dev)
spin_unlock_irqrestore(&base->lock, flags);
if (!next)
continue;
- delta = ktime_sub(next->expires, base->gettime());
+ delta = ktime_sub(next->expires, base->get_ktime());
if (!min || (delta < min)) {
expires = next->expires;
min = delta;
@@ -281,7 +281,7 @@ static int alarmtimer_suspend(struct device *dev)
return 0;
if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
- __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+ pm_wakeup_event(dev, 2 * MSEC_PER_SEC);
return -EBUSY;
}
@@ -296,7 +296,7 @@ static int alarmtimer_suspend(struct device *dev)
/* Set alarm, if in the past reject suspend briefly to handle */
ret = rtc_timer_start(rtc, &rtctimer, now, 0);
if (ret < 0)
- __pm_wakeup_event(ws, MSEC_PER_SEC);
+ pm_wakeup_event(dev, MSEC_PER_SEC);
return ret;
}
@@ -364,7 +364,7 @@ void alarm_start(struct alarm *alarm, ktime_t start)
hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
spin_unlock_irqrestore(&base->lock, flags);
- trace_alarmtimer_start(alarm, base->gettime());
+ trace_alarmtimer_start(alarm, base->get_ktime());
}
EXPORT_SYMBOL_GPL(alarm_start);
@@ -377,7 +377,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- start = ktime_add_safe(start, base->gettime());
+ start = ktime_add_safe(start, base->get_ktime());
alarm_start(alarm, start);
}
EXPORT_SYMBOL_GPL(alarm_start_relative);
@@ -414,7 +414,7 @@ int alarm_try_to_cancel(struct alarm *alarm)
alarmtimer_dequeue(base, alarm);
spin_unlock_irqrestore(&base->lock, flags);
- trace_alarmtimer_cancel(alarm, base->gettime());
+ trace_alarmtimer_cancel(alarm, base->get_ktime());
return ret;
}
EXPORT_SYMBOL_GPL(alarm_try_to_cancel);
@@ -474,7 +474,7 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- return alarm_forward(alarm, base->gettime(), interval);
+ return alarm_forward(alarm, base->get_ktime(), interval);
}
EXPORT_SYMBOL_GPL(alarm_forward_now);
@@ -500,7 +500,7 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
return;
}
- delta = ktime_sub(absexp, base->gettime());
+ delta = ktime_sub(absexp, base->get_ktime());
spin_lock_irqsave(&freezer_delta_lock, flags);
if (!freezer_delta || (delta < freezer_delta)) {
@@ -632,7 +632,7 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
struct alarm_base *base = &alarm_bases[alarm->type];
if (!absolute)
- expires = ktime_add_safe(expires, base->gettime());
+ expires = ktime_add_safe(expires, base->get_ktime());
if (sigev_none)
alarm->node.expires = expires;
else
@@ -657,24 +657,41 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp
}
/**
- * alarm_clock_get - posix clock_get interface
+ * alarm_clock_get_timespec - posix clock_get_timespec interface
* @which_clock: clockid
* @tp: timespec to fill.
*
- * Provides the underlying alarm base time.
+ * Provides the underlying alarm base time in a tasks time namespace.
*/
-static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp)
+static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp)
{
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
if (!alarmtimer_get_rtcdev())
return -EINVAL;
- *tp = ktime_to_timespec64(base->gettime());
+ base->get_timespec(tp);
+
return 0;
}
/**
+ * alarm_clock_get_ktime - posix clock_get_ktime interface
+ * @which_clock: clockid
+ *
+ * Provides the underlying alarm base time in the root namespace.
+ */
+static ktime_t alarm_clock_get_ktime(clockid_t which_clock)
+{
+ struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
+
+ if (!alarmtimer_get_rtcdev())
+ return -EINVAL;
+
+ return base->get_ktime();
+}
+
+/**
* alarm_timer_create - posix timer_create interface
* @new_timer: k_itimer pointer to manage
*
@@ -747,7 +764,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
struct timespec64 rmt;
ktime_t rem;
- rem = ktime_sub(absexp, alarm_bases[type].gettime());
+ rem = ktime_sub(absexp, alarm_bases[type].get_ktime());
if (rem <= 0)
return 0;
@@ -816,9 +833,11 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
exp = timespec64_to_ktime(*tsreq);
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
- ktime_t now = alarm_bases[type].gettime();
+ ktime_t now = alarm_bases[type].get_ktime();
exp = ktime_add_safe(now, exp);
+ } else {
+ exp = timens_ktime_to_host(which_clock, exp);
}
ret = alarmtimer_do_nsleep(&alarm, exp, type);
@@ -837,7 +856,8 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
const struct k_clock alarm_clock = {
.clock_getres = alarm_clock_getres,
- .clock_get = alarm_clock_get,
+ .clock_get_ktime = alarm_clock_get_ktime,
+ .clock_get_timespec = alarm_clock_get_timespec,
.timer_create = alarm_timer_create,
.timer_set = common_timer_set,
.timer_del = common_timer_del,
@@ -866,6 +886,12 @@ static struct platform_driver alarmtimer_driver = {
}
};
+static void get_boottime_timespec(struct timespec64 *tp)
+{
+ ktime_get_boottime_ts64(tp);
+ timens_add_boottime(tp);
+}
+
/**
* alarmtimer_init - Initialize alarm timer code
*
@@ -874,17 +900,18 @@ static struct platform_driver alarmtimer_driver = {
*/
static int __init alarmtimer_init(void)
{
- struct platform_device *pdev;
- int error = 0;
+ int error;
int i;
alarmtimer_rtc_timer_init();
/* Initialize alarm bases */
alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
- alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
+ alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real;
+ alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64,
alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
- alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
+ alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime;
+ alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec;
for (i = 0; i < ALARM_NUMTYPE; i++) {
timerqueue_init_head(&alarm_bases[i].timerqueue);
spin_lock_init(&alarm_bases[i].lock);
@@ -898,15 +925,7 @@ static int __init alarmtimer_init(void)
if (error)
goto out_if;
- pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0);
- if (IS_ERR(pdev)) {
- error = PTR_ERR(pdev);
- goto out_drv;
- }
return 0;
-
-out_drv:
- platform_driver_unregister(&alarmtimer_driver);
out_if:
alarmtimer_rtc_interface_remove();
return error;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 8de90ea31280..3a609e7344f3 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1477,7 +1477,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
struct hrtimer_clock_base *base,
struct hrtimer *timer, ktime_t *now,
- unsigned long flags)
+ unsigned long flags) __must_hold(&cpu_base->lock)
{
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
@@ -1910,8 +1910,8 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
return ret;
}
-long hrtimer_nanosleep(const struct timespec64 *rqtp,
- const enum hrtimer_mode mode, const clockid_t clockid)
+long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
+ const clockid_t clockid)
{
struct restart_block *restart;
struct hrtimer_sleeper t;
@@ -1923,7 +1923,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp,
slack = 0;
hrtimer_init_sleeper_on_stack(&t, clockid, mode);
- hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
+ hrtimer_set_expires_range_ns(&t.timer, rqtp, slack);
ret = do_nanosleep(&t, mode);
if (ret != -ERESTART_RESTARTBLOCK)
goto out;
@@ -1958,7 +1958,8 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
+ CLOCK_MONOTONIC);
}
#endif
@@ -1978,7 +1979,8 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
+ CLOCK_MONOTONIC);
}
#endif
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
new file mode 100644
index 000000000000..12858507d75a
--- /dev/null
+++ b/kernel/time/namespace.c
@@ -0,0 +1,468 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: Andrei Vagin <avagin@openvz.org>
+ * Author: Dmitry Safonov <dima@arista.com>
+ */
+
+#include <linux/time_namespace.h>
+#include <linux/user_namespace.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/seq_file.h>
+#include <linux/proc_ns.h>
+#include <linux/export.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+
+#include <vdso/datapage.h>
+
+ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
+ struct timens_offsets *ns_offsets)
+{
+ ktime_t offset;
+
+ switch (clockid) {
+ case CLOCK_MONOTONIC:
+ offset = timespec64_to_ktime(ns_offsets->monotonic);
+ break;
+ case CLOCK_BOOTTIME:
+ case CLOCK_BOOTTIME_ALARM:
+ offset = timespec64_to_ktime(ns_offsets->boottime);
+ break;
+ default:
+ return tim;
+ }
+
+ /*
+ * Check that @tim value is in [offset, KTIME_MAX + offset]
+ * and subtract offset.
+ */
+ if (tim < offset) {
+ /*
+ * User can specify @tim *absolute* value - if it's lesser than
+ * the time namespace's offset - it's already expired.
+ */
+ tim = 0;
+ } else {
+ tim = ktime_sub(tim, offset);
+ if (unlikely(tim > KTIME_MAX))
+ tim = KTIME_MAX;
+ }
+
+ return tim;
+}
+
+static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES);
+}
+
+static void dec_time_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES);
+}
+
+/**
+ * clone_time_ns - Clone a time namespace
+ * @user_ns: User namespace which owns a new namespace.
+ * @old_ns: Namespace to clone
+ *
+ * Clone @old_ns and set the clone refcount to 1
+ *
+ * Return: The new namespace or ERR_PTR.
+ */
+static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
+ struct time_namespace *old_ns)
+{
+ struct time_namespace *ns;
+ struct ucounts *ucounts;
+ int err;
+
+ err = -ENOSPC;
+ ucounts = inc_time_namespaces(user_ns);
+ if (!ucounts)
+ goto fail;
+
+ err = -ENOMEM;
+ ns = kmalloc(sizeof(*ns), GFP_KERNEL);
+ if (!ns)
+ goto fail_dec;
+
+ kref_init(&ns->kref);
+
+ ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!ns->vvar_page)
+ goto fail_free;
+
+ err = ns_alloc_inum(&ns->ns);
+ if (err)
+ goto fail_free_page;
+
+ ns->ucounts = ucounts;
+ ns->ns.ops = &timens_operations;
+ ns->user_ns = get_user_ns(user_ns);
+ ns->offsets = old_ns->offsets;
+ ns->frozen_offsets = false;
+ return ns;
+
+fail_free_page:
+ __free_page(ns->vvar_page);
+fail_free:
+ kfree(ns);
+fail_dec:
+ dec_time_namespaces(ucounts);
+fail:
+ return ERR_PTR(err);
+}
+
+/**
+ * copy_time_ns - Create timens_for_children from @old_ns
+ * @flags: Cloning flags
+ * @user_ns: User namespace which owns a new namespace.
+ * @old_ns: Namespace to clone
+ *
+ * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children;
+ * adds a refcounter to @old_ns otherwise.
+ *
+ * Return: timens_for_children namespace or ERR_PTR.
+ */
+struct time_namespace *copy_time_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct time_namespace *old_ns)
+{
+ if (!(flags & CLONE_NEWTIME))
+ return get_time_ns(old_ns);
+
+ return clone_time_ns(user_ns, old_ns);
+}
+
+static struct timens_offset offset_from_ts(struct timespec64 off)
+{
+ struct timens_offset ret;
+
+ ret.sec = off.tv_sec;
+ ret.nsec = off.tv_nsec;
+
+ return ret;
+}
+
+/*
+ * A time namespace VVAR page has the same layout as the VVAR page which
+ * contains the system wide VDSO data.
+ *
+ * For a normal task the VVAR pages are installed in the normal ordering:
+ * VVAR
+ * PVCLOCK
+ * HVCLOCK
+ * TIMENS <- Not really required
+ *
+ * Now for a timens task the pages are installed in the following order:
+ * TIMENS
+ * PVCLOCK
+ * HVCLOCK
+ * VVAR
+ *
+ * The check for vdso_data->clock_mode is in the unlikely path of
+ * the seq begin magic. So for the non-timens case most of the time
+ * 'seq' is even, so the branch is not taken.
+ *
+ * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
+ * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the
+ * update to finish and for 'seq' to become even anyway.
+ *
+ * Timens page has vdso_data->clock_mode set to VCLOCK_TIMENS which enforces
+ * the time namespace handling path.
+ */
+static void timens_setup_vdso_data(struct vdso_data *vdata,
+ struct time_namespace *ns)
+{
+ struct timens_offset *offset = vdata->offset;
+ struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
+ struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
+
+ vdata->seq = 1;
+ vdata->clock_mode = VCLOCK_TIMENS;
+ offset[CLOCK_MONOTONIC] = monotonic;
+ offset[CLOCK_MONOTONIC_RAW] = monotonic;
+ offset[CLOCK_MONOTONIC_COARSE] = monotonic;
+ offset[CLOCK_BOOTTIME] = boottime;
+ offset[CLOCK_BOOTTIME_ALARM] = boottime;
+}
+
+/*
+ * Protects possibly multiple offsets writers racing each other
+ * and tasks entering the namespace.
+ */
+static DEFINE_MUTEX(offset_lock);
+
+static void timens_set_vvar_page(struct task_struct *task,
+ struct time_namespace *ns)
+{
+ struct vdso_data *vdata;
+ unsigned int i;
+
+ if (ns == &init_time_ns)
+ return;
+
+ /* Fast-path, taken by every task in namespace except the first. */
+ if (likely(ns->frozen_offsets))
+ return;
+
+ mutex_lock(&offset_lock);
+ /* Nothing to-do: vvar_page has been already initialized. */
+ if (ns->frozen_offsets)
+ goto out;
+
+ ns->frozen_offsets = true;
+ vdata = arch_get_vdso_data(page_address(ns->vvar_page));
+
+ for (i = 0; i < CS_BASES; i++)
+ timens_setup_vdso_data(&vdata[i], ns);
+
+out:
+ mutex_unlock(&offset_lock);
+}
+
+void free_time_ns(struct kref *kref)
+{
+ struct time_namespace *ns;
+
+ ns = container_of(kref, struct time_namespace, kref);
+ dec_time_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+ ns_free_inum(&ns->ns);
+ __free_page(ns->vvar_page);
+ kfree(ns);
+}
+
+static struct time_namespace *to_time_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct time_namespace, ns);
+}
+
+static struct ns_common *timens_get(struct task_struct *task)
+{
+ struct time_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->time_ns;
+ get_time_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static struct ns_common *timens_for_children_get(struct task_struct *task)
+{
+ struct time_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->time_ns_for_children;
+ get_time_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static void timens_put(struct ns_common *ns)
+{
+ put_time_ns(to_time_ns(ns));
+}
+
+static int timens_install(struct nsproxy *nsproxy, struct ns_common *new)
+{
+ struct time_namespace *ns = to_time_ns(new);
+ int err;
+
+ if (!current_is_single_threaded())
+ return -EUSERS;
+
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
+ timens_set_vvar_page(current, ns);
+
+ err = vdso_join_timens(current, ns);
+ if (err)
+ return err;
+
+ get_time_ns(ns);
+ put_time_ns(nsproxy->time_ns);
+ nsproxy->time_ns = ns;
+
+ get_time_ns(ns);
+ put_time_ns(nsproxy->time_ns_for_children);
+ nsproxy->time_ns_for_children = ns;
+ return 0;
+}
+
+int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
+{
+ struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
+ struct time_namespace *ns = to_time_ns(nsc);
+ int err;
+
+ /* create_new_namespaces() already incremented the ref counter */
+ if (nsproxy->time_ns == nsproxy->time_ns_for_children)
+ return 0;
+
+ timens_set_vvar_page(tsk, ns);
+
+ err = vdso_join_timens(tsk, ns);
+ if (err)
+ return err;
+
+ get_time_ns(ns);
+ put_time_ns(nsproxy->time_ns);
+ nsproxy->time_ns = ns;
+
+ return 0;
+}
+
+static struct user_namespace *timens_owner(struct ns_common *ns)
+{
+ return to_time_ns(ns)->user_ns;
+}
+
+static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts)
+{
+ seq_printf(m, "%d %lld %ld\n", clockid, ts->tv_sec, ts->tv_nsec);
+}
+
+void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m)
+{
+ struct ns_common *ns;
+ struct time_namespace *time_ns;
+
+ ns = timens_for_children_get(p);
+ if (!ns)
+ return;
+ time_ns = to_time_ns(ns);
+
+ show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic);
+ show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime);
+ put_time_ns(time_ns);
+}
+
+int proc_timens_set_offset(struct file *file, struct task_struct *p,
+ struct proc_timens_offset *offsets, int noffsets)
+{
+ struct ns_common *ns;
+ struct time_namespace *time_ns;
+ struct timespec64 tp;
+ int i, err;
+
+ ns = timens_for_children_get(p);
+ if (!ns)
+ return -ESRCH;
+ time_ns = to_time_ns(ns);
+
+ if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) {
+ put_time_ns(time_ns);
+ return -EPERM;
+ }
+
+ for (i = 0; i < noffsets; i++) {
+ struct proc_timens_offset *off = &offsets[i];
+
+ switch (off->clockid) {
+ case CLOCK_MONOTONIC:
+ ktime_get_ts64(&tp);
+ break;
+ case CLOCK_BOOTTIME:
+ ktime_get_boottime_ts64(&tp);
+ break;
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = -ERANGE;
+
+ if (off->val.tv_sec > KTIME_SEC_MAX ||
+ off->val.tv_sec < -KTIME_SEC_MAX)
+ goto out;
+
+ tp = timespec64_add(tp, off->val);
+ /*
+ * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is
+ * still unreachable.
+ */
+ if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2)
+ goto out;
+ }
+
+ mutex_lock(&offset_lock);
+ if (time_ns->frozen_offsets) {
+ err = -EACCES;
+ goto out_unlock;
+ }
+
+ err = 0;
+ /* Don't report errors after this line */
+ for (i = 0; i < noffsets; i++) {
+ struct proc_timens_offset *off = &offsets[i];
+ struct timespec64 *offset = NULL;
+
+ switch (off->clockid) {
+ case CLOCK_MONOTONIC:
+ offset = &time_ns->offsets.monotonic;
+ break;
+ case CLOCK_BOOTTIME:
+ offset = &time_ns->offsets.boottime;
+ break;
+ }
+
+ *offset = off->val;
+ }
+
+out_unlock:
+ mutex_unlock(&offset_lock);
+out:
+ put_time_ns(time_ns);
+
+ return err;
+}
+
+const struct proc_ns_operations timens_operations = {
+ .name = "time",
+ .type = CLONE_NEWTIME,
+ .get = timens_get,
+ .put = timens_put,
+ .install = timens_install,
+ .owner = timens_owner,
+};
+
+const struct proc_ns_operations timens_for_children_operations = {
+ .name = "time_for_children",
+ .type = CLONE_NEWTIME,
+ .get = timens_for_children_get,
+ .put = timens_put,
+ .install = timens_install,
+ .owner = timens_owner,
+};
+
+struct time_namespace init_time_ns = {
+ .kref = KREF_INIT(3),
+ .user_ns = &init_user_ns,
+ .ns.inum = PROC_TIME_INIT_INO,
+ .ns.ops = &timens_operations,
+ .frozen_offsets = true,
+};
+
+static int __init time_ns_init(void)
+{
+ return 0;
+}
+subsys_initcall(time_ns_init);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index ec960bb939fd..77c0c2370b6d 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -14,8 +14,6 @@
#include "posix-timers.h"
-static void delete_clock(struct kref *kref);
-
/*
* Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
*/
@@ -125,7 +123,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
err = 0;
if (!err) {
- kref_get(&clk->kref);
+ get_device(clk->dev);
fp->private_data = clk;
}
out:
@@ -141,7 +139,7 @@ static int posix_clock_release(struct inode *inode, struct file *fp)
if (clk->ops.release)
err = clk->ops.release(clk);
- kref_put(&clk->kref, delete_clock);
+ put_device(clk->dev);
fp->private_data = NULL;
@@ -161,38 +159,35 @@ static const struct file_operations posix_clock_file_operations = {
#endif
};
-int posix_clock_register(struct posix_clock *clk, dev_t devid)
+int posix_clock_register(struct posix_clock *clk, struct device *dev)
{
int err;
- kref_init(&clk->kref);
init_rwsem(&clk->rwsem);
cdev_init(&clk->cdev, &posix_clock_file_operations);
+ err = cdev_device_add(&clk->cdev, dev);
+ if (err) {
+ pr_err("%s unable to add device %d:%d\n",
+ dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt));
+ return err;
+ }
clk->cdev.owner = clk->ops.owner;
- err = cdev_add(&clk->cdev, devid, 1);
+ clk->dev = dev;
- return err;
+ return 0;
}
EXPORT_SYMBOL_GPL(posix_clock_register);
-static void delete_clock(struct kref *kref)
-{
- struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
-
- if (clk->release)
- clk->release(clk);
-}
-
void posix_clock_unregister(struct posix_clock *clk)
{
- cdev_del(&clk->cdev);
+ cdev_device_del(&clk->cdev, clk->dev);
down_write(&clk->rwsem);
clk->zombie = true;
up_write(&clk->rwsem);
- kref_put(&clk->kref, delete_clock);
+ put_device(clk->dev);
}
EXPORT_SYMBOL_GPL(posix_clock_unregister);
@@ -315,8 +310,8 @@ out:
}
const struct k_clock clock_posix_dynamic = {
- .clock_getres = pc_clock_getres,
- .clock_set = pc_clock_settime,
- .clock_get = pc_clock_gettime,
- .clock_adj = pc_clock_adjtime,
+ .clock_getres = pc_clock_getres,
+ .clock_set = pc_clock_settime,
+ .clock_get_timespec = pc_clock_gettime,
+ .clock_adj = pc_clock_adjtime,
};
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 42d512fcfda2..8ff6da77a01f 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1391,26 +1391,26 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
}
const struct k_clock clock_posix_cpu = {
- .clock_getres = posix_cpu_clock_getres,
- .clock_set = posix_cpu_clock_set,
- .clock_get = posix_cpu_clock_get,
- .timer_create = posix_cpu_timer_create,
- .nsleep = posix_cpu_nsleep,
- .timer_set = posix_cpu_timer_set,
- .timer_del = posix_cpu_timer_del,
- .timer_get = posix_cpu_timer_get,
- .timer_rearm = posix_cpu_timer_rearm,
+ .clock_getres = posix_cpu_clock_getres,
+ .clock_set = posix_cpu_clock_set,
+ .clock_get_timespec = posix_cpu_clock_get,
+ .timer_create = posix_cpu_timer_create,
+ .nsleep = posix_cpu_nsleep,
+ .timer_set = posix_cpu_timer_set,
+ .timer_del = posix_cpu_timer_del,
+ .timer_get = posix_cpu_timer_get,
+ .timer_rearm = posix_cpu_timer_rearm,
};
const struct k_clock clock_process = {
- .clock_getres = process_cpu_clock_getres,
- .clock_get = process_cpu_clock_get,
- .timer_create = process_cpu_timer_create,
- .nsleep = process_cpu_nsleep,
+ .clock_getres = process_cpu_clock_getres,
+ .clock_get_timespec = process_cpu_clock_get,
+ .timer_create = process_cpu_timer_create,
+ .nsleep = process_cpu_nsleep,
};
const struct k_clock clock_thread = {
- .clock_getres = thread_cpu_clock_getres,
- .clock_get = thread_cpu_clock_get,
- .timer_create = thread_cpu_timer_create,
+ .clock_getres = thread_cpu_clock_getres,
+ .clock_get_timespec = thread_cpu_clock_get,
+ .timer_create = thread_cpu_timer_create,
};
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 67df65f887ac..fcb3b21d8bdc 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -14,6 +14,7 @@
#include <linux/ktime.h>
#include <linux/timekeeping.h>
#include <linux/posix-timers.h>
+#include <linux/time_namespace.h>
#include <linux/compat.h>
#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
@@ -77,9 +78,11 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
break;
case CLOCK_MONOTONIC:
ktime_get_ts64(tp);
+ timens_add_monotonic(tp);
break;
case CLOCK_BOOTTIME:
ktime_get_boottime_ts64(tp);
+ timens_add_boottime(tp);
break;
default:
return -EINVAL;
@@ -126,6 +129,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
struct __kernel_timespec __user *, rmtp)
{
struct timespec64 t;
+ ktime_t texp;
switch (which_clock) {
case CLOCK_REALTIME:
@@ -144,13 +148,19 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
+ texp = timespec64_to_ktime(t);
+ if (flags & TIMER_ABSTIME)
+ texp = timens_ktime_to_host(which_clock, texp);
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
#ifdef CONFIG_COMPAT
COMPAT_SYS_NI(timer_create);
+#endif
+
+#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
COMPAT_SYS_NI(getitimer);
COMPAT_SYS_NI(setitimer);
#endif
@@ -212,6 +222,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
struct old_timespec32 __user *, rmtp)
{
struct timespec64 t;
+ ktime_t texp;
switch (which_clock) {
case CLOCK_REALTIME:
@@ -230,7 +241,10 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
+ texp = timespec64_to_ktime(t);
+ if (flags & TIMER_ABSTIME)
+ texp = timens_ktime_to_host(which_clock, texp);
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0ec5b7a1d769..ff0eb30de346 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -30,6 +30,7 @@
#include <linux/hashtable.h>
#include <linux/compat.h>
#include <linux/nospec.h>
+#include <linux/time_namespace.h>
#include "timekeeping.h"
#include "posix-timers.h"
@@ -165,12 +166,17 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
}
/* Get clock_realtime */
-static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_real_ts64(tp);
return 0;
}
+static ktime_t posix_get_realtime_ktime(clockid_t which_clock)
+{
+ return ktime_get_real();
+}
+
/* Set clock_realtime */
static int posix_clock_realtime_set(const clockid_t which_clock,
const struct timespec64 *tp)
@@ -187,18 +193,25 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
/*
* Get monotonic time for posix timers
*/
-static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_ts64(tp);
+ timens_add_monotonic(tp);
return 0;
}
+static ktime_t posix_get_monotonic_ktime(clockid_t which_clock)
+{
+ return ktime_get();
+}
+
/*
* Get monotonic-raw time for posix timers
*/
static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_raw_ts64(tp);
+ timens_add_monotonic(tp);
return 0;
}
@@ -213,6 +226,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
struct timespec64 *tp)
{
ktime_get_coarse_ts64(tp);
+ timens_add_monotonic(tp);
return 0;
}
@@ -222,18 +236,29 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *
return 0;
}
-static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_boottime_ts64(tp);
+ timens_add_boottime(tp);
return 0;
}
-static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
+static ktime_t posix_get_boottime_ktime(const clockid_t which_clock)
+{
+ return ktime_get_boottime();
+}
+
+static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_clocktai_ts64(tp);
return 0;
}
+static ktime_t posix_get_tai_ktime(clockid_t which_clock)
+{
+ return ktime_get_clocktai();
+}
+
static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
{
tp->tv_sec = 0;
@@ -645,7 +670,6 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
{
const struct k_clock *kc = timr->kclock;
ktime_t now, remaining, iv;
- struct timespec64 ts64;
bool sig_none;
sig_none = timr->it_sigev_notify == SIGEV_NONE;
@@ -663,12 +687,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
return;
}
- /*
- * The timespec64 based conversion is suboptimal, but it's not
- * worth to implement yet another callback.
- */
- kc->clock_get(timr->it_clock, &ts64);
- now = timespec64_to_ktime(ts64);
+ now = kc->clock_get_ktime(timr->it_clock);
/*
* When a requeue is pending or this is a SIGEV_NONE timer move the
@@ -781,7 +800,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
* Posix magic: Relative CLOCK_REALTIME timers are not affected by
* clock modifications, so they become CLOCK_MONOTONIC based under the
* hood. See hrtimer_init(). Update timr->kclock, so the generic
- * functions which use timr->kclock->clock_get() work.
+ * functions which use timr->kclock->clock_get_*() work.
*
* Note: it_clock stays unmodified, because the next timer_set() might
* use ABSTIME, so it needs to switch back.
@@ -866,6 +885,8 @@ int common_timer_set(struct k_itimer *timr, int flags,
timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
expires = timespec64_to_ktime(new_setting->it_value);
+ if (flags & TIMER_ABSTIME)
+ expires = timens_ktime_to_host(timr->it_clock, expires);
sigev_none = timr->it_sigev_notify == SIGEV_NONE;
kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
@@ -1067,7 +1088,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
if (!kc)
return -EINVAL;
- error = kc->clock_get(which_clock, &kernel_tp);
+ error = kc->clock_get_timespec(which_clock, &kernel_tp);
if (!error && put_timespec64(&kernel_tp, tp))
error = -EFAULT;
@@ -1149,7 +1170,7 @@ SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
if (!kc)
return -EINVAL;
- err = kc->clock_get(which_clock, &ts);
+ err = kc->clock_get_timespec(which_clock, &ts);
if (!err && put_old_timespec32(&ts, tp))
err = -EFAULT;
@@ -1200,7 +1221,22 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
static int common_nsleep(const clockid_t which_clock, int flags,
const struct timespec64 *rqtp)
{
- return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ?
+ ktime_t texp = timespec64_to_ktime(*rqtp);
+
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
+ HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+ which_clock);
+}
+
+static int common_nsleep_timens(const clockid_t which_clock, int flags,
+ const struct timespec64 *rqtp)
+{
+ ktime_t texp = timespec64_to_ktime(*rqtp);
+
+ if (flags & TIMER_ABSTIME)
+ texp = timens_ktime_to_host(which_clock, texp);
+
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
@@ -1261,7 +1297,8 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
static const struct k_clock clock_realtime = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_clock_realtime_get,
+ .clock_get_timespec = posix_get_realtime_timespec,
+ .clock_get_ktime = posix_get_realtime_ktime,
.clock_set = posix_clock_realtime_set,
.clock_adj = posix_clock_realtime_adj,
.nsleep = common_nsleep,
@@ -1279,8 +1316,9 @@ static const struct k_clock clock_realtime = {
static const struct k_clock clock_monotonic = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_ktime_get_ts,
- .nsleep = common_nsleep,
+ .clock_get_timespec = posix_get_monotonic_timespec,
+ .clock_get_ktime = posix_get_monotonic_ktime,
+ .nsleep = common_nsleep_timens,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
.timer_get = common_timer_get,
@@ -1295,22 +1333,23 @@ static const struct k_clock clock_monotonic = {
static const struct k_clock clock_monotonic_raw = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_monotonic_raw,
+ .clock_get_timespec = posix_get_monotonic_raw,
};
static const struct k_clock clock_realtime_coarse = {
.clock_getres = posix_get_coarse_res,
- .clock_get = posix_get_realtime_coarse,
+ .clock_get_timespec = posix_get_realtime_coarse,
};
static const struct k_clock clock_monotonic_coarse = {
.clock_getres = posix_get_coarse_res,
- .clock_get = posix_get_monotonic_coarse,
+ .clock_get_timespec = posix_get_monotonic_coarse,
};
static const struct k_clock clock_tai = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_tai,
+ .clock_get_ktime = posix_get_tai_ktime,
+ .clock_get_timespec = posix_get_tai_timespec,
.nsleep = common_nsleep,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
@@ -1326,8 +1365,9 @@ static const struct k_clock clock_tai = {
static const struct k_clock clock_boottime = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_boottime,
- .nsleep = common_nsleep,
+ .clock_get_ktime = posix_get_boottime_ktime,
+ .clock_get_timespec = posix_get_boottime_timespec,
+ .nsleep = common_nsleep_timens,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
.timer_get = common_timer_get,
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 897c29e162b9..f32a2ebba9b8 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -6,8 +6,11 @@ struct k_clock {
struct timespec64 *tp);
int (*clock_set)(const clockid_t which_clock,
const struct timespec64 *tp);
- int (*clock_get)(const clockid_t which_clock,
- struct timespec64 *tp);
+ /* Returns the clock value in the current time namespace. */
+ int (*clock_get_timespec)(const clockid_t which_clock,
+ struct timespec64 *tp);
+ /* Returns the clock value in the root time namespace. */
+ ktime_t (*clock_get_ktime)(const clockid_t which_clock);
int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx);
int (*timer_create)(struct k_itimer *timer);
int (*nsleep)(const clockid_t which_clock, int flags,
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index dbd69052eaa6..e4332e3e2d56 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -169,14 +169,15 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
{
u64 res, wrap, new_mask, new_epoch, cyc, ns;
u32 new_mult, new_shift;
- unsigned long r;
+ unsigned long r, flags;
char r_unit;
struct clock_read_data rd;
if (cd.rate > rate)
return;
- WARN_ON(!irqs_disabled());
+ /* Cannot register a sched_clock with interrupts on */
+ local_irq_save(flags);
/* Calculate the mult/shift to convert counter ticks to ns. */
clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
@@ -233,6 +234,8 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
enable_sched_clock_irqtime();
+ local_irq_restore(flags);
+
pr_debug("Registered %pS as sched_clock source\n", read);
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 59225b484e4e..7e5d3524e924 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -11,6 +11,7 @@
#include <linux/err.h>
#include <linux/hrtimer.h>
#include <linux/interrupt.h>
+#include <linux/nmi.h>
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
@@ -558,6 +559,7 @@ void tick_unfreeze(void)
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), false);
} else {
+ touch_softlockup_watchdog();
tick_resume_local();
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 8b192e67aabc..a792d21cac64 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -58,8 +58,9 @@ static void tick_do_update_jiffies64(ktime_t now)
/*
* Do a quick check without holding jiffies_lock:
+ * The READ_ONCE() pairs with two updates done later in this function.
*/
- delta = ktime_sub(now, last_jiffies_update);
+ delta = ktime_sub(now, READ_ONCE(last_jiffies_update));
if (delta < tick_period)
return;
@@ -70,8 +71,9 @@ static void tick_do_update_jiffies64(ktime_t now)
if (delta >= tick_period) {
delta = ktime_sub(delta, tick_period);
- last_jiffies_update = ktime_add(last_jiffies_update,
- tick_period);
+ /* Pairs with the lockless read in this function. */
+ WRITE_ONCE(last_jiffies_update,
+ ktime_add(last_jiffies_update, tick_period));
/* Slow path for long timeouts */
if (unlikely(delta >= tick_period)) {
@@ -79,8 +81,10 @@ static void tick_do_update_jiffies64(ktime_t now)
ticks = ktime_divns(delta, incr);
- last_jiffies_update = ktime_add_ns(last_jiffies_update,
- incr * ticks);
+ /* Pairs with the lockless read in this function. */
+ WRITE_ONCE(last_jiffies_update,
+ ktime_add_ns(last_jiffies_update,
+ incr * ticks));
}
do_timer(++ticks);
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 5ee0f7709410..9577c89179cd 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -28,11 +28,6 @@ static inline void update_vdso_data(struct vdso_data *vdata,
vdata[CS_RAW].mult = tk->tkr_raw.mult;
vdata[CS_RAW].shift = tk->tkr_raw.shift;
- /* CLOCK_REALTIME */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
- vdso_ts->sec = tk->xtime_sec;
- vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
-
/* CLOCK_MONOTONIC */
vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
@@ -70,12 +65,6 @@ static inline void update_vdso_data(struct vdso_data *vdata,
vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI];
vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset;
vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
-
- /*
- * Read without the seqlock held by clock_getres().
- * Note: No need to have a second copy.
- */
- WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
}
void update_vsyscall(struct timekeeper *tk)
@@ -84,20 +73,17 @@ void update_vsyscall(struct timekeeper *tk)
struct vdso_timestamp *vdso_ts;
u64 nsec;
- if (__arch_update_vdso_data()) {
- /*
- * Some architectures might want to skip the update of the
- * data page.
- */
- return;
- }
-
/* copy vsyscall data */
vdso_write_begin(vdata);
vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk);
vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk);
+ /* CLOCK_REALTIME also required for time() */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
+ vdso_ts->sec = tk->xtime_sec;
+ vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
+
/* CLOCK_REALTIME_COARSE */
vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
vdso_ts->sec = tk->xtime_sec;
@@ -110,7 +96,18 @@ void update_vsyscall(struct timekeeper *tk)
nsec = nsec + tk->wall_to_monotonic.tv_nsec;
vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);
- update_vdso_data(vdata, tk);
+ /*
+ * Read without the seqlock held by clock_getres().
+ * Note: No need to have a second copy.
+ */
+ WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
+
+ /*
+ * Architectures can opt out of updating the high resolution part
+ * of the VDSO.
+ */
+ if (__arch_update_vdso_data())
+ update_vdso_data(vdata, tk);
__arch_update_vsyscall(vdata, tk);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e5ef4ae9edb5..19e793aa441a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -703,6 +703,7 @@ struct send_signal_irq_work {
struct irq_work irq_work;
struct task_struct *task;
u32 sig;
+ enum pid_type type;
};
static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
@@ -712,10 +713,10 @@ static void do_bpf_send_signal(struct irq_work *entry)
struct send_signal_irq_work *work;
work = container_of(entry, struct send_signal_irq_work, irq_work);
- group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID);
+ group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
}
-BPF_CALL_1(bpf_send_signal, u32, sig)
+static int bpf_send_signal_common(u32 sig, enum pid_type type)
{
struct send_signal_irq_work *work = NULL;
@@ -748,11 +749,17 @@ BPF_CALL_1(bpf_send_signal, u32, sig)
*/
work->task = current;
work->sig = sig;
+ work->type = type;
irq_work_queue(&work->irq_work);
return 0;
}
- return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID);
+ return group_send_sig_info(sig, SEND_SIG_PRIV, current, type);
+}
+
+BPF_CALL_1(bpf_send_signal, u32, sig)
+{
+ return bpf_send_signal_common(sig, PIDTYPE_TGID);
}
static const struct bpf_func_proto bpf_send_signal_proto = {
@@ -762,6 +769,18 @@ static const struct bpf_func_proto bpf_send_signal_proto = {
.arg1_type = ARG_ANYTHING,
};
+BPF_CALL_1(bpf_send_signal_thread, u32, sig)
+{
+ return bpf_send_signal_common(sig, PIDTYPE_PID);
+}
+
+static const struct bpf_func_proto bpf_send_signal_thread_proto = {
+ .func = bpf_send_signal_thread,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -822,6 +841,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
+ case BPF_FUNC_send_signal_thread:
+ return &bpf_send_signal_thread_proto;
default:
return NULL;
}
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 67e0c462b059..1af321dec0f1 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -96,11 +96,34 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
return 0;
}
+/*
+ * Not all archs define MCOUNT_INSN_SIZE which is used to look for direct
+ * functions. But those archs currently don't support direct functions
+ * anyway, and ftrace_find_rec_direct() is just a stub for them.
+ * Define MCOUNT_INSN_SIZE to keep those archs compiling.
+ */
+#ifndef MCOUNT_INSN_SIZE
+/* Make sure this only works without direct calls */
+# ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+# error MCOUNT_INSN_SIZE not defined with direct calls enabled
+# endif
+# define MCOUNT_INSN_SIZE 0
+#endif
+
int function_graph_enter(unsigned long ret, unsigned long func,
unsigned long frame_pointer, unsigned long *retp)
{
struct ftrace_graph_ent trace;
+ /*
+ * Skip graph tracing if the return location is served by direct trampoline,
+ * since call sequence and return addresses is unpredicatable anymore.
+ * Ex: BPF trampoline may call original function and may skip frame
+ * depending on type of BPF programs attached.
+ */
+ if (ftrace_direct_func_count &&
+ ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE))
+ return -EBUSY;
trace.func = func;
trace.depth = ++current->curr_ret_depth;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 74439ab5c2b6..9bf1f2cd515e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -526,8 +526,7 @@ static int function_stat_show(struct seq_file *m, void *v)
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- avg = rec->time;
- do_div(avg, rec->counter);
+ avg = div64_ul(rec->time, rec->counter);
if (tracing_thresh && (avg < tracing_thresh))
goto out;
#endif
@@ -553,7 +552,8 @@ static int function_stat_show(struct seq_file *m, void *v)
* Divide only 1000 for ns^2 -> us^2 conversion.
* trace_print_graph_duration will divide 1000 again.
*/
- do_div(stddev, rec->counter * (rec->counter - 1) * 1000);
+ stddev = div64_ul(stddev,
+ rec->counter * (rec->counter - 1) * 1000);
}
trace_seq_init(&s);
@@ -2364,7 +2364,7 @@ int ftrace_direct_func_count;
* Search the direct_functions hash to see if the given instruction pointer
* has a direct caller attached to it.
*/
-static unsigned long find_rec_direct(unsigned long ip)
+unsigned long ftrace_find_rec_direct(unsigned long ip)
{
struct ftrace_func_entry *entry;
@@ -2380,7 +2380,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip,
{
unsigned long addr;
- addr = find_rec_direct(ip);
+ addr = ftrace_find_rec_direct(ip);
if (!addr)
return;
@@ -2393,11 +2393,6 @@ struct ftrace_ops direct_ops = {
| FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
| FTRACE_OPS_FL_PERMANENT,
};
-#else
-static inline unsigned long find_rec_direct(unsigned long ip)
-{
- return 0;
-}
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
@@ -2417,7 +2412,7 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
if ((rec->flags & FTRACE_FL_DIRECT) &&
(ftrace_rec_count(rec) == 1)) {
- addr = find_rec_direct(rec->ip);
+ addr = ftrace_find_rec_direct(rec->ip);
if (addr)
return addr;
WARN_ON_ONCE(1);
@@ -2458,7 +2453,7 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
/* Direct calls take precedence over trampolines */
if (rec->flags & FTRACE_FL_DIRECT_EN) {
- addr = find_rec_direct(rec->ip);
+ addr = ftrace_find_rec_direct(rec->ip);
if (addr)
return addr;
WARN_ON_ONCE(1);
@@ -3604,7 +3599,7 @@ static int t_show(struct seq_file *m, void *v)
if (rec->flags & FTRACE_FL_DIRECT) {
unsigned long direct;
- direct = find_rec_direct(rec->ip);
+ direct = ftrace_find_rec_direct(rec->ip);
if (direct)
seq_printf(m, "\n\tdirect-->%pS", (void *)direct);
}
@@ -5008,7 +5003,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
mutex_lock(&direct_mutex);
/* See if there's a direct function at @ip already */
- if (find_rec_direct(ip))
+ if (ftrace_find_rec_direct(ip))
goto out_unlock;
ret = -ENODEV;
@@ -5027,7 +5022,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
if (ip != rec->ip) {
ip = rec->ip;
/* Need to check this ip for a direct. */
- if (find_rec_direct(ip))
+ if (ftrace_find_rec_direct(ip))
goto out_unlock;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4bf050fcfe3b..3f655371eaf6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5070,7 +5070,7 @@ static __init int test_ringbuffer(void)
int ret = 0;
if (security_locked_down(LOCKDOWN_TRACEFS)) {
- pr_warning("Lockdown is enabled, skipping ring buffer tests\n");
+ pr_warn("Lockdown is enabled, skipping ring buffer tests\n");
return 0;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 23459d53d576..5b6ee4aadc26 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1889,7 +1889,7 @@ int __init register_tracer(struct tracer *type)
}
if (security_locked_down(LOCKDOWN_TRACEFS)) {
- pr_warning("Can not register tracer %s due to lockdown\n",
+ pr_warn("Can not register tracer %s due to lockdown\n",
type->name);
return -EPERM;
}
@@ -4685,6 +4685,10 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
+ if ((mask == TRACE_ITER_RECORD_TGID) ||
+ (mask == TRACE_ITER_RECORD_CMD))
+ lockdep_assert_held(&event_mutex);
+
/* do nothing if flag is already set */
if (!!(tr->trace_flags & mask) == !!enabled)
return 0;
@@ -4752,6 +4756,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
cmp += len;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = match_string(trace_options, -1, cmp);
@@ -4762,6 +4767,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
ret = set_tracer_flag(tr, 1 << ret, !neg);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
/*
* If the first trailing whitespace is replaced with '\0' by strstrip,
@@ -8076,9 +8082,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (val != 0 && val != 1)
return -EINVAL;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = set_tracer_flag(tr, 1 << index, val);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
if (ret < 0)
return ret;
@@ -8796,7 +8804,7 @@ struct dentry *tracing_init_dentry(void)
struct trace_array *tr = &global_trace;
if (security_locked_down(LOCKDOWN_TRACEFS)) {
- pr_warning("Tracing disabled due to lockdown\n");
+ pr_warn("Tracing disabled due to lockdown\n");
return ERR_PTR(-EPERM);
}
@@ -9244,7 +9252,7 @@ __init static int tracer_alloc_buffers(void)
if (security_locked_down(LOCKDOWN_TRACEFS)) {
- pr_warning("Tracing disabled due to lockdown\n");
+ pr_warn("Tracing disabled due to lockdown\n");
return -EPERM;
}
@@ -9412,6 +9420,11 @@ __init static int tracing_set_default_clock(void)
{
/* sched_clock_stable() is determined in late_initcall */
if (!trace_boot_clock && !sched_clock_stable()) {
+ if (security_locked_down(LOCKDOWN_TRACEFS)) {
+ pr_warn("Can not set tracing clock due to lockdown\n");
+ return -EPERM;
+ }
+
printk(KERN_WARNING
"Unstable clock detected, switching default tracing clock to \"global\"\n"
"If you want to keep using the local clock, then add:\n"
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 63bf60f79398..a98dce1b3334 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -52,6 +52,9 @@ enum trace_type {
#undef __field
#define __field(type, item) type item;
+#undef __field_fn
+#define __field_fn(type, item) type item;
+
#undef __field_struct
#define __field_struct(type, item) __field(type, item)
@@ -71,26 +74,22 @@ enum trace_type {
#define F_STRUCT(args...) args
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
struct struct_name { \
struct trace_entry ent; \
tstruct \
}
#undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter)
+#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
#undef FTRACE_ENTRY_REG
-#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
- filter, regfn) \
- FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter)
+#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
#undef FTRACE_ENTRY_PACKED
-#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \
- filter) \
- FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter) __packed
+#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) __packed
#include "trace_entries.h"
@@ -1917,17 +1916,15 @@ extern void tracing_log_err(struct trace_array *tr,
#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
+#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
extern struct trace_event_call \
__aligned(4) event_##call;
#undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
- FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter)
+#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
+ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
#undef FTRACE_ENTRY_PACKED
-#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \
- FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter)
+#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print) \
+ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
#include "trace_entries.h"
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index fc8e97328e54..3e9d81608284 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -61,15 +61,13 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
TRACE_FN,
F_STRUCT(
- __field( unsigned long, ip )
- __field( unsigned long, parent_ip )
+ __field_fn( unsigned long, ip )
+ __field_fn( unsigned long, parent_ip )
),
F_printk(" %ps <-- %ps",
(void *)__entry->ip, (void *)__entry->parent_ip),
- FILTER_TRACE_FN,
-
perf_ftrace_event_register
);
@@ -84,9 +82,7 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
__field_desc( int, graph_ent, depth )
),
- F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth),
-
- FILTER_OTHER
+ F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth)
);
/* Function return entry */
@@ -97,18 +93,16 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
__field_desc( unsigned long, ret, func )
+ __field_desc( unsigned long, ret, overrun )
__field_desc( unsigned long long, ret, calltime)
__field_desc( unsigned long long, ret, rettime )
- __field_desc( unsigned long, ret, overrun )
__field_desc( int, ret, depth )
),
F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
(void *)__entry->func, __entry->depth,
__entry->calltime, __entry->rettime,
- __entry->depth),
-
- FILTER_OTHER
+ __entry->depth)
);
/*
@@ -137,9 +131,7 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry,
F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
__entry->prev_pid, __entry->prev_prio, __entry->prev_state,
__entry->next_pid, __entry->next_prio, __entry->next_state,
- __entry->next_cpu),
-
- FILTER_OTHER
+ __entry->next_cpu)
);
/*
@@ -157,9 +149,7 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
__entry->prev_pid, __entry->prev_prio, __entry->prev_state,
__entry->next_pid, __entry->next_prio, __entry->next_state,
- __entry->next_cpu),
-
- FILTER_OTHER
+ __entry->next_cpu)
);
/*
@@ -183,9 +173,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
(void *)__entry->caller[0], (void *)__entry->caller[1],
(void *)__entry->caller[2], (void *)__entry->caller[3],
(void *)__entry->caller[4], (void *)__entry->caller[5],
- (void *)__entry->caller[6], (void *)__entry->caller[7]),
-
- FILTER_OTHER
+ (void *)__entry->caller[6], (void *)__entry->caller[7])
);
FTRACE_ENTRY(user_stack, userstack_entry,
@@ -203,9 +191,7 @@ FTRACE_ENTRY(user_stack, userstack_entry,
(void *)__entry->caller[0], (void *)__entry->caller[1],
(void *)__entry->caller[2], (void *)__entry->caller[3],
(void *)__entry->caller[4], (void *)__entry->caller[5],
- (void *)__entry->caller[6], (void *)__entry->caller[7]),
-
- FILTER_OTHER
+ (void *)__entry->caller[6], (void *)__entry->caller[7])
);
/*
@@ -222,9 +208,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
),
F_printk("%ps: %s",
- (void *)__entry->ip, __entry->fmt),
-
- FILTER_OTHER
+ (void *)__entry->ip, __entry->fmt)
);
FTRACE_ENTRY_REG(print, print_entry,
@@ -239,8 +223,6 @@ FTRACE_ENTRY_REG(print, print_entry,
F_printk("%ps: %s",
(void *)__entry->ip, __entry->buf),
- FILTER_OTHER,
-
ftrace_event_register
);
@@ -254,9 +236,7 @@ FTRACE_ENTRY(raw_data, raw_data_entry,
),
F_printk("id:%04x %08x",
- __entry->id, (int)__entry->buf[0]),
-
- FILTER_OTHER
+ __entry->id, (int)__entry->buf[0])
);
FTRACE_ENTRY(bputs, bputs_entry,
@@ -269,9 +249,7 @@ FTRACE_ENTRY(bputs, bputs_entry,
),
F_printk("%ps: %s",
- (void *)__entry->ip, __entry->str),
-
- FILTER_OTHER
+ (void *)__entry->ip, __entry->str)
);
FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
@@ -283,16 +261,14 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
__field_desc( resource_size_t, rw, phys )
__field_desc( unsigned long, rw, value )
__field_desc( unsigned long, rw, pc )
- __field_desc( int, rw, map_id )
+ __field_desc( int, rw, map_id )
__field_desc( unsigned char, rw, opcode )
__field_desc( unsigned char, rw, width )
),
F_printk("%lx %lx %lx %d %x %x",
(unsigned long)__entry->phys, __entry->value, __entry->pc,
- __entry->map_id, __entry->opcode, __entry->width),
-
- FILTER_OTHER
+ __entry->map_id, __entry->opcode, __entry->width)
);
FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
@@ -304,15 +280,13 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
__field_desc( resource_size_t, map, phys )
__field_desc( unsigned long, map, virt )
__field_desc( unsigned long, map, len )
- __field_desc( int, map, map_id )
+ __field_desc( int, map, map_id )
__field_desc( unsigned char, map, opcode )
),
F_printk("%lx %lx %lx %d %x",
(unsigned long)__entry->phys, __entry->virt, __entry->len,
- __entry->map_id, __entry->opcode),
-
- FILTER_OTHER
+ __entry->map_id, __entry->opcode)
);
@@ -334,9 +308,7 @@ FTRACE_ENTRY(branch, trace_branch,
F_printk("%u:%s:%s (%u)%s",
__entry->line,
__entry->func, __entry->file, __entry->correct,
- __entry->constant ? " CONSTANT" : ""),
-
- FILTER_OTHER
+ __entry->constant ? " CONSTANT" : "")
);
@@ -362,7 +334,5 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
__entry->duration,
__entry->outer_duration,
__entry->nmi_total_ts,
- __entry->nmi_count),
-
- FILTER_OTHER
+ __entry->nmi_count)
);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c6de3cebc127..c8622a44d300 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -24,6 +24,7 @@
#include <linux/delay.h>
#include <trace/events/sched.h>
+#include <trace/syscall.h>
#include <asm/setup.h>
@@ -320,7 +321,8 @@ void trace_event_enable_cmd_record(bool enable)
struct trace_event_file *file;
struct trace_array *tr;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
+
do_for_each_event_file(tr, file) {
if (!(file->flags & EVENT_FILE_FL_ENABLED))
@@ -334,7 +336,6 @@ void trace_event_enable_cmd_record(bool enable)
clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
} while_for_each_event_file();
- mutex_unlock(&event_mutex);
}
void trace_event_enable_tgid_record(bool enable)
@@ -342,7 +343,8 @@ void trace_event_enable_tgid_record(bool enable)
struct trace_event_file *file;
struct trace_array *tr;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
+
do_for_each_event_file(tr, file) {
if (!(file->flags & EVENT_FILE_FL_ENABLED))
continue;
@@ -356,7 +358,6 @@ void trace_event_enable_tgid_record(bool enable)
&file->flags);
}
} while_for_each_event_file();
- mutex_unlock(&event_mutex);
}
static int __ftrace_event_enable_disable(struct trace_event_file *file,
@@ -2017,7 +2018,24 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
*/
head = trace_get_fields(call);
if (list_empty(head)) {
- ret = call->class->define_fields(call);
+ struct trace_event_fields *field = call->class->fields_array;
+ unsigned int offset = sizeof(struct trace_entry);
+
+ for (; field->type; field++) {
+ if (field->type == TRACE_FUNCTION_TYPE) {
+ ret = field->define_fields(call);
+ break;
+ }
+
+ offset = ALIGN(offset, field->align);
+ ret = trace_define_field(call, field->type, field->name,
+ offset, field->size,
+ field->is_signed, field->filter_type);
+ if (ret)
+ break;
+
+ offset += field->size;
+ }
if (ret < 0) {
pr_warn("Could not initialize trace point events/%s\n",
name);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index c9a74f82b14a..bf44f6bbd0c3 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1662,7 +1662,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
return -EINVAL;
fail_mem:
- kfree(filter);
+ __free_filter(filter);
/* If any call succeeded, we still need to sync */
if (!fail)
tracepoint_synchronize_unregister();
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f49d1a36d3ae..f2896d13001b 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -116,6 +116,7 @@ struct hist_field {
struct ftrace_event_field *field;
unsigned long flags;
hist_field_fn_t fn;
+ unsigned int ref;
unsigned int size;
unsigned int offset;
unsigned int is_signed;
@@ -911,7 +912,26 @@ static notrace void trace_event_raw_event_synth(void *__data,
strscpy(str_field, str_val, STR_VAR_LEN_MAX);
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
} else {
- entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
+ struct synth_field *field = event->fields[i];
+ u64 val = var_ref_vals[var_ref_idx + i];
+
+ switch (field->size) {
+ case 1:
+ *(u8 *)&entry->fields[n_u64] = (u8)val;
+ break;
+
+ case 2:
+ *(u16 *)&entry->fields[n_u64] = (u16)val;
+ break;
+
+ case 4:
+ *(u32 *)&entry->fields[n_u64] = (u32)val;
+ break;
+
+ default:
+ entry->fields[n_u64] = val;
+ break;
+ }
n_u64++;
}
}
@@ -1135,6 +1155,12 @@ static struct synth_event *find_synth_event(const char *name)
return NULL;
}
+static struct trace_event_fields synth_event_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = synth_event_define_fields },
+ {}
+};
+
static int register_synth_event(struct synth_event *event)
{
struct trace_event_call *call = &event->call;
@@ -1156,7 +1182,7 @@ static int register_synth_event(struct synth_event *event)
INIT_LIST_HEAD(&call->class->fields);
call->event.funcs = &synth_event_funcs;
- call->class->define_fields = synth_event_define_fields;
+ call->class->fields_array = synth_event_fields_array;
ret = register_trace_event(&call->event);
if (!ret) {
@@ -1747,11 +1773,13 @@ static struct hist_field *find_var(struct hist_trigger_data *hist_data,
struct event_trigger_data *test;
struct hist_field *hist_field;
+ lockdep_assert_held(&event_mutex);
+
hist_field = find_var_field(hist_data, var_name);
if (hist_field)
return hist_field;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
test_data = test->private_data;
hist_field = find_var_field(test_data, var_name);
@@ -1801,7 +1829,9 @@ static struct hist_field *find_file_var(struct trace_event_file *file,
struct event_trigger_data *test;
struct hist_field *hist_field;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
test_data = test->private_data;
hist_field = find_var_field(test_data, var_name);
@@ -2404,8 +2434,16 @@ static int contains_operator(char *str)
return field_op;
}
+static void get_hist_field(struct hist_field *hist_field)
+{
+ hist_field->ref++;
+}
+
static void __destroy_hist_field(struct hist_field *hist_field)
{
+ if (--hist_field->ref > 1)
+ return;
+
kfree(hist_field->var.name);
kfree(hist_field->name);
kfree(hist_field->type);
@@ -2447,6 +2485,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (!hist_field)
return NULL;
+ hist_field->ref = 1;
+
hist_field->hist_data = hist_data;
if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
@@ -2642,6 +2682,17 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data,
{
unsigned long flags = HIST_FIELD_FL_VAR_REF;
struct hist_field *ref_field;
+ int i;
+
+ /* Check if the variable already exists */
+ for (i = 0; i < hist_data->n_var_refs; i++) {
+ ref_field = hist_data->var_refs[i];
+ if (ref_field->var.idx == var_field->var.idx &&
+ ref_field->var.hist_data == var_field->hist_data) {
+ get_hist_field(ref_field);
+ return ref_field;
+ }
+ }
ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
if (ref_field) {
@@ -3096,7 +3147,9 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data,
{
struct event_trigger_data *test;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (test->private_data == hist_data)
return test->filter_str;
@@ -3147,9 +3200,11 @@ find_compatible_hist(struct hist_trigger_data *target_hist_data,
struct event_trigger_data *test;
unsigned int n_keys;
+ lockdep_assert_held(&event_mutex);
+
n_keys = target_hist_data->n_fields - target_hist_data->n_vals;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
hist_data = test->private_data;
@@ -5509,7 +5564,7 @@ static int hist_show(struct seq_file *m, void *v)
goto out_unlock;
}
- list_for_each_entry_rcu(data, &event_file->triggers, list) {
+ list_for_each_entry(data, &event_file->triggers, list) {
if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
hist_trigger_show(m, data, n++);
}
@@ -5902,7 +5957,9 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
if (hist_data->attrs->name && !named_data)
goto new;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (!hist_trigger_match(data, test, named_data, false))
continue;
@@ -5986,10 +6043,12 @@ static bool have_hist_trigger_match(struct event_trigger_data *data,
struct event_trigger_data *test, *named_data = NULL;
bool match = false;
+ lockdep_assert_held(&event_mutex);
+
if (hist_data->attrs->name)
named_data = find_named_trigger(hist_data->attrs->name);
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (hist_trigger_match(data, test, named_data, false)) {
match = true;
@@ -6007,10 +6066,12 @@ static bool hist_trigger_check_refs(struct event_trigger_data *data,
struct hist_trigger_data *hist_data = data->private_data;
struct event_trigger_data *test, *named_data = NULL;
+ lockdep_assert_held(&event_mutex);
+
if (hist_data->attrs->name)
named_data = find_named_trigger(hist_data->attrs->name);
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (!hist_trigger_match(data, test, named_data, false))
continue;
@@ -6032,10 +6093,12 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *test, *named_data = NULL;
bool unregistered = false;
+ lockdep_assert_held(&event_mutex);
+
if (hist_data->attrs->name)
named_data = find_named_trigger(hist_data->attrs->name);
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (!hist_trigger_match(data, test, named_data, false))
continue;
@@ -6061,7 +6124,9 @@ static bool hist_file_check_refs(struct trace_event_file *file)
struct hist_trigger_data *hist_data;
struct event_trigger_data *test;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
hist_data = test->private_data;
if (check_var_refs(hist_data))
@@ -6304,7 +6369,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec,
struct enable_trigger_data *enable_data = data->private_data;
struct event_trigger_data *test;
- list_for_each_entry_rcu(test, &enable_data->file->triggers, list) {
+ list_for_each_entry_rcu(test, &enable_data->file->triggers, list,
+ lockdep_is_held(&event_mutex)) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (enable_data->enable)
test->paused = false;
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index d43710718ee5..22bcf7c51d1e 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -17,12 +17,10 @@ static int
trace_inject_entry(struct trace_event_file *file, void *rec, int len)
{
struct trace_event_buffer fbuffer;
- struct ring_buffer *buffer;
int written = 0;
void *entry;
rcu_read_lock_sched();
- buffer = file->tr->trace_buffer.buffer;
entry = trace_event_buffer_reserve(&fbuffer, file, len);
if (entry) {
memcpy(entry, rec, len);
@@ -197,7 +195,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
unsigned long irq_flags;
void *entry = NULL;
int entry_size;
- u64 val;
+ u64 val = 0;
int len;
entry = trace_alloc_entry(call, &entry_size);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 2cd53ca21b51..40106fff06a4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -501,7 +501,9 @@ void update_cond_flag(struct trace_event_file *file)
struct event_trigger_data *data;
bool set_cond = false;
- list_for_each_entry_rcu(data, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(data, &file->triggers, list) {
if (data->filter || event_command_post_trigger(data->cmd_ops) ||
event_command_needs_rec(data->cmd_ops)) {
set_cond = true;
@@ -536,7 +538,9 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *test;
int ret = 0;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) {
ret = -EEXIST;
goto out;
@@ -581,7 +585,9 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *data;
bool unregistered = false;
- list_for_each_entry_rcu(data, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(data, &file->triggers, list) {
if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
unregistered = true;
list_del_rcu(&data->list);
@@ -1497,7 +1503,9 @@ int event_enable_register_trigger(char *glob,
struct event_trigger_data *test;
int ret = 0;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
test_enable_data = test->private_data;
if (test_enable_data &&
(test->cmd_ops->trigger_type ==
@@ -1537,7 +1545,9 @@ void event_enable_unregister_trigger(char *glob,
struct event_trigger_data *data;
bool unregistered = false;
- list_for_each_entry_rcu(data, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(data, &file->triggers, list) {
enable_data = data->private_data;
if (enable_data &&
(data->cmd_ops->trigger_type ==
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 2e6d2e9741cc..77ce5a3b6773 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -29,10 +29,8 @@ static int ftrace_event_register(struct trace_event_call *call,
* function and thus become accesible via perf.
*/
#undef FTRACE_ENTRY_REG
-#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
- filter, regfn) \
- FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter)
+#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
/* not needed for this file */
#undef __field_struct
@@ -41,6 +39,9 @@ static int ftrace_event_register(struct trace_event_call *call,
#undef __field
#define __field(type, item) type item;
+#undef __field_fn
+#define __field_fn(type, item) type item;
+
#undef __field_desc
#define __field_desc(type, container, item) type item;
@@ -60,7 +61,7 @@ static int ftrace_event_register(struct trace_event_call *call,
#define F_printk(fmt, args...) fmt, args
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
struct ____ftrace_##name { \
tstruct \
}; \
@@ -73,76 +74,46 @@ static void __always_unused ____ftrace_check_##name(void) \
}
#undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \
- FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
- filter)
+#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
#include "trace_entries.h"
+#undef __field_ext
+#define __field_ext(_type, _item, _filter_type) { \
+ .type = #_type, .name = #_item, \
+ .size = sizeof(_type), .align = __alignof__(_type), \
+ is_signed_type(_type), .filter_type = _filter_type },
+
#undef __field
-#define __field(type, item) \
- ret = trace_define_field(event_call, #type, #item, \
- offsetof(typeof(field), item), \
- sizeof(field.item), \
- is_signed_type(type), filter_type); \
- if (ret) \
- return ret;
+#define __field(_type, _item) __field_ext(_type, _item, FILTER_OTHER)
+
+#undef __field_fn
+#define __field_fn(_type, _item) __field_ext(_type, _item, FILTER_TRACE_FN)
#undef __field_desc
-#define __field_desc(type, container, item) \
- ret = trace_define_field(event_call, #type, #item, \
- offsetof(typeof(field), \
- container.item), \
- sizeof(field.container.item), \
- is_signed_type(type), filter_type); \
- if (ret) \
- return ret;
+#define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER)
#undef __array
-#define __array(type, item, len) \
- do { \
- char *type_str = #type"["__stringify(len)"]"; \
- BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
- ret = trace_define_field(event_call, type_str, #item, \
- offsetof(typeof(field), item), \
- sizeof(field.item), \
- is_signed_type(type), filter_type); \
- if (ret) \
- return ret; \
- } while (0);
+#define __array(_type, _item, _len) { \
+ .type = #_type"["__stringify(_len)"]", .name = #_item, \
+ .size = sizeof(_type[_len]), .align = __alignof__(_type), \
+ is_signed_type(_type), .filter_type = FILTER_OTHER },
#undef __array_desc
-#define __array_desc(type, container, item, len) \
- BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
- ret = trace_define_field(event_call, #type "[" #len "]", #item, \
- offsetof(typeof(field), \
- container.item), \
- sizeof(field.container.item), \
- is_signed_type(type), filter_type); \
- if (ret) \
- return ret;
+#define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len)
#undef __dynamic_array
-#define __dynamic_array(type, item) \
- ret = trace_define_field(event_call, #type "[]", #item, \
- offsetof(typeof(field), item), \
- 0, is_signed_type(type), filter_type);\
- if (ret) \
- return ret;
+#define __dynamic_array(_type, _item) { \
+ .type = #_type "[]", .name = #_item, \
+ .size = 0, .align = __alignof__(_type), \
+ is_signed_type(_type), .filter_type = FILTER_OTHER },
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
-static int __init \
-ftrace_define_fields_##name(struct trace_event_call *event_call) \
-{ \
- struct struct_name field; \
- int ret; \
- int filter_type = filter; \
- \
- tstruct; \
- \
- return ret; \
-}
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
+static struct trace_event_fields ftrace_event_fields_##name[] = { \
+ tstruct \
+ {} };
#include "trace_entries.h"
@@ -152,6 +123,9 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \
#undef __field
#define __field(type, item)
+#undef __field_fn
+#define __field_fn(type, item)
+
#undef __field_desc
#define __field_desc(type, container, item)
@@ -168,12 +142,10 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \
#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args)
#undef FTRACE_ENTRY_REG
-#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
- regfn) \
- \
+#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, regfn) \
static struct trace_event_class __refdata event_class_ftrace_##call = { \
.system = __stringify(TRACE_SYSTEM), \
- .define_fields = ftrace_define_fields_##call, \
+ .fields_array = ftrace_event_fields_##call, \
.fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
.reg = regfn, \
}; \
@@ -191,9 +163,9 @@ static struct trace_event_call __used \
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \
+#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
FTRACE_ENTRY_REG(call, struct_name, etype, \
- PARAMS(tstruct), PARAMS(print), filter, NULL)
+ PARAMS(tstruct), PARAMS(print), NULL)
bool ftrace_event_is_function(struct trace_event_call *call)
{
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 7f890262c8a3..aa515d578c5b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -290,7 +290,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
INIT_HLIST_NODE(&tk->rp.kp.hlist);
INIT_LIST_HEAD(&tk->rp.kp.list);
- ret = trace_probe_init(&tk->tp, event, group);
+ ret = trace_probe_init(&tk->tp, event, group, false);
if (ret < 0)
goto error;
@@ -1555,16 +1555,28 @@ static struct trace_event_functions kprobe_funcs = {
.trace = print_kprobe_event
};
+static struct trace_event_fields kretprobe_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = kretprobe_event_define_fields },
+ {}
+};
+
+static struct trace_event_fields kprobe_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = kprobe_event_define_fields },
+ {}
+};
+
static inline void init_trace_event_call(struct trace_kprobe *tk)
{
struct trace_event_call *call = trace_probe_event_call(&tk->tp);
if (trace_kprobe_is_return(tk)) {
call->event.funcs = &kretprobe_funcs;
- call->class->define_fields = kretprobe_event_define_fields;
+ call->class->fields_array = kretprobe_fields_array;
} else {
call->event.funcs = &kprobe_funcs;
- call->class->define_fields = kprobe_event_define_fields;
+ call->class->fields_array = kprobe_fields_array;
}
call->flags = TRACE_EVENT_FL_KPROBE;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 905b10af5d5c..ab8b6436d53f 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -876,7 +876,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
for (i = 0; i < tp->nr_args; i++) {
parg = tp->args + i;
if (parg->count) {
- if (strcmp(parg->type->name, "string") == 0)
+ if ((strcmp(parg->type->name, "string") == 0) ||
+ (strcmp(parg->type->name, "ustring") == 0))
fmt = ", __get_str(%s[%d])";
else
fmt = ", REC->%s[%d]";
@@ -884,7 +885,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
pos += snprintf(buf + pos, LEN_OR_ZERO,
fmt, parg->name, j);
} else {
- if (strcmp(parg->type->name, "string") == 0)
+ if ((strcmp(parg->type->name, "string") == 0) ||
+ (strcmp(parg->type->name, "ustring") == 0))
fmt = ", __get_str(%s)";
else
fmt = ", REC->%s";
@@ -984,15 +986,19 @@ void trace_probe_cleanup(struct trace_probe *tp)
}
int trace_probe_init(struct trace_probe *tp, const char *event,
- const char *group)
+ const char *group, bool alloc_filter)
{
struct trace_event_call *call;
+ size_t size = sizeof(struct trace_probe_event);
int ret = 0;
if (!event || !group)
return -EINVAL;
- tp->event = kzalloc(sizeof(struct trace_probe_event), GFP_KERNEL);
+ if (alloc_filter)
+ size += sizeof(struct trace_uprobe_filter);
+
+ tp->event = kzalloc(size, GFP_KERNEL);
if (!tp->event)
return -ENOMEM;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 4ee703728aec..a0ff9e200ef6 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -223,6 +223,12 @@ struct probe_arg {
const struct fetch_type *type; /* Type of this argument */
};
+struct trace_uprobe_filter {
+ rwlock_t rwlock;
+ int nr_systemwide;
+ struct list_head perf_events;
+};
+
/* Event call and class holder */
struct trace_probe_event {
unsigned int flags; /* For TP_FLAG_* */
@@ -230,6 +236,7 @@ struct trace_probe_event {
struct trace_event_call call;
struct list_head files;
struct list_head probes;
+ struct trace_uprobe_filter filter[0];
};
struct trace_probe {
@@ -322,7 +329,7 @@ static inline bool trace_probe_has_single_file(struct trace_probe *tp)
}
int trace_probe_init(struct trace_probe *tp, const char *event,
- const char *group);
+ const char *group, bool alloc_filter);
void trace_probe_cleanup(struct trace_probe *tp);
int trace_probe_append(struct trace_probe *tp, struct trace_probe *to);
void trace_probe_unlink(struct trace_probe *tp);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5e43b9664eca..617e297f46dc 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -630,7 +630,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_migrate_task\n");
- return;
+ goto fail_deprobe_sched_switch;
}
wakeup_reset(tr);
@@ -648,6 +648,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
printk(KERN_ERR "failed to start wakeup tracer\n");
return;
+fail_deprobe_sched_switch:
+ unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
fail_deprobe_wake_new:
unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
fail_deprobe:
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 344e4c1aa09c..87de6edafd14 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -381,7 +381,7 @@ int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str,
int prefix_type, int rowsize, int groupsize,
const void *buf, size_t len, bool ascii)
{
- unsigned int save_len = s->seq.len;
+ unsigned int save_len = s->seq.len;
if (s->full)
return 0;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4df9a209f7ca..c557f42a9397 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -283,6 +283,11 @@ static void check_stack(unsigned long ip, unsigned long *stack)
local_irq_restore(flags);
}
+/* Some archs may not define MCOUNT_INSN_SIZE */
+#ifndef MCOUNT_INSN_SIZE
+# define MCOUNT_INSN_SIZE 0
+#endif
+
static void
stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 16fa218556fa..2978c29d87d4 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -203,11 +203,10 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
extern char *__bad_type_size(void);
-#define SYSCALL_FIELD(type, field, name) \
- sizeof(type) != sizeof(trace.field) ? \
- __bad_type_size() : \
- #type, #name, offsetof(typeof(trace), field), \
- sizeof(trace.field), is_signed_type(type)
+#define SYSCALL_FIELD(_type, _name) { \
+ .type = #_type, .name = #_name, \
+ .size = sizeof(_type), .align = __alignof__(_type), \
+ .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }
static int __init
__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
@@ -274,42 +273,23 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
- int ret;
- int i;
int offset = offsetof(typeof(trace), args);
-
- ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
- FILTER_OTHER);
- if (ret)
- return ret;
+ int ret = 0;
+ int i;
for (i = 0; i < meta->nb_args; i++) {
ret = trace_define_field(call, meta->types[i],
meta->args[i], offset,
sizeof(unsigned long), 0,
FILTER_OTHER);
+ if (ret)
+ break;
offset += sizeof(unsigned long);
}
return ret;
}
-static int __init syscall_exit_define_fields(struct trace_event_call *call)
-{
- struct syscall_trace_exit trace;
- int ret;
-
- ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
- FILTER_OTHER);
- if (ret)
- return ret;
-
- ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret),
- FILTER_OTHER);
-
- return ret;
-}
-
static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
struct trace_array *tr = data;
@@ -507,6 +487,13 @@ static int __init init_syscall_trace(struct trace_event_call *call)
return id;
}
+static struct trace_event_fields __refdata syscall_enter_fields_array[] = {
+ SYSCALL_FIELD(int, __syscall_nr),
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = syscall_enter_define_fields },
+ {}
+};
+
struct trace_event_functions enter_syscall_print_funcs = {
.trace = print_syscall_enter,
};
@@ -518,7 +505,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
struct trace_event_class __refdata event_class_syscall_enter = {
.system = "syscalls",
.reg = syscall_enter_register,
- .define_fields = syscall_enter_define_fields,
+ .fields_array = syscall_enter_fields_array,
.get_fields = syscall_get_enter_fields,
.raw_init = init_syscall_trace,
};
@@ -526,7 +513,11 @@ struct trace_event_class __refdata event_class_syscall_enter = {
struct trace_event_class __refdata event_class_syscall_exit = {
.system = "syscalls",
.reg = syscall_exit_register,
- .define_fields = syscall_exit_define_fields,
+ .fields_array = (struct trace_event_fields[]){
+ SYSCALL_FIELD(int, __syscall_nr),
+ SYSCALL_FIELD(long, ret),
+ {}
+ },
.fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
.raw_init = init_syscall_trace,
};
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 352073d36585..7885ebd23d0c 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -34,12 +34,6 @@ struct uprobe_trace_entry_head {
#define DATAOF_TRACE_ENTRY(entry, is_return) \
((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
-struct trace_uprobe_filter {
- rwlock_t rwlock;
- int nr_systemwide;
- struct list_head perf_events;
-};
-
static int trace_uprobe_create(int argc, const char **argv);
static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev);
static int trace_uprobe_release(struct dyn_event *ev);
@@ -60,7 +54,6 @@ static struct dyn_event_operations trace_uprobe_ops = {
*/
struct trace_uprobe {
struct dyn_event devent;
- struct trace_uprobe_filter filter;
struct uprobe_consumer consumer;
struct path path;
struct inode *inode;
@@ -351,7 +344,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
if (!tu)
return ERR_PTR(-ENOMEM);
- ret = trace_probe_init(&tu->tp, event, group);
+ ret = trace_probe_init(&tu->tp, event, group, true);
if (ret < 0)
goto error;
@@ -359,7 +352,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
tu->consumer.handler = uprobe_dispatcher;
if (is_ret)
tu->consumer.ret_handler = uretprobe_dispatcher;
- init_trace_uprobe_filter(&tu->filter);
+ init_trace_uprobe_filter(tu->tp.event->filter);
return tu;
error:
@@ -1067,13 +1060,14 @@ static void __probe_event_disable(struct trace_probe *tp)
struct trace_probe *pos;
struct trace_uprobe *tu;
+ tu = container_of(tp, struct trace_uprobe, tp);
+ WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
+
list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
tu = container_of(pos, struct trace_uprobe, tp);
if (!tu->inode)
continue;
- WARN_ON(!uprobe_filter_is_empty(&tu->filter));
-
uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
tu->inode = NULL;
}
@@ -1108,7 +1102,7 @@ static int probe_event_enable(struct trace_event_call *call,
}
tu = container_of(tp, struct trace_uprobe, tp);
- WARN_ON(!uprobe_filter_is_empty(&tu->filter));
+ WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
if (enabled)
return 0;
@@ -1205,39 +1199,39 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
}
static inline bool
-uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
+trace_uprobe_filter_event(struct trace_uprobe_filter *filter,
+ struct perf_event *event)
{
- return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
+ return __uprobe_perf_filter(filter, event->hw.target->mm);
}
-static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
+static bool trace_uprobe_filter_remove(struct trace_uprobe_filter *filter,
+ struct perf_event *event)
{
bool done;
- write_lock(&tu->filter.rwlock);
+ write_lock(&filter->rwlock);
if (event->hw.target) {
list_del(&event->hw.tp_list);
- done = tu->filter.nr_systemwide ||
+ done = filter->nr_systemwide ||
(event->hw.target->flags & PF_EXITING) ||
- uprobe_filter_event(tu, event);
+ trace_uprobe_filter_event(filter, event);
} else {
- tu->filter.nr_systemwide--;
- done = tu->filter.nr_systemwide;
+ filter->nr_systemwide--;
+ done = filter->nr_systemwide;
}
- write_unlock(&tu->filter.rwlock);
+ write_unlock(&filter->rwlock);
- if (!done)
- return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
-
- return 0;
+ return done;
}
-static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
+/* This returns true if the filter always covers target mm */
+static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter,
+ struct perf_event *event)
{
bool done;
- int err;
- write_lock(&tu->filter.rwlock);
+ write_lock(&filter->rwlock);
if (event->hw.target) {
/*
* event->parent != NULL means copy_process(), we can avoid
@@ -1247,28 +1241,21 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
* attr.enable_on_exec means that exec/mmap will install the
* breakpoints we need.
*/
- done = tu->filter.nr_systemwide ||
+ done = filter->nr_systemwide ||
event->parent || event->attr.enable_on_exec ||
- uprobe_filter_event(tu, event);
- list_add(&event->hw.tp_list, &tu->filter.perf_events);
+ trace_uprobe_filter_event(filter, event);
+ list_add(&event->hw.tp_list, &filter->perf_events);
} else {
- done = tu->filter.nr_systemwide;
- tu->filter.nr_systemwide++;
+ done = filter->nr_systemwide;
+ filter->nr_systemwide++;
}
- write_unlock(&tu->filter.rwlock);
+ write_unlock(&filter->rwlock);
- err = 0;
- if (!done) {
- err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
- if (err)
- uprobe_perf_close(tu, event);
- }
- return err;
+ return done;
}
-static int uprobe_perf_multi_call(struct trace_event_call *call,
- struct perf_event *event,
- int (*op)(struct trace_uprobe *tu, struct perf_event *event))
+static int uprobe_perf_close(struct trace_event_call *call,
+ struct perf_event *event)
{
struct trace_probe *pos, *tp;
struct trace_uprobe *tu;
@@ -1278,25 +1265,59 @@ static int uprobe_perf_multi_call(struct trace_event_call *call,
if (WARN_ON_ONCE(!tp))
return -ENODEV;
+ tu = container_of(tp, struct trace_uprobe, tp);
+ if (trace_uprobe_filter_remove(tu->tp.event->filter, event))
+ return 0;
+
list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
tu = container_of(pos, struct trace_uprobe, tp);
- ret = op(tu, event);
+ ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
if (ret)
break;
}
return ret;
}
+
+static int uprobe_perf_open(struct trace_event_call *call,
+ struct perf_event *event)
+{
+ struct trace_probe *pos, *tp;
+ struct trace_uprobe *tu;
+ int err = 0;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+
+ tu = container_of(tp, struct trace_uprobe, tp);
+ if (trace_uprobe_filter_add(tu->tp.event->filter, event))
+ return 0;
+
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ if (err) {
+ uprobe_perf_close(call, event);
+ break;
+ }
+ }
+
+ return err;
+}
+
static bool uprobe_perf_filter(struct uprobe_consumer *uc,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
+ struct trace_uprobe_filter *filter;
struct trace_uprobe *tu;
int ret;
tu = container_of(uc, struct trace_uprobe, consumer);
- read_lock(&tu->filter.rwlock);
- ret = __uprobe_perf_filter(&tu->filter, mm);
- read_unlock(&tu->filter.rwlock);
+ filter = tu->tp.event->filter;
+
+ read_lock(&filter->rwlock);
+ ret = __uprobe_perf_filter(filter, mm);
+ read_unlock(&filter->rwlock);
return ret;
}
@@ -1419,10 +1440,10 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
return 0;
case TRACE_REG_PERF_OPEN:
- return uprobe_perf_multi_call(event, data, uprobe_perf_open);
+ return uprobe_perf_open(event, data);
case TRACE_REG_PERF_CLOSE:
- return uprobe_perf_multi_call(event, data, uprobe_perf_close);
+ return uprobe_perf_close(event, data);
#endif
default:
@@ -1507,12 +1528,17 @@ static struct trace_event_functions uprobe_funcs = {
.trace = print_uprobe_event
};
+static struct trace_event_fields uprobe_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = uprobe_event_define_fields },
+ {}
+};
+
static inline void init_trace_event_call(struct trace_uprobe *tu)
{
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
-
call->event.funcs = &uprobe_funcs;
- call->class->define_fields = uprobe_event_define_fields;
+ call->class->fields_array = uprobe_fields_array;
call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY;
call->class->reg = trace_uprobe_register;
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 9a1c22310323..9e31bfc818ff 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -148,8 +148,8 @@ static int tracing_map_cmp_atomic64(void *val_a, void *val_b)
#define DEFINE_TRACING_MAP_CMP_FN(type) \
static int tracing_map_cmp_##type(void *val_a, void *val_b) \
{ \
- type a = *(type *)val_a; \
- type b = *(type *)val_b; \
+ type a = (type)(*(u64 *)val_a); \
+ type b = (type)(*(u64 *)val_b); \
\
return (a > b) ? 1 : ((a < b) ? -1 : 0); \
}
diff --git a/kernel/up.c b/kernel/up.c
index 862b460ab97a..53144d056252 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -68,9 +68,8 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* Preemption is disabled here to make sure the cond_func is called under the
* same condtions in UP and SMP.
*/
-void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags, const struct cpumask *mask)
+void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, const struct cpumask *mask)
{
unsigned long flags;
@@ -84,11 +83,10 @@ void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags)
+void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait)
{
- on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL);
+ on_each_cpu_cond_mask(cond_func, func, info, wait, NULL);
}
EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f41334ef0971..b6b1f54a7837 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -161,6 +161,8 @@ static void lockup_detector_update_enable(void)
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
+#define SOFTLOCKUP_RESET ULONG_MAX
+
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -173,8 +175,6 @@ static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
-static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
-static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static unsigned long soft_lockup_nmi_warn;
@@ -274,7 +274,7 @@ notrace void touch_softlockup_watchdog_sched(void)
* Preemption can be enabled. It doesn't matter which CPU's timestamp
* gets zeroed here, so use the raw_ operation.
*/
- raw_cpu_write(watchdog_touch_ts, 0);
+ raw_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
}
notrace void touch_softlockup_watchdog(void)
@@ -298,14 +298,14 @@ void touch_all_softlockup_watchdogs(void)
* the softlockup check.
*/
for_each_cpu(cpu, &watchdog_allowed_mask)
- per_cpu(watchdog_touch_ts, cpu) = 0;
+ per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET;
wq_watchdog_touch(-1);
}
void touch_softlockup_watchdog_sync(void)
{
__this_cpu_write(softlockup_touch_sync, true);
- __this_cpu_write(watchdog_touch_ts, 0);
+ __this_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
}
static int is_softlockup(unsigned long touch_ts)
@@ -350,8 +350,6 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
*/
static int softlockup_fn(void *data)
{
- __this_cpu_write(soft_lockup_hrtimer_cnt,
- __this_cpu_read(hrtimer_interrupts));
__touch_watchdog();
complete(this_cpu_ptr(&softlockup_completion));
@@ -383,7 +381,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
- if (touch_ts == 0) {
+ if (touch_ts == SOFTLOCKUP_RESET) {
if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
/*
* If the time stamp was touched atomically
@@ -416,22 +414,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
/* only warn once */
- if (__this_cpu_read(soft_watchdog_warn) == true) {
- /*
- * When multiple processes are causing softlockups the
- * softlockup detector only warns on the first one
- * because the code relies on a full quiet cycle to
- * re-arm. The second process prevents the quiet cycle
- * and never gets reported. Use task pointers to detect
- * this.
- */
- if (__this_cpu_read(softlockup_task_ptr_saved) !=
- current) {
- __this_cpu_write(soft_watchdog_warn, false);
- __touch_watchdog();
- }
+ if (__this_cpu_read(soft_watchdog_warn) == true)
return HRTIMER_RESTART;
- }
if (softlockup_all_cpu_backtrace) {
/* Prevent multiple soft-lockup reports if one cpu is already
@@ -447,7 +431,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
- __this_cpu_write(softlockup_task_ptr_saved, current);
print_modules();
print_irqtrace_events(current);
if (regs)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bc88fd939f4e..301db4406bc3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2266,7 +2266,7 @@ __acquires(&pool->lock)
* While we must be careful to not use "work" after this, the trace
* point will only record its address.
*/
- trace_workqueue_execute_end(work);
+ trace_workqueue_execute_end(work, worker->current_func);
lock_map_release(&lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
@@ -2280,7 +2280,7 @@ __acquires(&pool->lock)
}
/*
- * The following prevents a kworker from hogging CPU on !PREEMPT
+ * The following prevents a kworker from hogging CPU on !PREEMPTION
* kernels, where a requeueing work item waiting for something to
* happen could deadlock with stop_machine as such work item could
* indefinitely requeue itself while all other CPUs are trapped in
@@ -4374,8 +4374,8 @@ void destroy_workqueue(struct workqueue_struct *wq)
for_each_pwq(pwq, wq) {
spin_lock_irq(&pwq->pool->lock);
if (WARN_ON(pwq_busy(pwq))) {
- pr_warning("%s: %s has the following busy pwq\n",
- __func__, wq->name);
+ pr_warn("%s: %s has the following busy pwq\n",
+ __func__, wq->name);
show_pwq(pwq);
spin_unlock_irq(&pwq->pool->lock);
mutex_unlock(&wq->mutex);