summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c76
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/auditfilter.c39
-rw-r--r--kernel/auditsc.c29
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arraymap.c6
-rw-r--r--kernel/bpf/cgroup.c570
-rw-r--r--kernel/bpf/core.c186
-rw-r--r--kernel/bpf/cpumap.c706
-rw-r--r--kernel/bpf/devmap.c5
-rw-r--r--kernel/bpf/disasm.c214
-rw-r--r--kernel/bpf/disasm.h32
-rw-r--r--kernel/bpf/hashtab.c5
-rw-r--r--kernel/bpf/inode.c15
-rw-r--r--kernel/bpf/lpm_trie.c98
-rw-r--r--kernel/bpf/offload.c191
-rw-r--r--kernel/bpf/percpu_freelist.c8
-rw-r--r--kernel/bpf/sockmap.c9
-rw-r--r--kernel/bpf/stackmap.c5
-rw-r--r--kernel/bpf/syscall.c315
-rw-r--r--kernel/bpf/verifier.c1511
-rw-r--r--kernel/cgroup/Makefile2
-rw-r--r--kernel/cgroup/cgroup-internal.h9
-rw-r--r--kernel/cgroup/cgroup.c195
-rw-r--r--kernel/cgroup/stat.c334
-rw-r--r--kernel/compat.c77
-rw-r--r--kernel/crash_core.c3
-rw-r--r--kernel/events/core.c44
-rw-r--r--kernel/events/ring_buffer.c4
-rw-r--r--kernel/fork.c26
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/kallsyms.c43
-rw-r--r--kernel/kcov.c216
-rw-r--r--kernel/kthread.c68
-rw-r--r--kernel/livepatch/Makefile2
-rw-r--r--kernel/livepatch/core.c52
-rw-r--r--kernel/livepatch/core.h40
-rw-r--r--kernel/livepatch/patch.c1
-rw-r--r--kernel/livepatch/shadow.c277
-rw-r--r--kernel/livepatch/transition.c45
-rw-r--r--kernel/locking/lockdep.c3
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/padata.c75
-rw-r--r--kernel/panic.c47
-rw-r--r--kernel/pid.c247
-rw-r--r--kernel/pid_namespace.c59
-rw-r--r--kernel/power/Kconfig14
-rw-r--r--kernel/power/qos.c4
-rw-r--r--kernel/power/snapshot.c39
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/power/swap.c128
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/printk/printk_safe.c17
-rw-r--r--kernel/reboot.c27
-rw-r--r--kernel/sched/core.c210
-rw-r--r--kernel/sched/cpuacct.h18
-rw-r--r--kernel/sched/cpufreq_schedutil.c6
-rw-r--r--kernel/sched/cputime.c14
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/sched/wait_bit.c18
-rw-r--r--kernel/signal.c98
-rw-r--r--kernel/softirq.c10
-rw-r--r--kernel/sys.c12
-rw-r--r--kernel/sysctl.c89
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/clocksource.c5
-rw-r--r--kernel/time/timekeeping.c45
-rw-r--r--kernel/time/timer.c40
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/trace/Kconfig15
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c90
-rw-r--r--kernel/trace/bpf_trace.c177
-rw-r--r--kernel/trace/ftrace.c354
-rw-r--r--kernel/trace/ring_buffer.c67
-rw-r--r--kernel/trace/trace.c91
-rw-r--r--kernel/trace/trace.h9
-rw-r--r--kernel/trace/trace_event_perf.c82
-rw-r--r--kernel/trace/trace_events.c31
-rw-r--r--kernel/trace/trace_events_hist.c128
-rw-r--r--kernel/trace/trace_irqsoff.c133
-rw-r--r--kernel/trace/trace_kprobe.c28
-rw-r--r--kernel/trace/trace_probe.c86
-rw-r--r--kernel/trace/trace_probe.h7
-rw-r--r--kernel/trace/trace_selftest.c34
-rw-r--r--kernel/trace/trace_syscalls.c38
-rw-r--r--kernel/trace/trace_uprobe.c7
-rw-r--r--kernel/trace/tracing_map.c3
-rw-r--r--kernel/trace/tracing_map.h2
-rw-r--r--kernel/umh.c4
-rw-r--r--kernel/user.c30
-rw-r--r--kernel/user_namespace.c349
-rw-r--r--kernel/workqueue.c5
99 files changed, 6318 insertions, 2174 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index be1c28fd4d57..227db99b0f19 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -85,13 +85,13 @@ static int audit_initialized;
#define AUDIT_OFF 0
#define AUDIT_ON 1
#define AUDIT_LOCKED 2
-u32 audit_enabled;
-u32 audit_ever_enabled;
+u32 audit_enabled = AUDIT_OFF;
+bool audit_ever_enabled = !!AUDIT_OFF;
EXPORT_SYMBOL_GPL(audit_enabled);
/* Default state when kernel boots without any parameters. */
-static u32 audit_default;
+static u32 audit_default = AUDIT_OFF;
/* If auditing cannot proceed, audit_failure selects what happens. */
static u32 audit_failure = AUDIT_FAIL_PRINTK;
@@ -1197,25 +1197,28 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
pid_t auditd_pid;
struct pid *req_pid = task_tgid(current);
- /* sanity check - PID values must match */
- if (new_pid != pid_vnr(req_pid))
+ /* Sanity check - PID values must match. Setting
+ * pid to 0 is how auditd ends auditing. */
+ if (new_pid && (new_pid != pid_vnr(req_pid)))
return -EINVAL;
/* test the auditd connection */
audit_replace(req_pid);
auditd_pid = auditd_pid_vnr();
- /* only the current auditd can unregister itself */
- if ((!new_pid) && (new_pid != auditd_pid)) {
- audit_log_config_change("audit_pid", new_pid,
- auditd_pid, 0);
- return -EACCES;
- }
- /* replacing a healthy auditd is not allowed */
- if (auditd_pid && new_pid) {
- audit_log_config_change("audit_pid", new_pid,
- auditd_pid, 0);
- return -EEXIST;
+ if (auditd_pid) {
+ /* replacing a healthy auditd is not allowed */
+ if (new_pid) {
+ audit_log_config_change("audit_pid",
+ new_pid, auditd_pid, 0);
+ return -EEXIST;
+ }
+ /* only current auditd can unregister itself */
+ if (pid_vnr(req_pid) != auditd_pid) {
+ audit_log_config_change("audit_pid",
+ new_pid, auditd_pid, 0);
+ return -EACCES;
+ }
}
if (new_pid) {
@@ -1549,8 +1552,6 @@ static int __init audit_init(void)
register_pernet_subsys(&audit_net_ops);
audit_initialized = AUDIT_INITIALIZED;
- audit_enabled = audit_default;
- audit_ever_enabled |= !!audit_default;
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
if (IS_ERR(kauditd_task)) {
@@ -1564,14 +1565,21 @@ static int __init audit_init(void)
return 0;
}
-__initcall(audit_init);
+postcore_initcall(audit_init);
/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
static int __init audit_enable(char *str)
{
- audit_default = !!simple_strtol(str, NULL, 0);
- if (!audit_default)
+ long val;
+
+ if (kstrtol(str, 0, &val))
+ panic("audit: invalid 'audit' parameter value (%s)\n", str);
+ audit_default = (val ? AUDIT_ON : AUDIT_OFF);
+
+ if (audit_default == AUDIT_OFF)
audit_initialized = AUDIT_DISABLED;
+ if (audit_set_enabled(audit_default))
+ panic("audit: error setting audit state (%d)\n", audit_default);
pr_info("%s\n", audit_default ?
"enabled (after initialization)" : "disabled (until reboot)");
@@ -2337,32 +2345,6 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
}
}
-#ifdef CONFIG_SECURITY
-/**
- * audit_log_secctx - Converts and logs SELinux context
- * @ab: audit_buffer
- * @secid: security number
- *
- * This is a helper function that calls security_secid_to_secctx to convert
- * secid to secctx and then adds the (converted) SELinux context to the audit
- * log by calling audit_log_format, thus also preventing leak of internal secid
- * to userspace. If secid cannot be converted audit_panic is called.
- */
-void audit_log_secctx(struct audit_buffer *ab, u32 secid)
-{
- u32 len;
- char *secctx;
-
- if (security_secid_to_secctx(secid, &secctx, &len)) {
- audit_panic("Cannot convert secid to context");
- } else {
- audit_log_format(ab, " obj=%s", secctx);
- security_release_secctx(secctx, len);
- }
-}
-EXPORT_SYMBOL(audit_log_secctx);
-#endif
-
EXPORT_SYMBOL(audit_log_start);
EXPORT_SYMBOL(audit_log_end);
EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/audit.h b/kernel/audit.h
index 9b110ae17ee3..af5bc59487ed 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -208,7 +208,7 @@ struct audit_context {
struct audit_proctitle proctitle;
};
-extern u32 audit_ever_enabled;
+extern bool audit_ever_enabled;
extern void audit_copy_inode(struct audit_names *name,
const struct dentry *dentry,
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index d4b050d9a66e..fd353120e0d9 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1008,7 +1008,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
* We are guaranteed to have at least one reference to the mark from
* either the inode or the caller of fsnotify_destroy_mark().
*/
- BUG_ON(atomic_read(&entry->refcnt) < 1);
+ BUG_ON(refcount_read(&entry->refcnt) < 1);
}
static const struct fsnotify_ops audit_tree_ops = {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0b0aa5854dac..4a1758adb222 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -56,7 +56,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_filter_list[3]),
LIST_HEAD_INIT(audit_filter_list[4]),
LIST_HEAD_INIT(audit_filter_list[5]),
-#if AUDIT_NR_FILTERS != 6
+ LIST_HEAD_INIT(audit_filter_list[6]),
+#if AUDIT_NR_FILTERS != 7
#error Fix audit_filter_list initialiser
#endif
};
@@ -67,6 +68,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_rules_list[3]),
LIST_HEAD_INIT(audit_rules_list[4]),
LIST_HEAD_INIT(audit_rules_list[5]),
+ LIST_HEAD_INIT(audit_rules_list[6]),
};
DEFINE_MUTEX(audit_filter_mutex);
@@ -263,6 +265,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
#endif
case AUDIT_FILTER_USER:
case AUDIT_FILTER_TYPE:
+ case AUDIT_FILTER_FS:
;
}
if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -338,6 +341,21 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
entry->rule.listnr != AUDIT_FILTER_USER)
return -EINVAL;
break;
+ case AUDIT_FSTYPE:
+ if (entry->rule.listnr != AUDIT_FILTER_FS)
+ return -EINVAL;
+ break;
+ }
+
+ switch(entry->rule.listnr) {
+ case AUDIT_FILTER_FS:
+ switch(f->type) {
+ case AUDIT_FSTYPE:
+ case AUDIT_FILTERKEY:
+ break;
+ default:
+ return -EINVAL;
+ }
}
switch(f->type) {
@@ -391,6 +409,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
return -EINVAL;
/* FALL THROUGH */
case AUDIT_ARCH:
+ case AUDIT_FSTYPE:
if (f->op != Audit_not_equal && f->op != Audit_equal)
return -EINVAL;
break;
@@ -910,10 +929,13 @@ static inline int audit_add_rule(struct audit_entry *entry)
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
- /* If either of these, don't count towards total */
- if (entry->rule.listnr == AUDIT_FILTER_USER ||
- entry->rule.listnr == AUDIT_FILTER_TYPE)
+ /* If any of these, don't count towards total */
+ switch(entry->rule.listnr) {
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_TYPE:
+ case AUDIT_FILTER_FS:
dont_count = 1;
+ }
#endif
mutex_lock(&audit_filter_mutex);
@@ -989,10 +1011,13 @@ int audit_del_rule(struct audit_entry *entry)
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
- /* If either of these, don't count towards total */
- if (entry->rule.listnr == AUDIT_FILTER_USER ||
- entry->rule.listnr == AUDIT_FILTER_TYPE)
+ /* If any of these, don't count towards total */
+ switch(entry->rule.listnr) {
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_TYPE:
+ case AUDIT_FILTER_FS:
dont_count = 1;
+ }
#endif
mutex_lock(&audit_filter_mutex);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ecc23e25c9eb..e80459f7e132 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1869,10 +1869,33 @@ void __audit_inode_child(struct inode *parent,
struct inode *inode = d_backing_inode(dentry);
const char *dname = dentry->d_name.name;
struct audit_names *n, *found_parent = NULL, *found_child = NULL;
+ struct audit_entry *e;
+ struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
+ int i;
if (!context->in_syscall)
return;
+ rcu_read_lock();
+ if (!list_empty(list)) {
+ list_for_each_entry_rcu(e, list, list) {
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+
+ if (f->type == AUDIT_FSTYPE) {
+ if (audit_comparator(parent->i_sb->s_magic,
+ f->op, f->val)) {
+ if (e->rule.action == AUDIT_NEVER) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ }
+ }
+ }
+ }
+ rcu_read_unlock();
+
if (inode)
handle_one(inode);
@@ -2390,6 +2413,12 @@ void __audit_log_kern_module(char *name)
context->type = AUDIT_KERN_MODULE;
}
+void __audit_fanotify(unsigned int response)
+{
+ audit_log(current->audit_context, GFP_KERNEL,
+ AUDIT_FANOTIFY, "resp=%u", response);
+}
+
static void audit_log_task(struct audit_buffer *ab)
{
kuid_t auid, uid;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index af3ab6164ff5..e691da0b3bab 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,8 +3,11 @@ obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += offload.o
ifeq ($(CONFIG_STREAM_PARSER),y)
obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c4b9ab01bba5..7c25426d3cf5 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -19,6 +19,9 @@
#include "map_in_map.h"
+#define ARRAY_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
static void bpf_array_free_percpu(struct bpf_array *array)
{
int i;
@@ -56,7 +59,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+ attr->value_size == 0 ||
+ attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
(percpu && numa_node != NUMA_NO_NODE))
return ERR_PTR(-EINVAL);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 546113430049..b789ab78d28f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -27,129 +27,405 @@ void cgroup_bpf_put(struct cgroup *cgrp)
{
unsigned int type;
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
- struct bpf_prog *prog = cgrp->bpf.prog[type];
-
- if (prog) {
- bpf_prog_put(prog);
+ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ struct bpf_prog_list *pl, *tmp;
+
+ list_for_each_entry_safe(pl, tmp, progs, node) {
+ list_del(&pl->node);
+ bpf_prog_put(pl->prog);
+ kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key);
}
+ bpf_prog_array_free(cgrp->bpf.effective[type]);
+ }
+}
+
+/* count number of elements in the list.
+ * it's slow but the list cannot be long
+ */
+static u32 prog_list_length(struct list_head *head)
+{
+ struct bpf_prog_list *pl;
+ u32 cnt = 0;
+
+ list_for_each_entry(pl, head, node) {
+ if (!pl->prog)
+ continue;
+ cnt++;
}
+ return cnt;
+}
+
+/* if parent has non-overridable prog attached,
+ * disallow attaching new programs to the descendent cgroup.
+ * if parent has overridable or multi-prog, allow attaching
+ */
+static bool hierarchy_allows_attach(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ u32 new_flags)
+{
+ struct cgroup *p;
+
+ p = cgroup_parent(cgrp);
+ if (!p)
+ return true;
+ do {
+ u32 flags = p->bpf.flags[type];
+ u32 cnt;
+
+ if (flags & BPF_F_ALLOW_MULTI)
+ return true;
+ cnt = prog_list_length(&p->bpf.progs[type]);
+ WARN_ON_ONCE(cnt > 1);
+ if (cnt == 1)
+ return !!(flags & BPF_F_ALLOW_OVERRIDE);
+ p = cgroup_parent(p);
+ } while (p);
+ return true;
+}
+
+/* compute a chain of effective programs for a given cgroup:
+ * start from the list of programs in this cgroup and add
+ * all parent programs.
+ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
+ * to programs in this cgroup
+ */
+static int compute_effective_progs(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ struct bpf_prog_array __rcu **array)
+{
+ struct bpf_prog_array __rcu *progs;
+ struct bpf_prog_list *pl;
+ struct cgroup *p = cgrp;
+ int cnt = 0;
+
+ /* count number of effective programs by walking parents */
+ do {
+ if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ cnt += prog_list_length(&p->bpf.progs[type]);
+ p = cgroup_parent(p);
+ } while (p);
+
+ progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+ if (!progs)
+ return -ENOMEM;
+
+ /* populate the array with effective progs */
+ cnt = 0;
+ p = cgrp;
+ do {
+ if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ list_for_each_entry(pl,
+ &p->bpf.progs[type], node) {
+ if (!pl->prog)
+ continue;
+ rcu_dereference_protected(progs, 1)->
+ progs[cnt++] = pl->prog;
+ }
+ p = cgroup_parent(p);
+ } while (p);
+
+ *array = progs;
+ return 0;
+}
+
+static void activate_effective_progs(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ struct bpf_prog_array __rcu *array)
+{
+ struct bpf_prog_array __rcu *old_array;
+
+ old_array = xchg(&cgrp->bpf.effective[type], array);
+ /* free prog array after grace period, since __cgroup_bpf_run_*()
+ * might be still walking the array
+ */
+ bpf_prog_array_free(old_array);
}
/**
* cgroup_bpf_inherit() - inherit effective programs from parent
* @cgrp: the cgroup to modify
- * @parent: the parent to inherit from
*/
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+int cgroup_bpf_inherit(struct cgroup *cgrp)
{
- unsigned int type;
+/* has to use marco instead of const int, since compiler thinks
+ * that array below is variable length
+ */
+#define NR ARRAY_SIZE(cgrp->bpf.effective)
+ struct bpf_prog_array __rcu *arrays[NR] = {};
+ int i;
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
- struct bpf_prog *e;
+ for (i = 0; i < NR; i++)
+ INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
- e = rcu_dereference_protected(parent->bpf.effective[type],
- lockdep_is_held(&cgroup_mutex));
- rcu_assign_pointer(cgrp->bpf.effective[type], e);
- cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
- }
+ for (i = 0; i < NR; i++)
+ if (compute_effective_progs(cgrp, i, &arrays[i]))
+ goto cleanup;
+
+ for (i = 0; i < NR; i++)
+ activate_effective_progs(cgrp, i, arrays[i]);
+
+ return 0;
+cleanup:
+ for (i = 0; i < NR; i++)
+ bpf_prog_array_free(arrays[i]);
+ return -ENOMEM;
}
+#define BPF_CGROUP_MAX_PROGS 64
+
/**
- * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * __cgroup_bpf_attach() - Attach the program to a cgroup, and
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
- * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
- * @prog: A new program to pin
- * @type: Type of pinning operation (ingress/egress)
- *
- * Each cgroup has a set of two pointers for bpf programs; one for eBPF
- * programs it owns, and which is effective for execution.
- *
- * If @prog is not %NULL, this function attaches a new program to the cgroup
- * and releases the one that is currently attached, if any. @prog is then made
- * the effective program of type @type in that cgroup.
- *
- * If @prog is %NULL, the currently attached program of type @type is released,
- * and the effective program of the parent cgroup (if any) is inherited to
- * @cgrp.
- *
- * Then, the descendants of @cgrp are walked and the effective program for
- * each of them is set to the effective program of @cgrp unless the
- * descendant has its own program attached, in which case the subbranch is
- * skipped. This ensures that delegated subcgroups with own programs are left
- * untouched.
+ * @prog: A program to attach
+ * @type: Type of attach operation
*
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
- struct bpf_prog *prog, enum bpf_attach_type type,
- bool new_overridable)
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
{
- struct bpf_prog *old_prog, *effective = NULL;
- struct cgroup_subsys_state *pos;
- bool overridable = true;
-
- if (parent) {
- overridable = !parent->bpf.disallow_override[type];
- effective = rcu_dereference_protected(parent->bpf.effective[type],
- lockdep_is_held(&cgroup_mutex));
- }
-
- if (prog && effective && !overridable)
- /* if parent has non-overridable prog attached, disallow
- * attaching new programs to descendent cgroup
- */
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ struct bpf_prog *old_prog = NULL;
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_list *pl;
+ bool pl_was_allocated;
+ int err;
+
+ if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+ /* invalid combination */
+ return -EINVAL;
+
+ if (!hierarchy_allows_attach(cgrp, type, flags))
return -EPERM;
- if (prog && effective && overridable != new_overridable)
- /* if parent has overridable prog attached, only
- * allow overridable programs in descendent cgroup
+ if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+ /* Disallow attaching non-overridable on top
+ * of existing overridable in this cgroup.
+ * Disallow attaching multi-prog if overridable or none
*/
return -EPERM;
- old_prog = cgrp->bpf.prog[type];
-
- if (prog) {
- overridable = new_overridable;
- effective = prog;
- if (old_prog &&
- cgrp->bpf.disallow_override[type] == new_overridable)
- /* disallow attaching non-overridable on top
- * of existing overridable in this cgroup
- * and vice versa
- */
- return -EPERM;
+ if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+ return -E2BIG;
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ list_for_each_entry(pl, progs, node)
+ if (pl->prog == prog)
+ /* disallow attaching the same prog twice */
+ return -EINVAL;
+
+ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl)
+ return -ENOMEM;
+ pl_was_allocated = true;
+ pl->prog = prog;
+ list_add_tail(&pl->node, progs);
+ } else {
+ if (list_empty(progs)) {
+ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl)
+ return -ENOMEM;
+ pl_was_allocated = true;
+ list_add_tail(&pl->node, progs);
+ } else {
+ pl = list_first_entry(progs, typeof(*pl), node);
+ old_prog = pl->prog;
+ pl_was_allocated = false;
+ }
+ pl->prog = prog;
}
- if (!prog && !old_prog)
- /* report error when trying to detach and nothing is attached */
- return -ENOENT;
+ cgrp->bpf.flags[type] = flags;
- cgrp->bpf.prog[type] = prog;
+ /* allocate and recompute effective prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
- css_for_each_descendant_pre(pos, &cgrp->self) {
- struct cgroup *desc = container_of(pos, struct cgroup, self);
-
- /* skip the subtree if the descendant has its own program */
- if (desc->bpf.prog[type] && desc != cgrp) {
- pos = css_rightmost_descendant(pos);
- } else {
- rcu_assign_pointer(desc->bpf.effective[type],
- effective);
- desc->bpf.disallow_override[type] = !overridable;
- }
+ err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ if (err)
+ goto cleanup;
}
- if (prog)
- static_branch_inc(&cgroup_bpf_enabled_key);
+ /* all allocations were successful. Activate all prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+ activate_effective_progs(desc, type, desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ static_branch_inc(&cgroup_bpf_enabled_key);
if (old_prog) {
bpf_prog_put(old_prog);
static_branch_dec(&cgroup_bpf_enabled_key);
}
return 0;
+
+cleanup:
+ /* oom while computing effective. Free all computed effective arrays
+ * since they were not activated
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* and cleanup the prog list */
+ pl->prog = old_prog;
+ if (pl_was_allocated) {
+ list_del(&pl->node);
+ kfree(pl);
+ }
+ return err;
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program from a cgroup, and
+ * propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @type: Type of detach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 unused_flags)
+{
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ u32 flags = cgrp->bpf.flags[type];
+ struct bpf_prog *old_prog = NULL;
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_list *pl;
+ int err;
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ if (!prog)
+ /* to detach MULTI prog the user has to specify valid FD
+ * of the program to be detached
+ */
+ return -EINVAL;
+ } else {
+ if (list_empty(progs))
+ /* report error when trying to detach and nothing is attached */
+ return -ENOENT;
+ }
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ /* find the prog and detach it */
+ list_for_each_entry(pl, progs, node) {
+ if (pl->prog != prog)
+ continue;
+ old_prog = prog;
+ /* mark it deleted, so it's ignored while
+ * recomputing effective
+ */
+ pl->prog = NULL;
+ break;
+ }
+ if (!old_prog)
+ return -ENOENT;
+ } else {
+ /* to maintain backward compatibility NONE and OVERRIDE cgroups
+ * allow detaching with invalid FD (prog==NULL)
+ */
+ pl = list_first_entry(progs, typeof(*pl), node);
+ old_prog = pl->prog;
+ pl->prog = NULL;
+ }
+
+ /* allocate and recompute effective prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ if (err)
+ goto cleanup;
+ }
+
+ /* all allocations were successful. Activate all prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ activate_effective_progs(desc, type, desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* now can actually delete it from this cgroup list */
+ list_del(&pl->node);
+ kfree(pl);
+ if (list_empty(progs))
+ /* last program was detached, reset flags to zero */
+ cgrp->bpf.flags[type] = 0;
+
+ bpf_prog_put(old_prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ return 0;
+
+cleanup:
+ /* oom while computing effective. Free all computed effective arrays
+ * since they were not activated
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* and restore back old_prog */
+ pl->prog = old_prog;
+ return err;
+}
+
+/* Must be called with cgroup_mutex held to avoid races. */
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ enum bpf_attach_type type = attr->query.attach_type;
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ u32 flags = cgrp->bpf.flags[type];
+ int cnt, ret = 0, i;
+
+ if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
+ cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+ else
+ cnt = prog_list_length(progs);
+
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+ return -EFAULT;
+ if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+ /* return early if user requested only program count + flags */
+ return 0;
+ if (attr->query.prog_cnt < cnt) {
+ cnt = attr->query.prog_cnt;
+ ret = -ENOSPC;
+ }
+
+ if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+ return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
+ prog_ids, cnt);
+ } else {
+ struct bpf_prog_list *pl;
+ u32 id;
+
+ i = 0;
+ list_for_each_entry(pl, progs, node) {
+ id = pl->prog->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (++i == cnt)
+ break;
+ }
+ }
+ return ret;
}
/**
@@ -171,36 +447,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
enum bpf_attach_type type)
{
- struct bpf_prog *prog;
+ unsigned int offset = skb->data - skb_network_header(skb);
+ struct sock *save_sk;
struct cgroup *cgrp;
- int ret = 0;
+ int ret;
if (!sk || !sk_fullsock(sk))
return 0;
- if (sk->sk_family != AF_INET &&
- sk->sk_family != AF_INET6)
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
return 0;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-
- rcu_read_lock();
-
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog) {
- unsigned int offset = skb->data - skb_network_header(skb);
- struct sock *save_sk = skb->sk;
-
- skb->sk = sk;
- __skb_push(skb, offset);
- ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
- __skb_pull(skb, offset);
- skb->sk = save_sk;
- }
-
- rcu_read_unlock();
-
- return ret;
+ save_sk = skb->sk;
+ skb->sk = sk;
+ __skb_push(skb, offset);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+ bpf_prog_run_save_cb);
+ __skb_pull(skb, offset);
+ skb->sk = save_sk;
+ return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
@@ -221,19 +487,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
enum bpf_attach_type type)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- struct bpf_prog *prog;
- int ret = 0;
+ int ret;
-
- rcu_read_lock();
-
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog)
- ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
-
- rcu_read_unlock();
-
- return ret;
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+ return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -258,18 +515,77 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
enum bpf_attach_type type)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- struct bpf_prog *prog;
- int ret = 0;
+ int ret;
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
+ BPF_PROG_RUN);
+ return ret == 1 ? 0 : -EPERM;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+ short access, enum bpf_attach_type type)
+{
+ struct cgroup *cgrp;
+ struct bpf_cgroup_dev_ctx ctx = {
+ .access_type = (access << 16) | dev_type,
+ .major = major,
+ .minor = minor,
+ };
+ int allow = 1;
rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
+ BPF_PROG_RUN);
+ rcu_read_unlock();
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog)
- ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
+ return !allow;
+}
+EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
- rcu_read_unlock();
+static const struct bpf_func_proto *
+cgroup_dev_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_trace_printk:
+ if (capable(CAP_SYS_ADMIN))
+ return bpf_get_trace_printk_proto();
+ default:
+ return NULL;
+ }
+}
- return ret;
+static bool cgroup_dev_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE)
+ return false;
+
+ if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
+ return false;
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u32))
+ return false;
+
+ return true;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+const struct bpf_prog_ops cg_dev_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_dev_verifier_ops = {
+ .get_func_proto = cgroup_dev_func_proto,
+ .is_valid_access = cgroup_dev_is_valid_access,
+};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7b62df86be1d..b9f8686a84cf 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -85,8 +85,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
if (fp == NULL)
return NULL;
- kmemcheck_annotate_bitfield(fp, meta);
-
aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
if (aux == NULL) {
vfree(fp);
@@ -127,8 +125,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
if (fp == NULL) {
__bpf_prog_uncharge(fp_old->aux->user, delta);
} else {
- kmemcheck_annotate_bitfield(fp, meta);
-
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
fp->pages = pages;
fp->aux->prog = fp;
@@ -309,12 +305,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
{
+ const char *end = sym + KSYM_NAME_LEN;
+
BUILD_BUG_ON(sizeof("bpf_prog_") +
- sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+ sizeof(prog->tag) * 2 +
+ /* name has been null terminated.
+ * We should need +1 for the '_' preceding
+ * the name. However, the null character
+ * is double counted between the name and the
+ * sizeof("bpf_prog_") above, so we omit
+ * the +1 here.
+ */
+ sizeof(prog->aux->name) > KSYM_NAME_LEN);
sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
- *sym = 0;
+ if (prog->aux->name[0])
+ snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
+ else
+ *sym = 0;
}
static __always_inline unsigned long
@@ -662,8 +671,6 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
if (fp != NULL) {
- kmemcheck_annotate_bitfield(fp, meta);
-
/* aux->prog still points to the fp_other one, so
* when promoting the clone to the real program,
* this still needs to be adapted.
@@ -1367,7 +1374,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* valid program, which in this case would simply not
* be JITed, but falls back to the interpreter.
*/
- fp = bpf_int_jit_compile(fp);
+ if (!bpf_prog_is_dev_bound(fp->aux)) {
+ fp = bpf_int_jit_compile(fp);
+ } else {
+ *err = bpf_prog_offload_compile(fp);
+ if (*err)
+ return fp;
+ }
bpf_prog_lock_ro(fp);
/* The tail call compatibility check can only be done at
@@ -1381,11 +1394,163 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+static unsigned int __bpf_prog_ret1(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ return 1;
+}
+
+static struct bpf_prog_dummy {
+ struct bpf_prog prog;
+} dummy_bpf_prog = {
+ .prog = {
+ .bpf_func = __bpf_prog_ret1,
+ },
+};
+
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+static struct {
+ struct bpf_prog_array hdr;
+ struct bpf_prog *null_prog;
+} empty_prog_array = {
+ .null_prog = NULL,
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+{
+ if (prog_cnt)
+ return kzalloc(sizeof(struct bpf_prog_array) +
+ sizeof(struct bpf_prog *) * (prog_cnt + 1),
+ flags);
+
+ return &empty_prog_array.hdr;
+}
+
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+{
+ if (!progs ||
+ progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+ return;
+ kfree_rcu(progs, rcu);
+}
+
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+{
+ struct bpf_prog **prog;
+ u32 cnt = 0;
+
+ rcu_read_lock();
+ prog = rcu_dereference(progs)->progs;
+ for (; *prog; prog++)
+ cnt++;
+ rcu_read_unlock();
+ return cnt;
+}
+
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+ __u32 __user *prog_ids, u32 cnt)
+{
+ struct bpf_prog **prog;
+ u32 i = 0, id;
+
+ rcu_read_lock();
+ prog = rcu_dereference(progs)->progs;
+ for (; *prog; prog++) {
+ id = (*prog)->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
+ rcu_read_unlock();
+ return -EFAULT;
+ }
+ if (++i == cnt) {
+ prog++;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (*prog)
+ return -ENOSPC;
+ return 0;
+}
+
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_prog **prog = progs->progs;
+
+ for (; *prog; prog++)
+ if (*prog == old_prog) {
+ WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+ break;
+ }
+}
+
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+ struct bpf_prog *exclude_prog,
+ struct bpf_prog *include_prog,
+ struct bpf_prog_array **new_array)
+{
+ int new_prog_cnt, carry_prog_cnt = 0;
+ struct bpf_prog **existing_prog;
+ struct bpf_prog_array *array;
+ int new_prog_idx = 0;
+
+ /* Figure out how many existing progs we need to carry over to
+ * the new array.
+ */
+ if (old_array) {
+ existing_prog = old_array->progs;
+ for (; *existing_prog; existing_prog++) {
+ if (*existing_prog != exclude_prog &&
+ *existing_prog != &dummy_bpf_prog.prog)
+ carry_prog_cnt++;
+ if (*existing_prog == include_prog)
+ return -EEXIST;
+ }
+ }
+
+ /* How many progs (not NULL) will be in the new array? */
+ new_prog_cnt = carry_prog_cnt;
+ if (include_prog)
+ new_prog_cnt += 1;
+
+ /* Do we have any prog (not NULL) in the new array? */
+ if (!new_prog_cnt) {
+ *new_array = NULL;
+ return 0;
+ }
+
+ /* +1 as the end of prog_array is marked with NULL */
+ array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+
+ /* Fill in the new prog array */
+ if (carry_prog_cnt) {
+ existing_prog = old_array->progs;
+ for (; *existing_prog; existing_prog++)
+ if (*existing_prog != exclude_prog &&
+ *existing_prog != &dummy_bpf_prog.prog)
+ array->progs[new_prog_idx++] = *existing_prog;
+ }
+ if (include_prog)
+ array->progs[new_prog_idx++] = include_prog;
+ array->progs[new_prog_idx] = NULL;
+ *new_array = array;
+ return 0;
+}
+
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
aux = container_of(work, struct bpf_prog_aux, work);
+ if (bpf_prog_is_dev_bound(aux))
+ bpf_prog_offload_destroy(aux->prog);
bpf_jit_free(aux->prog);
}
@@ -1498,5 +1663,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+/* These are only used within the BPF_SYSCALL code */
+#ifdef CONFIG_BPF_SYSCALL
EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
+#endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
new file mode 100644
index 000000000000..ce5b669003b2
--- /dev/null
+++ b/kernel/bpf/cpumap.c
@@ -0,0 +1,706 @@
+/* bpf/cpumap.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2. See COPYING.
+ */
+
+/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+ * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
+ *
+ * Unlike devmap which redirects XDP frames out another NIC device,
+ * this map type redirects raw XDP frames to another CPU. The remote
+ * CPU will do SKB-allocation and call the normal network stack.
+ *
+ * This is a scalability and isolation mechanism, that allow
+ * separating the early driver network XDP layer, from the rest of the
+ * netstack, and assigning dedicated CPUs for this stage. This
+ * basically allows for 10G wirespeed pre-filtering via bpf.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ptr_ring.h>
+
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/capability.h>
+#include <trace/events/xdp.h>
+
+#include <linux/netdevice.h> /* netif_receive_skb_core */
+#include <linux/etherdevice.h> /* eth_type_trans */
+
+/* General idea: XDP packets getting XDP redirected to another CPU,
+ * will maximum be stored/queued for one driver ->poll() call. It is
+ * guaranteed that setting flush bit and flush operation happen on
+ * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
+ * which queue in bpf_cpu_map_entry contains packets.
+ */
+
+#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
+struct xdp_bulk_queue {
+ void *q[CPU_MAP_BULK_SIZE];
+ unsigned int count;
+};
+
+/* Struct for every remote "destination" CPU in map */
+struct bpf_cpu_map_entry {
+ u32 cpu; /* kthread CPU and map index */
+ int map_id; /* Back reference to map */
+ u32 qsize; /* Queue size placeholder for map lookup */
+
+ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */
+ struct xdp_bulk_queue __percpu *bulkq;
+
+ /* Queue with potential multi-producers, and single-consumer kthread */
+ struct ptr_ring *queue;
+ struct task_struct *kthread;
+ struct work_struct kthread_stop_wq;
+
+ atomic_t refcnt; /* Control when this struct can be free'ed */
+ struct rcu_head rcu;
+};
+
+struct bpf_cpu_map {
+ struct bpf_map map;
+ /* Below members specific for map type */
+ struct bpf_cpu_map_entry **cpu_map;
+ unsigned long __percpu *flush_needed;
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_bulk_queue *bq);
+
+static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
+{
+ return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_cpu_map *cmap;
+ int err = -ENOMEM;
+ u64 cost;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ return ERR_PTR(-EINVAL);
+
+ cmap = kzalloc(sizeof(*cmap), GFP_USER);
+ if (!cmap)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ cmap->map.map_type = attr->map_type;
+ cmap->map.key_size = attr->key_size;
+ cmap->map.value_size = attr->value_size;
+ cmap->map.max_entries = attr->max_entries;
+ cmap->map.map_flags = attr->map_flags;
+ cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+
+ /* Pre-limit array size based on NR_CPUS, not final CPU check */
+ if (cmap->map.max_entries > NR_CPUS) {
+ err = -E2BIG;
+ goto free_cmap;
+ }
+
+ /* make sure page count doesn't overflow */
+ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
+ cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_cmap;
+ cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* Notice returns -EPERM on if map size is larger than memlock limit */
+ ret = bpf_map_precharge_memlock(cmap->map.pages);
+ if (ret) {
+ err = ret;
+ goto free_cmap;
+ }
+
+ /* A per cpu bitfield with a bit per possible CPU in map */
+ cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
+ __alignof__(unsigned long));
+ if (!cmap->flush_needed)
+ goto free_cmap;
+
+ /* Alloc array for possible remote "destination" CPUs */
+ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
+ sizeof(struct bpf_cpu_map_entry *),
+ cmap->map.numa_node);
+ if (!cmap->cpu_map)
+ goto free_percpu;
+
+ return &cmap->map;
+free_percpu:
+ free_percpu(cmap->flush_needed);
+free_cmap:
+ kfree(cmap);
+ return ERR_PTR(err);
+}
+
+void __cpu_map_queue_destructor(void *ptr)
+{
+ /* The tear-down procedure should have made sure that queue is
+ * empty. See __cpu_map_entry_replace() and work-queue
+ * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+ * gracefully and warn once.
+ */
+ if (WARN_ON_ONCE(ptr))
+ page_frag_free(ptr);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+ if (atomic_dec_and_test(&rcpu->refcnt)) {
+ /* The queue should be empty at this point */
+ ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
+ kfree(rcpu->queue);
+ kfree(rcpu);
+ }
+}
+
+static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+ atomic_inc(&rcpu->refcnt);
+}
+
+/* called from workqueue, to workaround syscall using preempt_disable */
+static void cpu_map_kthread_stop(struct work_struct *work)
+{
+ struct bpf_cpu_map_entry *rcpu;
+
+ rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
+
+ /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
+ * as it waits until all in-flight call_rcu() callbacks complete.
+ */
+ rcu_barrier();
+
+ /* kthread_stop will wake_up_process and wait for it to complete */
+ kthread_stop(rcpu->kthread);
+}
+
+/* For now, xdp_pkt is a cpumap internal data structure, with info
+ * carried between enqueue to dequeue. It is mapped into the top
+ * headroom of the packet, to avoid allocating separate mem.
+ */
+struct xdp_pkt {
+ void *data;
+ u16 len;
+ u16 headroom;
+ u16 metasize;
+ struct net_device *dev_rx;
+};
+
+/* Convert xdp_buff to xdp_pkt */
+static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
+{
+ struct xdp_pkt *xdp_pkt;
+ int metasize;
+ int headroom;
+
+ /* Assure headroom is available for storing info */
+ headroom = xdp->data - xdp->data_hard_start;
+ metasize = xdp->data - xdp->data_meta;
+ metasize = metasize > 0 ? metasize : 0;
+ if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
+ return NULL;
+
+ /* Store info in top of packet */
+ xdp_pkt = xdp->data_hard_start;
+
+ xdp_pkt->data = xdp->data;
+ xdp_pkt->len = xdp->data_end - xdp->data;
+ xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
+ xdp_pkt->metasize = metasize;
+
+ return xdp_pkt;
+}
+
+struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_pkt *xdp_pkt)
+{
+ unsigned int frame_size;
+ void *pkt_data_start;
+ struct sk_buff *skb;
+
+ /* build_skb need to place skb_shared_info after SKB end, and
+ * also want to know the memory "truesize". Thus, need to
+ * know the memory frame size backing xdp_buff.
+ *
+ * XDP was designed to have PAGE_SIZE frames, but this
+ * assumption is not longer true with ixgbe and i40e. It
+ * would be preferred to set frame_size to 2048 or 4096
+ * depending on the driver.
+ * frame_size = 2048;
+ * frame_len = frame_size - sizeof(*xdp_pkt);
+ *
+ * Instead, with info avail, skb_shared_info in placed after
+ * packet len. This, unfortunately fakes the truesize.
+ * Another disadvantage of this approach, the skb_shared_info
+ * is not at a fixed memory location, with mixed length
+ * packets, which is bad for cache-line hotness.
+ */
+ frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+ skb = build_skb(pkt_data_start, frame_size);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, xdp_pkt->headroom);
+ __skb_put(skb, xdp_pkt->len);
+ if (xdp_pkt->metasize)
+ skb_metadata_set(skb, xdp_pkt->metasize);
+
+ /* Essential SKB info: protocol and skb->dev */
+ skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+
+ /* Optional SKB info, currently missing:
+ * - HW checksum info (skb->ip_summed)
+ * - HW RX hash (skb_set_hash)
+ * - RX ring dev queue index (skb_record_rx_queue)
+ */
+
+ return skb;
+}
+
+static int cpu_map_kthread_run(void *data)
+{
+ struct bpf_cpu_map_entry *rcpu = data;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /* When kthread gives stop order, then rcpu have been disconnected
+ * from map, thus no new packets can enter. Remaining in-flight
+ * per CPU stored packets are flushed to this queue. Wait honoring
+ * kthread_stop signal until queue is empty.
+ */
+ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+ unsigned int processed = 0, drops = 0, sched = 0;
+ struct xdp_pkt *xdp_pkt;
+
+ /* Release CPU reschedule checks */
+ if (__ptr_ring_empty(rcpu->queue)) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ /* Recheck to avoid lost wake-up */
+ if (__ptr_ring_empty(rcpu->queue)) {
+ schedule();
+ sched = 1;
+ } else {
+ __set_current_state(TASK_RUNNING);
+ }
+ } else {
+ sched = cond_resched();
+ }
+
+ /* Process packets in rcpu->queue */
+ local_bh_disable();
+ /*
+ * The bpf_cpu_map_entry is single consumer, with this
+ * kthread CPU pinned. Lockless access to ptr_ring
+ * consume side valid as no-resize allowed of queue.
+ */
+ while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+ struct sk_buff *skb;
+ int ret;
+
+ skb = cpu_map_build_skb(rcpu, xdp_pkt);
+ if (!skb) {
+ page_frag_free(xdp_pkt);
+ continue;
+ }
+
+ /* Inject into network stack */
+ ret = netif_receive_skb_core(skb);
+ if (ret == NET_RX_DROP)
+ drops++;
+
+ /* Limit BH-disable period */
+ if (++processed == 8)
+ break;
+ }
+ /* Feedback loop via tracepoint */
+ trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched);
+
+ local_bh_enable(); /* resched point, may call do_softirq() */
+ }
+ __set_current_state(TASK_RUNNING);
+
+ put_cpu_map_entry(rcpu);
+ return 0;
+}
+
+struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+{
+ gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
+ struct bpf_cpu_map_entry *rcpu;
+ int numa, err;
+
+ /* Have map->numa_node, but choose node of redirect target CPU */
+ numa = cpu_to_node(cpu);
+
+ rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
+ if (!rcpu)
+ return NULL;
+
+ /* Alloc percpu bulkq */
+ rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
+ sizeof(void *), gfp);
+ if (!rcpu->bulkq)
+ goto free_rcu;
+
+ /* Alloc queue */
+ rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
+ if (!rcpu->queue)
+ goto free_bulkq;
+
+ err = ptr_ring_init(rcpu->queue, qsize, gfp);
+ if (err)
+ goto free_queue;
+
+ rcpu->cpu = cpu;
+ rcpu->map_id = map_id;
+ rcpu->qsize = qsize;
+
+ /* Setup kthread */
+ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
+ "cpumap/%d/map:%d", cpu, map_id);
+ if (IS_ERR(rcpu->kthread))
+ goto free_ptr_ring;
+
+ get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
+ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
+
+ /* Make sure kthread runs on a single CPU */
+ kthread_bind(rcpu->kthread, cpu);
+ wake_up_process(rcpu->kthread);
+
+ return rcpu;
+
+free_ptr_ring:
+ ptr_ring_cleanup(rcpu->queue, NULL);
+free_queue:
+ kfree(rcpu->queue);
+free_bulkq:
+ free_percpu(rcpu->bulkq);
+free_rcu:
+ kfree(rcpu);
+ return NULL;
+}
+
+void __cpu_map_entry_free(struct rcu_head *rcu)
+{
+ struct bpf_cpu_map_entry *rcpu;
+ int cpu;
+
+ /* This cpu_map_entry have been disconnected from map and one
+ * RCU graze-period have elapsed. Thus, XDP cannot queue any
+ * new packets and cannot change/set flush_needed that can
+ * find this entry.
+ */
+ rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+
+ /* Flush remaining packets in percpu bulkq */
+ for_each_online_cpu(cpu) {
+ struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
+
+ /* No concurrent bq_enqueue can run at this point */
+ bq_flush_to_queue(rcpu, bq);
+ }
+ free_percpu(rcpu->bulkq);
+ /* Cannot kthread_stop() here, last put free rcpu resources */
+ put_cpu_map_entry(rcpu);
+}
+
+/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
+ * ensure any driver rcu critical sections have completed, but this
+ * does not guarantee a flush has happened yet. Because driver side
+ * rcu_read_lock/unlock only protects the running XDP program. The
+ * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
+ * pending flush op doesn't fail.
+ *
+ * The bpf_cpu_map_entry is still used by the kthread, and there can
+ * still be pending packets (in queue and percpu bulkq). A refcnt
+ * makes sure to last user (kthread_stop vs. call_rcu) free memory
+ * resources.
+ *
+ * The rcu callback __cpu_map_entry_free flush remaining packets in
+ * percpu bulkq to queue. Due to caller map_delete_elem() disable
+ * preemption, cannot call kthread_stop() to make sure queue is empty.
+ * Instead a work_queue is started for stopping kthread,
+ * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * stopping kthread, emptying the queue.
+ */
+void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+ u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+{
+ struct bpf_cpu_map_entry *old_rcpu;
+
+ old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+ if (old_rcpu) {
+ call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
+ INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
+ schedule_work(&old_rcpu->kthread_stop_wq);
+ }
+}
+
+int cpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ u32 key_cpu = *(u32 *)key;
+
+ if (key_cpu >= map->max_entries)
+ return -EINVAL;
+
+ /* notice caller map_delete_elem() use preempt_disable() */
+ __cpu_map_entry_replace(cmap, key_cpu, NULL);
+ return 0;
+}
+
+int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpu_map_entry *rcpu;
+
+ /* Array index key correspond to CPU number */
+ u32 key_cpu = *(u32 *)key;
+ /* Value is the queue size */
+ u32 qsize = *(u32 *)value;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(key_cpu >= cmap->map.max_entries))
+ return -E2BIG;
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+ if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+ return -EOVERFLOW;
+
+ /* Make sure CPU is a valid possible cpu */
+ if (!cpu_possible(key_cpu))
+ return -ENODEV;
+
+ if (qsize == 0) {
+ rcpu = NULL; /* Same as deleting */
+ } else {
+ /* Updating qsize cause re-allocation of bpf_cpu_map_entry */
+ rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+ if (!rcpu)
+ return -ENOMEM;
+ }
+ rcu_read_lock();
+ __cpu_map_entry_replace(cmap, key_cpu, rcpu);
+ rcu_read_unlock();
+ return 0;
+}
+
+void cpu_map_free(struct bpf_map *map)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ int cpu;
+ u32 i;
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the bpf programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete. The rcu critical section only guarantees
+ * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
+ * It does __not__ ensure pending flush operations (if any) are
+ * complete.
+ */
+ synchronize_rcu();
+
+ /* To ensure all pending flush operations have completed wait for flush
+ * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * Because the above synchronize_rcu() ensures the map is disconnected
+ * from the program we can assume no new bits will be set.
+ */
+ for_each_online_cpu(cpu) {
+ unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+
+ while (!bitmap_empty(bitmap, cmap->map.max_entries))
+ cond_resched();
+ }
+
+ /* For cpu_map the remote CPUs can still be using the entries
+ * (struct bpf_cpu_map_entry).
+ */
+ for (i = 0; i < cmap->map.max_entries; i++) {
+ struct bpf_cpu_map_entry *rcpu;
+
+ rcpu = READ_ONCE(cmap->cpu_map[i]);
+ if (!rcpu)
+ continue;
+
+ /* bq flush and cleanup happens after RCU graze-period */
+ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+ }
+ free_percpu(cmap->flush_needed);
+ bpf_map_area_free(cmap->cpu_map);
+ kfree(cmap);
+}
+
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpu_map_entry *rcpu;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ rcpu = READ_ONCE(cmap->cpu_map[key]);
+ return rcpu;
+}
+
+static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_cpu_map_entry *rcpu =
+ __cpu_map_lookup_elem(map, *(u32 *)key);
+
+ return rcpu ? &rcpu->qsize : NULL;
+}
+
+static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= cmap->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == cmap->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+const struct bpf_map_ops cpu_map_ops = {
+ .map_alloc = cpu_map_alloc,
+ .map_free = cpu_map_free,
+ .map_delete_elem = cpu_map_delete_elem,
+ .map_update_elem = cpu_map_update_elem,
+ .map_lookup_elem = cpu_map_lookup_elem,
+ .map_get_next_key = cpu_map_get_next_key,
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_bulk_queue *bq)
+{
+ unsigned int processed = 0, drops = 0;
+ const int to_cpu = rcpu->cpu;
+ struct ptr_ring *q;
+ int i;
+
+ if (unlikely(!bq->count))
+ return 0;
+
+ q = rcpu->queue;
+ spin_lock(&q->producer_lock);
+
+ for (i = 0; i < bq->count; i++) {
+ void *xdp_pkt = bq->q[i];
+ int err;
+
+ err = __ptr_ring_produce(q, xdp_pkt);
+ if (err) {
+ drops++;
+ page_frag_free(xdp_pkt); /* Free xdp_pkt */
+ }
+ processed++;
+ }
+ bq->count = 0;
+ spin_unlock(&q->producer_lock);
+
+ /* Feedback loop via tracepoints */
+ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
+ return 0;
+}
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+{
+ struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+
+ if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
+ bq_flush_to_queue(rcpu, bq);
+
+ /* Notice, xdp_buff/page MUST be queued here, long enough for
+ * driver to code invoking us to finished, due to driver
+ * (e.g. ixgbe) recycle tricks based on page-refcnt.
+ *
+ * Thus, incoming xdp_pkt is always queued here (else we race
+ * with another CPU on page-refcnt and remaining driver code).
+ * Queue time is very short, as driver will invoke flush
+ * operation, when completing napi->poll call.
+ */
+ bq->q[bq->count++] = xdp_pkt;
+ return 0;
+}
+
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ struct xdp_pkt *xdp_pkt;
+
+ xdp_pkt = convert_to_xdp_pkt(xdp);
+ if (unlikely(!xdp_pkt))
+ return -EOVERFLOW;
+
+ /* Info needed when constructing SKB on remote CPU */
+ xdp_pkt->dev_rx = dev_rx;
+
+ bq_enqueue(rcpu, xdp_pkt);
+ return 0;
+}
+
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+
+ __set_bit(bit, bitmap);
+}
+
+void __cpu_map_flush(struct bpf_map *map)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+ u32 bit;
+
+ /* The napi->poll softirq makes sure __cpu_map_insert_ctx()
+ * and __cpu_map_flush() happen on same CPU. Thus, the percpu
+ * bitmap indicate which percpu bulkq have packets.
+ */
+ for_each_set_bit(bit, bitmap, map->max_entries) {
+ struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
+ struct xdp_bulk_queue *bq;
+
+ /* This is possible if entry is removed by user space
+ * between xdp redirect and flush op.
+ */
+ if (unlikely(!rcpu))
+ continue;
+
+ __clear_bit(bit, bitmap);
+
+ /* Flush all frames in bulkq to real queue */
+ bq = this_cpu_ptr(rcpu->bulkq);
+ bq_flush_to_queue(rcpu, bq);
+
+ /* If already running, costs spin_lock_irqsave + smb_mb */
+ wake_up_process(rcpu->kthread);
+ }
+}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e745d6a88224..ebdef54bf7df 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,9 @@
#include <linux/bpf.h>
#include <linux/filter.h>
+#define DEV_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
struct bpf_dtab_netdev {
struct net_device *dev;
struct bpf_dtab *dtab;
@@ -83,7 +86,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
dtab = kzalloc(sizeof(*dtab), GFP_USER);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
new file mode 100644
index 000000000000..e682850c9715
--- /dev/null
+++ b/kernel/bpf/disasm.c
@@ -0,0 +1,214 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+
+#include "disasm.h"
+
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+ __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+const char *func_id_name(int id)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+ if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+ return func_id_str[id];
+ else
+ return "unknown";
+}
+
+const char *const bpf_class_string[8] = {
+ [BPF_LD] = "ld",
+ [BPF_LDX] = "ldx",
+ [BPF_ST] = "st",
+ [BPF_STX] = "stx",
+ [BPF_ALU] = "alu",
+ [BPF_JMP] = "jmp",
+ [BPF_RET] = "BUG",
+ [BPF_ALU64] = "alu64",
+};
+
+const char *const bpf_alu_string[16] = {
+ [BPF_ADD >> 4] = "+=",
+ [BPF_SUB >> 4] = "-=",
+ [BPF_MUL >> 4] = "*=",
+ [BPF_DIV >> 4] = "/=",
+ [BPF_OR >> 4] = "|=",
+ [BPF_AND >> 4] = "&=",
+ [BPF_LSH >> 4] = "<<=",
+ [BPF_RSH >> 4] = ">>=",
+ [BPF_NEG >> 4] = "neg",
+ [BPF_MOD >> 4] = "%=",
+ [BPF_XOR >> 4] = "^=",
+ [BPF_MOV >> 4] = "=",
+ [BPF_ARSH >> 4] = "s>>=",
+ [BPF_END >> 4] = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+ [BPF_W >> 3] = "u32",
+ [BPF_H >> 3] = "u16",
+ [BPF_B >> 3] = "u8",
+ [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[16] = {
+ [BPF_JA >> 4] = "jmp",
+ [BPF_JEQ >> 4] = "==",
+ [BPF_JGT >> 4] = ">",
+ [BPF_JLT >> 4] = "<",
+ [BPF_JGE >> 4] = ">=",
+ [BPF_JLE >> 4] = "<=",
+ [BPF_JSET >> 4] = "&",
+ [BPF_JNE >> 4] = "!=",
+ [BPF_JSGT >> 4] = "s>",
+ [BPF_JSLT >> 4] = "s<",
+ [BPF_JSGE >> 4] = "s>=",
+ [BPF_JSLE >> 4] = "s<=",
+ [BPF_CALL >> 4] = "call",
+ [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_end_insn(bpf_insn_print_cb verbose,
+ struct bpf_verifier_env *env,
+ const struct bpf_insn *insn)
+{
+ verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+ BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+ insn->imm, insn->dst_reg);
+}
+
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, bool allow_ptr_leaks)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (BPF_OP(insn->code) == BPF_END) {
+ if (class == BPF_ALU64)
+ verbose(env, "BUG_alu64_%02x\n", insn->code);
+ else
+ print_bpf_end_insn(verbose, env, insn);
+ } else if (BPF_OP(insn->code) == BPF_NEG) {
+ verbose(env, "(%02x) r%d = %s-r%d\n",
+ insn->code, insn->dst_reg,
+ class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose(env, "(%02x) %sr%d %s %sr%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->src_reg);
+ } else {
+ verbose(env, "(%02x) %sr%d %s %s%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->imm);
+ }
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->src_reg);
+ else if (BPF_MODE(insn->code) == BPF_XADD)
+ verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ insn->src_reg);
+ else
+ verbose(env, "BUG_%02x\n", insn->code);
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "BUG_st_%02x\n", insn->code);
+ return;
+ }
+ verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (class == BPF_LD) {
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IND) {
+ verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM &&
+ BPF_SIZE(insn->code) == BPF_DW) {
+ /* At this point, we already made sure that the second
+ * part of the ldimm64 insn is accessible.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+ bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+ if (map_ptr && !allow_ptr_leaks)
+ imm = 0;
+
+ verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
+ insn->dst_reg, (unsigned long long)imm);
+ } else {
+ verbose(env, "BUG_ld_%02x\n", insn->code);
+ return;
+ }
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ verbose(env, "(%02x) call %s#%d\n", insn->code,
+ func_id_name(insn->imm), insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_JA)) {
+ verbose(env, "(%02x) goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+ verbose(env, "(%02x) exit\n", insn->code);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->src_reg, insn->off);
+ } else {
+ verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->imm, insn->off);
+ }
+ } else {
+ verbose(env, "(%02x) %s\n",
+ insn->code, bpf_class_string[class]);
+ }
+}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
new file mode 100644
index 000000000000..8de977e420b6
--- /dev/null
+++ b/kernel/bpf/disasm.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __BPF_DISASM_H__
+#define __BPF_DISASM_H__
+
+#include <linux/bpf.h>
+#include <linux/kernel.h>
+#include <linux/stringify.h>
+
+extern const char *const bpf_alu_string[16];
+extern const char *const bpf_class_string[8];
+
+const char *func_id_name(int id);
+
+struct bpf_verifier_env;
+typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env,
+ const char *, ...);
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, bool allow_ptr_leaks);
+
+#endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 6533f08d1238..e469e05c8e83 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,8 +18,9 @@
#include "bpf_lru_list.h"
#include "map_in_map.h"
-#define HTAB_CREATE_FLAG_MASK \
- (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+#define HTAB_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
+ BPF_F_RDONLY | BPF_F_WRONLY)
struct bucket {
struct hlist_nulls_head head;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index be1dde967208..01aaef1a77c5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -295,7 +295,7 @@ out:
}
static void *bpf_obj_do_get(const struct filename *pathname,
- enum bpf_type *type)
+ enum bpf_type *type, int flags)
{
struct inode *inode;
struct path path;
@@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
return ERR_PTR(ret);
inode = d_backing_inode(path.dentry);
- ret = inode_permission(inode, MAY_WRITE);
+ ret = inode_permission(inode, ACC_MODE(flags));
if (ret)
goto out;
@@ -326,18 +326,23 @@ out:
return ERR_PTR(ret);
}
-int bpf_obj_get_user(const char __user *pathname)
+int bpf_obj_get_user(const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
struct filename *pname;
int ret = -ENOENT;
+ int f_flags;
void *raw;
+ f_flags = bpf_get_file_flag(flags);
+ if (f_flags < 0)
+ return f_flags;
+
pname = getname(pathname);
if (IS_ERR(pname))
return PTR_ERR(pname);
- raw = bpf_obj_do_get(pname, &type);
+ raw = bpf_obj_do_get(pname, &type, f_flags);
if (IS_ERR(raw)) {
ret = PTR_ERR(raw);
goto out;
@@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname)
if (type == BPF_TYPE_PROG)
ret = bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
- ret = bpf_map_new_fd(raw);
+ ret = bpf_map_new_fd(raw, f_flags);
else
goto out;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1b767844a76f..885e45479680 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -389,10 +389,99 @@ out:
return ret;
}
-static int trie_delete_elem(struct bpf_map *map, void *key)
+/* Called from syscall or from eBPF program */
+static int trie_delete_elem(struct bpf_map *map, void *_key)
{
- /* TODO */
- return -ENOSYS;
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct bpf_lpm_trie_key *key = _key;
+ struct lpm_trie_node __rcu **trim, **trim2;
+ struct lpm_trie_node *node, *parent;
+ unsigned long irq_flags;
+ unsigned int next_bit;
+ size_t matchlen = 0;
+ int ret = 0;
+
+ if (key->prefixlen > trie->max_prefixlen)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&trie->lock, irq_flags);
+
+ /* Walk the tree looking for an exact key/length match and keeping
+ * track of the path we traverse. We will need to know the node
+ * we wish to delete, and the slot that points to the node we want
+ * to delete. We may also need to know the nodes parent and the
+ * slot that contains it.
+ */
+ trim = &trie->root;
+ trim2 = trim;
+ parent = NULL;
+ while ((node = rcu_dereference_protected(
+ *trim, lockdep_is_held(&trie->lock)))) {
+ matchlen = longest_prefix_match(trie, node, key);
+
+ if (node->prefixlen != matchlen ||
+ node->prefixlen == key->prefixlen)
+ break;
+
+ parent = node;
+ trim2 = trim;
+ next_bit = extract_bit(key->data, node->prefixlen);
+ trim = &node->child[next_bit];
+ }
+
+ if (!node || node->prefixlen != key->prefixlen ||
+ (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ trie->n_entries--;
+
+ /* If the node we are removing has two children, simply mark it
+ * as intermediate and we are done.
+ */
+ if (rcu_access_pointer(node->child[0]) &&
+ rcu_access_pointer(node->child[1])) {
+ node->flags |= LPM_TREE_NODE_FLAG_IM;
+ goto out;
+ }
+
+ /* If the parent of the node we are about to delete is an intermediate
+ * node, and the deleted node doesn't have any children, we can delete
+ * the intermediate parent as well and promote its other child
+ * up the tree. Doing this maintains the invariant that all
+ * intermediate nodes have exactly 2 children and that there are no
+ * unnecessary intermediate nodes in the tree.
+ */
+ if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) &&
+ !node->child[0] && !node->child[1]) {
+ if (node == rcu_access_pointer(parent->child[0]))
+ rcu_assign_pointer(
+ *trim2, rcu_access_pointer(parent->child[1]));
+ else
+ rcu_assign_pointer(
+ *trim2, rcu_access_pointer(parent->child[0]));
+ kfree_rcu(parent, rcu);
+ kfree_rcu(node, rcu);
+ goto out;
+ }
+
+ /* The node we are removing has either zero or one child. If there
+ * is a child, move it into the removed node's slot then delete
+ * the node. Otherwise just clear the slot and delete the node.
+ */
+ if (node->child[0])
+ rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0]));
+ else if (node->child[1])
+ rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
+ else
+ RCU_INIT_POINTER(*trim, NULL);
+ kfree_rcu(node, rcu);
+
+out:
+ raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+ return ret;
}
#define LPM_DATA_SIZE_MAX 256
@@ -406,7 +495,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key)
#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
-#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \
+ BPF_F_RDONLY | BPF_F_WRONLY)
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
new file mode 100644
index 000000000000..68ec884440b7
--- /dev/null
+++ b/kernel/bpf/offload.c
@@ -0,0 +1,191 @@
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rtnetlink.h>
+
+/* protected by RTNL */
+static LIST_HEAD(bpf_prog_offload_devs);
+
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_dev_offload *offload;
+
+ if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_type != BPF_PROG_TYPE_XDP)
+ return -EINVAL;
+
+ if (attr->prog_flags)
+ return -EINVAL;
+
+ offload = kzalloc(sizeof(*offload), GFP_USER);
+ if (!offload)
+ return -ENOMEM;
+
+ offload->prog = prog;
+ init_waitqueue_head(&offload->verifier_done);
+
+ rtnl_lock();
+ offload->netdev = __dev_get_by_index(net, attr->prog_ifindex);
+ if (!offload->netdev) {
+ rtnl_unlock();
+ kfree(offload);
+ return -EINVAL;
+ }
+
+ prog->aux->offload = offload;
+ list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
+ rtnl_unlock();
+
+ return 0;
+}
+
+static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
+ struct netdev_bpf *data)
+{
+ struct net_device *netdev = prog->aux->offload->netdev;
+
+ ASSERT_RTNL();
+
+ if (!netdev)
+ return -ENODEV;
+ if (!netdev->netdev_ops->ndo_bpf)
+ return -EOPNOTSUPP;
+
+ data->command = cmd;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, data);
+}
+
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+ struct netdev_bpf data = {};
+ int err;
+
+ data.verifier.prog = env->prog;
+
+ rtnl_lock();
+ err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
+ if (err)
+ goto exit_unlock;
+
+ env->dev_ops = data.verifier.ops;
+
+ env->prog->aux->offload->dev_state = true;
+ env->prog->aux->offload->verifier_running = true;
+exit_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_dev_offload *offload = prog->aux->offload;
+ struct netdev_bpf data = {};
+
+ /* Caution - if netdev is destroyed before the program, this function
+ * will be called twice.
+ */
+
+ data.offload.prog = prog;
+
+ if (offload->verifier_running)
+ wait_event(offload->verifier_done, !offload->verifier_running);
+
+ if (offload->dev_state)
+ WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+
+ offload->dev_state = false;
+ list_del_init(&offload->offloads);
+ offload->netdev = NULL;
+}
+
+void bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_dev_offload *offload = prog->aux->offload;
+
+ offload->verifier_running = false;
+ wake_up(&offload->verifier_done);
+
+ rtnl_lock();
+ __bpf_prog_offload_destroy(prog);
+ rtnl_unlock();
+
+ kfree(offload);
+}
+
+static int bpf_prog_offload_translate(struct bpf_prog *prog)
+{
+ struct bpf_dev_offload *offload = prog->aux->offload;
+ struct netdev_bpf data = {};
+ int ret;
+
+ data.offload.prog = prog;
+
+ offload->verifier_running = false;
+ wake_up(&offload->verifier_done);
+
+ rtnl_lock();
+ ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
+ rtnl_unlock();
+
+ return ret;
+}
+
+static unsigned int bpf_prog_warn_on_exec(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ WARN(1, "attempt to execute device eBPF program on the host!");
+ return 0;
+}
+
+int bpf_prog_offload_compile(struct bpf_prog *prog)
+{
+ prog->bpf_func = bpf_prog_warn_on_exec;
+
+ return bpf_prog_offload_translate(prog);
+}
+
+const struct bpf_prog_ops bpf_offload_prog_ops = {
+};
+
+static int bpf_offload_notification(struct notifier_block *notifier,
+ ulong event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct bpf_dev_offload *offload, *tmp;
+
+ ASSERT_RTNL();
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* ignore namespace changes */
+ if (netdev->reg_state != NETREG_UNREGISTERING)
+ break;
+
+ list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
+ offloads) {
+ if (offload->netdev == netdev)
+ __bpf_prog_offload_destroy(offload->prog);
+ }
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block bpf_offload_notifier = {
+ .notifier_call = bpf_offload_notification,
+};
+
+static int __init bpf_offload_init(void)
+{
+ register_netdevice_notifier(&bpf_offload_notifier);
+ return 0;
+}
+
+subsys_initcall(bpf_offload_init);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 5c51d1985b51..673fa6fe2d73 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -78,8 +78,10 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
+ unsigned long flags;
int orig_cpu, cpu;
+ local_irq_save(flags);
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
@@ -87,14 +89,16 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
node = head->first;
if (node) {
head->first = node->next;
- raw_spin_unlock(&head->lock);
+ raw_spin_unlock_irqrestore(&head->lock, flags);
return node;
}
raw_spin_unlock(&head->lock);
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
- if (cpu == orig_cpu)
+ if (cpu == orig_cpu) {
+ local_irq_restore(flags);
return NULL;
+ }
}
}
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index dbd7b322a86b..5ee2e41893d9 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -41,6 +41,9 @@
#include <net/strparser.h>
#include <net/tcp.h>
+#define SOCK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
struct bpf_stab {
struct bpf_map map;
struct sock **sock_map;
@@ -122,7 +125,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
*/
TCP_SKB_CB(skb)->bpf.map = NULL;
skb->sk = psock->sock;
- bpf_compute_data_end_sk_skb(skb);
+ bpf_compute_data_pointers(skb);
preempt_disable();
rc = (*prog->bpf_func)(skb, prog->insnsi);
preempt_enable();
@@ -385,7 +388,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
* any socket yet.
*/
skb->sk = psock->sock;
- bpf_compute_data_end_sk_skb(skb);
+ bpf_compute_data_pointers(skb);
rc = (*prog->bpf_func)(skb, prog->insnsi);
skb->sk = NULL;
rcu_read_unlock();
@@ -508,7 +511,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
if (attr->value_size > KMALLOC_MAX_SIZE)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 135be433e9a0..a15bc636cc98 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,9 @@
#include <linux/perf_event.h>
#include "percpu_freelist.h"
+#define STACK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
struct stack_map_bucket {
struct pcpu_freelist_node fnode;
u32 hash;
@@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- if (attr->map_flags & ~BPF_F_NUMA_NODE)
+ if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
/* check sanity of attributes */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 25d074920a00..2c4cfeaa8d5e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,9 @@
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/idr.h>
+#include <linux/cred.h>
+#include <linux/timekeeping.h>
+#include <linux/ctype.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -31,6 +34,8 @@
#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
+#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
+
DEFINE_PER_CPU(int, bpf_prog_active);
static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
@@ -207,6 +212,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
struct bpf_map *map = container_of(work, struct bpf_map, work);
bpf_map_uncharge_memlock(map);
+ security_bpf_map_free(map);
/* implementation dependent freeing */
map->ops->map_free(map);
}
@@ -291,17 +297,54 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
-static const struct file_operations bpf_map_fops = {
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+ loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_READ.
+ */
+ return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+ size_t siz, loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_WRITE.
+ */
+ return -EINVAL;
+}
+
+const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
#endif
.release = bpf_map_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
};
-int bpf_map_new_fd(struct bpf_map *map)
+int bpf_map_new_fd(struct bpf_map *map, int flags)
{
+ int ret;
+
+ ret = security_bpf_map(map, OPEN_FMODE(flags));
+ if (ret < 0)
+ return ret;
+
return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
- O_RDWR | O_CLOEXEC);
+ flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+ if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+ return -EINVAL;
+ if (flags & BPF_F_RDONLY)
+ return O_RDONLY;
+ if (flags & BPF_F_WRONLY)
+ return O_WRONLY;
+ return O_RDWR;
}
/* helper macro to check that unused fields 'union bpf_attr' are zero */
@@ -312,18 +355,46 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD numa_node
+/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
+ * Return 0 on success and < 0 on error.
+ */
+static int bpf_obj_name_cpy(char *dst, const char *src)
+{
+ const char *end = src + BPF_OBJ_NAME_LEN;
+
+ memset(dst, 0, BPF_OBJ_NAME_LEN);
+
+ /* Copy all isalnum() and '_' char */
+ while (src < end && *src) {
+ if (!isalnum(*src) && *src != '_')
+ return -EINVAL;
+ *dst++ = *src++;
+ }
+
+ /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
+ if (src == end)
+ return -EINVAL;
+
+ return 0;
+}
+
+#define BPF_MAP_CREATE_LAST_FIELD map_name
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_map *map;
+ int f_flags;
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;
+ f_flags = bpf_get_file_flag(attr->map_flags);
+ if (f_flags < 0)
+ return f_flags;
+
if (numa_node != NUMA_NO_NODE &&
((unsigned int)numa_node >= nr_node_ids ||
!node_online(numa_node)))
@@ -334,18 +405,26 @@ static int map_create(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ err = bpf_obj_name_cpy(map->name, attr->map_name);
+ if (err)
+ goto free_map_nouncharge;
+
atomic_set(&map->refcnt, 1);
atomic_set(&map->usercnt, 1);
- err = bpf_map_charge_memlock(map);
+ err = security_bpf_map_alloc(map);
if (err)
goto free_map_nouncharge;
+ err = bpf_map_charge_memlock(map);
+ if (err)
+ goto free_map_sec;
+
err = bpf_map_alloc_id(map);
if (err)
goto free_map;
- err = bpf_map_new_fd(map);
+ err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
/* failed to allocate fd.
* bpf_map_put() is needed because the above
@@ -362,6 +441,8 @@ static int map_create(union bpf_attr *attr)
free_map:
bpf_map_uncharge_memlock(map);
+free_map_sec:
+ security_bpf_map_free(map);
free_map_nouncharge:
map->ops->map_free(map);
return err;
@@ -460,6 +541,11 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -540,6 +626,11 @@ static int map_update_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -562,6 +653,12 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value;
+ /* Need to create a kthread, thus must support schedule */
+ if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
+ goto out;
+ }
+
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
* inside bpf map update or delete otherwise deadlocks are possible
*/
@@ -592,7 +689,7 @@ static int map_update_elem(union bpf_attr *attr)
}
__this_cpu_dec(bpf_prog_active);
preempt_enable();
-
+out:
if (!err)
trace_bpf_map_update_elem(map, ufd, key, value);
free_value:
@@ -623,6 +720,11 @@ static int map_delete_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -666,6 +768,11 @@ static int map_get_next_key(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
if (ukey) {
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
@@ -703,9 +810,9 @@ err_put:
return err;
}
-static const struct bpf_verifier_ops * const bpf_prog_types[] = {
-#define BPF_PROG_TYPE(_id, _ops) \
- [_id] = &_ops,
+static const struct bpf_prog_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+ [_id] = & _name ## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
@@ -717,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
return -EINVAL;
- prog->aux->ops = bpf_prog_types[type];
+ if (!bpf_prog_is_dev_bound(prog->aux))
+ prog->aux->ops = bpf_prog_types[type];
+ else
+ prog->aux->ops = &bpf_offload_prog_ops;
prog->type = type;
return 0;
}
@@ -820,6 +930,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
free_used_maps(aux);
bpf_prog_uncharge_memlock(aux->prog);
+ security_bpf_prog_free(aux);
bpf_prog_free(aux->prog);
}
@@ -867,15 +978,23 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
-static const struct file_operations bpf_prog_fops = {
+const struct file_operations bpf_prog_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_prog_show_fdinfo,
#endif
.release = bpf_prog_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
};
int bpf_prog_new_fd(struct bpf_prog *prog)
{
+ int ret;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ret;
+
return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
O_RDWR | O_CLOEXEC);
}
@@ -938,7 +1057,23 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
}
EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
+static bool bpf_prog_get_ok(struct bpf_prog *prog,
+ enum bpf_prog_type *attach_type, bool attach_drv)
+{
+ /* not an attachment, just a refcount inc, always allow */
+ if (!attach_type)
+ return true;
+
+ if (prog->type != *attach_type)
+ return false;
+ if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
+ return false;
+
+ return true;
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
+ bool attach_drv)
{
struct fd f = fdget(ufd);
struct bpf_prog *prog;
@@ -946,7 +1081,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
prog = ____bpf_prog_get(f);
if (IS_ERR(prog))
return prog;
- if (type && prog->type != *type) {
+ if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
prog = ERR_PTR(-EINVAL);
goto out;
}
@@ -959,21 +1094,22 @@ out:
struct bpf_prog *bpf_prog_get(u32 ufd)
{
- return __bpf_prog_get(ufd, NULL);
+ return __bpf_prog_get(ufd, NULL, false);
}
-struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+ bool attach_drv)
{
- struct bpf_prog *prog = __bpf_prog_get(ufd, &type);
+ struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv);
if (!IS_ERR(prog))
trace_bpf_prog_get_type(prog);
return prog;
}
-EXPORT_SYMBOL_GPL(bpf_prog_get_type);
+EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD prog_flags
+#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex
static int bpf_prog_load(union bpf_attr *attr)
{
@@ -1015,10 +1151,14 @@ static int bpf_prog_load(union bpf_attr *attr)
if (!prog)
return -ENOMEM;
- err = bpf_prog_charge_memlock(prog);
+ err = security_bpf_prog_alloc(prog->aux);
if (err)
goto free_prog_nouncharge;
+ err = bpf_prog_charge_memlock(prog);
+ if (err)
+ goto free_prog_sec;
+
prog->len = attr->insn_cnt;
err = -EFAULT;
@@ -1032,11 +1172,22 @@ static int bpf_prog_load(union bpf_attr *attr)
atomic_set(&prog->aux->refcnt, 1);
prog->gpl_compatible = is_gpl ? 1 : 0;
+ if (attr->prog_ifindex) {
+ err = bpf_prog_offload_init(prog, attr);
+ if (err)
+ goto free_prog;
+ }
+
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
if (err < 0)
goto free_prog;
+ prog->aux->load_time = ktime_get_boot_ns();
+ err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
+ if (err)
+ goto free_prog;
+
/* run eBPF verifier */
err = bpf_check(&prog, attr);
if (err < 0)
@@ -1071,16 +1222,18 @@ free_used_maps:
free_used_maps(prog->aux);
free_prog:
bpf_prog_uncharge_memlock(prog);
+free_prog_sec:
+ security_bpf_prog_free(prog->aux);
free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}
-#define BPF_OBJ_LAST_FIELD bpf_fd
+#define BPF_OBJ_LAST_FIELD file_flags
static int bpf_obj_pin(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ))
+ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
return -EINVAL;
return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
@@ -1088,10 +1241,12 @@ static int bpf_obj_pin(const union bpf_attr *attr)
static int bpf_obj_get(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+ if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+ attr->file_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
- return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
+ return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+ attr->file_flags);
}
#ifdef CONFIG_CGROUP_BPF
@@ -1132,6 +1287,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
return 0;
}
+#define BPF_F_ATTACH_MASK \
+ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+
static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
@@ -1145,7 +1303,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
- if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
return -EINVAL;
switch (attr->attach_type) {
@@ -1159,6 +1317,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_SOCK_OPS:
ptype = BPF_PROG_TYPE_SOCK_OPS;
break;
+ case BPF_CGROUP_DEVICE:
+ ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+ break;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
return sockmap_get_from_fd(attr, true);
@@ -1176,8 +1337,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return PTR_ERR(cgrp);
}
- ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
- attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+ ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+ attr->attach_flags);
if (ret)
bpf_prog_put(prog);
cgroup_put(cgrp);
@@ -1189,6 +1350,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
static int bpf_prog_detach(const union bpf_attr *attr)
{
+ enum bpf_prog_type ptype;
+ struct bpf_prog *prog;
struct cgroup *cgrp;
int ret;
@@ -1201,26 +1364,71 @@ static int bpf_prog_detach(const union bpf_attr *attr)
switch (attr->attach_type) {
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
+ ptype = BPF_PROG_TYPE_CGROUP_SKB;
+ break;
case BPF_CGROUP_INET_SOCK_CREATE:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+ break;
case BPF_CGROUP_SOCK_OPS:
- cgrp = cgroup_get_from_fd(attr->target_fd);
- if (IS_ERR(cgrp))
- return PTR_ERR(cgrp);
-
- ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
- cgroup_put(cgrp);
+ ptype = BPF_PROG_TYPE_SOCK_OPS;
+ break;
+ case BPF_CGROUP_DEVICE:
+ ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
- ret = sockmap_get_from_fd(attr, false);
- break;
+ return sockmap_get_from_fd(attr, false);
default:
return -EINVAL;
}
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ prog = NULL;
+
+ ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+ if (prog)
+ bpf_prog_put(prog);
+ cgroup_put(cgrp);
return ret;
}
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+
+static int bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (CHECK_ATTR(BPF_PROG_QUERY))
+ return -EINVAL;
+ if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
+ return -EINVAL;
+
+ switch (attr->query.attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_DEVICE:
+ break;
+ default:
+ return -EINVAL;
+ }
+ cgrp = cgroup_get_from_fd(attr->query.target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+ ret = cgroup_bpf_query(cgrp, attr, uattr);
+ cgroup_put(cgrp);
+ return ret;
+}
#endif /* CONFIG_CGROUP_BPF */
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -1305,20 +1513,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
return fd;
}
-#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
{
struct bpf_map *map;
u32 id = attr->map_id;
+ int f_flags;
int fd;
- if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+ if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
+ attr->open_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ f_flags = bpf_get_file_flag(attr->open_flags);
+ if (f_flags < 0)
+ return f_flags;
+
spin_lock_bh(&map_idr_lock);
map = idr_find(&map_idr, id);
if (map)
@@ -1330,7 +1544,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- fd = bpf_map_new_fd(map);
+ fd = bpf_map_new_fd(map, f_flags);
if (fd < 0)
bpf_map_put(map);
@@ -1358,8 +1572,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info.type = prog->type;
info.id = prog->aux->id;
+ info.load_time = prog->aux->load_time;
+ info.created_by_uid = from_kuid_munged(current_user_ns(),
+ prog->aux->user->uid);
memcpy(info.tag, prog->tag, sizeof(prog->tag));
+ memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+
+ ulen = info.nr_map_ids;
+ info.nr_map_ids = prog->aux->used_map_cnt;
+ ulen = min_t(u32, info.nr_map_ids, ulen);
+ if (ulen) {
+ u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
+ u32 i;
+
+ for (i = 0; i < ulen; i++)
+ if (put_user(prog->aux->used_maps[i]->id,
+ &user_map_ids[i]))
+ return -EFAULT;
+ }
if (!capable(CAP_SYS_ADMIN)) {
info.jited_prog_len = 0;
@@ -1413,6 +1644,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
info.value_size = map->value_size;
info.max_entries = map->max_entries;
info.map_flags = map->map_flags;
+ memcpy(info.name, map->name, sizeof(map->name));
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
@@ -1467,6 +1699,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
+ err = security_bpf(cmd, &attr, size);
+ if (err < 0)
+ return err;
+
switch (cmd) {
case BPF_MAP_CREATE:
err = map_create(&attr);
@@ -1499,6 +1735,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_PROG_DETACH:
err = bpf_prog_detach(&attr);
break;
+ case BPF_PROG_QUERY:
+ err = bpf_prog_query(&attr, uattr);
+ break;
#endif
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c48ca2a34b5e..d4593571c404 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21,6 +21,17 @@
#include <linux/vmalloc.h>
#include <linux/stringify.h>
+#include "disasm.h"
+
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+ [_id] = & _name ## _verifier_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
+
/* bpf_check() is a static code analyzer that walks eBPF program
* instruction by instruction and updates register/stack state.
* All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -153,28 +164,42 @@ struct bpf_call_arg_meta {
int access_size;
};
-/* verbose verifier prints what it's seeing
- * bpf_check() is called under lock, so no race to access these global vars
- */
-static u32 log_level, log_size, log_len;
-static char *log_buf;
-
static DEFINE_MUTEX(bpf_verifier_lock);
/* log_level controls verbosity level of eBPF verifier.
* verbose() is used to dump the verification trace to the log, so the user
* can figure out what's wrong with the program
*/
-static __printf(1, 2) void verbose(const char *fmt, ...)
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+ const char *fmt, ...)
{
+ struct bpf_verifer_log *log = &env->log;
+ unsigned int n;
va_list args;
- if (log_level == 0 || log_len >= log_size - 1)
+ if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
return;
va_start(args, fmt);
- log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
va_end(args);
+
+ WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+ "verifier log line truncated - local buffer too short\n");
+
+ n = min(log->len_total - log->len_used - 1, n);
+ log->kbuf[n] = '\0';
+
+ if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+ log->len_used += n;
+ else
+ log->ubuf = NULL;
+}
+
+static bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+ return type == PTR_TO_PACKET ||
+ type == PTR_TO_PACKET_META;
}
/* string representation of 'enum bpf_reg_type' */
@@ -187,26 +212,12 @@ static const char * const reg_type_str[] = {
[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
[PTR_TO_STACK] = "fp",
[PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_META] = "pkt_meta",
[PTR_TO_PACKET_END] = "pkt_end",
};
-#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
-static const char * const func_id_str[] = {
- __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
-};
-#undef __BPF_FUNC_STR_FN
-
-static const char *func_id_name(int id)
-{
- BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
-
- if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
- return func_id_str[id];
- else
- return "unknown";
-}
-
-static void print_verifier_state(struct bpf_verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state)
{
struct bpf_reg_state *reg;
enum bpf_reg_type t;
@@ -217,21 +228,21 @@ static void print_verifier_state(struct bpf_verifier_state *state)
t = reg->type;
if (t == NOT_INIT)
continue;
- verbose(" R%d=%s", i, reg_type_str[t]);
+ verbose(env, " R%d=%s", i, reg_type_str[t]);
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
tnum_is_const(reg->var_off)) {
/* reg->off should be 0 for SCALAR_VALUE */
- verbose("%lld", reg->var_off.value + reg->off);
+ verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
- verbose("(id=%d", reg->id);
+ verbose(env, "(id=%d", reg->id);
if (t != SCALAR_VALUE)
- verbose(",off=%d", reg->off);
- if (t == PTR_TO_PACKET)
- verbose(",r=%d", reg->range);
+ verbose(env, ",off=%d", reg->off);
+ if (type_is_pkt_pointer(t))
+ verbose(env, ",r=%d", reg->range);
else if (t == CONST_PTR_TO_MAP ||
t == PTR_TO_MAP_VALUE ||
t == PTR_TO_MAP_VALUE_OR_NULL)
- verbose(",ks=%d,vs=%d",
+ verbose(env, ",ks=%d,vs=%d",
reg->map_ptr->key_size,
reg->map_ptr->value_size);
if (tnum_is_const(reg->var_off)) {
@@ -239,243 +250,174 @@ static void print_verifier_state(struct bpf_verifier_state *state)
* could be a pointer whose offset is too big
* for reg->off
*/
- verbose(",imm=%llx", reg->var_off.value);
+ verbose(env, ",imm=%llx", reg->var_off.value);
} else {
if (reg->smin_value != reg->umin_value &&
reg->smin_value != S64_MIN)
- verbose(",smin_value=%lld",
+ verbose(env, ",smin_value=%lld",
(long long)reg->smin_value);
if (reg->smax_value != reg->umax_value &&
reg->smax_value != S64_MAX)
- verbose(",smax_value=%lld",
+ verbose(env, ",smax_value=%lld",
(long long)reg->smax_value);
if (reg->umin_value != 0)
- verbose(",umin_value=%llu",
+ verbose(env, ",umin_value=%llu",
(unsigned long long)reg->umin_value);
if (reg->umax_value != U64_MAX)
- verbose(",umax_value=%llu",
+ verbose(env, ",umax_value=%llu",
(unsigned long long)reg->umax_value);
if (!tnum_is_unknown(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(",var_off=%s", tn_buf);
+ verbose(env, ",var_off=%s", tn_buf);
}
}
- verbose(")");
+ verbose(env, ")");
}
}
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] == STACK_SPILL)
- verbose(" fp%d=%s", -MAX_BPF_STACK + i,
- reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] == STACK_SPILL)
+ verbose(env, " fp%d=%s",
+ -MAX_BPF_STACK + i * BPF_REG_SIZE,
+ reg_type_str[state->stack[i].spilled_ptr.type]);
}
- verbose("\n");
+ verbose(env, "\n");
}
-static const char *const bpf_class_string[] = {
- [BPF_LD] = "ld",
- [BPF_LDX] = "ldx",
- [BPF_ST] = "st",
- [BPF_STX] = "stx",
- [BPF_ALU] = "alu",
- [BPF_JMP] = "jmp",
- [BPF_RET] = "BUG",
- [BPF_ALU64] = "alu64",
-};
-
-static const char *const bpf_alu_string[16] = {
- [BPF_ADD >> 4] = "+=",
- [BPF_SUB >> 4] = "-=",
- [BPF_MUL >> 4] = "*=",
- [BPF_DIV >> 4] = "/=",
- [BPF_OR >> 4] = "|=",
- [BPF_AND >> 4] = "&=",
- [BPF_LSH >> 4] = "<<=",
- [BPF_RSH >> 4] = ">>=",
- [BPF_NEG >> 4] = "neg",
- [BPF_MOD >> 4] = "%=",
- [BPF_XOR >> 4] = "^=",
- [BPF_MOV >> 4] = "=",
- [BPF_ARSH >> 4] = "s>>=",
- [BPF_END >> 4] = "endian",
-};
-
-static const char *const bpf_ldst_string[] = {
- [BPF_W >> 3] = "u32",
- [BPF_H >> 3] = "u16",
- [BPF_B >> 3] = "u8",
- [BPF_DW >> 3] = "u64",
-};
-
-static const char *const bpf_jmp_string[16] = {
- [BPF_JA >> 4] = "jmp",
- [BPF_JEQ >> 4] = "==",
- [BPF_JGT >> 4] = ">",
- [BPF_JLT >> 4] = "<",
- [BPF_JGE >> 4] = ">=",
- [BPF_JLE >> 4] = "<=",
- [BPF_JSET >> 4] = "&",
- [BPF_JNE >> 4] = "!=",
- [BPF_JSGT >> 4] = "s>",
- [BPF_JSLT >> 4] = "s<",
- [BPF_JSGE >> 4] = "s>=",
- [BPF_JSLE >> 4] = "s<=",
- [BPF_CALL >> 4] = "call",
- [BPF_EXIT >> 4] = "exit",
-};
+static int copy_stack_state(struct bpf_verifier_state *dst,
+ const struct bpf_verifier_state *src)
+{
+ if (!src->stack)
+ return 0;
+ if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
+ /* internal bug, make state invalid to reject the program */
+ memset(dst, 0, sizeof(*dst));
+ return -EFAULT;
+ }
+ memcpy(dst->stack, src->stack,
+ sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
+ return 0;
+}
-static void print_bpf_insn(const struct bpf_verifier_env *env,
- const struct bpf_insn *insn)
+/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
+ * make it consume minimal amount of memory. check_stack_write() access from
+ * the program calls into realloc_verifier_state() to grow the stack size.
+ * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
+ * which this function copies over. It points to previous bpf_verifier_state
+ * which is never reallocated
+ */
+static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
+ bool copy_old)
{
- u8 class = BPF_CLASS(insn->code);
-
- if (class == BPF_ALU || class == BPF_ALU64) {
- if (BPF_SRC(insn->code) == BPF_X)
- verbose("(%02x) %sr%d %s %sr%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
- insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
- insn->src_reg);
- else
- verbose("(%02x) %sr%d %s %s%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
- insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
- insn->imm);
- } else if (class == BPF_STX) {
- if (BPF_MODE(insn->code) == BPF_MEM)
- verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->src_reg);
- else if (BPF_MODE(insn->code) == BPF_XADD)
- verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg, insn->off,
- insn->src_reg);
- else
- verbose("BUG_%02x\n", insn->code);
- } else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose("BUG_st_%02x\n", insn->code);
- return;
- }
- verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->imm);
- } else if (class == BPF_LDX) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose("BUG_ldx_%02x\n", insn->code);
- return;
+ u32 old_size = state->allocated_stack;
+ struct bpf_stack_state *new_stack;
+ int slot = size / BPF_REG_SIZE;
+
+ if (size <= old_size || !size) {
+ if (copy_old)
+ return 0;
+ state->allocated_stack = slot * BPF_REG_SIZE;
+ if (!size && old_size) {
+ kfree(state->stack);
+ state->stack = NULL;
}
- verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
- insn->code, insn->dst_reg,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->src_reg, insn->off);
- } else if (class == BPF_LD) {
- if (BPF_MODE(insn->code) == BPF_ABS) {
- verbose("(%02x) r0 = *(%s *)skb[%d]\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IND) {
- verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->src_reg, insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IMM &&
- BPF_SIZE(insn->code) == BPF_DW) {
- /* At this point, we already made sure that the second
- * part of the ldimm64 insn is accessible.
- */
- u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+ return 0;
+ }
+ new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
+ GFP_KERNEL);
+ if (!new_stack)
+ return -ENOMEM;
+ if (copy_old) {
+ if (state->stack)
+ memcpy(new_stack, state->stack,
+ sizeof(*new_stack) * (old_size / BPF_REG_SIZE));
+ memset(new_stack + old_size / BPF_REG_SIZE, 0,
+ sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE);
+ }
+ state->allocated_stack = slot * BPF_REG_SIZE;
+ kfree(state->stack);
+ state->stack = new_stack;
+ return 0;
+}
- if (map_ptr && !env->allow_ptr_leaks)
- imm = 0;
+static void free_verifier_state(struct bpf_verifier_state *state,
+ bool free_self)
+{
+ kfree(state->stack);
+ if (free_self)
+ kfree(state);
+}
- verbose("(%02x) r%d = 0x%llx\n", insn->code,
- insn->dst_reg, (unsigned long long)imm);
- } else {
- verbose("BUG_ld_%02x\n", insn->code);
- return;
- }
- } else if (class == BPF_JMP) {
- u8 opcode = BPF_OP(insn->code);
+/* copy verifier state from src to dst growing dst stack space
+ * when necessary to accommodate larger src stack
+ */
+static int copy_verifier_state(struct bpf_verifier_state *dst,
+ const struct bpf_verifier_state *src)
+{
+ int err;
- if (opcode == BPF_CALL) {
- verbose("(%02x) call %s#%d\n", insn->code,
- func_id_name(insn->imm), insn->imm);
- } else if (insn->code == (BPF_JMP | BPF_JA)) {
- verbose("(%02x) goto pc%+d\n",
- insn->code, insn->off);
- } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
- verbose("(%02x) exit\n", insn->code);
- } else if (BPF_SRC(insn->code) == BPF_X) {
- verbose("(%02x) if r%d %s r%d goto pc%+d\n",
- insn->code, insn->dst_reg,
- bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->src_reg, insn->off);
- } else {
- verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
- insn->code, insn->dst_reg,
- bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->imm, insn->off);
- }
- } else {
- verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
- }
+ err = realloc_verifier_state(dst, src->allocated_stack, false);
+ if (err)
+ return err;
+ memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack));
+ return copy_stack_state(dst, src);
}
-static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
+ int *insn_idx)
{
- struct bpf_verifier_stack_elem *elem;
- int insn_idx;
+ struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_verifier_stack_elem *elem, *head = env->head;
+ int err;
if (env->head == NULL)
- return -1;
+ return -ENOENT;
- memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
- insn_idx = env->head->insn_idx;
+ if (cur) {
+ err = copy_verifier_state(cur, &head->st);
+ if (err)
+ return err;
+ }
+ if (insn_idx)
+ *insn_idx = head->insn_idx;
if (prev_insn_idx)
- *prev_insn_idx = env->head->prev_insn_idx;
- elem = env->head->next;
- kfree(env->head);
+ *prev_insn_idx = head->prev_insn_idx;
+ elem = head->next;
+ free_verifier_state(&head->st, false);
+ kfree(head);
env->head = elem;
env->stack_size--;
- return insn_idx;
+ return 0;
}
static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx)
{
+ struct bpf_verifier_state *cur = env->cur_state;
struct bpf_verifier_stack_elem *elem;
+ int err;
- elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
if (!elem)
goto err;
- memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
env->head = elem;
env->stack_size++;
+ err = copy_verifier_state(&elem->st, cur);
+ if (err)
+ goto err;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
- verbose("BPF program is too complex\n");
+ verbose(env, "BPF program is too complex\n");
goto err;
}
return &elem->st;
err:
/* pop all elements and return */
- while (pop_stack(env, NULL) >= 0);
+ while (!pop_stack(env, NULL, NULL));
return NULL;
}
@@ -507,10 +449,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
__mark_reg_known(reg, 0);
}
-static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_known_zero(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_known_zero(regs, %u)\n", regno);
+ verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -519,6 +462,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
__mark_reg_known_zero(regs + regno);
}
+static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+{
+ return type_is_pkt_pointer(reg->type);
+}
+
+static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
+{
+ return reg_is_pkt_pointer(reg) ||
+ reg->type == PTR_TO_PACKET_END;
+}
+
+/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
+static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
+ enum bpf_reg_type which)
+{
+ /* The register can already have a range from prior markings.
+ * This is fine as long as it hasn't been advanced from its
+ * origin.
+ */
+ return reg->type == which &&
+ reg->id == 0 &&
+ reg->off == 0 &&
+ tnum_equals_const(reg->var_off, 0);
+}
+
/* Attempts to improve min/max values based on var_off information */
static void __update_reg_bounds(struct bpf_reg_state *reg)
{
@@ -595,10 +563,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
__mark_reg_unbounded(reg);
}
-static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_unknown(regs, %u)\n", regno);
+ verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -613,10 +582,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg)
reg->type = NOT_INIT;
}
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_not_init(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_not_init(regs, %u)\n", regno);
+ verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -625,22 +595,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
__mark_reg_not_init(regs + regno);
}
-static void init_reg_state(struct bpf_reg_state *regs)
+static void init_reg_state(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
{
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
- mark_reg_not_init(regs, i);
+ mark_reg_not_init(env, regs, i);
regs[i].live = REG_LIVE_NONE;
}
/* frame pointer */
regs[BPF_REG_FP].type = PTR_TO_STACK;
- mark_reg_known_zero(regs, BPF_REG_FP);
+ mark_reg_known_zero(env, regs, BPF_REG_FP);
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
- mark_reg_known_zero(regs, BPF_REG_1);
+ mark_reg_known_zero(env, regs, BPF_REG_1);
}
enum reg_arg_type {
@@ -671,29 +642,29 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
enum reg_arg_type t)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state->regs;
if (regno >= MAX_BPF_REG) {
- verbose("R%d is invalid\n", regno);
+ verbose(env, "R%d is invalid\n", regno);
return -EINVAL;
}
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
if (regs[regno].type == NOT_INIT) {
- verbose("R%d !read_ok\n", regno);
+ verbose(env, "R%d !read_ok\n", regno);
return -EACCES;
}
- mark_reg_read(&env->cur_state, regno);
+ mark_reg_read(env->cur_state, regno);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
- verbose("frame pointer is read only\n");
+ verbose(env, "frame pointer is read only\n");
return -EACCES;
}
regs[regno].live |= REG_LIVE_WRITTEN;
if (t == DST_OP)
- mark_reg_unknown(regs, regno);
+ mark_reg_unknown(env, regs, regno);
}
return 0;
}
@@ -706,6 +677,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
case PTR_TO_PACKET_END:
case CONST_PTR_TO_MAP:
return true;
@@ -717,35 +689,48 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
-static int check_stack_write(struct bpf_verifier_state *state, int off,
+static int check_stack_write(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state, int off,
int size, int value_regno)
{
- int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+
+ err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE),
+ true);
+ if (err)
+ return err;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
+ if (!env->allow_ptr_leaks &&
+ state->stack[spi].slot_type[0] == STACK_SPILL &&
+ size != BPF_REG_SIZE) {
+ verbose(env, "attempt to corrupt spilled pointer on stack\n");
+ return -EACCES;
+ }
if (value_regno >= 0 &&
is_spillable_regtype(state->regs[value_regno].type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
- verbose("invalid size of register spill\n");
+ verbose(env, "invalid size of register spill\n");
return -EACCES;
}
/* save register state */
- state->spilled_regs[spi] = state->regs[value_regno];
- state->spilled_regs[spi].live |= REG_LIVE_WRITTEN;
+ state->stack[spi].spilled_ptr = state->regs[value_regno];
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
for (i = 0; i < BPF_REG_SIZE; i++)
- state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+ state->stack[spi].slot_type[i] = STACK_SPILL;
} else {
/* regular write of data into stack */
- state->spilled_regs[spi] = (struct bpf_reg_state) {};
+ state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
for (i = 0; i < size; i++)
- state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+ state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
+ STACK_MISC;
}
return 0;
}
@@ -756,66 +741,72 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
while (parent) {
/* if read wasn't screened by an earlier write ... */
- if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN)
+ if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
break;
/* ... then we depend on parent's value */
- parent->spilled_regs[slot].live |= REG_LIVE_READ;
+ parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
state = parent;
parent = state->parent;
}
}
-static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state, int off, int size,
int value_regno)
{
- u8 *slot_type;
- int i, spi;
+ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+ u8 *stype;
- slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+ if (state->allocated_stack <= slot) {
+ verbose(env, "invalid read from stack off %d+0 size %d\n",
+ off, size);
+ return -EACCES;
+ }
+ stype = state->stack[spi].slot_type;
- if (slot_type[0] == STACK_SPILL) {
+ if (stype[0] == STACK_SPILL) {
if (size != BPF_REG_SIZE) {
- verbose("invalid size of register spill\n");
+ verbose(env, "invalid size of register spill\n");
return -EACCES;
}
for (i = 1; i < BPF_REG_SIZE; i++) {
- if (slot_type[i] != STACK_SPILL) {
- verbose("corrupted spill memory\n");
+ if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
+ verbose(env, "corrupted spill memory\n");
return -EACCES;
}
}
- spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
-
if (value_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] = state->spilled_regs[spi];
+ state->regs[value_regno] = state->stack[spi].spilled_ptr;
mark_stack_slot_read(state, spi);
}
return 0;
} else {
for (i = 0; i < size; i++) {
- if (slot_type[i] != STACK_MISC) {
- verbose("invalid read from stack off %d+%d size %d\n",
+ if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) {
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
}
}
if (value_regno >= 0)
/* have read misc data from the stack */
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, state->regs, value_regno);
return 0;
}
}
/* check read/write into map element returned by bpf_map_lookup_elem() */
static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size)
+ int size, bool zero_size_allowed)
{
- struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_map *map = regs[regno].map_ptr;
- if (off < 0 || size <= 0 || off + size > map->value_size) {
- verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+ if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
+ off + size > map->value_size) {
+ verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
map->value_size, off, size);
return -EACCES;
}
@@ -824,9 +815,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
/* check read/write into a map element with possible variable offset */
static int check_map_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size)
+ int off, int size, bool zero_size_allowed)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *reg = &state->regs[regno];
int err;
@@ -834,8 +825,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* need to try adding each of min_value and max_value to off
* to make sure our theoretical access will be safe.
*/
- if (log_level)
- print_verifier_state(state);
+ if (env->log.level)
+ print_verifier_state(env, state);
/* The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
@@ -843,13 +834,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* will have a set floor within our range.
*/
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
- err = __check_map_access(env, regno, reg->smin_value + off, size);
+ err = __check_map_access(env, regno, reg->smin_value + off, size,
+ zero_size_allowed);
if (err) {
- verbose("R%d min value is outside of the array range\n", regno);
+ verbose(env, "R%d min value is outside of the array range\n",
+ regno);
return err;
}
@@ -858,13 +851,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* If reg->umax_value + off could overflow, treat that as unbounded too.
*/
if (reg->umax_value >= BPF_MAX_VAR_OFF) {
- verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+ verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
regno);
return -EACCES;
}
- err = __check_map_access(env, regno, reg->umax_value + off, size);
+ err = __check_map_access(env, regno, reg->umax_value + off, size,
+ zero_size_allowed);
if (err)
- verbose("R%d max value is outside of the array range\n", regno);
+ verbose(env, "R%d max value is outside of the array range\n",
+ regno);
return err;
}
@@ -897,13 +892,14 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
}
static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size)
+ int off, int size, bool zero_size_allowed)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = &regs[regno];
- if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
- verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+ if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
+ (u64)off + size > reg->range) {
+ verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
off, size, regno, reg->id, reg->off, reg->range);
return -EACCES;
}
@@ -911,9 +907,9 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
}
static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size)
+ int size, bool zero_size_allowed)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = &regs[regno];
int err;
@@ -926,13 +922,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
* detail to prove they're safe.
*/
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
- err = __check_packet_access(env, regno, off, size);
+ err = __check_packet_access(env, regno, off, size, zero_size_allowed);
if (err) {
- verbose("R%d offset is outside of the packet\n", regno);
+ verbose(env, "R%d offset is outside of the packet\n", regno);
return err;
}
return err;
@@ -946,12 +942,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
.reg_type = *reg_type,
};
- /* for analyzer ctx accesses are already validated and converted */
- if (env->analyzer_ops)
- return 0;
-
- if (env->prog->aux->ops->is_valid_access &&
- env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+ if (env->ops->is_valid_access &&
+ env->ops->is_valid_access(off, size, t, &info)) {
/* A non zero info.ctx_field_size indicates that this field is a
* candidate for later verifier transformation to load the whole
* field and then apply a mask when accessed with a narrower
@@ -959,16 +951,16 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
* will only allow for whole field access and rejects any other
* type of narrower access.
*/
- env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
*reg_type = info.reg_type;
+ env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
return 0;
}
- verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+ verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
return -EACCES;
}
@@ -983,10 +975,11 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
{
- return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+ return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
}
-static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
int off, int size, bool strict)
{
struct tnum reg_off;
@@ -1011,7 +1004,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+ verbose(env,
+ "misaligned packet access off %d+%s+%d+%d size %d\n",
ip_align, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -1019,7 +1013,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
return 0;
}
-static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
const char *pointer_desc,
int off, int size, bool strict)
{
@@ -1034,7 +1029,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("misaligned %saccess off %s+%d+%d size %d\n",
+ verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
pointer_desc, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -1051,8 +1046,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
switch (reg->type) {
case PTR_TO_PACKET:
- /* special case, because of NET_IP_ALIGN */
- return check_pkt_ptr_alignment(reg, off, size, strict);
+ case PTR_TO_PACKET_META:
+ /* Special case, because of NET_IP_ALIGN. Given metadata sits
+ * right in front, treat it the very same way.
+ */
+ return check_pkt_ptr_alignment(env, reg, off, size, strict);
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
break;
@@ -1065,7 +1063,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
default:
break;
}
- return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
+ return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
+ strict);
}
/* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -1078,8 +1077,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
int bpf_size, enum bpf_access_type t,
int value_regno)
{
- struct bpf_verifier_state *state = &env->cur_state;
- struct bpf_reg_state *reg = &state->regs[regno];
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = regs + regno;
int size, err = 0;
size = bpf_size_to_bytes(bpf_size);
@@ -1097,27 +1097,28 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (reg->type == PTR_TO_MAP_VALUE) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into map\n", value_regno);
+ verbose(env, "R%d leaks addr into map\n", value_regno);
return -EACCES;
}
- err = check_map_access(env, regno, off, size);
+ err = check_map_access(env, regno, off, size, false);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into ctx\n", value_regno);
+ verbose(env, "R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
/* ctx accesses must be at a fixed offset, so that we can
* determine what type of data were returned.
*/
if (reg->off) {
- verbose("dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
+ verbose(env,
+ "dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
regno, reg->off, off - reg->off);
return -EACCES;
}
@@ -1125,24 +1126,26 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("variable ctx access var_off=%s off=%d size=%d",
+ verbose(env,
+ "variable ctx access var_off=%s off=%d size=%d",
tn_buf, off, size);
return -EACCES;
}
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
/* ctx access returns either a scalar, or a
- * PTR_TO_PACKET[_END]. In the latter case, we know
- * the offset is zero.
+ * PTR_TO_PACKET[_META,_END]. In the latter
+ * case, we know the offset is zero.
*/
if (reg_type == SCALAR_VALUE)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, regs, value_regno);
else
- mark_reg_known_zero(state->regs, value_regno);
- state->regs[value_regno].id = 0;
- state->regs[value_regno].off = 0;
- state->regs[value_regno].range = 0;
- state->regs[value_regno].type = reg_type;
+ mark_reg_known_zero(env, regs,
+ value_regno);
+ regs[value_regno].id = 0;
+ regs[value_regno].off = 0;
+ regs[value_regno].range = 0;
+ regs[value_regno].type = reg_type;
}
} else if (reg->type == PTR_TO_STACK) {
@@ -1154,55 +1157,52 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("variable stack access var_off=%s off=%d size=%d",
+ verbose(env, "variable stack access var_off=%s off=%d size=%d",
tn_buf, off, size);
return -EACCES;
}
off += reg->var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK) {
- verbose("invalid stack off=%d size=%d\n", off, size);
+ verbose(env, "invalid stack off=%d size=%d\n", off,
+ size);
return -EACCES;
}
if (env->prog->aux->stack_depth < -off)
env->prog->aux->stack_depth = -off;
- if (t == BPF_WRITE) {
- if (!env->allow_ptr_leaks &&
- state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
- size != BPF_REG_SIZE) {
- verbose("attempt to corrupt spilled pointer on stack\n");
- return -EACCES;
- }
- err = check_stack_write(state, off, size, value_regno);
- } else {
- err = check_stack_read(state, off, size, value_regno);
- }
- } else if (reg->type == PTR_TO_PACKET) {
+ if (t == BPF_WRITE)
+ err = check_stack_write(env, state, off, size,
+ value_regno);
+ else
+ err = check_stack_read(env, state, off, size,
+ value_regno);
+ } else if (reg_is_pkt_pointer(reg)) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
- verbose("cannot write into packet\n");
+ verbose(env, "cannot write into packet\n");
return -EACCES;
}
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into packet\n", value_regno);
+ verbose(env, "R%d leaks addr into packet\n",
+ value_regno);
return -EACCES;
}
- err = check_packet_access(env, regno, off, size);
+ err = check_packet_access(env, regno, off, size, false);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, regs, value_regno);
} else {
- verbose("R%d invalid mem access '%s'\n",
- regno, reg_type_str[reg->type]);
+ verbose(env, "R%d invalid mem access '%s'\n", regno,
+ reg_type_str[reg->type]);
return -EACCES;
}
if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
- state->regs[value_regno].type == SCALAR_VALUE) {
+ regs[value_regno].type == SCALAR_VALUE) {
/* b/h/w load zero-extends, mark upper bits as known 0 */
- state->regs[value_regno].var_off = tnum_cast(
- state->regs[value_regno].var_off, size);
- __update_reg_bounds(&state->regs[value_regno]);
+ regs[value_regno].var_off =
+ tnum_cast(regs[value_regno].var_off, size);
+ __update_reg_bounds(&regs[value_regno]);
}
return err;
}
@@ -1213,7 +1213,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
insn->imm != 0) {
- verbose("BPF_XADD uses reserved fields\n");
+ verbose(env, "BPF_XADD uses reserved fields\n");
return -EINVAL;
}
@@ -1228,7 +1228,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
return err;
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d leaks addr into mem\n", insn->src_reg);
+ verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
return -EACCES;
}
@@ -1259,9 +1259,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *regs = state->regs;
- int off, i;
+ int off, i, slot, spi;
if (regs[regno].type != PTR_TO_STACK) {
/* Allow zero-byte read from NULL, regardless of pointer type */
@@ -1269,7 +1269,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
register_is_null(regs[regno]))
return 0;
- verbose("R%d type=%s expected=%s\n", regno,
+ verbose(env, "R%d type=%s expected=%s\n", regno,
reg_type_str[regs[regno].type],
reg_type_str[PTR_TO_STACK]);
return -EACCES;
@@ -1280,13 +1280,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
- verbose("invalid variable stack read R%d var_off=%s\n",
+ verbose(env, "invalid variable stack read R%d var_off=%s\n",
regno, tn_buf);
}
off = regs[regno].off + regs[regno].var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
- access_size <= 0) {
- verbose("invalid stack type R%d off=%d access_size=%d\n",
+ access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
+ verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
regno, off, access_size);
return -EACCES;
}
@@ -1301,8 +1301,12 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
}
for (i = 0; i < access_size; i++) {
- if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
- verbose("invalid indirect read from stack off %d+%d size %d\n",
+ slot = -(off + i) - 1;
+ spi = slot / BPF_REG_SIZE;
+ if (state->allocated_stack <= slot ||
+ state->stack[spi].slot_type[slot % BPF_REG_SIZE] !=
+ STACK_MISC) {
+ verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
off, i, access_size);
return -EACCES;
}
@@ -1314,13 +1318,16 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
switch (reg->type) {
case PTR_TO_PACKET:
- return check_packet_access(env, regno, reg->off, access_size);
+ case PTR_TO_PACKET_META:
+ return check_packet_access(env, regno, reg->off, access_size,
+ zero_size_allowed);
case PTR_TO_MAP_VALUE:
- return check_map_access(env, regno, reg->off, access_size);
+ return check_map_access(env, regno, reg->off, access_size,
+ zero_size_allowed);
default: /* scalar_value|ptr_to_stack or invalid ptr */
return check_stack_boundary(env, regno, access_size,
zero_size_allowed, meta);
@@ -1331,7 +1338,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
enum bpf_reg_type expected_type, type = reg->type;
int err = 0;
@@ -1344,22 +1351,24 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_ANYTHING) {
if (is_pointer_value(env, regno)) {
- verbose("R%d leaks addr into helper function\n", regno);
+ verbose(env, "R%d leaks addr into helper function\n",
+ regno);
return -EACCES;
}
return 0;
}
- if (type == PTR_TO_PACKET &&
+ if (type_is_pkt_pointer(type) &&
!may_access_direct_pkt_data(env, meta, BPF_READ)) {
- verbose("helper access to the packet is not allowed\n");
+ verbose(env, "helper access to the packet is not allowed\n");
return -EACCES;
}
if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
- if (type != PTR_TO_PACKET && type != expected_type)
+ if (!type_is_pkt_pointer(type) &&
+ type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -1375,20 +1384,23 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (type != expected_type)
goto err_type;
} else if (arg_type == ARG_PTR_TO_MEM ||
+ arg_type == ARG_PTR_TO_MEM_OR_NULL ||
arg_type == ARG_PTR_TO_UNINIT_MEM) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
* passed in as argument, it's a SCALAR_VALUE type. Final test
* happens during stack boundary checking.
*/
- if (register_is_null(*reg))
+ if (register_is_null(*reg) &&
+ arg_type == ARG_PTR_TO_MEM_OR_NULL)
/* final test in check_stack_boundary() */;
- else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+ else if (!type_is_pkt_pointer(type) &&
+ type != PTR_TO_MAP_VALUE &&
type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
} else {
- verbose("unsupported arg_type %d\n", arg_type);
+ verbose(env, "unsupported arg_type %d\n", arg_type);
return -EFAULT;
}
@@ -1406,12 +1418,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
* we have to check map_key here. Otherwise it means
* that kernel subsystem misconfigured verifier
*/
- verbose("invalid map_ptr to access map->key\n");
+ verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
- if (type == PTR_TO_PACKET)
+ if (type_is_pkt_pointer(type))
err = check_packet_access(env, regno, reg->off,
- meta->map_ptr->key_size);
+ meta->map_ptr->key_size,
+ false);
else
err = check_stack_boundary(env, regno,
meta->map_ptr->key_size,
@@ -1422,12 +1435,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (!meta->map_ptr) {
/* kernel subsystem misconfigured verifier */
- verbose("invalid map_ptr to access map->value\n");
+ verbose(env, "invalid map_ptr to access map->value\n");
return -EACCES;
}
- if (type == PTR_TO_PACKET)
+ if (type_is_pkt_pointer(type))
err = check_packet_access(env, regno, reg->off,
- meta->map_ptr->value_size);
+ meta->map_ptr->value_size,
+ false);
else
err = check_stack_boundary(env, regno,
meta->map_ptr->value_size,
@@ -1442,7 +1456,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (regno == 0) {
/* kernel subsystem misconfigured verifier */
- verbose("ARG_CONST_SIZE cannot be first argument\n");
+ verbose(env,
+ "ARG_CONST_SIZE cannot be first argument\n");
return -EACCES;
}
@@ -1459,7 +1474,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
meta = NULL;
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+ verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
regno);
return -EACCES;
}
@@ -1473,7 +1488,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
}
if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
- verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
regno);
return -EACCES;
}
@@ -1484,12 +1499,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return err;
err_type:
- verbose("R%d type=%s expected=%s\n", regno,
+ verbose(env, "R%d type=%s expected=%s\n", regno,
reg_type_str[type], reg_type_str[expected_type]);
return -EACCES;
}
-static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+static int check_map_func_compatibility(struct bpf_verifier_env *env,
+ struct bpf_map *map, int func_id)
{
if (!map)
return 0;
@@ -1502,7 +1518,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
break;
case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
if (func_id != BPF_FUNC_perf_event_read &&
- func_id != BPF_FUNC_perf_event_output)
+ func_id != BPF_FUNC_perf_event_output &&
+ func_id != BPF_FUNC_perf_event_read_value)
goto error;
break;
case BPF_MAP_TYPE_STACK_TRACE:
@@ -1522,6 +1539,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
+ /* Restrict bpf side of cpumap, open when use-cases appear */
+ case BPF_MAP_TYPE_CPUMAP:
+ if (func_id != BPF_FUNC_redirect_map)
+ goto error;
+ break;
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
if (func_id != BPF_FUNC_map_lookup_elem)
@@ -1545,6 +1567,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
break;
case BPF_FUNC_perf_event_read:
case BPF_FUNC_perf_event_output:
+ case BPF_FUNC_perf_event_read_value:
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;
break;
@@ -1558,7 +1581,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
goto error;
break;
case BPF_FUNC_redirect_map:
- if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+ if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+ map->map_type != BPF_MAP_TYPE_CPUMAP)
goto error;
break;
case BPF_FUNC_sk_redirect_map:
@@ -1575,7 +1599,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
return 0;
error:
- verbose("cannot pass map_type %d into func %s#%d\n",
+ verbose(env, "cannot pass map_type %d into func %s#%d\n",
map->map_type, func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -1598,57 +1622,55 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
return count > 1 ? -EINVAL : 0;
}
-/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
- * so turn them into unknown SCALAR_VALUE.
+/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
+ * are now invalid, so turn them into unknown SCALAR_VALUE.
*/
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *regs = state->regs, *reg;
int i;
for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].type == PTR_TO_PACKET ||
- regs[i].type == PTR_TO_PACKET_END)
- mark_reg_unknown(regs, i);
+ if (reg_is_pkt_pointer_any(&regs[i]))
+ mark_reg_unknown(env, regs, i);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
- continue;
- reg = &state->spilled_regs[i / BPF_REG_SIZE];
- if (reg->type != PTR_TO_PACKET &&
- reg->type != PTR_TO_PACKET_END)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- __mark_reg_unknown(reg);
+ reg = &state->stack[i].spilled_ptr;
+ if (reg_is_pkt_pointer_any(reg))
+ __mark_reg_unknown(reg);
}
}
static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
{
- struct bpf_verifier_state *state = &env->cur_state;
const struct bpf_func_proto *fn = NULL;
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs;
struct bpf_call_arg_meta meta;
bool changes_data;
int i, err;
/* find function prototype */
if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
- verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
+ verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
+ func_id);
return -EINVAL;
}
- if (env->prog->aux->ops->get_func_proto)
- fn = env->prog->aux->ops->get_func_proto(func_id);
+ if (env->ops->get_func_proto)
+ fn = env->ops->get_func_proto(func_id);
if (!fn) {
- verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
+ verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
+ func_id);
return -EINVAL;
}
/* eBPF programs must be GPL compatible to use GPL-ed functions */
if (!env->prog->gpl_compatible && fn->gpl_only) {
- verbose("cannot call GPL only function from proprietary program\n");
+ verbose(env, "cannot call GPL only function from proprietary program\n");
return -EINVAL;
}
@@ -1662,7 +1684,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
*/
err = check_raw_mode(fn);
if (err) {
- verbose("kernel subsystem misconfigured func %s#%d\n",
+ verbose(env, "kernel subsystem misconfigured func %s#%d\n",
func_id_name(func_id), func_id);
return err;
}
@@ -1693,16 +1715,17 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return err;
}
+ regs = cur_regs(env);
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(regs, caller_saved[i]);
+ mark_reg_not_init(env, regs, caller_saved[i]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
/* update return register (already marked as written above) */
if (fn->ret_type == RET_INTEGER) {
/* sets type to SCALAR_VALUE */
- mark_reg_unknown(regs, BPF_REG_0);
+ mark_reg_unknown(env, regs, BPF_REG_0);
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1710,14 +1733,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
/* There is no offset yet applied, variable or fixed */
- mark_reg_known_zero(regs, BPF_REG_0);
+ mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].off = 0;
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
*/
if (meta.map_ptr == NULL) {
- verbose("kernel subsystem misconfigured verifier\n");
+ verbose(env,
+ "kernel subsystem misconfigured verifier\n");
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
@@ -1728,12 +1752,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
else if (insn_aux->map_ptr != meta.map_ptr)
insn_aux->map_ptr = BPF_MAP_PTR_POISON;
} else {
- verbose("unknown return type %d of func %s#%d\n",
+ verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
return -EINVAL;
}
- err = check_map_func_compatibility(meta.map_ptr, func_id);
+ err = check_map_func_compatibility(env, meta.map_ptr, func_id);
if (err)
return err;
@@ -1780,7 +1804,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
const struct bpf_reg_state *ptr_reg,
const struct bpf_reg_state *off_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ struct bpf_reg_state *regs = cur_regs(env), *dst_reg;
bool known = tnum_is_const(off_reg->var_off);
s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
@@ -1792,39 +1816,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg = &regs[dst];
if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: known but bad sbounds\n");
+ print_verifier_state(env, env->cur_state);
+ verbose(env,
+ "verifier internal error: known but bad sbounds\n");
return -EINVAL;
}
if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: known but bad ubounds\n");
+ print_verifier_state(env, env->cur_state);
+ verbose(env,
+ "verifier internal error: known but bad ubounds\n");
return -EINVAL;
}
if (BPF_CLASS(insn->code) != BPF_ALU64) {
/* 32-bit ALU ops on pointers produce (meaningless) scalars */
if (!env->allow_ptr_leaks)
- verbose("R%d 32-bit pointer arithmetic prohibited\n",
+ verbose(env,
+ "R%d 32-bit pointer arithmetic prohibited\n",
dst);
return -EACCES;
}
if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+ verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
dst);
return -EACCES;
}
if (ptr_reg->type == CONST_PTR_TO_MAP) {
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+ verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
dst);
return -EACCES;
}
if (ptr_reg->type == PTR_TO_PACKET_END) {
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+ verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
dst);
return -EACCES;
}
@@ -1879,7 +1906,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
- if (ptr_reg->type == PTR_TO_PACKET) {
+ if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
dst_reg->range = 0;
@@ -1889,7 +1916,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (dst_reg == off_reg) {
/* scalar -= pointer. Creates an unknown scalar */
if (!env->allow_ptr_leaks)
- verbose("R%d tried to subtract pointer from scalar\n",
+ verbose(env, "R%d tried to subtract pointer from scalar\n",
dst);
return -EACCES;
}
@@ -1899,7 +1926,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
*/
if (ptr_reg->type == PTR_TO_STACK) {
if (!env->allow_ptr_leaks)
- verbose("R%d subtraction from stack pointer prohibited\n",
+ verbose(env, "R%d subtraction from stack pointer prohibited\n",
dst);
return -EACCES;
}
@@ -1939,7 +1966,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
- if (ptr_reg->type == PTR_TO_PACKET) {
+ if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
if (smin_val < 0)
@@ -1954,13 +1981,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
* ptr &= ~3 which would reduce min_value by 3.)
*/
if (!env->allow_ptr_leaks)
- verbose("R%d bitwise operator %s on pointer prohibited\n",
+ verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
default:
/* other operators (e.g. MUL,LSH) produce non-pointer results */
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic with %s operator prohibited\n",
+ verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
}
@@ -1976,7 +2003,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_reg_state *dst_reg,
struct bpf_reg_state src_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
bool src_known, dst_known;
s64 smin_val, smax_val;
@@ -2126,7 +2153,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* Shifts greater than 63 are undefined. This includes
* shifts by a negative number.
*/
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
/* We lose all sign bit information (except what we can pick
@@ -2154,7 +2181,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* Shifts greater than 63 are undefined. This includes
* shifts by a negative number.
*/
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
/* BPF_RSH is an unsigned shift, so make the appropriate casts */
@@ -2182,7 +2209,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
break;
default:
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
@@ -2197,7 +2224,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+ struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
u8 opcode = BPF_OP(insn->code);
int rc;
@@ -2214,12 +2241,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* an arbitrary scalar.
*/
if (!env->allow_ptr_leaks) {
- verbose("R%d pointer %s pointer prohibited\n",
+ verbose(env, "R%d pointer %s pointer prohibited\n",
insn->dst_reg,
bpf_alu_string[opcode >> 4]);
return -EACCES;
}
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
return 0;
} else {
/* scalar += pointer
@@ -2271,13 +2298,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: unexpected ptr_reg\n");
+ print_verifier_state(env, env->cur_state);
+ verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: no src_reg\n");
+ print_verifier_state(env, env->cur_state);
+ verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
@@ -2286,7 +2313,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* check validity of 32-bit and 64-bit arithmetic operations */
static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
int err;
@@ -2295,14 +2322,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->off != 0 || insn->imm != 0) {
- verbose("BPF_NEG uses reserved fields\n");
+ verbose(env, "BPF_NEG uses reserved fields\n");
return -EINVAL;
}
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
(insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
BPF_CLASS(insn->code) == BPF_ALU64) {
- verbose("BPF_END uses reserved fields\n");
+ verbose(env, "BPF_END uses reserved fields\n");
return -EINVAL;
}
}
@@ -2313,7 +2340,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
+ verbose(env, "R%d pointer arithmetic prohibited\n",
insn->dst_reg);
return -EACCES;
}
@@ -2327,7 +2354,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
- verbose("BPF_MOV uses reserved fields\n");
+ verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
@@ -2337,7 +2364,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose("BPF_MOV uses reserved fields\n");
+ verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
}
@@ -2357,11 +2384,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d partial copy of pointer\n",
+ verbose(env,
+ "R%d partial copy of pointer\n",
insn->src_reg);
return -EACCES;
}
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
/* high 32 bits are known zero. */
regs[insn->dst_reg].var_off = tnum_cast(
regs[insn->dst_reg].var_off, 4);
@@ -2376,14 +2404,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
} else if (opcode > BPF_END) {
- verbose("invalid BPF_ALU opcode %x\n", opcode);
+ verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
return -EINVAL;
} else { /* all other ALU ops: and, sub, xor, add, ... */
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
- verbose("BPF_ALU uses reserved fields\n");
+ verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
/* check src1 operand */
@@ -2392,7 +2420,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose("BPF_ALU uses reserved fields\n");
+ verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
}
@@ -2404,7 +2432,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
- verbose("div by zero\n");
+ verbose(env, "div by zero\n");
return -EINVAL;
}
@@ -2413,7 +2441,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
if (insn->imm < 0 || insn->imm >= size) {
- verbose("invalid shift %d\n", insn->imm);
+ verbose(env, "invalid shift %d\n", insn->imm);
return -EINVAL;
}
}
@@ -2431,6 +2459,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
static void find_good_pkt_pointers(struct bpf_verifier_state *state,
struct bpf_reg_state *dst_reg,
+ enum bpf_reg_type type,
bool range_right_open)
{
struct bpf_reg_state *regs = state->regs, *reg;
@@ -2501,15 +2530,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+ if (regs[i].type == type && regs[i].id == dst_reg->id)
/* keep the maximum range already checked */
regs[i].range = max(regs[i].range, new_range);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- reg = &state->spilled_regs[i / BPF_REG_SIZE];
- if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+ reg = &state->stack[i].spilled_ptr;
+ if (reg->type == type && reg->id == dst_reg->id)
reg->range = max(reg->range, new_range);
}
}
@@ -2758,29 +2787,122 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
for (i = 0; i < MAX_BPF_REG; i++)
mark_map_reg(regs, i, id, is_null);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null);
+ mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
+ }
+}
+
+static bool try_match_pkt_pointers(const struct bpf_insn *insn,
+ struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg,
+ struct bpf_verifier_state *this_branch,
+ struct bpf_verifier_state *other_branch)
+{
+ if (BPF_SRC(insn->code) != BPF_X)
+ return false;
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_JGT:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' > pkt_end, pkt_meta' > pkt_data */
+ find_good_pkt_pointers(this_branch, dst_reg,
+ dst_reg->type, false);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end > pkt_data', pkt_data > pkt_meta' */
+ find_good_pkt_pointers(other_branch, src_reg,
+ src_reg->type, true);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JLT:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' < pkt_end, pkt_meta' < pkt_data */
+ find_good_pkt_pointers(other_branch, dst_reg,
+ dst_reg->type, true);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end < pkt_data', pkt_data > pkt_meta' */
+ find_good_pkt_pointers(this_branch, src_reg,
+ src_reg->type, false);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JGE:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
+ find_good_pkt_pointers(this_branch, dst_reg,
+ dst_reg->type, true);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
+ find_good_pkt_pointers(other_branch, src_reg,
+ src_reg->type, false);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JLE:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
+ find_good_pkt_pointers(other_branch, dst_reg,
+ dst_reg->type, false);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
+ find_good_pkt_pointers(this_branch, src_reg,
+ src_reg->type, true);
+ } else {
+ return false;
+ }
+ break;
+ default:
+ return false;
}
+
+ return true;
}
static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
- struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
+ struct bpf_verifier_state *other_branch, *this_branch = env->cur_state;
struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
u8 opcode = BPF_OP(insn->code);
int err;
if (opcode > BPF_JSLE) {
- verbose("invalid BPF_JMP opcode %x\n", opcode);
+ verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
return -EINVAL;
}
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
- verbose("BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP uses reserved fields\n");
return -EINVAL;
}
@@ -2790,13 +2912,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return err;
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d pointer comparison prohibited\n",
+ verbose(env, "R%d pointer comparison prohibited\n",
insn->src_reg);
return -EACCES;
}
} else {
if (insn->src_reg != BPF_REG_0) {
- verbose("BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP uses reserved fields\n");
return -EINVAL;
}
}
@@ -2871,52 +2993,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*/
mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
- dst_reg->type == PTR_TO_PACKET &&
- regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- /* pkt_data' > pkt_end */
- find_good_pkt_pointers(this_branch, dst_reg, false);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
- dst_reg->type == PTR_TO_PACKET_END &&
- regs[insn->src_reg].type == PTR_TO_PACKET) {
- /* pkt_end > pkt_data' */
- find_good_pkt_pointers(other_branch, &regs[insn->src_reg], true);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
- dst_reg->type == PTR_TO_PACKET &&
- regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- /* pkt_data' < pkt_end */
- find_good_pkt_pointers(other_branch, dst_reg, true);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
- dst_reg->type == PTR_TO_PACKET_END &&
- regs[insn->src_reg].type == PTR_TO_PACKET) {
- /* pkt_end < pkt_data' */
- find_good_pkt_pointers(this_branch, &regs[insn->src_reg], false);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
- dst_reg->type == PTR_TO_PACKET &&
- regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- /* pkt_data' >= pkt_end */
- find_good_pkt_pointers(this_branch, dst_reg, true);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
- dst_reg->type == PTR_TO_PACKET_END &&
- regs[insn->src_reg].type == PTR_TO_PACKET) {
- /* pkt_end >= pkt_data' */
- find_good_pkt_pointers(other_branch, &regs[insn->src_reg], false);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
- dst_reg->type == PTR_TO_PACKET &&
- regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- /* pkt_data' <= pkt_end */
- find_good_pkt_pointers(other_branch, dst_reg, false);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
- dst_reg->type == PTR_TO_PACKET_END &&
- regs[insn->src_reg].type == PTR_TO_PACKET) {
- /* pkt_end <= pkt_data' */
- find_good_pkt_pointers(this_branch, &regs[insn->src_reg], true);
- } else if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+ } else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
+ this_branch, other_branch) &&
+ is_pointer_value(env, insn->dst_reg)) {
+ verbose(env, "R%d pointer comparison prohibited\n",
+ insn->dst_reg);
return -EACCES;
}
- if (log_level)
- print_verifier_state(this_branch);
+ if (env->log.level)
+ print_verifier_state(env, this_branch);
return 0;
}
@@ -2931,15 +3016,15 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
/* verify BPF_LD_IMM64 instruction */
static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
int err;
if (BPF_SIZE(insn->code) != BPF_DW) {
- verbose("invalid BPF_LD_IMM insn\n");
+ verbose(env, "invalid BPF_LD_IMM insn\n");
return -EINVAL;
}
if (insn->off != 0) {
- verbose("BPF_LD_IMM64 uses reserved fields\n");
+ verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
return -EINVAL;
}
@@ -2992,19 +3077,19 @@ static bool may_access_skb(enum bpf_prog_type type)
*/
static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 mode = BPF_MODE(insn->code);
int i, err;
if (!may_access_skb(env->prog->type)) {
- verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
+ verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
return -EINVAL;
}
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
- verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
+ verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
return -EINVAL;
}
@@ -3014,7 +3099,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (regs[BPF_REG_6].type != PTR_TO_CTX) {
- verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+ verbose(env,
+ "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
return -EINVAL;
}
@@ -3027,7 +3113,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* reset caller saved regs to unreadable */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(regs, caller_saved[i]);
+ mark_reg_not_init(env, regs, caller_saved[i]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
@@ -3035,7 +3121,45 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* the value fetched from the packet.
* Already marked as written above.
*/
- mark_reg_unknown(regs, BPF_REG_0);
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ return 0;
+}
+
+static int check_return_code(struct bpf_verifier_env *env)
+{
+ struct bpf_reg_state *reg;
+ struct tnum range = tnum_range(0, 1);
+
+ switch (env->prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ break;
+ default:
+ return 0;
+ }
+
+ reg = cur_regs(env) + BPF_REG_0;
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "At program exit the register R0 is not a known value (%s)\n",
+ reg_type_str[reg->type]);
+ return -EINVAL;
+ }
+
+ if (!tnum_in(range, reg->var_off)) {
+ verbose(env, "At program exit the register R0 ");
+ if (!tnum_is_unknown(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "has value %s", tn_buf);
+ } else {
+ verbose(env, "has unknown scalar value");
+ }
+ verbose(env, " should have been 0 or 1\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -3099,7 +3223,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
return 0;
if (w < 0 || w >= env->prog->len) {
- verbose("jump out of range from insn %d to %d\n", t, w);
+ verbose(env, "jump out of range from insn %d to %d\n", t, w);
return -EINVAL;
}
@@ -3116,13 +3240,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
insn_stack[cur_stack++] = w;
return 1;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- verbose("back-edge from insn %d to %d\n", t, w);
+ verbose(env, "back-edge from insn %d to %d\n", t, w);
return -EINVAL;
} else if (insn_state[w] == EXPLORED) {
/* forward- or cross-edge */
insn_state[t] = DISCOVERED | e;
} else {
- verbose("insn state internal bug\n");
+ verbose(env, "insn state internal bug\n");
return -EFAULT;
}
return 0;
@@ -3216,7 +3340,7 @@ peek_stack:
mark_explored:
insn_state[t] = EXPLORED;
if (cur_stack-- <= 0) {
- verbose("pop stack internal bug\n");
+ verbose(env, "pop stack internal bug\n");
ret = -EFAULT;
goto err_free;
}
@@ -3225,7 +3349,7 @@ mark_explored:
check_state:
for (i = 0; i < insn_cnt; i++) {
if (insn_state[i] != EXPLORED) {
- verbose("unreachable insn %d\n", i);
+ verbose(env, "unreachable insn %d\n", i);
ret = -EINVAL;
goto err_free;
}
@@ -3340,8 +3464,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
/* Check our ids match any regs they're supposed to */
return check_ids(rold->id, rcur->id, idmap);
+ case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
- if (rcur->type != PTR_TO_PACKET)
+ if (rcur->type != rold->type)
return false;
/* We must have at least as much range as the old ptr
* did, so that any accesses which were safe before are
@@ -3379,6 +3504,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
}
+static bool stacksafe(struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur,
+ struct idpair *idmap)
+{
+ int i, spi;
+
+ /* if explored stack has more populated slots than current stack
+ * such stacks are not equivalent
+ */
+ if (old->allocated_stack > cur->allocated_stack)
+ return false;
+
+ /* walk slots of the explored stack and ignore any additional
+ * slots in the current stack, since explored(safe) state
+ * didn't use them
+ */
+ for (i = 0; i < old->allocated_stack; i++) {
+ spi = i / BPF_REG_SIZE;
+
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
+ continue;
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+ /* Ex: old explored (safe) state has STACK_SPILL in
+ * this stack slot, but current has has STACK_MISC ->
+ * this verifier states are not equivalent,
+ * return false to continue verification of this path
+ */
+ return false;
+ if (i % BPF_REG_SIZE)
+ continue;
+ if (old->stack[spi].slot_type[0] != STACK_SPILL)
+ continue;
+ if (!regsafe(&old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr,
+ idmap))
+ /* when explored and current stack slot are both storing
+ * spilled registers, check that stored pointers types
+ * are the same as well.
+ * Ex: explored safe path could have stored
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
+ * but current path has stored:
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
+ * such verifier states are not equivalent.
+ * return false to continue verification of this path
+ */
+ return false;
+ }
+ return true;
+}
+
/* compare two verifier states
*
* all states stored in state_list are known to be valid, since
@@ -3423,37 +3599,8 @@ static bool states_equal(struct bpf_verifier_env *env,
goto out_free;
}
- for (i = 0; i < MAX_BPF_STACK; i++) {
- if (old->stack_slot_type[i] == STACK_INVALID)
- continue;
- if (old->stack_slot_type[i] != cur->stack_slot_type[i])
- /* Ex: old explored (safe) state has STACK_SPILL in
- * this stack slot, but current has has STACK_MISC ->
- * this verifier states are not equivalent,
- * return false to continue verification of this path
- */
- goto out_free;
- if (i % BPF_REG_SIZE)
- continue;
- if (old->stack_slot_type[i] != STACK_SPILL)
- continue;
- if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
- &cur->spilled_regs[i / BPF_REG_SIZE],
- idmap))
- /* when explored and current stack slot are both storing
- * spilled registers, check that stored pointers types
- * are the same as well.
- * Ex: explored safe path could have stored
- * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
- * but current path has stored:
- * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
- * such verifier states are not equivalent.
- * return false to continue verification of this path
- */
- goto out_free;
- else
- continue;
- }
+ if (!stacksafe(old, cur, idmap))
+ goto out_free;
ret = true;
out_free:
kfree(idmap);
@@ -3489,17 +3636,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state,
}
}
/* ... and stack slots */
- for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) {
- if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
+ i < parent->allocated_stack / BPF_REG_SIZE; i++) {
+ if (parent->stack[i].slot_type[0] != STACK_SPILL)
continue;
- if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- if (parent->spilled_regs[i].live & REG_LIVE_READ)
+ if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
continue;
- if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN))
+ if (writes &&
+ (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN))
continue;
- if (state->spilled_regs[i].live & REG_LIVE_READ) {
- parent->spilled_regs[i].live |= REG_LIVE_READ;
+ if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) {
+ parent->stack[i].spilled_ptr.live |= REG_LIVE_READ;
touched = true;
}
}
@@ -3529,7 +3678,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
- int i;
+ struct bpf_verifier_state *cur = env->cur_state;
+ int i, err;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -3539,7 +3689,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
return 0;
while (sl != STATE_LIST_MARK) {
- if (states_equal(env, &sl->state, &env->cur_state)) {
+ if (states_equal(env, &sl->state, cur)) {
/* reached equivalent register/stack state,
* prune the search.
* Registers read by the continuation are read by us.
@@ -3550,7 +3700,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* they'll be immediately forgotten as we're pruning
* this state and will pop a new one.
*/
- propagate_liveness(&sl->state, &env->cur_state);
+ propagate_liveness(&sl->state, cur);
return 1;
}
sl = sl->next;
@@ -3562,16 +3712,21 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* it will be rejected. Since there are no loops, we won't be
* seeing this 'insn_idx' instruction again on the way to bpf_exit
*/
- new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
+ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
if (!new_sl)
return -ENOMEM;
/* add new state to the head of linked list */
- memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+ err = copy_verifier_state(&new_sl->state, cur);
+ if (err) {
+ free_verifier_state(&new_sl->state, false);
+ kfree(new_sl);
+ return err;
+ }
new_sl->next = env->explored_states[insn_idx];
env->explored_states[insn_idx] = new_sl;
/* connect new state to parentage chain */
- env->cur_state.parent = &new_sl->state;
+ cur->parent = &new_sl->state;
/* clear write marks in current state: the writes we did are not writes
* our child did, so they don't screen off its reads from us.
* (There are no read marks in current state, because reads always mark
@@ -3579,33 +3734,37 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* explored_states can get read marks.)
*/
for (i = 0; i < BPF_REG_FP; i++)
- env->cur_state.regs[i].live = REG_LIVE_NONE;
- for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
- if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
- env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
+ cur->regs[i].live = REG_LIVE_NONE;
+ for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++)
+ if (cur->stack[i].slot_type[0] == STACK_SPILL)
+ cur->stack[i].spilled_ptr.live = REG_LIVE_NONE;
return 0;
}
static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx)
{
- if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
- return 0;
+ if (env->dev_ops && env->dev_ops->insn_hook)
+ return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
- return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+ return 0;
}
static int do_check(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state;
struct bpf_insn *insns = env->prog->insnsi;
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs;
int insn_cnt = env->prog->len;
int insn_idx, prev_insn_idx = 0;
int insn_processed = 0;
bool do_print_state = false;
- init_reg_state(regs);
+ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+ env->cur_state = state;
+ init_reg_state(env, state->regs);
state->parent = NULL;
insn_idx = 0;
for (;;) {
@@ -3614,7 +3773,7 @@ static int do_check(struct bpf_verifier_env *env)
int err;
if (insn_idx >= insn_cnt) {
- verbose("invalid insn idx %d insn_cnt %d\n",
+ verbose(env, "invalid insn idx %d insn_cnt %d\n",
insn_idx, insn_cnt);
return -EFAULT;
}
@@ -3623,7 +3782,8 @@ static int do_check(struct bpf_verifier_env *env)
class = BPF_CLASS(insn->code);
if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
- verbose("BPF program is too large. Processed %d insn\n",
+ verbose(env,
+ "BPF program is too large. Processed %d insn\n",
insn_processed);
return -E2BIG;
}
@@ -3633,12 +3793,12 @@ static int do_check(struct bpf_verifier_env *env)
return err;
if (err == 1) {
/* found equivalent state, can prune the search */
- if (log_level) {
+ if (env->log.level) {
if (do_print_state)
- verbose("\nfrom %d to %d: safe\n",
+ verbose(env, "\nfrom %d to %d: safe\n",
prev_insn_idx, insn_idx);
else
- verbose("%d: safe\n", insn_idx);
+ verbose(env, "%d: safe\n", insn_idx);
}
goto process_bpf_exit;
}
@@ -3646,25 +3806,28 @@ static int do_check(struct bpf_verifier_env *env)
if (need_resched())
cond_resched();
- if (log_level > 1 || (log_level && do_print_state)) {
- if (log_level > 1)
- verbose("%d:", insn_idx);
+ if (env->log.level > 1 || (env->log.level && do_print_state)) {
+ if (env->log.level > 1)
+ verbose(env, "%d:", insn_idx);
else
- verbose("\nfrom %d to %d:",
+ verbose(env, "\nfrom %d to %d:",
prev_insn_idx, insn_idx);
- print_verifier_state(&env->cur_state);
+ print_verifier_state(env, state);
do_print_state = false;
}
- if (log_level) {
- verbose("%d: ", insn_idx);
- print_bpf_insn(env, insn);
+ if (env->log.level) {
+ verbose(env, "%d: ", insn_idx);
+ print_bpf_insn(verbose, env, insn,
+ env->allow_ptr_leaks);
}
err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
if (err)
return err;
+ regs = cur_regs(env);
+ env->insn_aux_data[insn_idx].seen = true;
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err)
@@ -3714,7 +3877,7 @@ static int do_check(struct bpf_verifier_env *env)
* src_reg == stack|map in some other branch.
* Reject it.
*/
- verbose("same insn cannot be used with different pointers\n");
+ verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
}
@@ -3754,14 +3917,14 @@ static int do_check(struct bpf_verifier_env *env)
} else if (dst_reg_type != *prev_dst_type &&
(dst_reg_type == PTR_TO_CTX ||
*prev_dst_type == PTR_TO_CTX)) {
- verbose("same insn cannot be used with different pointers\n");
+ verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
}
} else if (class == BPF_ST) {
if (BPF_MODE(insn->code) != BPF_MEM ||
insn->src_reg != BPF_REG_0) {
- verbose("BPF_ST uses reserved fields\n");
+ verbose(env, "BPF_ST uses reserved fields\n");
return -EINVAL;
}
/* check src operand */
@@ -3784,7 +3947,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->off != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_CALL uses reserved fields\n");
+ verbose(env, "BPF_CALL uses reserved fields\n");
return -EINVAL;
}
@@ -3797,7 +3960,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_JA uses reserved fields\n");
+ verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}
@@ -3809,7 +3972,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_EXIT uses reserved fields\n");
+ verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
@@ -3824,13 +3987,18 @@ static int do_check(struct bpf_verifier_env *env)
return err;
if (is_pointer_value(env, BPF_REG_0)) {
- verbose("R0 leaks addr as return value\n");
+ verbose(env, "R0 leaks addr as return value\n");
return -EACCES;
}
+ err = check_return_code(env);
+ if (err)
+ return err;
process_bpf_exit:
- insn_idx = pop_stack(env, &prev_insn_idx);
- if (insn_idx < 0) {
+ err = pop_stack(env, &prev_insn_idx, &insn_idx);
+ if (err < 0) {
+ if (err != -ENOENT)
+ return err;
break;
} else {
do_print_state = true;
@@ -3855,20 +4023,21 @@ process_bpf_exit:
return err;
insn_idx++;
+ env->insn_aux_data[insn_idx].seen = true;
} else {
- verbose("invalid BPF_LD mode\n");
+ verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
}
} else {
- verbose("unknown insn class %d\n", class);
+ verbose(env, "unknown insn class %d\n", class);
return -EINVAL;
}
insn_idx++;
}
- verbose("processed %d insns, stack depth %d\n",
- insn_processed, env->prog->aux->stack_depth);
+ verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
+ env->prog->aux->stack_depth);
return 0;
}
@@ -3880,7 +4049,8 @@ static int check_map_prealloc(struct bpf_map *map)
!(map->map_flags & BPF_F_NO_PREALLOC);
}
-static int check_map_prog_compatibility(struct bpf_map *map,
+static int check_map_prog_compatibility(struct bpf_verifier_env *env,
+ struct bpf_map *map,
struct bpf_prog *prog)
{
@@ -3891,12 +4061,12 @@ static int check_map_prog_compatibility(struct bpf_map *map,
*/
if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
if (!check_map_prealloc(map)) {
- verbose("perf_event programs can only use preallocated hash map\n");
+ verbose(env, "perf_event programs can only use preallocated hash map\n");
return -EINVAL;
}
if (map->inner_map_meta &&
!check_map_prealloc(map->inner_map_meta)) {
- verbose("perf_event programs can only use preallocated inner hash map\n");
+ verbose(env, "perf_event programs can only use preallocated inner hash map\n");
return -EINVAL;
}
}
@@ -3919,14 +4089,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
- verbose("BPF_LDX uses reserved fields\n");
+ verbose(env, "BPF_LDX uses reserved fields\n");
return -EINVAL;
}
if (BPF_CLASS(insn->code) == BPF_STX &&
((BPF_MODE(insn->code) != BPF_MEM &&
BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
- verbose("BPF_STX uses reserved fields\n");
+ verbose(env, "BPF_STX uses reserved fields\n");
return -EINVAL;
}
@@ -3937,7 +4107,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
insn[1].off != 0) {
- verbose("invalid bpf_ld_imm64 insn\n");
+ verbose(env, "invalid bpf_ld_imm64 insn\n");
return -EINVAL;
}
@@ -3946,19 +4116,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
goto next_insn;
if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
- verbose("unrecognized bpf_ld_imm64 insn\n");
+ verbose(env,
+ "unrecognized bpf_ld_imm64 insn\n");
return -EINVAL;
}
f = fdget(insn->imm);
map = __bpf_map_get(f);
if (IS_ERR(map)) {
- verbose("fd %d is not pointing to valid bpf_map\n",
+ verbose(env, "fd %d is not pointing to valid bpf_map\n",
insn->imm);
return PTR_ERR(map);
}
- err = check_map_prog_compatibility(map, env->prog);
+ err = check_map_prog_compatibility(env, map, env->prog);
if (err) {
fdput(f);
return err;
@@ -4035,6 +4206,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
u32 off, u32 cnt)
{
struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ int i;
if (cnt == 1)
return 0;
@@ -4044,6 +4216,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ for (i = off; i < off + cnt - 1; i++)
+ new_data[i].seen = true;
env->insn_aux_data = new_data;
vfree(old_data);
return 0;
@@ -4062,12 +4236,31 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
return new_prog;
}
+/* The verifier does more data flow analysis than llvm and will not explore
+ * branches that are dead at run time. Malicious programs can have dead code
+ * too. Therefore replace all dead at-run-time code with nops.
+ */
+static void sanitize_dead_code(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (aux_data[i].seen)
+ continue;
+ memcpy(insn + i, &nop, sizeof(nop));
+ }
+}
+
/* convert load instructions that access fields of 'struct __sk_buff'
* into sequence of instructions that access fields of 'struct sk_buff'
*/
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
- const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+ const struct bpf_verifier_ops *ops = env->ops;
int i, cnt, size, ctx_field_size, delta = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16], *insn;
@@ -4080,7 +4273,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
env->prog);
if (cnt >= ARRAY_SIZE(insn_buf)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
} else if (cnt) {
new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
@@ -4128,7 +4321,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
u8 size_code;
if (type == BPF_WRITE) {
- verbose("bpf verifier narrow ctx access misconfigured\n");
+ verbose(env, "bpf verifier narrow ctx access misconfigured\n");
return -EINVAL;
}
@@ -4147,7 +4340,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
&target_size);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
(ctx_field_size && !target_size)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -4229,7 +4422,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -4268,12 +4461,13 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn = new_prog->insnsi + i + delta;
}
patch_call_imm:
- fn = prog->aux->ops->get_func_proto(insn->imm);
+ fn = env->ops->get_func_proto(insn->imm);
/* all functions that have prototype and verifier allowed
* programs to call them, must be real in-kernel functions
*/
if (!fn->func) {
- verbose("kernel subsystem misconfigured func %s#%d\n",
+ verbose(env,
+ "kernel subsystem misconfigured func %s#%d\n",
func_id_name(insn->imm), insn->imm);
return -EFAULT;
}
@@ -4297,6 +4491,7 @@ static void free_states(struct bpf_verifier_env *env)
if (sl)
while (sl != STATE_LIST_MARK) {
sln = sl->next;
+ free_verifier_state(&sl->state, false);
kfree(sl);
sl = sln;
}
@@ -4307,16 +4502,21 @@ static void free_states(struct bpf_verifier_env *env)
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
- char __user *log_ubuf = NULL;
struct bpf_verifier_env *env;
+ struct bpf_verifer_log *log;
int ret = -EINVAL;
+ /* no program is valid */
+ if (ARRAY_SIZE(bpf_verifier_ops) == 0)
+ return -EINVAL;
+
/* 'struct bpf_verifier_env' can be global, but since it's not small,
* allocate/free it every time bpf_check() is called
*/
env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
+ log = &env->log;
env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
(*prog)->len);
@@ -4324,6 +4524,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (!env->insn_aux_data)
goto err_free_env;
env->prog = *prog;
+ env->ops = bpf_verifier_ops[env->prog->type];
/* grab the mutex to protect few globals used by verifier */
mutex_lock(&bpf_verifier_lock);
@@ -4332,29 +4533,27 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
/* user requested verbose verifier output
* and supplied buffer to store the verification trace
*/
- log_level = attr->log_level;
- log_ubuf = (char __user *) (unsigned long) attr->log_buf;
- log_size = attr->log_size;
- log_len = 0;
+ log->level = attr->log_level;
+ log->ubuf = (char __user *) (unsigned long) attr->log_buf;
+ log->len_total = attr->log_size;
ret = -EINVAL;
- /* log_* values have to be sane */
- if (log_size < 128 || log_size > UINT_MAX >> 8 ||
- log_level == 0 || log_ubuf == NULL)
- goto err_unlock;
-
- ret = -ENOMEM;
- log_buf = vmalloc(log_size);
- if (!log_buf)
+ /* log attributes have to be sane */
+ if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+ !log->level || !log->ubuf)
goto err_unlock;
- } else {
- log_level = 0;
}
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
+ if (env->prog->aux->offload) {
+ ret = bpf_prog_offload_verifier_prep(env);
+ if (ret)
+ goto err_unlock;
+ }
+
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
@@ -4373,29 +4572,30 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
ret = do_check(env);
+ if (env->cur_state) {
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ }
skip_full_check:
- while (pop_stack(env, NULL) >= 0);
+ while (!pop_stack(env, NULL, NULL));
free_states(env);
if (ret == 0)
+ sanitize_dead_code(env);
+
+ if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
if (ret == 0)
ret = fixup_bpf_calls(env);
- if (log_level && log_len >= log_size - 1) {
- BUG_ON(log_len >= log_size);
- /* verifier log exceeded user supplied buffer */
+ if (log->level && bpf_verifier_log_full(log))
ret = -ENOSPC;
- /* fall through to return what was recorded */
- }
-
- /* copy verifier log back to user space including trailing zero */
- if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ if (log->level && !log->ubuf) {
ret = -EFAULT;
- goto free_log_buf;
+ goto err_release_maps;
}
if (ret == 0 && env->used_map_cnt) {
@@ -4406,7 +4606,7 @@ skip_full_check:
if (!env->prog->aux->used_maps) {
ret = -ENOMEM;
- goto free_log_buf;
+ goto err_release_maps;
}
memcpy(env->prog->aux->used_maps, env->used_maps,
@@ -4419,9 +4619,7 @@ skip_full_check:
convert_pseudo_ld_imm64(env);
}
-free_log_buf:
- if (log_level)
- vfree(log_buf);
+err_release_maps:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_bpf_prog_info() will release them.
@@ -4435,58 +4633,3 @@ err_free_env:
kfree(env);
return ret;
}
-
-int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
- void *priv)
-{
- struct bpf_verifier_env *env;
- int ret;
-
- env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
- if (!env)
- return -ENOMEM;
-
- env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
- prog->len);
- ret = -ENOMEM;
- if (!env->insn_aux_data)
- goto err_free_env;
- env->prog = prog;
- env->analyzer_ops = ops;
- env->analyzer_priv = priv;
-
- /* grab the mutex to protect few globals used by verifier */
- mutex_lock(&bpf_verifier_lock);
-
- log_level = 0;
-
- env->strict_alignment = false;
- if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
- env->strict_alignment = true;
-
- env->explored_states = kcalloc(env->prog->len,
- sizeof(struct bpf_verifier_state_list *),
- GFP_KERNEL);
- ret = -ENOMEM;
- if (!env->explored_states)
- goto skip_full_check;
-
- ret = check_cfg(env);
- if (ret < 0)
- goto skip_full_check;
-
- env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
-
- ret = do_check(env);
-
-skip_full_check:
- while (pop_stack(env, NULL) >= 0);
- free_states(env);
-
- mutex_unlock(&bpf_verifier_lock);
- vfree(env->insn_aux_data);
-err_free_env:
- kfree(env);
- return ret;
-}
-EXPORT_SYMBOL_GPL(bpf_analyzer);
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index ae448f7632cc..2be89a003185 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y := cgroup.o namespace.o cgroup-v1.o
+obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index bf54ade001be..b928b27050c6 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -201,6 +201,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
int cgroup_task_count(const struct cgroup *cgrp);
/*
+ * stat.c
+ */
+void cgroup_stat_flush(struct cgroup *cgrp);
+int cgroup_stat_init(struct cgroup *cgrp);
+void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_stat_show_cputime(struct seq_file *seq);
+void cgroup_stat_boot(void);
+
+/*
* namespace.c
*/
extern const struct proc_ns_operations cgroupns_operations;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 44857278eb8a..0b1ffe147f24 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
};
#undef SUBSYS
+static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+
/*
* The default hierarchy, reserved for the subsystems that are otherwise
* unattached - it never has more than a single cgroup, and all tasks are
* part of that cgroup.
*/
-struct cgroup_root cgrp_dfl_root;
+struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
@@ -462,6 +464,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
}
/**
+ * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist
+ * or is offline, %NULL is returned.
+ */
+static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+ css = cgroup_css(cgrp, ss);
+ if (!css || !css_tryget_online(css))
+ css = NULL;
+ rcu_read_unlock();
+
+ return css;
+}
+
+/**
* cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
@@ -647,6 +671,14 @@ struct css_set init_css_set = {
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+
+ /*
+ * The following field is re-initialized when this cset gets linked
+ * in cgroup_init(). However, let's initialize the field
+ * statically too so that the default cgroup can be accessed safely
+ * early during boot.
+ */
+ .dfl_cgrp = &cgrp_dfl_root.cgrp,
};
static int css_set_count = 1; /* 1 for init_css_set */
@@ -1896,6 +1928,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
if (ret)
goto destroy_root;
+ ret = cgroup_bpf_inherit(root_cgrp);
+ WARN_ON_ONCE(ret);
+
trace_cgroup_setup_root(root);
/*
@@ -3312,6 +3347,37 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
return 0;
}
+static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
+ struct cgroup *cgrp, int ssid)
+{
+ struct cgroup_subsys *ss = cgroup_subsys[ssid];
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (!ss->css_extra_stat_show)
+ return 0;
+
+ css = cgroup_tryget_css(cgrp, ss);
+ if (!css)
+ return 0;
+
+ ret = ss->css_extra_stat_show(seq, css);
+ css_put(css);
+ return ret;
+}
+
+static int cpu_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
+ int ret = 0;
+
+ cgroup_stat_show_cputime(seq);
+#ifdef CONFIG_CGROUP_SCHED
+ ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
+#endif
+ return ret;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
@@ -4419,6 +4485,11 @@ static struct cftype cgroup_base_files[] = {
.name = "cgroup.stat",
.seq_show = cgroup_stat_show,
},
+ {
+ .name = "cpu.stat",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_stat_show,
+ },
{ } /* terminate */
};
@@ -4479,6 +4550,8 @@ static void css_free_work_fn(struct work_struct *work)
*/
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
+ if (cgroup_on_dfl(cgrp))
+ cgroup_stat_exit(cgrp);
kfree(cgrp);
} else {
/*
@@ -4523,6 +4596,9 @@ static void css_release_work_fn(struct work_struct *work)
/* cgroup release path */
trace_cgroup_release(cgrp);
+ if (cgroup_on_dfl(cgrp))
+ cgroup_stat_flush(cgrp);
+
for (tcgrp = cgroup_parent(cgrp); tcgrp;
tcgrp = cgroup_parent(tcgrp))
tcgrp->nr_dying_descendants--;
@@ -4706,6 +4782,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (ret)
goto out_free_cgrp;
+ if (cgroup_on_dfl(parent)) {
+ ret = cgroup_stat_init(cgrp);
+ if (ret)
+ goto out_cancel_ref;
+ }
+
/*
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
@@ -4713,7 +4795,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
if (cgrp->id < 0) {
ret = -ENOMEM;
- goto out_cancel_ref;
+ goto out_stat_exit;
}
init_cgroup_housekeeping(cgrp);
@@ -4721,6 +4803,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->self.parent = &parent->self;
cgrp->root = root;
cgrp->level = level;
+ ret = cgroup_bpf_inherit(cgrp);
+ if (ret)
+ goto out_idr_free;
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@ -4755,13 +4840,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (!cgroup_on_dfl(cgrp))
cgrp->subtree_control = cgroup_control(cgrp);
- if (parent)
- cgroup_bpf_inherit(cgrp, parent);
-
cgroup_propagate_control(cgrp);
return cgrp;
+out_idr_free:
+ cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_stat_exit:
+ if (cgroup_on_dfl(parent))
+ cgroup_stat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -5156,6 +5243,8 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
+ cgroup_stat_boot();
+
/*
* The latency of the synchronize_sched() is too high for cgroups,
* avoid it at the cost of forcing all readers into the slow path.
@@ -5744,15 +5833,103 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#endif /* CONFIG_SOCK_CGROUP_DATA */
#ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, bool overridable)
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
{
- struct cgroup *parent = cgroup_parent(cgrp);
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+ ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_query(cgrp, attr, uattr);
mutex_unlock(&cgroup_mutex);
return ret;
}
#endif /* CONFIG_CGROUP_BPF */
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_delegatable_files(struct cftype *files, char *buf,
+ ssize_t size, const char *prefix)
+{
+ struct cftype *cft;
+ ssize_t ret = 0;
+
+ for (cft = files; cft && cft->name[0] != '\0'; cft++) {
+ if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
+ continue;
+
+ if (prefix)
+ ret += snprintf(buf + ret, size - ret, "%s.", prefix);
+
+ ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
+
+ if (unlikely(ret >= size)) {
+ WARN_ON(1);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
+ ssize_t ret = 0;
+
+ ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
+ NULL);
+
+ for_each_subsys(ss, ssid)
+ ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
+ PAGE_SIZE - ret,
+ cgroup_subsys_name[ssid]);
+
+ return ret;
+}
+static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
+
+static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+}
+static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
+
+static struct attribute *cgroup_sysfs_attrs[] = {
+ &cgroup_delegate_attr.attr,
+ &cgroup_features_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group cgroup_sysfs_attr_group = {
+ .attrs = cgroup_sysfs_attrs,
+ .name = "cgroup",
+};
+
+static int __init cgroup_sysfs_init(void)
+{
+ return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
+}
+subsys_initcall(cgroup_sysfs_init);
+#endif /* CONFIG_SYSFS */
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
new file mode 100644
index 000000000000..133b465691d6
--- /dev/null
+++ b/kernel/cgroup/stat.c
@@ -0,0 +1,334 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched/cputime.h>
+
+static DEFINE_MUTEX(cgroup_stat_mutex);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
+{
+ return per_cpu_ptr(cgrp->cpu_stat, cpu);
+}
+
+/**
+ * cgroup_cpu_stat_updated - keep track of updated cpu_stat
+ * @cgrp: target cgroup
+ * @cpu: cpu on which cpu_stat was updated
+ *
+ * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching
+ * cpu_stat->updated_children list. See the comment on top of
+ * cgroup_cpu_stat definition for details.
+ */
+static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
+{
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+ struct cgroup *parent;
+ unsigned long flags;
+
+ /*
+ * Speculative already-on-list test. This may race leading to
+ * temporary inaccuracies, which is fine.
+ *
+ * Because @parent's updated_children is terminated with @parent
+ * instead of NULL, we can tell whether @cgrp is on the list by
+ * testing the next pointer for NULL.
+ */
+ if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
+ return;
+
+ raw_spin_lock_irqsave(cpu_lock, flags);
+
+ /* put @cgrp and all ancestors on the corresponding updated lists */
+ for (parent = cgroup_parent(cgrp); parent;
+ cgrp = parent, parent = cgroup_parent(cgrp)) {
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+ struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+
+ /*
+ * Both additions and removals are bottom-up. If a cgroup
+ * is already in the tree, all ancestors are.
+ */
+ if (cstat->updated_next)
+ break;
+
+ cstat->updated_next = pcstat->updated_children;
+ pcstat->updated_children = cgrp;
+ }
+
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
+/**
+ * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
+ * @pos: current position
+ * @root: root of the tree to traversal
+ * @cpu: target cpu
+ *
+ * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts
+ * the traversal and %NULL return indicates the end. During traversal,
+ * each returned cgroup is unlinked from the tree. Must be called with the
+ * matching cgroup_cpu_stat_lock held.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, if a child is visited, its parent is
+ * guaranteed to be visited afterwards.
+ */
+static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
+ struct cgroup *root, int cpu)
+{
+ struct cgroup_cpu_stat *cstat;
+ struct cgroup *parent;
+
+ if (pos == root)
+ return NULL;
+
+ /*
+ * We're gonna walk down to the first leaf and visit/remove it. We
+ * can pick whatever unvisited node as the starting point.
+ */
+ if (!pos)
+ pos = root;
+ else
+ pos = cgroup_parent(pos);
+
+ /* walk down to the first leaf */
+ while (true) {
+ cstat = cgroup_cpu_stat(pos, cpu);
+ if (cstat->updated_children == pos)
+ break;
+ pos = cstat->updated_children;
+ }
+
+ /*
+ * Unlink @pos from the tree. As the updated_children list is
+ * singly linked, we have to walk it to find the removal point.
+ * However, due to the way we traverse, @pos will be the first
+ * child in most cases. The only exception is @root.
+ */
+ parent = cgroup_parent(pos);
+ if (parent && cstat->updated_next) {
+ struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+ struct cgroup_cpu_stat *ncstat;
+ struct cgroup **nextp;
+
+ nextp = &pcstat->updated_children;
+ while (true) {
+ ncstat = cgroup_cpu_stat(*nextp, cpu);
+ if (*nextp == pos)
+ break;
+
+ WARN_ON_ONCE(*nextp == parent);
+ nextp = &ncstat->updated_next;
+ }
+
+ *nextp = cstat->updated_next;
+ cstat->updated_next = NULL;
+ }
+
+ return pos;
+}
+
+static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
+ struct cgroup_stat *src_stat)
+{
+ dst_stat->cputime.utime += src_stat->cputime.utime;
+ dst_stat->cputime.stime += src_stat->cputime.stime;
+ dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
+}
+
+static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+ struct task_cputime *last_cputime = &cstat->last_cputime;
+ struct task_cputime cputime;
+ struct cgroup_stat delta;
+ unsigned seq;
+
+ lockdep_assert_held(&cgroup_stat_mutex);
+
+ /* fetch the current per-cpu values */
+ do {
+ seq = __u64_stats_fetch_begin(&cstat->sync);
+ cputime = cstat->cputime;
+ } while (__u64_stats_fetch_retry(&cstat->sync, seq));
+
+ /* accumulate the deltas to propgate */
+ delta.cputime.utime = cputime.utime - last_cputime->utime;
+ delta.cputime.stime = cputime.stime - last_cputime->stime;
+ delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
+ last_cputime->sum_exec_runtime;
+ *last_cputime = cputime;
+
+ /* transfer the pending stat into delta */
+ cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
+ memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
+
+ /* propagate delta into the global stat and the parent's pending */
+ cgroup_stat_accumulate(&cgrp->stat, &delta);
+ if (parent)
+ cgroup_stat_accumulate(&parent->pending_stat, &delta);
+}
+
+/* see cgroup_stat_flush() */
+static void cgroup_stat_flush_locked(struct cgroup *cgrp)
+{
+ int cpu;
+
+ lockdep_assert_held(&cgroup_stat_mutex);
+
+ for_each_possible_cpu(cpu) {
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+ struct cgroup *pos = NULL;
+
+ raw_spin_lock_irq(cpu_lock);
+ while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
+ cgroup_cpu_stat_flush_one(pos, cpu);
+ raw_spin_unlock_irq(cpu_lock);
+ }
+}
+
+/**
+ * cgroup_stat_flush - flush stats in @cgrp's subtree
+ * @cgrp: target cgroup
+ *
+ * Collect all per-cpu stats in @cgrp's subtree into the global counters
+ * and propagate them upwards. After this function returns, all cgroups in
+ * the subtree have up-to-date ->stat.
+ *
+ * This also gets all cgroups in the subtree including @cgrp off the
+ * ->updated_children lists.
+ */
+void cgroup_stat_flush(struct cgroup *cgrp)
+{
+ mutex_lock(&cgroup_stat_mutex);
+ cgroup_stat_flush_locked(cgrp);
+ mutex_unlock(&cgroup_stat_mutex);
+}
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
+{
+ struct cgroup_cpu_stat *cstat;
+
+ cstat = get_cpu_ptr(cgrp->cpu_stat);
+ u64_stats_update_begin(&cstat->sync);
+ return cstat;
+}
+
+static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
+ struct cgroup_cpu_stat *cstat)
+{
+ u64_stats_update_end(&cstat->sync);
+ cgroup_cpu_stat_updated(cgrp, smp_processor_id());
+ put_cpu_ptr(cstat);
+}
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+ struct cgroup_cpu_stat *cstat;
+
+ cstat = cgroup_cpu_stat_account_begin(cgrp);
+ cstat->cputime.sum_exec_runtime += delta_exec;
+ cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+ enum cpu_usage_stat index, u64 delta_exec)
+{
+ struct cgroup_cpu_stat *cstat;
+
+ cstat = cgroup_cpu_stat_account_begin(cgrp);
+
+ switch (index) {
+ case CPUTIME_USER:
+ case CPUTIME_NICE:
+ cstat->cputime.utime += delta_exec;
+ break;
+ case CPUTIME_SYSTEM:
+ case CPUTIME_IRQ:
+ case CPUTIME_SOFTIRQ:
+ cstat->cputime.stime += delta_exec;
+ break;
+ default:
+ break;
+ }
+
+ cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void cgroup_stat_show_cputime(struct seq_file *seq)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ u64 usage, utime, stime;
+
+ if (!cgroup_parent(cgrp))
+ return;
+
+ mutex_lock(&cgroup_stat_mutex);
+
+ cgroup_stat_flush_locked(cgrp);
+
+ usage = cgrp->stat.cputime.sum_exec_runtime;
+ cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
+ &utime, &stime);
+
+ mutex_unlock(&cgroup_stat_mutex);
+
+ do_div(usage, NSEC_PER_USEC);
+ do_div(utime, NSEC_PER_USEC);
+ do_div(stime, NSEC_PER_USEC);
+
+ seq_printf(seq, "usage_usec %llu\n"
+ "user_usec %llu\n"
+ "system_usec %llu\n",
+ usage, utime, stime);
+}
+
+int cgroup_stat_init(struct cgroup *cgrp)
+{
+ int cpu;
+
+ /* the root cgrp has cpu_stat preallocated */
+ if (!cgrp->cpu_stat) {
+ cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
+ if (!cgrp->cpu_stat)
+ return -ENOMEM;
+ }
+
+ /* ->updated_children list is self terminated */
+ for_each_possible_cpu(cpu)
+ cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
+
+ prev_cputime_init(&cgrp->stat.prev_cputime);
+
+ return 0;
+}
+
+void cgroup_stat_exit(struct cgroup *cgrp)
+{
+ int cpu;
+
+ cgroup_stat_flush(cgrp);
+
+ /* sanity check */
+ for_each_possible_cpu(cpu) {
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+
+ if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
+ WARN_ON_ONCE(cstat->updated_next))
+ return;
+ }
+
+ free_percpu(cgrp->cpu_stat);
+ cgrp->cpu_stat = NULL;
+}
+
+void __init cgroup_stat_boot(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
+
+ BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
+}
diff --git a/kernel/compat.c b/kernel/compat.c
index 772e038d04d9..d1cee656a7ed 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -367,24 +367,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
return ret;
}
-int get_compat_itimerspec(struct itimerspec *dst,
- const struct compat_itimerspec __user *src)
-{
- if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
- __compat_get_timespec(&dst->it_value, &src->it_value))
- return -EFAULT;
- return 0;
-}
-
-int put_compat_itimerspec(struct compat_itimerspec __user *dst,
- const struct itimerspec *src)
-{
- if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
- __compat_put_timespec(&src->it_value, &dst->it_value))
- return -EFAULT;
- return 0;
-}
-
int get_compat_itimerspec64(struct itimerspec64 *its,
const struct compat_itimerspec __user *uits)
{
@@ -485,27 +467,44 @@ Efault:
return -EFAULT;
}
-void
-sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
+int
+get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
{
+#ifdef __BIG_ENDIAN
+ compat_sigset_t v;
+ if (copy_from_user(&v, compat, sizeof(compat_sigset_t)))
+ return -EFAULT;
switch (_NSIG_WORDS) {
- case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
- case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
- case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
- case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
+ case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
+ case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
+ case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
+ case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
}
+#else
+ if (copy_from_user(set, compat, sizeof(compat_sigset_t)))
+ return -EFAULT;
+#endif
+ return 0;
}
-EXPORT_SYMBOL_GPL(sigset_from_compat);
+EXPORT_SYMBOL_GPL(get_compat_sigset);
-void
-sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+int
+put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
+ unsigned int size)
{
+ /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
+#ifdef __BIG_ENDIAN
+ compat_sigset_t v;
switch (_NSIG_WORDS) {
- case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
- case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
- case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
- case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+ case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
+ case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2];
+ case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
+ case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0];
}
+ return copy_to_user(compat, &v, size) ? -EFAULT : 0;
+#else
+ return copy_to_user(compat, set, size) ? -EFAULT : 0;
+#endif
}
#ifdef CONFIG_NUMA
@@ -563,22 +562,6 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
}
#endif
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
- compat_pid_t, pid,
- struct compat_timespec __user *, interval)
-{
- struct timespec t;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
- set_fs(old_fs);
- if (compat_put_timespec(&t, interval))
- return -EFAULT;
- return ret;
-}
-
/*
* Allocate user-space memory for the duration of a single system call,
* in order to marshall parameters inside a compat thunk.
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 6db80fc0810b..b3663896278e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -108,7 +108,8 @@ static int __init parse_crashkernel_mem(char *cmdline,
return -EINVAL;
}
}
- }
+ } else
+ pr_info("crashkernel size resulted in zero bytes\n");
return 0;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4c39c05e029a..9404c631bd3f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3601,7 +3601,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
goto out;
}
-
/*
* If the event is currently on this CPU, its either a per-task event,
* or local to this CPU. Furthermore it means its ACTIVE (otherwise
@@ -7867,25 +7866,24 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
struct pt_regs *regs, struct hlist_head *head,
struct task_struct *task)
{
- struct bpf_prog *prog = call->prog;
-
- if (prog) {
+ if (bpf_prog_array_valid(call)) {
*(struct pt_regs **)raw_data = regs;
- if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+ if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
}
}
perf_tp_event(call->event.type, count, raw_data, size, regs, head,
- rctx, task, NULL);
+ rctx, task);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
- struct task_struct *task, struct perf_event *event)
+ struct task_struct *task)
{
struct perf_sample_data data;
+ struct perf_event *event;
struct perf_raw_record raw = {
.frag = {
@@ -7899,15 +7897,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
perf_trace_buf_update(record, event_type);
- /* Use the given event instead of the hlist */
- if (event) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
- } else {
- hlist_for_each_entry_rcu(event, head, hlist_entry) {
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
}
/*
@@ -8060,13 +8052,11 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
+ int ret;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return perf_event_set_bpf_handler(event, prog_fd);
- if (event->tp_event->prog)
- return -EEXIST;
-
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
is_syscall_tp = is_syscall_trace_event(event->tp_event);
@@ -8094,26 +8084,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
return -EACCES;
}
}
- event->tp_event->prog = prog;
- event->tp_event->bpf_prog_owner = event;
- return 0;
+ ret = perf_event_attach_bpf_prog(event, prog);
+ if (ret)
+ bpf_prog_put(prog);
+ return ret;
}
static void perf_event_free_bpf_prog(struct perf_event *event)
{
- struct bpf_prog *prog;
-
- perf_event_free_bpf_handler(event);
-
- if (!event->tp_event)
+ if (event->attr.type != PERF_TYPE_TRACEPOINT) {
+ perf_event_free_bpf_handler(event);
return;
-
- prog = event->tp_event->prog;
- if (prog && event->tp_event->bpf_prog_owner == event) {
- event->tp_event->prog = NULL;
- bpf_prog_put(prog);
}
+ perf_event_detach_bpf_prog(event);
}
#else
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index f3e37971c842..141aa2ca8728 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -411,6 +411,7 @@ err:
return NULL;
}
+EXPORT_SYMBOL_GPL(perf_aux_output_begin);
static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb)
{
@@ -480,6 +481,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
rb_free_aux(rb);
ring_buffer_put(rb);
}
+EXPORT_SYMBOL_GPL(perf_aux_output_end);
/*
* Skip over a given number of bytes in the AUX buffer, due to, for example,
@@ -505,6 +507,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
return 0;
}
+EXPORT_SYMBOL_GPL(perf_aux_output_skip);
void *perf_get_aux(struct perf_output_handle *handle)
{
@@ -514,6 +517,7 @@ void *perf_get_aux(struct perf_output_handle *handle)
return handle->rb->aux_priv;
}
+EXPORT_SYMBOL_GPL(perf_get_aux);
#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cc743698d3..432eadf6b58c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -469,7 +469,7 @@ void __init fork_init(void)
/* create a slab on which task_structs can be allocated */
task_struct_cachep = kmem_cache_create("task_struct",
arch_task_struct_size, align,
- SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
+ SLAB_PANIC|SLAB_ACCOUNT, NULL);
#endif
/* do the arch specific task caches init */
@@ -817,8 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
- atomic_long_set(&mm->nr_ptes, 0);
- mm_nr_pmds_init(mm);
+ mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
mm->pinned_vm = 0;
@@ -872,12 +871,9 @@ static void check_mm(struct mm_struct *mm)
"mm:%p idx:%d val:%ld\n", mm, i, x);
}
- if (atomic_long_read(&mm->nr_ptes))
- pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
- atomic_long_read(&mm->nr_ptes));
- if (mm_nr_pmds(mm))
- pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
- mm_nr_pmds(mm));
+ if (mm_pgtables_bytes(mm))
+ pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+ mm_pgtables_bytes(mm));
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
@@ -1875,7 +1871,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = -ERESTARTNOINTR;
goto bad_fork_cancel_cgroup;
}
- if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
goto bad_fork_cancel_cgroup;
}
@@ -2209,18 +2205,18 @@ void __init proc_caches_init(void)
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
- SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
+ SLAB_ACCOUNT, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
/*
* FIXME! The "sizeof(struct mm_struct)" currently includes the
@@ -2231,7 +2227,7 @@ void __init proc_caches_init(void)
*/
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index f2edcf85780d..49b54e9979cc 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -862,6 +862,7 @@ int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity)
return 0;
}
+EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition);
void kstat_incr_irq_this_cpu(unsigned int irq)
{
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 1215229d1c12..ef2a47e0eab6 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -20,7 +20,7 @@
static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
-static void poll_spurious_irqs(unsigned long dummy);
+static void poll_spurious_irqs(struct timer_list *unused);
static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
static int irq_poll_cpu;
static atomic_t irq_poll_active;
@@ -143,7 +143,7 @@ out:
return ok;
}
-static void poll_spurious_irqs(unsigned long dummy)
+static void poll_spurious_irqs(struct timer_list *unused)
{
struct irq_desc *desc;
int i;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 1e6ae66c6244..531ffa984bc2 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,6 +24,7 @@
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/filter.h>
+#include <linux/ftrace.h>
#include <linux/compiler.h>
#include <asm/sections.h>
@@ -337,6 +338,10 @@ const char *kallsyms_lookup(unsigned long addr,
if (!ret)
ret = bpf_address_lookup(addr, symbolsize,
offset, modname, namebuf);
+
+ if (!ret)
+ ret = ftrace_mod_address_lookup(addr, symbolsize,
+ offset, modname, namebuf);
return ret;
}
@@ -474,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol);
struct kallsym_iter {
loff_t pos;
loff_t pos_mod_end;
+ loff_t pos_ftrace_mod_end;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
@@ -497,11 +503,25 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
return 1;
}
+static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
+{
+ int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
+ &iter->value, &iter->type,
+ iter->name, iter->module_name,
+ &iter->exported);
+ if (ret < 0) {
+ iter->pos_ftrace_mod_end = iter->pos;
+ return 0;
+ }
+
+ return 1;
+}
+
static int get_ksymbol_bpf(struct kallsym_iter *iter)
{
iter->module_name[0] = '\0';
iter->exported = 0;
- return bpf_get_kallsym(iter->pos - iter->pos_mod_end,
+ return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
&iter->value, &iter->type,
iter->name) < 0 ? 0 : 1;
}
@@ -526,20 +546,31 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
iter->name[0] = '\0';
iter->nameoff = get_symbol_offset(new_pos);
iter->pos = new_pos;
- if (new_pos == 0)
+ if (new_pos == 0) {
iter->pos_mod_end = 0;
+ iter->pos_ftrace_mod_end = 0;
+ }
}
static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
{
iter->pos = pos;
- if (iter->pos_mod_end > 0 &&
- iter->pos_mod_end < iter->pos)
+ if (iter->pos_ftrace_mod_end > 0 &&
+ iter->pos_ftrace_mod_end < iter->pos)
return get_ksymbol_bpf(iter);
- if (!get_ksymbol_mod(iter))
- return get_ksymbol_bpf(iter);
+ if (iter->pos_mod_end > 0 &&
+ iter->pos_mod_end < iter->pos) {
+ if (!get_ksymbol_ftrace_mod(iter))
+ return get_ksymbol_bpf(iter);
+ return 1;
+ }
+
+ if (!get_ksymbol_mod(iter)) {
+ if (!get_ksymbol_ftrace_mod(iter))
+ return get_ksymbol_bpf(iter);
+ }
return 1;
}
diff --git a/kernel/kcov.c b/kernel/kcov.c
index fc6af9e1308b..15f33faf4013 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -22,13 +22,21 @@
#include <linux/kcov.h>
#include <asm/setup.h>
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
+
/*
* kcov descriptor (one per opened debugfs file).
* State transitions of the descriptor:
* - initial state after open()
* - then there must be a single ioctl(KCOV_INIT_TRACE) call
* - then, mmap() call (several calls are allowed but not useful)
- * - then, repeated enable/disable for a task (only one task a time allowed)
+ * - then, ioctl(KCOV_ENABLE, arg), where arg is
+ * KCOV_TRACE_PC - to trace only the PCs
+ * or
+ * KCOV_TRACE_CMP - to trace only the comparison operands
+ * - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
*/
struct kcov {
/*
@@ -48,51 +56,176 @@ struct kcov {
struct task_struct *t;
};
-/*
- * Entry point from instrumented code.
- * This is called once per basic-block/edge.
- */
-void notrace __sanitizer_cov_trace_pc(void)
+static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
{
- struct task_struct *t;
enum kcov_mode mode;
- t = current;
/*
* We are interested in code coverage as a function of a syscall inputs,
* so we ignore code executed in interrupts.
*/
- if (!t || !in_task())
- return;
+ if (!in_task())
+ return false;
mode = READ_ONCE(t->kcov_mode);
- if (mode == KCOV_MODE_TRACE) {
- unsigned long *area;
- unsigned long pos;
- unsigned long ip = _RET_IP_;
+ /*
+ * There is some code that runs in interrupts but for which
+ * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+ * READ_ONCE()/barrier() effectively provides load-acquire wrt
+ * interrupts, there are paired barrier()/WRITE_ONCE() in
+ * kcov_ioctl_locked().
+ */
+ barrier();
+ return mode == needed_mode;
+}
+static unsigned long canonicalize_ip(unsigned long ip)
+{
#ifdef CONFIG_RANDOMIZE_BASE
- ip -= kaslr_offset();
+ ip -= kaslr_offset();
#endif
+ return ip;
+}
- /*
- * There is some code that runs in interrupts but for which
- * in_interrupt() returns false (e.g. preempt_schedule_irq()).
- * READ_ONCE()/barrier() effectively provides load-acquire wrt
- * interrupts, there are paired barrier()/WRITE_ONCE() in
- * kcov_ioctl_locked().
- */
- barrier();
- area = t->kcov_area;
- /* The first word is number of subsequent PCs. */
- pos = READ_ONCE(area[0]) + 1;
- if (likely(pos < t->kcov_size)) {
- area[pos] = ip;
- WRITE_ONCE(area[0], pos);
- }
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+ struct task_struct *t;
+ unsigned long *area;
+ unsigned long ip = canonicalize_ip(_RET_IP_);
+ unsigned long pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
+ return;
+
+ area = t->kcov_area;
+ /* The first 64-bit word is the number of subsequent PCs. */
+ pos = READ_ONCE(area[0]) + 1;
+ if (likely(pos < t->kcov_size)) {
+ area[pos] = ip;
+ WRITE_ONCE(area[0], pos);
}
}
EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+ struct task_struct *t;
+ u64 *area;
+ u64 count, start_index, end_pos, max_pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+ return;
+
+ ip = canonicalize_ip(ip);
+
+ /*
+ * We write all comparison arguments and types as u64.
+ * The buffer was allocated for t->kcov_size unsigned longs.
+ */
+ area = (u64 *)t->kcov_area;
+ max_pos = t->kcov_size * sizeof(unsigned long);
+
+ count = READ_ONCE(area[0]);
+
+ /* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+ start_index = 1 + count * KCOV_WORDS_PER_CMP;
+ end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+ if (likely(end_pos <= max_pos)) {
+ area[start_index] = type;
+ area[start_index + 1] = arg1;
+ area[start_index + 2] = arg2;
+ area[start_index + 3] = ip;
+ WRITE_ONCE(area[0], count + 1);
+ }
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+{
+ u64 i;
+ u64 count = cases[0];
+ u64 size = cases[1];
+ u64 type = KCOV_CMP_CONST;
+
+ switch (size) {
+ case 8:
+ type |= KCOV_CMP_SIZE(0);
+ break;
+ case 16:
+ type |= KCOV_CMP_SIZE(1);
+ break;
+ case 32:
+ type |= KCOV_CMP_SIZE(2);
+ break;
+ case 64:
+ type |= KCOV_CMP_SIZE(3);
+ break;
+ default:
+ return;
+ }
+ for (i = 0; i < count; i++)
+ write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
static void kcov_get(struct kcov *kcov)
{
atomic_inc(&kcov->refcount);
@@ -129,6 +262,7 @@ void kcov_task_exit(struct task_struct *t)
/* Just to not leave dangling references behind. */
kcov_task_init(t);
kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
spin_unlock(&kcov->lock);
kcov_put(kcov);
}
@@ -147,7 +281,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
spin_lock(&kcov->lock);
size = kcov->size * sizeof(unsigned long);
- if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+ if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
vma->vm_end - vma->vm_start != size) {
res = -EINVAL;
goto exit;
@@ -176,6 +310,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
if (!kcov)
return -ENOMEM;
+ kcov->mode = KCOV_MODE_DISABLED;
atomic_set(&kcov->refcount, 1);
spin_lock_init(&kcov->lock);
filep->private_data = kcov;
@@ -211,7 +346,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
if (size < 2 || size > INT_MAX / sizeof(unsigned long))
return -EINVAL;
kcov->size = size;
- kcov->mode = KCOV_MODE_TRACE;
+ kcov->mode = KCOV_MODE_INIT;
return 0;
case KCOV_ENABLE:
/*
@@ -221,17 +356,25 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
* at task exit or voluntary by KCOV_DISABLE. After that it can
* be enabled for another task.
*/
- unused = arg;
- if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
- kcov->area == NULL)
+ if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
return -EINVAL;
if (kcov->t != NULL)
return -EBUSY;
+ if (arg == KCOV_TRACE_PC)
+ kcov->mode = KCOV_MODE_TRACE_PC;
+ else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+ kcov->mode = KCOV_MODE_TRACE_CMP;
+#else
+ return -ENOTSUPP;
+#endif
+ else
+ return -EINVAL;
t = current;
/* Cache in task struct for performance. */
t->kcov_size = kcov->size;
t->kcov_area = kcov->area;
- /* See comment in __sanitizer_cov_trace_pc(). */
+ /* See comment in check_kcov_mode(). */
barrier();
WRITE_ONCE(t->kcov_mode, kcov->mode);
t->kcov = kcov;
@@ -249,6 +392,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
return -EINVAL;
kcov_task_init(t);
kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
kcov_put(kcov);
return 0;
default:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ba3992c8c375..cd50e99202b0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,7 +20,6 @@
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
-#include <linux/cgroup.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -47,6 +46,9 @@ struct kthread {
void *data;
struct completion parked;
struct completion exited;
+#ifdef CONFIG_BLK_CGROUP
+ struct cgroup_subsys_state *blkcg_css;
+#endif
};
enum KTHREAD_BITS {
@@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct *k)
void free_kthread_struct(struct task_struct *k)
{
+ struct kthread *kthread;
+
/*
* Can be NULL if this kthread was created by kernel_thread()
* or if kmalloc() in kthread() failed.
*/
- kfree(to_kthread(k));
+ kthread = to_kthread(k);
+#ifdef CONFIG_BLK_CGROUP
+ WARN_ON_ONCE(kthread && kthread->blkcg_css);
+#endif
+ kfree(kthread);
}
/**
@@ -196,7 +204,7 @@ static int kthread(void *_create)
struct kthread *self;
int ret;
- self = kmalloc(sizeof(*self), GFP_KERNEL);
+ self = kzalloc(sizeof(*self), GFP_KERNEL);
set_kthread_struct(self);
/* If user was SIGKILLed, I release the structure. */
@@ -212,7 +220,6 @@ static int kthread(void *_create)
do_exit(-ENOMEM);
}
- self->flags = 0;
self->data = data;
init_completion(&self->exited);
init_completion(&self->parked);
@@ -836,7 +843,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
struct timer_list *timer = &dwork->timer;
struct kthread_work *work = &dwork->work;
- WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn);
+ WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
/*
* If @delay is 0, queue @dwork->work immediately. This is for
@@ -1152,3 +1159,54 @@ void kthread_destroy_worker(struct kthread_worker *worker)
kfree(worker);
}
EXPORT_SYMBOL(kthread_destroy_worker);
+
+#ifdef CONFIG_BLK_CGROUP
+/**
+ * kthread_associate_blkcg - associate blkcg to current kthread
+ * @css: the cgroup info
+ *
+ * Current thread must be a kthread. The thread is running jobs on behalf of
+ * other threads. In some cases, we expect the jobs attach cgroup info of
+ * original threads instead of that of current thread. This function stores
+ * original thread's cgroup info in current kthread context for later
+ * retrieval.
+ */
+void kthread_associate_blkcg(struct cgroup_subsys_state *css)
+{
+ struct kthread *kthread;
+
+ if (!(current->flags & PF_KTHREAD))
+ return;
+ kthread = to_kthread(current);
+ if (!kthread)
+ return;
+
+ if (kthread->blkcg_css) {
+ css_put(kthread->blkcg_css);
+ kthread->blkcg_css = NULL;
+ }
+ if (css) {
+ css_get(css);
+ kthread->blkcg_css = css;
+ }
+}
+EXPORT_SYMBOL(kthread_associate_blkcg);
+
+/**
+ * kthread_blkcg - get associated blkcg css of current kthread
+ *
+ * Current thread must be a kthread.
+ */
+struct cgroup_subsys_state *kthread_blkcg(void)
+{
+ struct kthread *kthread;
+
+ if (current->flags & PF_KTHREAD) {
+ kthread = to_kthread(current);
+ if (kthread)
+ return kthread->blkcg_css;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(kthread_blkcg);
+#endif
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index 2b8bdb1925da..b36ceda6488e 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
obj-$(CONFIG_LIVEPATCH) += livepatch.o
-livepatch-objs := core.o patch.o transition.o
+livepatch-objs := core.o patch.o shadow.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index bf8c8fd72589..de9e45dca70f 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -54,11 +54,6 @@ static bool klp_is_module(struct klp_object *obj)
return obj->name;
}
-static bool klp_is_object_loaded(struct klp_object *obj)
-{
- return !obj->name || obj->mod;
-}
-
/* sets obj->mod if object is not vmlinux and module is found */
static void klp_find_object_module(struct klp_object *obj)
{
@@ -285,6 +280,11 @@ static int klp_write_object_relocations(struct module *pmod,
static int __klp_disable_patch(struct klp_patch *patch)
{
+ struct klp_object *obj;
+
+ if (WARN_ON(!patch->enabled))
+ return -EINVAL;
+
if (klp_transition_patch)
return -EBUSY;
@@ -295,6 +295,10 @@ static int __klp_disable_patch(struct klp_patch *patch)
klp_init_transition(patch, KLP_UNPATCHED);
+ klp_for_each_object(patch, obj)
+ if (obj->patched)
+ klp_pre_unpatch_callback(obj);
+
/*
* Enforce the order of the func->transition writes in
* klp_init_transition() and the TIF_PATCH_PENDING writes in
@@ -388,13 +392,18 @@ static int __klp_enable_patch(struct klp_patch *patch)
if (!klp_is_object_loaded(obj))
continue;
- ret = klp_patch_object(obj);
+ ret = klp_pre_patch_callback(obj);
if (ret) {
- pr_warn("failed to enable patch '%s'\n",
- patch->mod->name);
+ pr_warn("pre-patch callback failed for object '%s'\n",
+ klp_is_module(obj) ? obj->name : "vmlinux");
+ goto err;
+ }
- klp_cancel_transition();
- return ret;
+ ret = klp_patch_object(obj);
+ if (ret) {
+ pr_warn("failed to patch object '%s'\n",
+ klp_is_module(obj) ? obj->name : "vmlinux");
+ goto err;
}
}
@@ -403,6 +412,11 @@ static int __klp_enable_patch(struct klp_patch *patch)
patch->enabled = true;
return 0;
+err:
+ pr_warn("failed to enable patch '%s'\n", patch->mod->name);
+
+ klp_cancel_transition();
+ return ret;
}
/**
@@ -854,9 +868,15 @@ static void klp_cleanup_module_patches_limited(struct module *mod,
* is in transition.
*/
if (patch->enabled || patch == klp_transition_patch) {
+
+ if (patch != klp_transition_patch)
+ klp_pre_unpatch_callback(obj);
+
pr_notice("reverting patch '%s' on unloading module '%s'\n",
patch->mod->name, obj->mod->name);
klp_unpatch_object(obj);
+
+ klp_post_unpatch_callback(obj);
}
klp_free_object_loaded(obj);
@@ -906,13 +926,25 @@ int klp_module_coming(struct module *mod)
pr_notice("applying patch '%s' to loading module '%s'\n",
patch->mod->name, obj->mod->name);
+ ret = klp_pre_patch_callback(obj);
+ if (ret) {
+ pr_warn("pre-patch callback failed for object '%s'\n",
+ obj->name);
+ goto err;
+ }
+
ret = klp_patch_object(obj);
if (ret) {
pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
patch->mod->name, obj->mod->name, ret);
+
+ klp_post_unpatch_callback(obj);
goto err;
}
+ if (patch != klp_transition_patch)
+ klp_post_patch_callback(obj);
+
break;
}
}
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
index a351601d7f76..48a83d4364cf 100644
--- a/kernel/livepatch/core.h
+++ b/kernel/livepatch/core.h
@@ -2,6 +2,46 @@
#ifndef _LIVEPATCH_CORE_H
#define _LIVEPATCH_CORE_H
+#include <linux/livepatch.h>
+
extern struct mutex klp_mutex;
+static inline bool klp_is_object_loaded(struct klp_object *obj)
+{
+ return !obj->name || obj->mod;
+}
+
+static inline int klp_pre_patch_callback(struct klp_object *obj)
+{
+ int ret = 0;
+
+ if (obj->callbacks.pre_patch)
+ ret = (*obj->callbacks.pre_patch)(obj);
+
+ obj->callbacks.post_unpatch_enabled = !ret;
+
+ return ret;
+}
+
+static inline void klp_post_patch_callback(struct klp_object *obj)
+{
+ if (obj->callbacks.post_patch)
+ (*obj->callbacks.post_patch)(obj);
+}
+
+static inline void klp_pre_unpatch_callback(struct klp_object *obj)
+{
+ if (obj->callbacks.pre_unpatch)
+ (*obj->callbacks.pre_unpatch)(obj);
+}
+
+static inline void klp_post_unpatch_callback(struct klp_object *obj)
+{
+ if (obj->callbacks.post_unpatch_enabled &&
+ obj->callbacks.post_unpatch)
+ (*obj->callbacks.post_unpatch)(obj);
+
+ obj->callbacks.post_unpatch_enabled = false;
+}
+
#endif /* _LIVEPATCH_CORE_H */
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 52c4e907c14b..82d584225dc6 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -28,6 +28,7 @@
#include <linux/slab.h>
#include <linux/bug.h>
#include <linux/printk.h>
+#include "core.h"
#include "patch.h"
#include "transition.h"
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
new file mode 100644
index 000000000000..fdac27588d60
--- /dev/null
+++ b/kernel/livepatch/shadow.c
@@ -0,0 +1,277 @@
+/*
+ * shadow.c - Shadow Variables
+ *
+ * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * DOC: Shadow variable API concurrency notes:
+ *
+ * The shadow variable API provides a simple relationship between an
+ * <obj, id> pair and a pointer value. It is the responsibility of the
+ * caller to provide any mutual exclusion required of the shadow data.
+ *
+ * Once a shadow variable is attached to its parent object via the
+ * klp_shadow_*alloc() API calls, it is considered live: any subsequent
+ * call to klp_shadow_get() may then return the shadow variable's data
+ * pointer. Callers of klp_shadow_*alloc() should prepare shadow data
+ * accordingly.
+ *
+ * The klp_shadow_*alloc() API calls may allocate memory for new shadow
+ * variable structures. Their implementation does not call kmalloc
+ * inside any spinlocks, but API callers should pass GFP flags according
+ * to their specific needs.
+ *
+ * The klp_shadow_hash is an RCU-enabled hashtable and is safe against
+ * concurrent klp_shadow_free() and klp_shadow_get() operations.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/hashtable.h>
+#include <linux/slab.h>
+#include <linux/livepatch.h>
+
+static DEFINE_HASHTABLE(klp_shadow_hash, 12);
+
+/*
+ * klp_shadow_lock provides exclusive access to the klp_shadow_hash and
+ * the shadow variables it references.
+ */
+static DEFINE_SPINLOCK(klp_shadow_lock);
+
+/**
+ * struct klp_shadow - shadow variable structure
+ * @node: klp_shadow_hash hash table node
+ * @rcu_head: RCU is used to safely free this structure
+ * @obj: pointer to parent object
+ * @id: data identifier
+ * @data: data area
+ */
+struct klp_shadow {
+ struct hlist_node node;
+ struct rcu_head rcu_head;
+ void *obj;
+ unsigned long id;
+ char data[];
+};
+
+/**
+ * klp_shadow_match() - verify a shadow variable matches given <obj, id>
+ * @shadow: shadow variable to match
+ * @obj: pointer to parent object
+ * @id: data identifier
+ *
+ * Return: true if the shadow variable matches.
+ */
+static inline bool klp_shadow_match(struct klp_shadow *shadow, void *obj,
+ unsigned long id)
+{
+ return shadow->obj == obj && shadow->id == id;
+}
+
+/**
+ * klp_shadow_get() - retrieve a shadow variable data pointer
+ * @obj: pointer to parent object
+ * @id: data identifier
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get(void *obj, unsigned long id)
+{
+ struct klp_shadow *shadow;
+
+ rcu_read_lock();
+
+ hash_for_each_possible_rcu(klp_shadow_hash, shadow, node,
+ (unsigned long)obj) {
+
+ if (klp_shadow_match(shadow, obj, id)) {
+ rcu_read_unlock();
+ return shadow->data;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get);
+
+static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+ size_t size, gfp_t gfp_flags, bool warn_on_exist)
+{
+ struct klp_shadow *new_shadow;
+ void *shadow_data;
+ unsigned long flags;
+
+ /* Check if the shadow variable already exists */
+ shadow_data = klp_shadow_get(obj, id);
+ if (shadow_data)
+ goto exists;
+
+ /* Allocate a new shadow variable for use inside the lock below */
+ new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags);
+ if (!new_shadow)
+ return NULL;
+
+ new_shadow->obj = obj;
+ new_shadow->id = id;
+
+ /* Initialize the shadow variable if data provided */
+ if (data)
+ memcpy(new_shadow->data, data, size);
+
+ /* Look for <obj, id> again under the lock */
+ spin_lock_irqsave(&klp_shadow_lock, flags);
+ shadow_data = klp_shadow_get(obj, id);
+ if (unlikely(shadow_data)) {
+ /*
+ * Shadow variable was found, throw away speculative
+ * allocation.
+ */
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+ kfree(new_shadow);
+ goto exists;
+ }
+
+ /* No <obj, id> found, so attach the newly allocated one */
+ hash_add_rcu(klp_shadow_hash, &new_shadow->node,
+ (unsigned long)new_shadow->obj);
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+
+ return new_shadow->data;
+
+exists:
+ if (warn_on_exist) {
+ WARN(1, "Duplicate shadow variable <%p, %lx>\n", obj, id);
+ return NULL;
+ }
+
+ return shadow_data;
+}
+
+/**
+ * klp_shadow_alloc() - allocate and add a new shadow variable
+ * @obj: pointer to parent object
+ * @id: data identifier
+ * @data: pointer to data to attach to parent
+ * @size: size of attached data
+ * @gfp_flags: GFP mask for allocation
+ *
+ * Allocates @size bytes for new shadow variable data using @gfp_flags
+ * and copies @size bytes from @data into the new shadow variable's own
+ * data space. If @data is NULL, @size bytes are still allocated, but
+ * no copy is performed. The new shadow variable is then added to the
+ * global hashtable.
+ *
+ * If an existing <obj, id> shadow variable can be found, this routine
+ * will issue a WARN, exit early and return NULL.
+ *
+ * Return: the shadow variable data element, NULL on duplicate or
+ * failure.
+ */
+void *klp_shadow_alloc(void *obj, unsigned long id, void *data,
+ size_t size, gfp_t gfp_flags)
+{
+ return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_alloc);
+
+/**
+ * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable
+ * @obj: pointer to parent object
+ * @id: data identifier
+ * @data: pointer to data to attach to parent
+ * @size: size of attached data
+ * @gfp_flags: GFP mask for allocation
+ *
+ * Returns a pointer to existing shadow data if an <obj, id> shadow
+ * variable is already present. Otherwise, it creates a new shadow
+ * variable like klp_shadow_alloc().
+ *
+ * This function guarantees that only one shadow variable exists with
+ * the given @id for the given @obj. It also guarantees that the shadow
+ * variable will be initialized by the given @data only when it did not
+ * exist before.
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+ size_t size, gfp_t gfp_flags)
+{
+ return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
+
+/**
+ * klp_shadow_free() - detach and free a <obj, id> shadow variable
+ * @obj: pointer to parent object
+ * @id: data identifier
+ *
+ * This function releases the memory for this <obj, id> shadow variable
+ * instance, callers should stop referencing it accordingly.
+ */
+void klp_shadow_free(void *obj, unsigned long id)
+{
+ struct klp_shadow *shadow;
+ unsigned long flags;
+
+ spin_lock_irqsave(&klp_shadow_lock, flags);
+
+ /* Delete <obj, id> from hash */
+ hash_for_each_possible(klp_shadow_hash, shadow, node,
+ (unsigned long)obj) {
+
+ if (klp_shadow_match(shadow, obj, id)) {
+ hash_del_rcu(&shadow->node);
+ kfree_rcu(shadow, rcu_head);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free);
+
+/**
+ * klp_shadow_free_all() - detach and free all <*, id> shadow variables
+ * @id: data identifier
+ *
+ * This function releases the memory for all <*, id> shadow variable
+ * instances, callers should stop referencing them accordingly.
+ */
+void klp_shadow_free_all(unsigned long id)
+{
+ struct klp_shadow *shadow;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&klp_shadow_lock, flags);
+
+ /* Delete all <*, id> from hash */
+ hash_for_each(klp_shadow_hash, i, shadow, node) {
+ if (klp_shadow_match(shadow, shadow->obj, id)) {
+ hash_del_rcu(&shadow->node);
+ kfree_rcu(shadow, rcu_head);
+ }
+ }
+
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free_all);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index b004a1fb6032..56add6327736 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -82,6 +82,10 @@ static void klp_complete_transition(void)
unsigned int cpu;
bool immediate_func = false;
+ pr_debug("'%s': completing %s transition\n",
+ klp_transition_patch->mod->name,
+ klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
if (klp_target_state == KLP_UNPATCHED) {
/*
* All tasks have transitioned to KLP_UNPATCHED so we can now
@@ -109,9 +113,6 @@ static void klp_complete_transition(void)
}
}
- if (klp_target_state == KLP_UNPATCHED && !immediate_func)
- module_put(klp_transition_patch->mod);
-
/* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
if (klp_target_state == KLP_PATCHED)
klp_synchronize_transition();
@@ -130,6 +131,27 @@ static void klp_complete_transition(void)
}
done:
+ klp_for_each_object(klp_transition_patch, obj) {
+ if (!klp_is_object_loaded(obj))
+ continue;
+ if (klp_target_state == KLP_PATCHED)
+ klp_post_patch_callback(obj);
+ else if (klp_target_state == KLP_UNPATCHED)
+ klp_post_unpatch_callback(obj);
+ }
+
+ pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
+ klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
+ /*
+ * See complementary comment in __klp_enable_patch() for why we
+ * keep the module reference for immediate patches.
+ */
+ if (!klp_transition_patch->immediate && !immediate_func &&
+ klp_target_state == KLP_UNPATCHED) {
+ module_put(klp_transition_patch->mod);
+ }
+
klp_target_state = KLP_UNDEFINED;
klp_transition_patch = NULL;
}
@@ -145,6 +167,9 @@ void klp_cancel_transition(void)
if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED))
return;
+ pr_debug("'%s': canceling patching transition, going to unpatch\n",
+ klp_transition_patch->mod->name);
+
klp_target_state = KLP_UNPATCHED;
klp_complete_transition();
}
@@ -408,9 +433,6 @@ void klp_try_complete_transition(void)
}
success:
- pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
- klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
-
/* we're done, now cleanup the data structures */
klp_complete_transition();
}
@@ -426,7 +448,8 @@ void klp_start_transition(void)
WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
- pr_notice("'%s': %s...\n", klp_transition_patch->mod->name,
+ pr_notice("'%s': starting %s transition\n",
+ klp_transition_patch->mod->name,
klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
/*
@@ -482,6 +505,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
*/
klp_target_state = state;
+ pr_debug("'%s': initializing %s transition\n", patch->mod->name,
+ klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
/*
* If the patch can be applied or reverted immediately, skip the
* per-task transitions.
@@ -547,6 +573,11 @@ void klp_reverse_transition(void)
unsigned int cpu;
struct task_struct *g, *task;
+ pr_debug("'%s': reversing transition from %s\n",
+ klp_transition_patch->mod->name,
+ klp_target_state == KLP_PATCHED ? "patching to unpatching" :
+ "unpatching to patching");
+
klp_transition_patch->enabled = !klp_transition_patch->enabled;
klp_target_state = !klp_target_state;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index db933d063bfc..9776da8db180 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -47,7 +47,6 @@
#include <linux/stringify.h>
#include <linux/bitops.h>
#include <linux/gfp.h>
-#include <linux/kmemcheck.h>
#include <linux/random.h>
#include <linux/jhash.h>
@@ -3238,8 +3237,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
{
int i;
- kmemcheck_mark_initialized(lock, sizeof(*lock));
-
for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
lock->class_cache[i] = NULL;
diff --git a/kernel/module.c b/kernel/module.c
index 32c2cdaccd93..f0411a271765 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -847,10 +847,8 @@ static int add_module_usage(struct module *a, struct module *b)
pr_debug("Allocating new usage for %s.\n", a->name);
use = kmalloc(sizeof(*use), GFP_ATOMIC);
- if (!use) {
- pr_warn("%s: out of memory loading\n", a->name);
+ if (!use)
return -ENOMEM;
- }
use->source = a;
use->target = b;
@@ -3483,6 +3481,8 @@ static noinline int do_init_module(struct module *mod)
if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
async_synchronize_full();
+ ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
+ mod->init_layout.size);
mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
diff --git a/kernel/padata.c b/kernel/padata.c
index 868f947166d7..57c0074d50cc 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -131,6 +131,7 @@ int padata_do_parallel(struct padata_instance *pinst,
padata->cb_cpu = cb_cpu;
target_cpu = padata_cpu_hash(pd);
+ padata->cpu = target_cpu;
queue = per_cpu_ptr(pd->pqueue, target_cpu);
spin_lock(&queue->parallel.lock);
@@ -275,11 +276,51 @@ static void padata_reorder(struct parallel_data *pd)
return;
}
-static void padata_reorder_timer(unsigned long arg)
+static void invoke_padata_reorder(struct work_struct *work)
{
- struct parallel_data *pd = (struct parallel_data *)arg;
+ struct padata_parallel_queue *pqueue;
+ struct parallel_data *pd;
+ local_bh_disable();
+ pqueue = container_of(work, struct padata_parallel_queue, reorder_work);
+ pd = pqueue->pd;
padata_reorder(pd);
+ local_bh_enable();
+}
+
+static void padata_reorder_timer(struct timer_list *t)
+{
+ struct parallel_data *pd = from_timer(pd, t, timer);
+ unsigned int weight;
+ int target_cpu, cpu;
+
+ cpu = get_cpu();
+
+ /* We don't lock pd here to not interfere with parallel processing
+ * padata_reorder() calls on other CPUs. We just need any CPU out of
+ * the cpumask.pcpu set. It would be nice if it's the right one but
+ * it doesn't matter if we're off to the next one by using an outdated
+ * pd->processed value.
+ */
+ weight = cpumask_weight(pd->cpumask.pcpu);
+ target_cpu = padata_index_to_cpu(pd, pd->processed % weight);
+
+ /* ensure to call the reorder callback on the correct CPU */
+ if (cpu != target_cpu) {
+ struct padata_parallel_queue *pqueue;
+ struct padata_instance *pinst;
+
+ /* The timer function is serialized wrt itself -- no locking
+ * needed.
+ */
+ pinst = pd->pinst;
+ pqueue = per_cpu_ptr(pd->pqueue, target_cpu);
+ queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work);
+ } else {
+ padata_reorder(pd);
+ }
+
+ put_cpu();
}
static void padata_serial_worker(struct work_struct *serial_work)
@@ -323,10 +364,21 @@ void padata_do_serial(struct padata_priv *padata)
int cpu;
struct padata_parallel_queue *pqueue;
struct parallel_data *pd;
+ int reorder_via_wq = 0;
pd = padata->pd;
cpu = get_cpu();
+
+ /* We need to run on the same CPU padata_do_parallel(.., padata, ..)
+ * was called on -- or, at least, enqueue the padata object into the
+ * correct per-cpu queue.
+ */
+ if (cpu != padata->cpu) {
+ reorder_via_wq = 1;
+ cpu = padata->cpu;
+ }
+
pqueue = per_cpu_ptr(pd->pqueue, cpu);
spin_lock(&pqueue->reorder.lock);
@@ -336,7 +388,13 @@ void padata_do_serial(struct padata_priv *padata)
put_cpu();
- padata_reorder(pd);
+ /* If we're running on the wrong CPU, call padata_reorder() via a
+ * kernel worker.
+ */
+ if (reorder_via_wq)
+ queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work);
+ else
+ padata_reorder(pd);
}
EXPORT_SYMBOL(padata_do_serial);
@@ -384,8 +442,14 @@ static void padata_init_pqueues(struct parallel_data *pd)
struct padata_parallel_queue *pqueue;
cpu_index = 0;
- for_each_cpu(cpu, pd->cpumask.pcpu) {
+ for_each_possible_cpu(cpu) {
pqueue = per_cpu_ptr(pd->pqueue, cpu);
+
+ if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) {
+ pqueue->cpu_index = -1;
+ continue;
+ }
+
pqueue->pd = pd;
pqueue->cpu_index = cpu_index;
cpu_index++;
@@ -393,6 +457,7 @@ static void padata_init_pqueues(struct parallel_data *pd)
__padata_list_init(&pqueue->reorder);
__padata_list_init(&pqueue->parallel);
INIT_WORK(&pqueue->work, padata_parallel_worker);
+ INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder);
atomic_set(&pqueue->num_obj, 0);
}
}
@@ -420,7 +485,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
padata_init_pqueues(pd);
padata_init_squeues(pd);
- setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
+ timer_setup(&pd->timer, padata_reorder_timer, 0);
atomic_set(&pd->seq_nr, -1);
atomic_set(&pd->reorder_objects, 0);
atomic_set(&pd->refcnt, 0);
diff --git a/kernel/panic.c b/kernel/panic.c
index bdd18afa19a4..2cfef408fec9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,6 +27,8 @@
#include <linux/console.h>
#include <linux/bug.h>
#include <linux/ratelimit.h>
+#include <linux/debugfs.h>
+#include <asm/sections.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -322,6 +324,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
{ 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */
{ 'L', ' ', false }, /* TAINT_SOFTLOCKUP */
{ 'K', ' ', true }, /* TAINT_LIVEPATCH */
+ { 'X', ' ', true }, /* TAINT_AUX */
};
/**
@@ -343,6 +346,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
* 'E' - Unsigned module has been loaded.
* 'L' - A soft lockup has previously occurred.
* 'K' - Kernel has been live patched.
+ * 'X' - Auxiliary taint, for distros' use.
*
* The string is overwritten by the next call to print_tainted().
*/
@@ -518,7 +522,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
{
disable_trace_on_warning();
- pr_warn("------------[ cut here ]------------\n");
+ if (args)
+ pr_warn(CUT_HERE);
if (file)
pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
@@ -582,9 +587,49 @@ EXPORT_SYMBOL(warn_slowpath_fmt_taint);
void warn_slowpath_null(const char *file, int line)
{
+ pr_warn(CUT_HERE);
__warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
}
EXPORT_SYMBOL(warn_slowpath_null);
+#else
+void __warn_printk(const char *fmt, ...)
+{
+ va_list args;
+
+ pr_warn(CUT_HERE);
+
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL(__warn_printk);
+#endif
+
+#ifdef CONFIG_BUG
+
+/* Support resetting WARN*_ONCE state */
+
+static int clear_warn_once_set(void *data, u64 val)
+{
+ generic_bug_clear_once();
+ memset(__start_once, 0, __end_once - __start_once);
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
+ NULL,
+ clear_warn_once_set,
+ "%lld\n");
+
+static __init int register_warn_debugfs(void)
+{
+ /* Don't care about failure */
+ debugfs_create_file("clear_warn_once", 0200, NULL,
+ NULL, &clear_warn_once_fops);
+ return 0;
+}
+
+device_initcall(register_warn_debugfs);
#endif
#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/pid.c b/kernel/pid.c
index 020dedbdf066..b13b624e2c49 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -39,11 +39,8 @@
#include <linux/proc_ns.h>
#include <linux/proc_fs.h>
#include <linux/sched/task.h>
+#include <linux/idr.h>
-#define pid_hashfn(nr, ns) \
- hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
-static struct hlist_head *pid_hash;
-static unsigned int pidhash_shift = 4;
struct pid init_struct_pid = INIT_STRUCT_PID;
int pid_max = PID_MAX_DEFAULT;
@@ -53,15 +50,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-static inline int mk_pid(struct pid_namespace *pid_ns,
- struct pidmap *map, int off)
-{
- return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
-}
-
-#define find_next_offset(map, off) \
- find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
-
/*
* PID-map pages start out as NULL, they get allocated upon
* first use and are never deallocated. This way a low pid_max
@@ -70,11 +58,8 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
*/
struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
- .pidmap = {
- [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
- },
- .last_pid = 0,
- .nr_hashed = PIDNS_HASH_ADDING,
+ .idr = IDR_INIT,
+ .pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
@@ -101,138 +86,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-static void free_pidmap(struct upid *upid)
-{
- int nr = upid->nr;
- struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
- int offset = nr & BITS_PER_PAGE_MASK;
-
- clear_bit(offset, map->page);
- atomic_inc(&map->nr_free);
-}
-
-/*
- * If we started walking pids at 'base', is 'a' seen before 'b'?
- */
-static int pid_before(int base, int a, int b)
-{
- /*
- * This is the same as saying
- *
- * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
- * and that mapping orders 'a' and 'b' with respect to 'base'.
- */
- return (unsigned)(a - base) < (unsigned)(b - base);
-}
-
-/*
- * We might be racing with someone else trying to set pid_ns->last_pid
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
- * We want the winner to have the "later" value, because if the
- * "earlier" value prevails, then a pid may get reused immediately.
- *
- * Since pids rollover, it is not sufficient to just pick the bigger
- * value. We have to consider where we started counting from.
- *
- * 'base' is the value of pid_ns->last_pid that we observed when
- * we started looking for a pid.
- *
- * 'pid' is the pid that we eventually found.
- */
-static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
-{
- int prev;
- int last_write = base;
- do {
- prev = last_write;
- last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
- } while ((prev != last_write) && (pid_before(base, last_write, pid)));
-}
-
-static int alloc_pidmap(struct pid_namespace *pid_ns)
-{
- int i, offset, max_scan, pid, last = pid_ns->last_pid;
- struct pidmap *map;
-
- pid = last + 1;
- if (pid >= pid_max)
- pid = RESERVED_PIDS;
- offset = pid & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
- /*
- * If last_pid points into the middle of the map->page we
- * want to scan this bitmap block twice, the second time
- * we start with offset == 0 (or RESERVED_PIDS).
- */
- max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
- for (i = 0; i <= max_scan; ++i) {
- if (unlikely(!map->page)) {
- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /*
- * Free the page if someone raced with us
- * installing it:
- */
- spin_lock_irq(&pidmap_lock);
- if (!map->page) {
- map->page = page;
- page = NULL;
- }
- spin_unlock_irq(&pidmap_lock);
- kfree(page);
- if (unlikely(!map->page))
- return -ENOMEM;
- }
- if (likely(atomic_read(&map->nr_free))) {
- for ( ; ; ) {
- if (!test_and_set_bit(offset, map->page)) {
- atomic_dec(&map->nr_free);
- set_last_pid(pid_ns, last, pid);
- return pid;
- }
- offset = find_next_offset(map, offset);
- if (offset >= BITS_PER_PAGE)
- break;
- pid = mk_pid(pid_ns, map, offset);
- if (pid >= pid_max)
- break;
- }
- }
- if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
- ++map;
- offset = 0;
- } else {
- map = &pid_ns->pidmap[0];
- offset = RESERVED_PIDS;
- if (unlikely(last == offset))
- break;
- }
- pid = mk_pid(pid_ns, map, offset);
- }
- return -EAGAIN;
-}
-
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
-{
- int offset;
- struct pidmap *map, *end;
-
- if (last >= PID_MAX_LIMIT)
- return -1;
-
- offset = (last + 1) & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
- end = &pid_ns->pidmap[PIDMAP_ENTRIES];
- for (; map < end; map++, offset = 0) {
- if (unlikely(!map->page))
- continue;
- offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
- if (offset < BITS_PER_PAGE)
- return mk_pid(pid_ns, map, offset);
- }
- return -1;
-}
-
void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
@@ -265,8 +118,7 @@ void free_pid(struct pid *pid)
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
- hlist_del_rcu(&upid->pid_chain);
- switch(--ns->nr_hashed) {
+ switch (--ns->pid_allocated) {
case 2:
case 1:
/* When all that is left in the pid namespace
@@ -275,21 +127,20 @@ void free_pid(struct pid *pid)
*/
wake_up_process(ns->child_reaper);
break;
- case PIDNS_HASH_ADDING:
+ case PIDNS_ADDING:
/* Handle a fork failure of the first process */
WARN_ON(ns->child_reaper);
- ns->nr_hashed = 0;
+ ns->pid_allocated = 0;
/* fall through */
case 0:
schedule_work(&ns->proc_work);
break;
}
+
+ idr_remove(&ns->idr, upid->nr);
}
spin_unlock_irqrestore(&pidmap_lock, flags);
- for (i = 0; i <= pid->level; i++)
- free_pidmap(pid->numbers + i);
-
call_rcu(&pid->rcu, delayed_put_pid);
}
@@ -308,8 +159,29 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = ns;
pid->level = ns->level;
+
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ int pid_min = 1;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_irq(&pidmap_lock);
+
+ /*
+ * init really needs pid 1, but after reaching the maximum
+ * wrap back to RESERVED_PIDS
+ */
+ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+ pid_min = RESERVED_PIDS;
+
+ /*
+ * Store a null pointer so find_pid_ns does not find
+ * a partially initialized PID (see below).
+ */
+ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+ pid_max, GFP_ATOMIC);
+ spin_unlock_irq(&pidmap_lock);
+ idr_preload_end();
+
if (nr < 0) {
retval = nr;
goto out_free;
@@ -334,12 +206,12 @@ struct pid *alloc_pid(struct pid_namespace *ns)
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
- if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+ if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
for ( ; upid >= pid->numbers; --upid) {
- hlist_add_head_rcu(&upid->pid_chain,
- &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
- upid->ns->nr_hashed++;
+ /* Make the PID visible to find_pid_ns. */
+ idr_replace(&upid->ns->idr, pid, upid->nr);
+ upid->ns->pid_allocated++;
}
spin_unlock_irq(&pidmap_lock);
@@ -350,8 +222,11 @@ out_unlock:
put_pid_ns(ns);
out_free:
+ spin_lock_irq(&pidmap_lock);
while (++i <= ns->level)
- free_pidmap(pid->numbers + i);
+ idr_remove(&ns->idr, (pid->numbers + i)->nr);
+
+ spin_unlock_irq(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -360,21 +235,13 @@ out_free:
void disable_pid_allocation(struct pid_namespace *ns)
{
spin_lock_irq(&pidmap_lock);
- ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+ ns->pid_allocated &= ~PIDNS_ADDING;
spin_unlock_irq(&pidmap_lock);
}
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
- struct upid *pnr;
-
- hlist_for_each_entry_rcu(pnr,
- &pid_hash[pid_hashfn(nr, ns)], pid_chain)
- if (pnr->nr == nr && pnr->ns == ns)
- return container_of(pnr, struct pid,
- numbers[ns->level]);
-
- return NULL;
+ return idr_find(&ns->idr, nr);
}
EXPORT_SYMBOL_GPL(find_pid_ns);
@@ -530,6 +397,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (type != PIDTYPE_PID) {
if (type == __PIDTYPE_TGID)
type = PIDTYPE_PID;
+
task = task->group_leader;
}
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
@@ -553,35 +421,13 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
*/
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
- struct pid *pid;
-
- do {
- pid = find_pid_ns(nr, ns);
- if (pid)
- break;
- nr = next_pidmap(ns, nr);
- } while (nr > 0);
-
- return pid;
-}
-
-/*
- * The pid hash table is scaled according to the amount of memory in the
- * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
- * more.
- */
-void __init pidhash_init(void)
-{
- pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
- HASH_EARLY | HASH_SMALL | HASH_ZERO,
- &pidhash_shift, NULL,
- 0, 4096);
+ return idr_get_next(&ns->idr, &nr);
}
-void __init pidmap_init(void)
+void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
- BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
pid_max = min(pid_max_max, max_t(int, pid_max,
@@ -590,10 +436,7 @@ void __init pidmap_init(void)
PIDS_PER_CPU_MIN * num_possible_cpus());
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
- init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /* Reserve PID 0. We never call free_pidmap(0) */
- set_bit(0, init_pid_ns.pidmap[0].page);
- atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+ idr_init(&init_pid_ns.idr);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 4918314893bc..0b53eef7d34b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,7 @@
#include <linux/export.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
+#include <linux/idr.h>
struct pid_cache {
int nr_ids;
@@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
struct ucounts *ucounts;
- int i;
int err;
err = -EINVAL;
@@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns == NULL)
goto out_dec;
- ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- if (!ns->pidmap[0].page)
- goto out_free;
+ idr_init(&ns->idr);
ns->pid_cachep = create_pid_cachep(level + 1);
if (ns->pid_cachep == NULL)
- goto out_free_map;
+ goto out_free_idr;
err = ns_alloc_inum(&ns->ns);
if (err)
- goto out_free_map;
+ goto out_free_idr;
ns->ns.ops = &pidns_operations;
kref_init(&ns->kref);
@@ -135,20 +133,13 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
- ns->nr_hashed = PIDNS_HASH_ADDING;
+ ns->pid_allocated = PIDNS_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);
- set_bit(0, ns->pidmap[0].page);
- atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
- for (i = 1; i < PIDMAP_ENTRIES; i++)
- atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-
return ns;
-out_free_map:
- kfree(ns->pidmap[0].page);
-out_free:
+out_free_idr:
+ idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
out_dec:
dec_pid_namespaces(ucounts);
@@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
- int i;
-
ns_free_inum(&ns->ns);
- for (i = 0; i < PIDMAP_ENTRIES; i++)
- kfree(ns->pidmap[i].page);
+
+ idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
}
@@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
int rc;
struct task_struct *task, *me = current;
int init_pids = thread_group_leader(me) ? 1 : 2;
+ struct pid *pid;
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
@@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* maintain a tasklist for each pid namespace.
*
*/
+ rcu_read_lock();
read_lock(&tasklist_lock);
- nr = next_pidmap(pid_ns, 1);
- while (nr > 0) {
- rcu_read_lock();
-
- task = pid_task(find_vpid(nr), PIDTYPE_PID);
+ nr = 2;
+ idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
+ task = pid_task(pid, PIDTYPE_PID);
if (task && !__fatal_signal_pending(task))
send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
-
- rcu_read_unlock();
-
- nr = next_pidmap(pid_ns, nr);
}
read_unlock(&tasklist_lock);
+ rcu_read_unlock();
/*
* Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
@@ -268,7 +254,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* sys_wait4() above can't reap the EXIT_DEAD children but we do not
* really care, we could reparent them to the global init. We could
* exit and reap ->child_reaper even if it is not the last thread in
- * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+ * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
* pid_ns can not go away until proc_kill_sb() drops the reference.
*
* But this ns can also have other tasks injected by setns()+fork().
@@ -282,7 +268,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*/
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- if (pid_ns->nr_hashed == init_pids)
+ if (pid_ns->pid_allocated == init_pids)
break;
schedule();
}
@@ -301,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
+ int ret, next;
if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
@@ -311,8 +298,14 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
* it should synchronize its usage with external means.
*/
- tmp.data = &pid_ns->last_pid;
- return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ next = idr_get_cursor(&pid_ns->idr) - 1;
+
+ tmp.data = &next;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (!ret && write)
+ idr_set_cursor(&pid_ns->idr, next + 1);
+
+ return ret;
}
extern int pid_max;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e8517b63eb37..e880ca22c5a5 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -259,20 +259,6 @@ config APM_EMULATION
anything, try disabling/enabling this option (or disabling/enabling
APM in your BIOS).
-config PM_OPP
- bool
- select SRCU
- ---help---
- SOCs have a standard set of tuples consisting of frequency and
- voltage pairs that the device will support per voltage domain. This
- is called Operating Performance Point or OPP. The actual definitions
- of OPP varies over silicon within the same family of devices.
-
- OPP layer organizes the data internally using device pointers
- representing individual voltage domains and provides SOC
- implementations a ready to use framework to manage OPPs.
- For more information, read <file:Documentation/power/opp.txt>
-
config PM_CLK
def_bool y
depends on PM && HAVE_CLK
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 97b0df71303e..9d7503910ce2 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -701,8 +701,8 @@ static int __init pm_qos_power_init(void)
for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
ret = register_pm_qos_misc(pm_qos_array[i], d);
if (ret < 0) {
- printk(KERN_ERR "pm_qos_param: %s setup failed\n",
- pm_qos_array[i]->name);
+ pr_err("%s: %s setup failed\n",
+ __func__, pm_qos_array[i]->name);
return ret;
}
}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0972a8e09d08..bce0464524d8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) "PM: " fmt
+
#include <linux/version.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -967,7 +969,7 @@ void __init __register_nosave_region(unsigned long start_pfn,
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
Report:
- printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n",
+ pr_info("Registered nosave memory: [mem %#010llx-%#010llx]\n",
(unsigned long long) start_pfn << PAGE_SHIFT,
((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
}
@@ -1039,7 +1041,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
list_for_each_entry(region, &nosave_regions, list) {
unsigned long pfn;
- pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
+ pr_debug("Marking nosave pages: [mem %#010llx-%#010llx]\n",
(unsigned long long) region->start_pfn << PAGE_SHIFT,
((unsigned long long) region->end_pfn << PAGE_SHIFT)
- 1);
@@ -1095,7 +1097,7 @@ int create_basic_memory_bitmaps(void)
free_pages_map = bm2;
mark_nosave_pages(forbidden_pages_map);
- pr_debug("PM: Basic memory bitmaps created\n");
+ pr_debug("Basic memory bitmaps created\n");
return 0;
@@ -1131,7 +1133,7 @@ void free_basic_memory_bitmaps(void)
memory_bm_free(bm2, PG_UNSAFE_CLEAR);
kfree(bm2);
- pr_debug("PM: Basic memory bitmaps freed\n");
+ pr_debug("Basic memory bitmaps freed\n");
}
void clear_free_pages(void)
@@ -1152,7 +1154,7 @@ void clear_free_pages(void)
pfn = memory_bm_next_pfn(bm);
}
memory_bm_position_reset(bm);
- pr_info("PM: free pages cleared after restore\n");
+ pr_info("free pages cleared after restore\n");
#endif /* PAGE_POISONING_ZERO */
}
@@ -1690,7 +1692,7 @@ int hibernate_preallocate_memory(void)
ktime_t start, stop;
int error;
- printk(KERN_INFO "PM: Preallocating image memory... ");
+ pr_info("Preallocating image memory... ");
start = ktime_get();
error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
@@ -1821,13 +1823,13 @@ int hibernate_preallocate_memory(void)
out:
stop = ktime_get();
- printk(KERN_CONT "done (allocated %lu pages)\n", pages);
+ pr_cont("done (allocated %lu pages)\n", pages);
swsusp_show_speed(start, stop, pages, "Allocated");
return 0;
err_out:
- printk(KERN_CONT "\n");
+ pr_cont("\n");
swsusp_free();
return -ENOMEM;
}
@@ -1867,8 +1869,8 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
free += zone_page_state(zone, NR_FREE_PAGES);
nr_pages += count_pages_for_highmem(nr_highmem);
- pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
- nr_pages, PAGES_FOR_IO, free);
+ pr_debug("Normal pages needed: %u + %u, available pages: %u\n",
+ nr_pages, PAGES_FOR_IO, free);
return free > nr_pages + PAGES_FOR_IO;
}
@@ -1882,7 +1884,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
*/
static inline int get_highmem_buffer(int safe_needed)
{
- buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+ buffer = get_image_page(GFP_ATOMIC, safe_needed);
return buffer ? 0 : -ENOMEM;
}
@@ -1943,7 +1945,7 @@ static int swsusp_alloc(struct memory_bitmap *copy_bm,
while (nr_pages-- > 0) {
struct page *page;
- page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+ page = alloc_image_page(GFP_ATOMIC);
if (!page)
goto err_out;
memory_bm_set_bit(copy_bm, page_to_pfn(page));
@@ -1961,20 +1963,20 @@ asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
- printk(KERN_INFO "PM: Creating hibernation image:\n");
+ pr_info("Creating hibernation image:\n");
drain_local_pages(NULL);
nr_pages = count_data_pages();
nr_highmem = count_highmem_pages();
- printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
+ pr_info("Need to copy %u pages\n", nr_pages + nr_highmem);
if (!enough_free_mem(nr_pages, nr_highmem)) {
- printk(KERN_ERR "PM: Not enough free memory\n");
+ pr_err("Not enough free memory\n");
return -ENOMEM;
}
if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem)) {
- printk(KERN_ERR "PM: Memory allocation failed\n");
+ pr_err("Memory allocation failed\n");
return -ENOMEM;
}
@@ -1995,8 +1997,7 @@ asmlinkage __visible int swsusp_save(void)
nr_copy_pages = nr_pages;
nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
- printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n",
- nr_pages);
+ pr_info("Hibernation image created (%d pages copied)\n", nr_pages);
return 0;
}
@@ -2170,7 +2171,7 @@ static int check_header(struct swsusp_info *info)
if (!reason && info->num_physpages != get_num_physpages())
reason = "memory size";
if (reason) {
- printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
+ pr_err("Image mismatch: %s\n", reason);
return -EPERM;
}
return 0;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ccd2d20e6b06..0685c4499431 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -437,7 +437,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = suspend_ops->enter(state);
trace_suspend_resume(TPS("machine_suspend"),
state, false);
- events_check_enabled = false;
} else if (*wakeup) {
error = -EBUSY;
}
@@ -582,6 +581,7 @@ static int enter_state(suspend_state_t state)
pm_restore_gfp_mask();
Finish:
+ events_check_enabled = false;
pm_pr_dbg("Finishing wakeup.\n");
suspend_finish();
Unlock:
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index d7cdc426ee38..293ead59eccc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -12,6 +12,8 @@
*
*/
+#define pr_fmt(fmt) "PM: " fmt
+
#include <linux/module.h>
#include <linux/file.h>
#include <linux/delay.h>
@@ -241,9 +243,9 @@ static void hib_end_io(struct bio *bio)
struct page *page = bio->bi_io_vec[0].bv_page;
if (bio->bi_status) {
- printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
- MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+ (unsigned long long)bio->bi_iter.bi_sector);
}
if (bio_data_dir(bio) == WRITE)
@@ -273,8 +275,8 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_err("Adding page to bio failed at %llu\n",
+ (unsigned long long)bio->bi_iter.bi_sector);
bio_put(bio);
return -EFAULT;
}
@@ -319,7 +321,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
swsusp_resume_block, swsusp_header, NULL);
} else {
- printk(KERN_ERR "PM: Swap header not found!\n");
+ pr_err("Swap header not found!\n");
error = -ENODEV;
}
return error;
@@ -413,8 +415,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
ret = swsusp_swap_check();
if (ret) {
if (ret != -ENOSPC)
- printk(KERN_ERR "PM: Cannot find swap device, try "
- "swapon -a.\n");
+ pr_err("Cannot find swap device, try swapon -a\n");
return ret;
}
handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -491,9 +492,9 @@ static int swap_writer_finish(struct swap_map_handle *handle,
{
if (!error) {
flush_swap_writer(handle);
- printk(KERN_INFO "PM: S");
+ pr_info("S");
error = mark_swapfiles(handle, flags);
- printk("|\n");
+ pr_cont("|\n");
}
if (error)
@@ -542,7 +543,7 @@ static int save_image(struct swap_map_handle *handle,
hib_init_batch(&hb);
- printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
+ pr_info("Saving image data pages (%u pages)...\n",
nr_to_write);
m = nr_to_write / 10;
if (!m)
@@ -557,8 +558,8 @@ static int save_image(struct swap_map_handle *handle,
if (ret)
break;
if (!(nr_pages % m))
- printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image saving progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
err2 = hib_wait_io(&hb);
@@ -566,7 +567,7 @@ static int save_image(struct swap_map_handle *handle,
if (!ret)
ret = err2;
if (!ret)
- printk(KERN_INFO "PM: Image saving done.\n");
+ pr_info("Image saving done\n");
swsusp_show_speed(start, stop, nr_to_write, "Wrote");
return ret;
}
@@ -692,14 +693,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
if (!page) {
- printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+ pr_err("Failed to allocate LZO page\n");
ret = -ENOMEM;
goto out_clean;
}
data = vmalloc(sizeof(*data) * nr_threads);
if (!data) {
- printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+ pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -708,7 +709,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
crc = kmalloc(sizeof(*crc), GFP_KERNEL);
if (!crc) {
- printk(KERN_ERR "PM: Failed to allocate crc\n");
+ pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -726,8 +727,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
"image_compress/%u", thr);
if (IS_ERR(data[thr].thr)) {
data[thr].thr = NULL;
- printk(KERN_ERR
- "PM: Cannot start compression threads\n");
+ pr_err("Cannot start compression threads\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -749,7 +749,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
if (IS_ERR(crc->thr)) {
crc->thr = NULL;
- printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+ pr_err("Cannot start CRC32 thread\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -760,10 +760,9 @@ static int save_image_lzo(struct swap_map_handle *handle,
*/
handle->reqd_free_pages = reqd_free_pages();
- printk(KERN_INFO
- "PM: Using %u thread(s) for compression.\n"
- "PM: Compressing and saving image data (%u pages)...\n",
- nr_threads, nr_to_write);
+ pr_info("Using %u thread(s) for compression\n", nr_threads);
+ pr_info("Compressing and saving image data (%u pages)...\n",
+ nr_to_write);
m = nr_to_write / 10;
if (!m)
m = 1;
@@ -783,10 +782,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
data_of(*snapshot), PAGE_SIZE);
if (!(nr_pages % m))
- printk(KERN_INFO
- "PM: Image saving progress: "
- "%3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image saving progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
if (!off)
@@ -813,15 +810,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
ret = data[thr].ret;
if (ret < 0) {
- printk(KERN_ERR "PM: LZO compression failed\n");
+ pr_err("LZO compression failed\n");
goto out_finish;
}
if (unlikely(!data[thr].cmp_len ||
data[thr].cmp_len >
lzo1x_worst_compress(data[thr].unc_len))) {
- printk(KERN_ERR
- "PM: Invalid LZO compressed length\n");
+ pr_err("Invalid LZO compressed length\n");
ret = -1;
goto out_finish;
}
@@ -857,7 +853,7 @@ out_finish:
if (!ret)
ret = err2;
if (!ret)
- printk(KERN_INFO "PM: Image saving done.\n");
+ pr_info("Image saving done\n");
swsusp_show_speed(start, stop, nr_to_write, "Wrote");
out_clean:
if (crc) {
@@ -888,7 +884,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
unsigned int free_swap = count_swap_pages(root_swap, 1);
unsigned int required;
- pr_debug("PM: Free swap pages: %u\n", free_swap);
+ pr_debug("Free swap pages: %u\n", free_swap);
required = PAGES_FOR_IO + nr_pages;
return free_swap > required;
@@ -915,12 +911,12 @@ int swsusp_write(unsigned int flags)
pages = snapshot_get_image_size();
error = get_swap_writer(&handle);
if (error) {
- printk(KERN_ERR "PM: Cannot get swap writer\n");
+ pr_err("Cannot get swap writer\n");
return error;
}
if (flags & SF_NOCOMPRESS_MODE) {
if (!enough_swap(pages, flags)) {
- printk(KERN_ERR "PM: Not enough free swap\n");
+ pr_err("Not enough free swap\n");
error = -ENOSPC;
goto out_finish;
}
@@ -1068,8 +1064,7 @@ static int load_image(struct swap_map_handle *handle,
hib_init_batch(&hb);
clean_pages_on_read = true;
- printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
- nr_to_read);
+ pr_info("Loading image data pages (%u pages)...\n", nr_to_read);
m = nr_to_read / 10;
if (!m)
m = 1;
@@ -1087,8 +1082,8 @@ static int load_image(struct swap_map_handle *handle,
if (ret)
break;
if (!(nr_pages % m))
- printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image loading progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
err2 = hib_wait_io(&hb);
@@ -1096,7 +1091,7 @@ static int load_image(struct swap_map_handle *handle,
if (!ret)
ret = err2;
if (!ret) {
- printk(KERN_INFO "PM: Image loading done.\n");
+ pr_info("Image loading done\n");
snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
ret = -ENODATA;
@@ -1190,14 +1185,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
if (!page) {
- printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+ pr_err("Failed to allocate LZO page\n");
ret = -ENOMEM;
goto out_clean;
}
data = vmalloc(sizeof(*data) * nr_threads);
if (!data) {
- printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+ pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -1206,7 +1201,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
crc = kmalloc(sizeof(*crc), GFP_KERNEL);
if (!crc) {
- printk(KERN_ERR "PM: Failed to allocate crc\n");
+ pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -1226,8 +1221,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
"image_decompress/%u", thr);
if (IS_ERR(data[thr].thr)) {
data[thr].thr = NULL;
- printk(KERN_ERR
- "PM: Cannot start decompression threads\n");
+ pr_err("Cannot start decompression threads\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -1249,7 +1243,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
if (IS_ERR(crc->thr)) {
crc->thr = NULL;
- printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+ pr_err("Cannot start CRC32 thread\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -1274,8 +1268,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
if (!page[i]) {
if (i < LZO_CMP_PAGES) {
ring_size = i;
- printk(KERN_ERR
- "PM: Failed to allocate LZO pages\n");
+ pr_err("Failed to allocate LZO pages\n");
ret = -ENOMEM;
goto out_clean;
} else {
@@ -1285,10 +1278,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
want = ring_size = i;
- printk(KERN_INFO
- "PM: Using %u thread(s) for decompression.\n"
- "PM: Loading and decompressing image data (%u pages)...\n",
- nr_threads, nr_to_read);
+ pr_info("Using %u thread(s) for decompression\n", nr_threads);
+ pr_info("Loading and decompressing image data (%u pages)...\n",
+ nr_to_read);
m = nr_to_read / 10;
if (!m)
m = 1;
@@ -1348,8 +1340,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
if (unlikely(!data[thr].cmp_len ||
data[thr].cmp_len >
lzo1x_worst_compress(LZO_UNC_SIZE))) {
- printk(KERN_ERR
- "PM: Invalid LZO compressed length\n");
+ pr_err("Invalid LZO compressed length\n");
ret = -1;
goto out_finish;
}
@@ -1400,16 +1391,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
ret = data[thr].ret;
if (ret < 0) {
- printk(KERN_ERR
- "PM: LZO decompression failed\n");
+ pr_err("LZO decompression failed\n");
goto out_finish;
}
if (unlikely(!data[thr].unc_len ||
data[thr].unc_len > LZO_UNC_SIZE ||
data[thr].unc_len & (PAGE_SIZE - 1))) {
- printk(KERN_ERR
- "PM: Invalid LZO uncompressed length\n");
+ pr_err("Invalid LZO uncompressed length\n");
ret = -1;
goto out_finish;
}
@@ -1420,10 +1409,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
data[thr].unc + off, PAGE_SIZE);
if (!(nr_pages % m))
- printk(KERN_INFO
- "PM: Image loading progress: "
- "%3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image loading progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
ret = snapshot_write_next(snapshot);
@@ -1448,15 +1435,14 @@ out_finish:
}
stop = ktime_get();
if (!ret) {
- printk(KERN_INFO "PM: Image loading done.\n");
+ pr_info("Image loading done\n");
snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
ret = -ENODATA;
if (!ret) {
if (swsusp_header->flags & SF_CRC32_MODE) {
if(handle->crc32 != swsusp_header->crc32) {
- printk(KERN_ERR
- "PM: Invalid image CRC32!\n");
+ pr_err("Invalid image CRC32!\n");
ret = -ENODATA;
}
}
@@ -1513,9 +1499,9 @@ int swsusp_read(unsigned int *flags_p)
swap_reader_finish(&handle);
end:
if (!error)
- pr_debug("PM: Image successfully loaded\n");
+ pr_debug("Image successfully loaded\n");
else
- pr_debug("PM: Error %d resuming\n", error);
+ pr_debug("Error %d resuming\n", error);
return error;
}
@@ -1552,13 +1538,13 @@ put:
if (error)
blkdev_put(hib_resume_bdev, FMODE_READ);
else
- pr_debug("PM: Image signature found, resuming\n");
+ pr_debug("Image signature found, resuming\n");
} else {
error = PTR_ERR(hib_resume_bdev);
}
if (error)
- pr_debug("PM: Image not found (code %d)\n", error);
+ pr_debug("Image not found (code %d)\n", error);
return error;
}
@@ -1570,7 +1556,7 @@ put:
void swsusp_close(fmode_t mode)
{
if (IS_ERR(hib_resume_bdev)) {
- pr_debug("PM: Image device not initialised\n");
+ pr_debug("Image device not initialised\n");
return;
}
@@ -1594,7 +1580,7 @@ int swsusp_unmark(void)
swsusp_resume_block,
swsusp_header, NULL);
} else {
- printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
+ pr_err("Cannot find swsusp signature!\n");
error = -ENODEV;
}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 512f7c2baedd..5d81206a572d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2190,7 +2190,7 @@ again:
}
if (console_seq < log_first_seq) {
- len = sprintf(text, "** %u printk messages dropped ** ",
+ len = sprintf(text, "** %u printk messages dropped **\n",
(unsigned)(log_first_seq - console_seq));
/* messages are gone, move to first one */
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 3cdaeaef9ce1..3e3c2004bb23 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -39,7 +39,7 @@
* There are situations when we want to make sure that all buffers
* were handled or when IRQs are blocked.
*/
-static int printk_safe_irq_ready;
+static int printk_safe_irq_ready __read_mostly;
#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
sizeof(atomic_t) - \
@@ -63,11 +63,8 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
/* Get flushed in a more safe context. */
static void queue_flush_work(struct printk_safe_seq_buf *s)
{
- if (printk_safe_irq_ready) {
- /* Make sure that IRQ work is really initialized. */
- smp_rmb();
+ if (printk_safe_irq_ready)
irq_work_queue(&s->work);
- }
}
/*
@@ -75,7 +72,7 @@ static void queue_flush_work(struct printk_safe_seq_buf *s)
* have dedicated buffers, because otherwise printk-safe preempted by
* NMI-printk would have overwritten the NMI messages.
*
- * The messages are fushed from irq work (or from panic()), possibly,
+ * The messages are flushed from irq work (or from panic()), possibly,
* from other CPU, concurrently with printk_safe_log_store(). Should this
* happen, printk_safe_log_store() will notice the buffer->len mismatch
* and repeat the write.
@@ -398,8 +395,12 @@ void __init printk_safe_init(void)
#endif
}
- /* Make sure that IRQ works are initialized before enabling. */
- smp_wmb();
+ /*
+ * In the highly unlikely event that a NMI were to trigger at
+ * this moment. Make sure IRQ work is set up before this
+ * variable is set.
+ */
+ barrier();
printk_safe_irq_ready = 1;
/* Flush pending messages that did not have scheduled IRQ works. */
diff --git a/kernel/reboot.c b/kernel/reboot.c
index bd30a973fe94..e4ced883d8de 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_reboot_notifier);
+static void devm_unregister_reboot_notifier(struct device *dev, void *res)
+{
+ WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res));
+}
+
+int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb)
+{
+ struct notifier_block **rcnb;
+ int ret;
+
+ rcnb = devres_alloc(devm_unregister_reboot_notifier,
+ sizeof(*rcnb), GFP_KERNEL);
+ if (!rcnb)
+ return -ENOMEM;
+
+ ret = register_reboot_notifier(nb);
+ if (!ret) {
+ *rcnb = nb;
+ devres_add(dev, rcnb);
+ } else {
+ devres_free(rcnb);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(devm_register_reboot_notifier);
+
/*
* Notifier list for kernel code which wants to be called
* to restart the system.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5b82a0073532..75554f366fd3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -16,6 +16,7 @@
#include <linux/init_task.h>
#include <linux/context_tracking.h>
#include <linux/rcupdate_wait.h>
+#include <linux/compat.h>
#include <linux/blkdev.h>
#include <linux/kprobes.h>
@@ -5107,13 +5108,11 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
* Return: On success, 0 and the timeslice is in @interval. Otherwise,
* an error code.
*/
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
- struct timespec __user *, interval)
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
struct task_struct *p;
unsigned int time_slice;
struct rq_flags rf;
- struct timespec t;
struct rq *rq;
int retval;
@@ -5137,15 +5136,40 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
task_rq_unlock(rq, p, &rf);
rcu_read_unlock();
- jiffies_to_timespec(time_slice, &t);
- retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
- return retval;
+ jiffies_to_timespec64(time_slice, t);
+ return 0;
out_unlock:
rcu_read_unlock();
return retval;
}
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_timespec64(&t, interval);
+
+ return retval;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+ compat_pid_t, pid,
+ struct compat_timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = compat_put_timespec64(&t, interval);
+ return retval;
+}
+#endif
+
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
@@ -6620,7 +6644,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
return ret;
}
-static int cpu_stats_show(struct seq_file *sf, void *v)
+static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -6660,7 +6684,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
-static struct cftype cpu_files[] = {
+static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
.name = "shares",
@@ -6681,7 +6705,7 @@ static struct cftype cpu_files[] = {
},
{
.name = "stat",
- .seq_show = cpu_stats_show,
+ .seq_show = cpu_cfs_stat_show,
},
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -6699,16 +6723,182 @@ static struct cftype cpu_files[] = {
{ } /* Terminate */
};
+static int cpu_extra_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ u64 throttled_usec;
+
+ throttled_usec = cfs_b->throttled_time;
+ do_div(throttled_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, "nr_periods %d\n"
+ "nr_throttled %d\n"
+ "throttled_usec %llu\n",
+ cfs_b->nr_periods, cfs_b->nr_throttled,
+ throttled_usec);
+ }
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct task_group *tg = css_tg(css);
+ u64 weight = scale_load_down(tg->shares);
+
+ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+}
+
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 weight)
+{
+ /*
+ * cgroup weight knobs should use the common MIN, DFL and MAX
+ * values which are 1, 100 and 10000 respectively. While it loses
+ * a bit of range on both ends, it maps pretty well onto the shares
+ * value used by scheduler and the round-trip conversions preserve
+ * the original value over the entire range.
+ */
+ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+ return -ERANGE;
+
+ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+
+static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ unsigned long weight = scale_load_down(css_tg(css)->shares);
+ int last_delta = INT_MAX;
+ int prio, delta;
+
+ /* find the closest nice value to the current weight */
+ for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
+ delta = abs(sched_prio_to_weight[prio] - weight);
+ if (delta >= last_delta)
+ break;
+ last_delta = delta;
+ }
+
+ return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
+}
+
+static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 nice)
+{
+ unsigned long weight;
+
+ if (nice < MIN_NICE || nice > MAX_NICE)
+ return -ERANGE;
+
+ weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO];
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+#endif
+
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+ long period, long quota)
+{
+ if (quota < 0)
+ seq_puts(sf, "max");
+ else
+ seq_printf(sf, "%ld", quota);
+
+ seq_printf(sf, " %ld\n", period);
+}
+
+/* caller should put the current value in *@periodp before calling */
+static int __maybe_unused cpu_period_quota_parse(char *buf,
+ u64 *periodp, u64 *quotap)
+{
+ char tok[21]; /* U64_MAX */
+
+ if (!sscanf(buf, "%s %llu", tok, periodp))
+ return -EINVAL;
+
+ *periodp *= NSEC_PER_USEC;
+
+ if (sscanf(tok, "%llu", quotap))
+ *quotap *= NSEC_PER_USEC;
+ else if (!strcmp(tok, "max"))
+ *quotap = RUNTIME_INF;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static int cpu_max_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+ return 0;
+}
+
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct task_group *tg = css_tg(of_css(of));
+ u64 period = tg_get_cfs_period(tg);
+ u64 quota;
+ int ret;
+
+ ret = cpu_period_quota_parse(buf, &period, &quota);
+ if (!ret)
+ ret = tg_set_cfs_bandwidth(tg, period, quota);
+ return ret ?: nbytes;
+}
+#endif
+
+static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ {
+ .name = "weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_weight_read_u64,
+ .write_u64 = cpu_weight_write_u64,
+ },
+ {
+ .name = "weight.nice",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_weight_nice_read_s64,
+ .write_s64 = cpu_weight_nice_write_s64,
+ },
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .name = "max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_max_show,
+ .write = cpu_max_write,
+ },
+#endif
+ { } /* terminate */
+};
+
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
+ .css_extra_stat_show = cpu_extra_stat_show,
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
- .legacy_cftypes = cpu_files,
+ .legacy_cftypes = cpu_legacy_files,
+ .dfl_cftypes = cpu_files,
.early_init = true,
+ .threaded = true,
};
#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
deleted file mode 100644
index a8358a57a316..000000000000
--- a/kernel/sched/cpuacct.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifdef CONFIG_CGROUP_CPUACCT
-
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
-
-#else
-
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-}
-
-static inline void
-cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
-{
-}
-
-#endif
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index ba0da243fdd8..2f52ec0f1539 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -282,8 +282,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
*/
- if (busy && next_f < sg_policy->next_freq)
+ if (busy && next_f < sg_policy->next_freq) {
next_f = sg_policy->next_freq;
+
+ /* Reset cached freq as next_freq has changed */
+ sg_policy->cached_raw_freq = 0;
+ }
}
sugov_update_commit(sg_policy, time, next_f);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9be8b68a66da..bac6ac9a4ec7 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -109,7 +109,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
*/
__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
- cpuacct_account_field(p, index, tmp);
+ cgroup_account_cputime_field(p, index, tmp);
}
/*
@@ -446,6 +446,13 @@ void vtime_account_irq_enter(struct task_struct *tsk)
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+ u64 *ut, u64 *st)
+{
+ *ut = curr->utime;
+ *st = curr->stime;
+}
+
void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
*ut = p->utime;
@@ -584,9 +591,8 @@ drop_precision:
*
* Assuming that rtime_i+1 >= rtime_i.
*/
-static void cputime_adjust(struct task_cputime *curr,
- struct prev_cputime *prev,
- u64 *ut, u64 *st)
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+ u64 *ut, u64 *st)
{
u64 rtime, stime, utime;
unsigned long flags;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f349f7e98dec..2473736c7616 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1144,7 +1144,7 @@ static void update_curr_dl(struct rq *rq)
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq_clock_task(rq);
- cpuacct_charge(curr, delta_exec);
+ cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0989676c50e9..4037e19bbca2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -844,7 +844,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
- cpuacct_charge(curtask, delta_exec);
+ cgroup_account_cputime(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d8c43d73e078..4056c19ca3f0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -969,7 +969,7 @@ static void update_curr_rt(struct rq *rq)
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq_clock_task(rq);
- cpuacct_charge(curr, delta_exec);
+ cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 45ab0bf564e7..b19552a212de 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -30,6 +30,7 @@
#include <linux/irq_work.h>
#include <linux/tick.h>
#include <linux/slab.h>
+#include <linux/cgroup.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
@@ -37,7 +38,6 @@
#include "cpupri.h"
#include "cpudeadline.h"
-#include "cpuacct.h"
#ifdef CONFIG_SCHED_DEBUG
# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 45caf90b24cd..210b1f2146ff 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -72,7 +72,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq_clock_task(rq);
- cpuacct_charge(curr, delta_exec);
+ cgroup_account_cputime(curr, delta_exec);
}
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index f8159698aa4d..84cb3acd9260 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -183,7 +183,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
*/
static __sched
int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
- int (*action)(atomic_t *), unsigned mode)
+ wait_atomic_t_action_f action, unsigned int mode)
{
atomic_t *val;
int ret = 0;
@@ -193,7 +193,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
val = wbq_entry->key.flags;
if (atomic_read(val) == 0)
break;
- ret = (*action)(val);
+ ret = (*action)(val, mode);
} while (!ret && atomic_read(val) != 0);
finish_wait(wq_head, &wbq_entry->wq_entry);
return ret;
@@ -210,8 +210,9 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
}, \
}
-__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
- unsigned mode)
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p,
+ wait_atomic_t_action_f action,
+ unsigned int mode)
{
struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
DEFINE_WAIT_ATOMIC_T(wq_entry, p);
@@ -220,6 +221,15 @@ __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
}
EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
+__sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
+{
+ schedule();
+ if (signal_pending_state(mode, current))
+ return -EINTR;
+ return 0;
+}
+EXPORT_SYMBOL(atomic_t_wait);
+
/**
* wake_up_atomic_t - Wake up a waiter on a atomic_t
* @p: The atomic_t being waited on, a kernel virtual address
diff --git a/kernel/signal.c b/kernel/signal.c
index 8dcd8825b2de..9558664bd9ec 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -78,7 +78,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force)
handler = sig_handler(t, sig);
if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
- handler == SIG_DFL && !force)
+ handler == SIG_DFL && !(force && sig_kernel_only(sig)))
return 1;
return sig_handler_ignored(handler, sig);
@@ -94,13 +94,15 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
return 0;
- if (!sig_task_ignored(t, sig, force))
- return 0;
-
/*
- * Tracers may want to know about even ignored signals.
+ * Tracers may want to know about even ignored signal unless it
+ * is SIGKILL which can't be reported anyway but can be ignored
+ * by SIGNAL_UNKILLABLE task.
*/
- return !t->ptrace;
+ if (t->ptrace && sig != SIGKILL)
+ return 0;
+
+ return sig_task_ignored(t, sig, force);
}
/*
@@ -929,9 +931,9 @@ static void complete_signal(int sig, struct task_struct *p, int group)
* then start taking the whole group down immediately.
*/
if (sig_fatal(p, sig) &&
- !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
+ !(signal->flags & SIGNAL_GROUP_EXIT) &&
!sigismember(&t->real_blocked, sig) &&
- (sig == SIGKILL || !t->ptrace)) {
+ (sig == SIGKILL || !p->ptrace)) {
/*
* This signal will be fatal to the whole group.
*/
@@ -1036,8 +1038,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
else
override_rlimit = 0;
- q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
- override_rlimit);
+ q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);
if (q) {
list_add_tail(&q->list, &pending->list);
switch ((unsigned long) info) {
@@ -2600,7 +2601,6 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
{
-#ifdef __BIG_ENDIAN
sigset_t old_set = current->blocked;
/* XXX: Don't preclude handling different sized sigset_t's. */
@@ -2608,38 +2608,22 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
return -EINVAL;
if (nset) {
- compat_sigset_t new32;
sigset_t new_set;
int error;
- if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
+ if (get_compat_sigset(&new_set, nset))
return -EFAULT;
-
- sigset_from_compat(&new_set, &new32);
sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
error = sigprocmask(how, &new_set, NULL);
if (error)
return error;
}
- if (oset) {
- compat_sigset_t old32;
- sigset_to_compat(&old32, &old_set);
- if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
- return -EFAULT;
- }
- return 0;
-#else
- return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
- (sigset_t __user *)oset, sigsetsize);
-#endif
+ return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
}
#endif
-static int do_sigpending(void *set, unsigned long sigsetsize)
+static int do_sigpending(sigset_t *set)
{
- if (sigsetsize > sizeof(sigset_t))
- return -EINVAL;
-
spin_lock_irq(&current->sighand->siglock);
sigorsets(set, &current->pending.signal,
&current->signal->shared_pending.signal);
@@ -2659,7 +2643,12 @@ static int do_sigpending(void *set, unsigned long sigsetsize)
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
sigset_t set;
- int err = do_sigpending(&set, sigsetsize);
+ int err;
+
+ if (sigsetsize > sizeof(*uset))
+ return -EINVAL;
+
+ err = do_sigpending(&set);
if (!err && copy_to_user(uset, &set, sigsetsize))
err = -EFAULT;
return err;
@@ -2669,20 +2658,16 @@ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
compat_size_t, sigsetsize)
{
-#ifdef __BIG_ENDIAN
sigset_t set;
- int err = do_sigpending(&set, sigsetsize);
- if (!err) {
- compat_sigset_t set32;
- sigset_to_compat(&set32, &set);
- /* we can get here only if sigsetsize <= sizeof(set) */
- if (copy_to_user(uset, &set32, sigsetsize))
- err = -EFAULT;
- }
+ int err;
+
+ if (sigsetsize > sizeof(*uset))
+ return -EINVAL;
+
+ err = do_sigpending(&set);
+ if (!err)
+ err = put_compat_sigset(uset, &set, sigsetsize);
return err;
-#else
- return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
-#endif
}
#endif
@@ -2916,7 +2901,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
struct compat_siginfo __user *, uinfo,
struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
{
- compat_sigset_t s32;
sigset_t s;
struct timespec t;
siginfo_t info;
@@ -2925,9 +2909,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
- if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+ if (get_compat_sigset(&s, uthese))
return -EFAULT;
- sigset_from_compat(&s, &s32);
if (uts) {
if (compat_get_timespec(&t, uts))
@@ -3345,15 +3328,11 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
-#ifdef __BIG_ENDIAN
sigset_t set;
- int err = do_sigpending(&set, sizeof(set.sig[0]));
+ int err = do_sigpending(&set);
if (!err)
err = put_user(set.sig[0], set32);
return err;
-#else
- return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32));
-#endif
}
#endif
@@ -3451,7 +3430,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
compat_size_t, sigsetsize)
{
struct k_sigaction new_ka, old_ka;
- compat_sigset_t mask;
#ifdef __ARCH_HAS_SA_RESTORER
compat_uptr_t restorer;
#endif
@@ -3469,19 +3447,18 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
ret |= get_user(restorer, &act->sa_restorer);
new_ka.sa.sa_restorer = compat_ptr(restorer);
#endif
- ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
+ ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
if (ret)
return -EFAULT;
- sigset_from_compat(&new_ka.sa.sa_mask, &mask);
}
ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
if (!ret && oact) {
- sigset_to_compat(&mask, &old_ka.sa.sa_mask);
ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
&oact->sa_handler);
- ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
+ ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
+ sizeof(oact->sa_mask));
ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
#ifdef __ARCH_HAS_SA_RESTORER
ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
@@ -3661,22 +3638,15 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
{
-#ifdef __BIG_ENDIAN
sigset_t newset;
- compat_sigset_t newset32;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
- if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+ if (get_compat_sigset(&newset, unewset))
return -EFAULT;
- sigset_from_compat(&newset, &newset32);
return sigsuspend(&newset);
-#else
- /* on little-endian bitmaps don't care about granularity */
- return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
-#endif
}
#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 662f7b1b7a78..2f5e87f1bae2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -486,16 +486,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
}
EXPORT_SYMBOL(__tasklet_hi_schedule);
-void __tasklet_hi_schedule_first(struct tasklet_struct *t)
-{
- lockdep_assert_irqs_disabled();
-
- t->next = __this_cpu_read(tasklet_hi_vec.head);
- __this_cpu_write(tasklet_hi_vec.head, t);
- __raise_softirq_irqoff(HI_SOFTIRQ);
-}
-EXPORT_SYMBOL(__tasklet_hi_schedule_first);
-
static __latent_entropy void tasklet_action(struct softirq_action *a)
{
struct tasklet_struct *list;
diff --git a/kernel/sys.c b/kernel/sys.c
index 524a4cb9bbe2..83ffd7dccf23 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -111,6 +111,12 @@
#ifndef SET_FP_MODE
# define SET_FP_MODE(a,b) (-EINVAL)
#endif
+#ifndef SVE_SET_VL
+# define SVE_SET_VL(a) (-EINVAL)
+#endif
+#ifndef SVE_GET_VL
+# define SVE_GET_VL() (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -2386,6 +2392,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_GET_FP_MODE:
error = GET_FP_MODE(me);
break;
+ case PR_SVE_SET_VL:
+ error = SVE_SET_VL(arg2);
+ break;
+ case PR_SVE_GET_VL:
+ error = SVE_GET_VL();
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d9c31bc2eaea..557d46728577 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,7 +30,6 @@
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/ctype.h>
-#include <linux/kmemcheck.h>
#include <linux/kmemleak.h>
#include <linux/fs.h>
#include <linux/init.h>
@@ -67,6 +66,7 @@
#include <linux/kexec.h>
#include <linux/bpf.h>
#include <linux/mount.h>
+#include <linux/pipe_fs_i.h>
#include <linux/uaccess.h>
#include <asm/processor.h>
@@ -1174,15 +1174,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &one_thousand,
},
#endif
-#ifdef CONFIG_KMEMCHECK
- {
- .procname = "kmemcheck",
- .data = &kmemcheck_enabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
{
.procname = "panic_on_warn",
.data = &panic_on_warn,
@@ -1342,11 +1333,6 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
- .procname = "nr_pdflush_threads",
- .mode = 0444 /* read-only */,
- .proc_handler = pdflush_proc_obsolete,
- },
- {
.procname = "swappiness",
.data = &vm_swappiness,
.maxlen = sizeof(vm_swappiness),
@@ -1371,6 +1357,15 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &hugetlb_mempolicy_sysctl_handler,
},
+ {
+ .procname = "numa_stat",
+ .data = &sysctl_vm_numa_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_vm_numa_stat_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif
{
.procname = "hugetlb_shm_group",
@@ -1822,7 +1817,7 @@ static struct ctl_table fs_table[] = {
{
.procname = "pipe-max-size",
.data = &pipe_max_size,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(pipe_max_size),
.mode = 0644,
.proc_handler = &pipe_proc_fn,
.extra1 = &pipe_min_size,
@@ -2581,12 +2576,13 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
if (write) {
unsigned int val = *lvalp;
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
+
if ((param->min && *param->min > val) ||
(param->max && *param->max < val))
return -ERANGE;
- if (*lvalp > UINT_MAX)
- return -EINVAL;
*valp = val;
} else {
unsigned int val = *valp;
@@ -2626,6 +2622,48 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
do_proc_douintvec_minmax_conv, &param);
}
+struct do_proc_dopipe_max_size_conv_param {
+ unsigned int *min;
+};
+
+static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ struct do_proc_dopipe_max_size_conv_param *param = data;
+
+ if (write) {
+ unsigned int val;
+
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
+
+ val = round_pipe_size(*lvalp);
+ if (val == 0)
+ return -EINVAL;
+
+ if (param->min && *param->min > val)
+ return -ERANGE;
+
+ *valp = val;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long) val;
+ }
+
+ return 0;
+}
+
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_dopipe_max_size_conv_param param = {
+ .min = (unsigned int *) table->extra1,
+ };
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_dopipe_max_size_conv, &param);
+}
+
static void validate_coredump_safety(void)
{
#ifdef CONFIG_COREDUMP
@@ -3089,14 +3127,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
else
bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
}
- kfree(tmp_bitmap);
*lenp -= left;
*ppos += *lenp;
- return 0;
- } else {
- kfree(tmp_bitmap);
- return err;
}
+
+ kfree(tmp_bitmap);
+ return err;
}
#else /* CONFIG_PROC_SYSCTL */
@@ -3131,6 +3167,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_jiffies(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -3174,6 +3216,7 @@ EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
+EXPORT_SYMBOL_GPL(proc_dopipe_max_size);
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
EXPORT_SYMBOL(proc_dostring);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index d689a9557e17..e776fc8cc1df 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -21,10 +21,6 @@ config CLOCKSOURCE_VALIDATE_LAST_CYCLE
config GENERIC_TIME_VSYSCALL
bool
-# Timekeeping vsyscall support
-config GENERIC_TIME_VSYSCALL_OLD
- bool
-
# Old style timekeeping
config ARCH_USES_GETTIMEOFFSET
bool
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 03918a19cf2d..65f9e3f24dde 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -171,7 +171,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-static void clocksource_watchdog(unsigned long data)
+static void clocksource_watchdog(struct timer_list *unused)
{
struct clocksource *cs;
u64 csnow, wdnow, cslast, wdlast, delta;
@@ -290,8 +290,7 @@ static inline void clocksource_start_watchdog(void)
{
if (watchdog_running || !watchdog || list_empty(&watchdog_list))
return;
- init_timer(&watchdog_timer);
- watchdog_timer.function = clocksource_watchdog;
+ timer_setup(&watchdog_timer, clocksource_watchdog, 0);
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
watchdog_running = 1;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 198afa78bf69..cd03317e7b57 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -557,45 +557,6 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}
-#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
-#warning Please contact your maintainers, as GENERIC_TIME_VSYSCALL_OLD compatibity will disappear soon.
-
-static inline void update_vsyscall(struct timekeeper *tk)
-{
- struct timespec xt, wm;
-
- xt = timespec64_to_timespec(tk_xtime(tk));
- wm = timespec64_to_timespec(tk->wall_to_monotonic);
- update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
- tk->tkr_mono.cycle_last);
-}
-
-static inline void old_vsyscall_fixup(struct timekeeper *tk)
-{
- s64 remainder;
-
- /*
- * Store only full nanoseconds into xtime_nsec after rounding
- * it up and add the remainder to the error difference.
- * XXX - This is necessary to avoid small 1ns inconsistnecies caused
- * by truncating the remainder in vsyscalls. However, it causes
- * additional work to be done in timekeeping_adjust(). Once
- * the vsyscall implementations are converted to use xtime_nsec
- * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
- * users are removed, this can be killed.
- */
- remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
- if (remainder != 0) {
- tk->tkr_mono.xtime_nsec -= remainder;
- tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
- tk->ntp_error += remainder << tk->ntp_error_shift;
- tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
- }
-}
-#else
-#define old_vsyscall_fixup(tk)
-#endif
-
static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
@@ -2164,12 +2125,6 @@ void update_wall_time(void)
timekeeping_adjust(tk, offset);
/*
- * XXX This can be killed once everyone converts
- * to the new update_vsyscall.
- */
- old_vsyscall_fixup(tk);
-
- /*
* Finally, make sure that after the rounding
* xtime_nsec isn't larger than NSEC_PER_SEC
*/
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index af0b8bae4502..ffebcf878fba 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -707,14 +707,18 @@ static inline void debug_timer_assert_init(struct timer_list *timer)
debug_object_assert_init(timer, &timer_debug_descr);
}
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
+static void do_init_timer(struct timer_list *timer,
+ void (*func)(struct timer_list *),
+ unsigned int flags,
const char *name, struct lock_class_key *key);
-void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
+void init_timer_on_stack_key(struct timer_list *timer,
+ void (*func)(struct timer_list *),
+ unsigned int flags,
const char *name, struct lock_class_key *key)
{
debug_object_init_on_stack(timer, &timer_debug_descr);
- do_init_timer(timer, flags, name, key);
+ do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
@@ -755,10 +759,13 @@ static inline void debug_assert_init(struct timer_list *timer)
debug_timer_assert_init(timer);
}
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
+static void do_init_timer(struct timer_list *timer,
+ void (*func)(struct timer_list *),
+ unsigned int flags,
const char *name, struct lock_class_key *key)
{
timer->entry.pprev = NULL;
+ timer->function = func;
timer->flags = flags | raw_smp_processor_id();
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
@@ -766,6 +773,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
/**
* init_timer_key - initialize a timer
* @timer: the timer to be initialized
+ * @func: timer callback function
* @flags: timer flags
* @name: name of the timer
* @key: lockdep class key of the fake lock used for tracking timer
@@ -774,11 +782,12 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
* init_timer_key() must be done to a timer prior calling *any* of the
* other timer functions.
*/
-void init_timer_key(struct timer_list *timer, unsigned int flags,
+void init_timer_key(struct timer_list *timer,
+ void (*func)(struct timer_list *), unsigned int flags,
const char *name, struct lock_class_key *key)
{
debug_init(timer);
- do_init_timer(timer, flags, name, key);
+ do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL(init_timer_key);
@@ -1107,12 +1116,12 @@ EXPORT_SYMBOL(timer_reduce);
* add_timer - start a timer
* @timer: the timer to be added
*
- * The kernel will do a ->function(->data) callback from the
+ * The kernel will do a ->function(@timer) callback from the
* timer interrupt at the ->expires point in the future. The
* current time is 'jiffies'.
*
- * The timer's ->expires, ->function (and if the handler uses it, ->data)
- * fields must be set prior calling this function.
+ * The timer's ->expires, ->function fields must be set prior calling this
+ * function.
*
* Timers with an ->expires field in the past will be executed in the next
* timer tick.
@@ -1284,8 +1293,7 @@ int del_timer_sync(struct timer_list *timer)
EXPORT_SYMBOL(del_timer_sync);
#endif
-static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
- unsigned long data)
+static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *))
{
int count = preempt_count();
@@ -1309,7 +1317,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
lock_map_acquire(&lockdep_map);
trace_timer_expire_entry(timer);
- fn(data);
+ fn(timer);
trace_timer_expire_exit(timer);
lock_map_release(&lockdep_map);
@@ -1331,8 +1339,7 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
while (!hlist_empty(head)) {
struct timer_list *timer;
- void (*fn)(unsigned long);
- unsigned long data;
+ void (*fn)(struct timer_list *);
timer = hlist_entry(head->first, struct timer_list, entry);
@@ -1340,15 +1347,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
detach_timer(timer, true);
fn = timer->function;
- data = timer->data;
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
- call_timer_fn(timer, fn, data);
+ call_timer_fn(timer, fn);
raw_spin_lock(&base->lock);
} else {
raw_spin_unlock_irq(&base->lock);
- call_timer_fn(timer, fn, data);
+ call_timer_fn(timer, fn);
raw_spin_lock_irq(&base->lock);
}
}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 0e7f5428a148..0ed768b56c60 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -389,7 +389,7 @@ static int __init init_timer_list_procfs(void)
{
struct proc_dir_entry *pe;
- pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
+ pe = proc_create("timer_list", 0400, NULL, &timer_list_fops);
if (!pe)
return -ENOMEM;
return 0;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..af7dad126c13 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -160,6 +160,17 @@ config FUNCTION_GRAPH_TRACER
address on the current task structure into a stack of calls.
+config PREEMPTIRQ_EVENTS
+ bool "Enable trace events for preempt and irq disable/enable"
+ select TRACE_IRQFLAGS
+ depends on DEBUG_PREEMPT || !PROVE_LOCKING
+ default n
+ help
+ Enable tracing of disable and enable events for preemption and irqs.
+ For tracing preempt disable/enable events, DEBUG_PREEMPT must be
+ enabled. For tracing irq disable/enable events, PROVE_LOCKING must
+ be disabled.
+
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
@@ -224,7 +235,7 @@ config HWLAT_TRACER
select GENERIC_TRACER
help
This tracer, when enabled will create one or more kernel threads,
- depening on what the cpumask file is set to, which each thread
+ depending on what the cpumask file is set to, which each thread
spinning in a loop looking for interruptions caused by
something other than the kernel. For example, if a
System Management Interrupt (SMI) takes a noticeable amount of
@@ -239,7 +250,7 @@ config HWLAT_TRACER
iteration
A kernel thread is created that will spin with interrupts disabled
- for "width" microseconds in every "widow" cycle. It will not spin
+ for "width" microseconds in every "window" cycle. It will not spin
for "window - width" microseconds, where the system can
continue to operate.
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 19a15b2f1190..e2538c7638d4 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 45a3928544ce..206e0e2ace53 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -66,7 +66,8 @@ static struct tracer_flags blk_tracer_flags = {
};
/* Global reference count of probes */
-static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
static void blk_register_tracepoints(void);
static void blk_unregister_tracepoints(void);
@@ -329,14 +330,29 @@ static void blk_trace_free(struct blk_trace *bt)
kfree(bt);
}
+static void get_probe_ref(void)
+{
+ mutex_lock(&blk_probe_mutex);
+ if (++blk_probes_ref == 1)
+ blk_register_tracepoints();
+ mutex_unlock(&blk_probe_mutex);
+}
+
+static void put_probe_ref(void)
+{
+ mutex_lock(&blk_probe_mutex);
+ if (!--blk_probes_ref)
+ blk_unregister_tracepoints();
+ mutex_unlock(&blk_probe_mutex);
+}
+
static void blk_trace_cleanup(struct blk_trace *bt)
{
blk_trace_free(bt);
- if (atomic_dec_and_test(&blk_probes_ref))
- blk_unregister_tracepoints();
+ put_probe_ref();
}
-int blk_trace_remove(struct request_queue *q)
+static int __blk_trace_remove(struct request_queue *q)
{
struct blk_trace *bt;
@@ -349,6 +365,17 @@ int blk_trace_remove(struct request_queue *q)
return 0;
}
+
+int blk_trace_remove(struct request_queue *q)
+{
+ int ret;
+
+ mutex_lock(&q->blk_trace_mutex);
+ ret = __blk_trace_remove(q);
+ mutex_unlock(&q->blk_trace_mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(blk_trace_remove);
static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
@@ -538,8 +565,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (cmpxchg(&q->blk_trace, NULL, bt))
goto err;
- if (atomic_inc_return(&blk_probes_ref) == 1)
- blk_register_tracepoints();
+ get_probe_ref();
ret = 0;
err:
@@ -550,9 +576,8 @@ err:
return ret;
}
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev,
- char __user *arg)
+static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev, char __user *arg)
{
struct blk_user_trace_setup buts;
int ret;
@@ -571,6 +596,19 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
}
return 0;
}
+
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ char __user *arg)
+{
+ int ret;
+
+ mutex_lock(&q->blk_trace_mutex);
+ ret = __blk_trace_setup(q, name, dev, bdev, arg);
+ mutex_unlock(&q->blk_trace_mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(blk_trace_setup);
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
@@ -607,7 +645,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
}
#endif
-int blk_trace_startstop(struct request_queue *q, int start)
+static int __blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
struct blk_trace *bt = q->blk_trace;
@@ -646,6 +684,17 @@ int blk_trace_startstop(struct request_queue *q, int start)
return ret;
}
+
+int blk_trace_startstop(struct request_queue *q, int start)
+{
+ int ret;
+
+ mutex_lock(&q->blk_trace_mutex);
+ ret = __blk_trace_startstop(q, start);
+ mutex_unlock(&q->blk_trace_mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(blk_trace_startstop);
/*
@@ -676,7 +725,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
switch (cmd) {
case BLKTRACESETUP:
bdevname(bdev, b);
- ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+ ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
case BLKTRACESETUP32:
@@ -687,10 +736,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
case BLKTRACESTART:
start = 1;
case BLKTRACESTOP:
- ret = blk_trace_startstop(q, start);
+ ret = __blk_trace_startstop(q, start);
break;
case BLKTRACETEARDOWN:
- ret = blk_trace_remove(q);
+ ret = __blk_trace_remove(q);
break;
default:
ret = -ENOTTY;
@@ -708,10 +757,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
**/
void blk_trace_shutdown(struct request_queue *q)
{
+ mutex_lock(&q->blk_trace_mutex);
+
if (q->blk_trace) {
- blk_trace_startstop(q, 0);
- blk_trace_remove(q);
+ __blk_trace_startstop(q, 0);
+ __blk_trace_remove(q);
}
+
+ mutex_unlock(&q->blk_trace_mutex);
}
#ifdef CONFIG_BLK_CGROUP
@@ -1558,9 +1611,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (bt == NULL)
return -EINVAL;
- if (atomic_dec_and_test(&blk_probes_ref))
- blk_unregister_tracepoints();
-
+ put_probe_ref();
blk_trace_free(bt);
return 0;
}
@@ -1591,8 +1642,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
if (cmpxchg(&q->blk_trace, NULL, bt))
goto free_bt;
- if (atomic_inc_return(&blk_probes_ref) == 1)
- blk_register_tracepoints();
+ get_probe_ref();
return 0;
free_bt:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 95888ae6c263..27d1f4ffa3de 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -15,9 +15,11 @@
#include <linux/ctype.h>
#include "trace.h"
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+
/**
* trace_call_bpf - invoke BPF program
- * @prog: BPF program
+ * @call: tracepoint event
* @ctx: opaque context pointer
*
* kprobe handlers execute BPF programs via this helper.
@@ -29,7 +31,7 @@
* 1 - store kprobe event into ring buffer
* Other values are reserved and currently alias to 1
*/
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
unsigned int ret;
@@ -49,9 +51,22 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
goto out;
}
- rcu_read_lock();
- ret = BPF_PROG_RUN(prog, ctx);
- rcu_read_unlock();
+ /*
+ * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
+ * to all call sites, we did a bpf_prog_array_valid() there to check
+ * whether call->prog_array is empty or not, which is
+ * a heurisitc to speed up execution.
+ *
+ * If bpf_prog_array_valid() fetched prog_array was
+ * non-NULL, we go into trace_call_bpf() and do the actual
+ * proper rcu_dereference() under RCU lock.
+ * If it turns out that prog_array is NULL then, we bail out.
+ * For the opposite, if the bpf_prog_array_valid() fetched pointer
+ * was NULL, you'll skip the prog_array with the risk of missing
+ * out of events when it was updated in between this and the
+ * rcu_dereference() which is accepted risk.
+ */
+ ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
out:
__this_cpu_dec(bpf_prog_active);
@@ -77,7 +92,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
- .arg2_type = ARG_CONST_SIZE,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
@@ -255,14 +270,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
-BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+static __always_inline int
+get_map_perf_counter(struct bpf_map *map, u64 flags,
+ u64 *value, u64 *enabled, u64 *running)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
struct bpf_event_entry *ee;
- u64 value = 0;
- int err;
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -275,7 +290,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
if (!ee)
return -ENOENT;
- err = perf_event_read_local(ee->event, &value, NULL, NULL);
+ return perf_event_read_local(ee->event, value, enabled, running);
+}
+
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+{
+ u64 value = 0;
+ int err;
+
+ err = get_map_perf_counter(map, flags, &value, NULL, NULL);
/*
* this api is ugly since we miss [-22..-2] range of valid
* counter values, but that's uapi
@@ -293,6 +316,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
+ struct bpf_perf_event_value *, buf, u32, size)
+{
+ int err = -EINVAL;
+
+ if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+ goto clear;
+ err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
+ &buf->running);
+ if (unlikely(err))
+ goto clear;
+ return 0;
+clear:
+ memset(buf, 0, size);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
+ .func = bpf_perf_event_read_value,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
static __always_inline u64
@@ -353,7 +403,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_MEM,
- .arg5_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
@@ -444,7 +494,7 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
- .arg2_type = ARG_CONST_SIZE,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
@@ -499,6 +549,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_perf_event_output_proto;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
+ case BPF_FUNC_perf_event_read_value:
+ return &bpf_perf_event_read_value_proto;
default:
return tracing_func_proto(func_id);
}
@@ -524,11 +576,14 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
return true;
}
-const struct bpf_verifier_ops kprobe_prog_ops = {
+const struct bpf_verifier_ops kprobe_verifier_ops = {
.get_func_proto = kprobe_prog_func_proto,
.is_valid_access = kprobe_prog_is_valid_access,
};
+const struct bpf_prog_ops kprobe_prog_ops = {
+};
+
BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
@@ -550,7 +605,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_MEM,
- .arg5_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
@@ -576,6 +631,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx,
+ struct bpf_perf_event_value *, buf, u32, size)
+{
+ int err = -EINVAL;
+
+ if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+ goto clear;
+ err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
+ &buf->running);
+ if (unlikely(err))
+ goto clear;
+ return 0;
+clear:
+ memset(buf, 0, size);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
+ .func = bpf_perf_prog_read_value_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -583,6 +664,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_tp;
+ case BPF_FUNC_perf_prog_read_value:
+ return &bpf_perf_prog_read_value_proto_tp;
default:
return tracing_func_proto(func_id);
}
@@ -602,11 +685,14 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
return true;
}
-const struct bpf_verifier_ops tracepoint_prog_ops = {
+const struct bpf_verifier_ops tracepoint_verifier_ops = {
.get_func_proto = tp_prog_func_proto,
.is_valid_access = tp_prog_is_valid_access,
};
+const struct bpf_prog_ops tracepoint_prog_ops = {
+};
+
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
@@ -662,8 +748,67 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
-const struct bpf_verifier_ops perf_event_prog_ops = {
+const struct bpf_verifier_ops perf_event_verifier_ops = {
.get_func_proto = tp_prog_func_proto,
.is_valid_access = pe_prog_is_valid_access,
.convert_ctx_access = pe_prog_convert_ctx_access,
};
+
+const struct bpf_prog_ops perf_event_prog_ops = {
+};
+
+static DEFINE_MUTEX(bpf_event_mutex);
+
+int perf_event_attach_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog)
+{
+ struct bpf_prog_array __rcu *old_array;
+ struct bpf_prog_array *new_array;
+ int ret = -EEXIST;
+
+ mutex_lock(&bpf_event_mutex);
+
+ if (event->prog)
+ goto unlock;
+
+ old_array = event->tp_event->prog_array;
+ ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+ if (ret < 0)
+ goto unlock;
+
+ /* set the new array to event->tp_event and set event->prog */
+ event->prog = prog;
+ rcu_assign_pointer(event->tp_event->prog_array, new_array);
+ bpf_prog_array_free(old_array);
+
+unlock:
+ mutex_unlock(&bpf_event_mutex);
+ return ret;
+}
+
+void perf_event_detach_bpf_prog(struct perf_event *event)
+{
+ struct bpf_prog_array __rcu *old_array;
+ struct bpf_prog_array *new_array;
+ int ret;
+
+ mutex_lock(&bpf_event_mutex);
+
+ if (!event->prog)
+ goto unlock;
+
+ old_array = event->tp_event->prog_array;
+ ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+ if (ret < 0) {
+ bpf_prog_array_delete_safe(old_array, event->prog);
+ } else {
+ rcu_assign_pointer(event->tp_event->prog_array, new_array);
+ bpf_prog_array_free(old_array);
+ }
+
+ bpf_prog_put(event->prog);
+ event->prog = NULL;
+
+unlock:
+ mutex_unlock(&bpf_event_mutex);
+}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8319e09e15b9..ccdf3664e4a9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -203,30 +203,6 @@ void clear_ftrace_function(void)
ftrace_trace_function = ftrace_stub;
}
-static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- *per_cpu_ptr(ops->disabled, cpu) = 1;
-}
-
-static int per_cpu_ops_alloc(struct ftrace_ops *ops)
-{
- int __percpu *disabled;
-
- if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
- return -EINVAL;
-
- disabled = alloc_percpu(int);
- if (!disabled)
- return -ENOMEM;
-
- ops->disabled = disabled;
- per_cpu_ops_disable_all(ops);
- return 0;
-}
-
static void ftrace_sync(struct work_struct *work)
{
/*
@@ -262,8 +238,8 @@ static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
* If this is a dynamic, RCU, or per CPU ops, or we force list func,
* then it needs to call the list anyway.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
- FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) ||
+ FTRACE_FORCE_LIST_FUNC)
return ftrace_ops_list_func;
return ftrace_ops_get_func(ops);
@@ -422,11 +398,6 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
- if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
- if (per_cpu_ops_alloc(ops))
- return -ENOMEM;
- }
-
add_ftrace_ops(&ftrace_ops_list, ops);
/* Always save the function, and reset at unregistering */
@@ -2727,11 +2698,6 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
}
-static void per_cpu_ops_free(struct ftrace_ops *ops)
-{
- free_percpu(ops->disabled);
-}
-
static void ftrace_startup_enable(int command)
{
if (saved_ftrace_func != ftrace_trace_function) {
@@ -2833,7 +2799,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* not currently active, we can just free them
* without synchronizing all CPUs.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU))
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
goto free_ops;
return 0;
@@ -2880,7 +2846,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* The same goes for freeing the per_cpu data of the per_cpu
* ops.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC) {
/*
* We need to do a hard force of sched synchronization.
* This is because we use preempt_disable() to do RCU, but
@@ -2903,9 +2869,6 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
free_ops:
arch_ftrace_trampoline_free(ops);
-
- if (ops->flags & FTRACE_OPS_FL_PER_CPU)
- per_cpu_ops_free(ops);
}
return 0;
@@ -5672,10 +5635,29 @@ static int ftrace_process_locs(struct module *mod,
return ret;
}
+struct ftrace_mod_func {
+ struct list_head list;
+ char *name;
+ unsigned long ip;
+ unsigned int size;
+};
+
+struct ftrace_mod_map {
+ struct rcu_head rcu;
+ struct list_head list;
+ struct module *mod;
+ unsigned long start_addr;
+ unsigned long end_addr;
+ struct list_head funcs;
+ unsigned int num_funcs;
+};
+
#ifdef CONFIG_MODULES
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
+static LIST_HEAD(ftrace_mod_maps);
+
static int referenced_filters(struct dyn_ftrace *rec)
{
struct ftrace_ops *ops;
@@ -5729,8 +5711,26 @@ static void clear_mod_from_hashes(struct ftrace_page *pg)
mutex_unlock(&trace_types_lock);
}
+static void ftrace_free_mod_map(struct rcu_head *rcu)
+{
+ struct ftrace_mod_map *mod_map = container_of(rcu, struct ftrace_mod_map, rcu);
+ struct ftrace_mod_func *mod_func;
+ struct ftrace_mod_func *n;
+
+ /* All the contents of mod_map are now not visible to readers */
+ list_for_each_entry_safe(mod_func, n, &mod_map->funcs, list) {
+ kfree(mod_func->name);
+ list_del(&mod_func->list);
+ kfree(mod_func);
+ }
+
+ kfree(mod_map);
+}
+
void ftrace_release_mod(struct module *mod)
{
+ struct ftrace_mod_map *mod_map;
+ struct ftrace_mod_map *n;
struct dyn_ftrace *rec;
struct ftrace_page **last_pg;
struct ftrace_page *tmp_page = NULL;
@@ -5742,6 +5742,14 @@ void ftrace_release_mod(struct module *mod)
if (ftrace_disabled)
goto out_unlock;
+ list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) {
+ if (mod_map->mod == mod) {
+ list_del_rcu(&mod_map->list);
+ call_rcu_sched(&mod_map->rcu, ftrace_free_mod_map);
+ break;
+ }
+ }
+
/*
* Each module has its own ftrace_pages, remove
* them from the list.
@@ -5749,7 +5757,8 @@ void ftrace_release_mod(struct module *mod)
last_pg = &ftrace_pages_start;
for (pg = ftrace_pages_start; pg; pg = *last_pg) {
rec = &pg->records[0];
- if (within_module_core(rec->ip, mod)) {
+ if (within_module_core(rec->ip, mod) ||
+ within_module_init(rec->ip, mod)) {
/*
* As core pages are first, the first
* page should never be a module page.
@@ -5818,7 +5827,8 @@ void ftrace_module_enable(struct module *mod)
* not part of this module, then skip this pg,
* which the "break" will do.
*/
- if (!within_module_core(rec->ip, mod))
+ if (!within_module_core(rec->ip, mod) &&
+ !within_module_init(rec->ip, mod))
break;
cnt = 0;
@@ -5863,23 +5873,245 @@ void ftrace_module_init(struct module *mod)
ftrace_process_locs(mod, mod->ftrace_callsites,
mod->ftrace_callsites + mod->num_ftrace_callsites);
}
+
+static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
+ struct dyn_ftrace *rec)
+{
+ struct ftrace_mod_func *mod_func;
+ unsigned long symsize;
+ unsigned long offset;
+ char str[KSYM_SYMBOL_LEN];
+ char *modname;
+ const char *ret;
+
+ ret = kallsyms_lookup(rec->ip, &symsize, &offset, &modname, str);
+ if (!ret)
+ return;
+
+ mod_func = kmalloc(sizeof(*mod_func), GFP_KERNEL);
+ if (!mod_func)
+ return;
+
+ mod_func->name = kstrdup(str, GFP_KERNEL);
+ if (!mod_func->name) {
+ kfree(mod_func);
+ return;
+ }
+
+ mod_func->ip = rec->ip - offset;
+ mod_func->size = symsize;
+
+ mod_map->num_funcs++;
+
+ list_add_rcu(&mod_func->list, &mod_map->funcs);
+}
+
+static struct ftrace_mod_map *
+allocate_ftrace_mod_map(struct module *mod,
+ unsigned long start, unsigned long end)
+{
+ struct ftrace_mod_map *mod_map;
+
+ mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL);
+ if (!mod_map)
+ return NULL;
+
+ mod_map->mod = mod;
+ mod_map->start_addr = start;
+ mod_map->end_addr = end;
+ mod_map->num_funcs = 0;
+
+ INIT_LIST_HEAD_RCU(&mod_map->funcs);
+
+ list_add_rcu(&mod_map->list, &ftrace_mod_maps);
+
+ return mod_map;
+}
+
+static const char *
+ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
+ unsigned long addr, unsigned long *size,
+ unsigned long *off, char *sym)
+{
+ struct ftrace_mod_func *found_func = NULL;
+ struct ftrace_mod_func *mod_func;
+
+ list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) {
+ if (addr >= mod_func->ip &&
+ addr < mod_func->ip + mod_func->size) {
+ found_func = mod_func;
+ break;
+ }
+ }
+
+ if (found_func) {
+ if (size)
+ *size = found_func->size;
+ if (off)
+ *off = addr - found_func->ip;
+ if (sym)
+ strlcpy(sym, found_func->name, KSYM_NAME_LEN);
+
+ return found_func->name;
+ }
+
+ return NULL;
+}
+
+const char *
+ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
+ unsigned long *off, char **modname, char *sym)
+{
+ struct ftrace_mod_map *mod_map;
+ const char *ret = NULL;
+
+ /* mod_map is freed via call_rcu_sched() */
+ preempt_disable();
+ list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
+ ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym);
+ if (ret) {
+ if (modname)
+ *modname = mod_map->mod->name;
+ break;
+ }
+ }
+ preempt_enable();
+
+ return ret;
+}
+
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+ char *type, char *name,
+ char *module_name, int *exported)
+{
+ struct ftrace_mod_map *mod_map;
+ struct ftrace_mod_func *mod_func;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
+
+ if (symnum >= mod_map->num_funcs) {
+ symnum -= mod_map->num_funcs;
+ continue;
+ }
+
+ list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) {
+ if (symnum > 1) {
+ symnum--;
+ continue;
+ }
+
+ *value = mod_func->ip;
+ *type = 'T';
+ strlcpy(name, mod_func->name, KSYM_NAME_LEN);
+ strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN);
+ *exported = 1;
+ preempt_enable();
+ return 0;
+ }
+ WARN_ON(1);
+ break;
+ }
+ preempt_enable();
+ return -ERANGE;
+}
+
+#else
+static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
+ struct dyn_ftrace *rec) { }
+static inline struct ftrace_mod_map *
+allocate_ftrace_mod_map(struct module *mod,
+ unsigned long start, unsigned long end)
+{
+ return NULL;
+}
#endif /* CONFIG_MODULES */
-void __init ftrace_free_init_mem(void)
+struct ftrace_init_func {
+ struct list_head list;
+ unsigned long ip;
+};
+
+/* Clear any init ips from hashes */
+static void
+clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash)
+{
+ struct ftrace_func_entry *entry;
+
+ if (ftrace_hash_empty(hash))
+ return;
+
+ entry = __ftrace_lookup_ip(hash, func->ip);
+
+ /*
+ * Do not allow this rec to match again.
+ * Yeah, it may waste some memory, but will be removed
+ * if/when the hash is modified again.
+ */
+ if (entry)
+ entry->ip = 0;
+}
+
+static void
+clear_func_from_hashes(struct ftrace_init_func *func)
+{
+ struct trace_array *tr;
+
+ mutex_lock(&trace_types_lock);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->ops || !tr->ops->func_hash)
+ continue;
+ mutex_lock(&tr->ops->func_hash->regex_lock);
+ clear_func_from_hash(func, tr->ops->func_hash->filter_hash);
+ clear_func_from_hash(func, tr->ops->func_hash->notrace_hash);
+ mutex_unlock(&tr->ops->func_hash->regex_lock);
+ }
+ mutex_unlock(&trace_types_lock);
+}
+
+static void add_to_clear_hash_list(struct list_head *clear_list,
+ struct dyn_ftrace *rec)
+{
+ struct ftrace_init_func *func;
+
+ func = kmalloc(sizeof(*func), GFP_KERNEL);
+ if (!func) {
+ WARN_ONCE(1, "alloc failure, ftrace filter could be stale\n");
+ return;
+ }
+
+ func->ip = rec->ip;
+ list_add(&func->list, clear_list);
+}
+
+void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
{
- unsigned long start = (unsigned long)(&__init_begin);
- unsigned long end = (unsigned long)(&__init_end);
+ unsigned long start = (unsigned long)(start_ptr);
+ unsigned long end = (unsigned long)(end_ptr);
struct ftrace_page **last_pg = &ftrace_pages_start;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
struct dyn_ftrace key;
+ struct ftrace_mod_map *mod_map = NULL;
+ struct ftrace_init_func *func, *func_next;
+ struct list_head clear_hash;
int order;
+ INIT_LIST_HEAD(&clear_hash);
+
key.ip = start;
key.flags = end; /* overload flags, as it is unsigned long */
mutex_lock(&ftrace_lock);
+ /*
+ * If we are freeing module init memory, then check if
+ * any tracer is active. If so, we need to save a mapping of
+ * the module functions being freed with the address.
+ */
+ if (mod && ftrace_ops_list != &ftrace_list_end)
+ mod_map = allocate_ftrace_mod_map(mod, start, end);
+
for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) {
if (end < pg->records[0].ip ||
start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
@@ -5890,6 +6122,13 @@ void __init ftrace_free_init_mem(void)
ftrace_cmp_recs);
if (!rec)
continue;
+
+ /* rec will be cleared from hashes after ftrace_lock unlock */
+ add_to_clear_hash_list(&clear_hash, rec);
+
+ if (mod_map)
+ save_ftrace_mod_rec(mod_map, rec);
+
pg->index--;
ftrace_update_tot_cnt--;
if (!pg->index) {
@@ -5908,6 +6147,19 @@ void __init ftrace_free_init_mem(void)
goto again;
}
mutex_unlock(&ftrace_lock);
+
+ list_for_each_entry_safe(func, func_next, &clear_hash, list) {
+ clear_func_from_hashes(func);
+ kfree(func);
+ }
+}
+
+void __init ftrace_free_init_mem(void)
+{
+ void *start = (void *)(&__init_begin);
+ void *end = (void *)(&__init_end);
+
+ ftrace_free_mem(NULL, start, end);
}
void __init ftrace_init(void)
@@ -6063,10 +6315,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
* If any of the above fails then the op->func() is not executed.
*/
if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
- (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
- !ftrace_function_local_disabled(op)) &&
ftrace_ops_test(op, ip, regs)) {
-
if (FTRACE_WARN_ON(!op->func)) {
pr_warn("op=%p %pS\n", op, op);
goto out;
@@ -6124,10 +6373,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
preempt_disable_notrace();
- if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
- !ftrace_function_local_disabled(op)) {
- op->func(ip, parent_ip, op, regs);
- }
+ op->func(ip, parent_ip, op, regs);
preempt_enable_notrace();
trace_clear_recursion(bit);
@@ -6151,7 +6397,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
* or does per cpu logic, then we need to call the assist handler.
*/
if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
- ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+ ops->flags & FTRACE_OPS_FL_RCU)
return ftrace_ops_assist_func;
return ops->func;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 845f3805c73d..91874a95060d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -13,7 +13,6 @@
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/kthread.h> /* for self test */
-#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/mutex.h>
@@ -2055,7 +2054,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
}
event = __rb_page_index(tail_page, tail);
- kmemcheck_annotate_bitfield(event, bitfield);
/* account for padding bytes */
local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
@@ -2538,61 +2536,29 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
* by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. To pass this
- * information from the lock to the unlock without having to
- * access the 'in_interrupt()' functions again (which do show
- * a bit of overhead in something as critical as function tracing,
- * we use a bitmask trick.
+ * be modified more than once via an interrupt. There are four
+ * different contexts that we need to consider.
*
- * bit 0 = NMI context
- * bit 1 = IRQ context
- * bit 2 = SoftIRQ context
- * bit 3 = normal context.
+ * Normal context.
+ * SoftIRQ context
+ * IRQ context
+ * NMI context
*
- * This works because this is the order of contexts that can
- * preempt other contexts. A SoftIRQ never preempts an IRQ
- * context.
- *
- * When the context is determined, the corresponding bit is
- * checked and set (if it was set, then a recursion of that context
- * happened).
- *
- * On unlock, we need to clear this bit. To do so, just subtract
- * 1 from the current_context and AND it to itself.
- *
- * (binary)
- * 101 - 1 = 100
- * 101 & 100 = 100 (clearing bit zero)
- *
- * 1010 - 1 = 1001
- * 1010 & 1001 = 1000 (clearing bit 1)
- *
- * The least significant bit can be cleared this way, and it
- * just so happens that it is the same bit corresponding to
- * the current context.
+ * If for some reason the ring buffer starts to recurse, we
+ * only allow that to happen at most 4 times (one for each
+ * context). If it happens 5 times, then we consider this a
+ * recusive loop and do not let it go further.
*/
static __always_inline int
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
- unsigned int val = cpu_buffer->current_context;
- int bit;
-
- if (in_interrupt()) {
- if (in_nmi())
- bit = RB_CTX_NMI;
- else if (in_irq())
- bit = RB_CTX_IRQ;
- else
- bit = RB_CTX_SOFTIRQ;
- } else
- bit = RB_CTX_NORMAL;
-
- if (unlikely(val & (1 << bit)))
+ if (cpu_buffer->current_context >= 4)
return 1;
- val |= (1 << bit);
- cpu_buffer->current_context = val;
+ cpu_buffer->current_context++;
+ /* Interrupts must see this update */
+ barrier();
return 0;
}
@@ -2600,7 +2566,9 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
static __always_inline void
trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
{
- cpu_buffer->current_context &= cpu_buffer->current_context - 1;
+ /* Don't let the dec leak out */
+ barrier();
+ cpu_buffer->current_context--;
}
/**
@@ -2686,7 +2654,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
/* We reserved something on the buffer */
event = __rb_page_index(tail_page, tail);
- kmemcheck_annotate_bitfield(event, bitfield);
rb_update_event(cpu_buffer, event, info);
local_inc(&tail_page->entries);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 752e5daf0896..73e67b68c53b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7687,6 +7687,7 @@ static int instance_mkdir(const char *name)
struct trace_array *tr;
int ret;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = -EEXIST;
@@ -7742,6 +7743,7 @@ static int instance_mkdir(const char *name)
list_add(&tr->list, &ftrace_trace_arrays);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return 0;
@@ -7753,6 +7755,7 @@ static int instance_mkdir(const char *name)
out_unlock:
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
@@ -7765,6 +7768,7 @@ static int instance_rmdir(const char *name)
int ret;
int i;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = -ENODEV;
@@ -7810,6 +7814,7 @@ static int instance_rmdir(const char *name)
out_unlock:
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
}
@@ -8276,6 +8281,92 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
}
EXPORT_SYMBOL_GPL(ftrace_dump);
+int trace_run_command(const char *buf, int (*createfn)(int, char **))
+{
+ char **argv;
+ int argc, ret;
+
+ argc = 0;
+ ret = 0;
+ argv = argv_split(GFP_KERNEL, buf, &argc);
+ if (!argv)
+ return -ENOMEM;
+
+ if (argc)
+ ret = createfn(argc, argv);
+
+ argv_free(argv);
+
+ return ret;
+}
+
+#define WRITE_BUFSIZE 4096
+
+ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos,
+ int (*createfn)(int, char **))
+{
+ char *kbuf, *buf, *tmp;
+ int ret = 0;
+ size_t done = 0;
+ size_t size;
+
+ kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ while (done < count) {
+ size = count - done;
+
+ if (size >= WRITE_BUFSIZE)
+ size = WRITE_BUFSIZE - 1;
+
+ if (copy_from_user(kbuf, buffer + done, size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ kbuf[size] = '\0';
+ buf = kbuf;
+ do {
+ tmp = strchr(buf, '\n');
+ if (tmp) {
+ *tmp = '\0';
+ size = tmp - buf + 1;
+ } else {
+ size = strlen(buf);
+ if (done + size < count) {
+ if (buf != kbuf)
+ break;
+ /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
+ pr_warn("Line length is too long: Should be less than %d\n",
+ WRITE_BUFSIZE - 2);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ done += size;
+
+ /* Remove comments */
+ tmp = strchr(buf, '#');
+
+ if (tmp)
+ *tmp = '\0';
+
+ ret = trace_run_command(buf, createfn);
+ if (ret)
+ goto out;
+ buf += size;
+
+ } while (done < count);
+ }
+ ret = done;
+
+out:
+ kfree(kbuf);
+
+ return ret;
+}
+
__init static int tracer_alloc_buffers(void)
{
int ring_buf_size;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6b0b343a36a2..2a6d0325a761 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -739,8 +739,6 @@ extern int trace_selftest_startup_wakeup(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_nop(struct tracer *trace,
struct trace_array *tr);
-extern int trace_selftest_startup_sched_switch(struct tracer *trace,
- struct trace_array *tr);
extern int trace_selftest_startup_branch(struct tracer *trace,
struct trace_array *tr);
/*
@@ -1755,6 +1753,13 @@ void trace_printk_start_comm(void);
int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
+#define MAX_EVENT_NAME_LEN 64
+
+extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
+extern ssize_t trace_parse_run_command(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos,
+ int (*createfn)(int, char**));
+
/*
* Normal trace_printk() and friends allocates special buffers
* to do the manipulation, as well as saves the print formats
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 13ba2d3f6a91..55d6dff37daf 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -240,27 +240,41 @@ void perf_trace_destroy(struct perf_event *p_event)
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
- struct hlist_head __percpu *pcpu_list;
- struct hlist_head *list;
-
- pcpu_list = tp_event->perf_events;
- if (WARN_ON_ONCE(!pcpu_list))
- return -EINVAL;
if (!(flags & PERF_EF_START))
p_event->hw.state = PERF_HES_STOPPED;
- list = this_cpu_ptr(pcpu_list);
- hlist_add_head_rcu(&p_event->hlist_entry, list);
+ /*
+ * If TRACE_REG_PERF_ADD returns false; no custom action was performed
+ * and we need to take the default action of enqueueing our event on
+ * the right per-cpu hlist.
+ */
+ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
+ struct hlist_head __percpu *pcpu_list;
+ struct hlist_head *list;
+
+ pcpu_list = tp_event->perf_events;
+ if (WARN_ON_ONCE(!pcpu_list))
+ return -EINVAL;
+
+ list = this_cpu_ptr(pcpu_list);
+ hlist_add_head_rcu(&p_event->hlist_entry, list);
+ }
- return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+ return 0;
}
void perf_trace_del(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
- hlist_del_rcu(&p_event->hlist_entry);
- tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+
+ /*
+ * If TRACE_REG_PERF_DEL returns false; no custom action was performed
+ * and we need to take the default action of dequeueing our event from
+ * the right per-cpu hlist.
+ */
+ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
+ hlist_del_rcu(&p_event->hlist_entry);
}
void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -306,16 +320,25 @@ static void
perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *pt_regs)
{
- struct perf_event *event;
struct ftrace_entry *entry;
- struct hlist_head *head;
+ struct perf_event *event;
+ struct hlist_head head;
struct pt_regs regs;
int rctx;
- head = this_cpu_ptr(event_function.perf_events);
- if (hlist_empty(head))
+ if ((unsigned long)ops->private != smp_processor_id())
return;
+ event = container_of(ops, struct perf_event, ftrace_ops);
+
+ /*
+ * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
+ * the perf code does is hlist_for_each_entry_rcu(), so we can
+ * get away with simply setting the @head.first pointer in order
+ * to create a singular list.
+ */
+ head.first = &event->hlist_entry;
+
#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
sizeof(u64)) - sizeof(u32))
@@ -330,9 +353,8 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
entry->ip = ip;
entry->parent_ip = parent_ip;
- event = container_of(ops, struct perf_event, ftrace_ops);
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
- 1, &regs, head, NULL, event);
+ 1, &regs, &head, NULL);
#undef ENTRY_SIZE
}
@@ -341,8 +363,10 @@ static int perf_ftrace_function_register(struct perf_event *event)
{
struct ftrace_ops *ops = &event->ftrace_ops;
- ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
- ops->func = perf_ftrace_function_call;
+ ops->flags = FTRACE_OPS_FL_RCU;
+ ops->func = perf_ftrace_function_call;
+ ops->private = (void *)(unsigned long)nr_cpu_ids;
+
return register_ftrace_function(ops);
}
@@ -354,19 +378,11 @@ static int perf_ftrace_function_unregister(struct perf_event *event)
return ret;
}
-static void perf_ftrace_function_enable(struct perf_event *event)
-{
- ftrace_function_local_enable(&event->ftrace_ops);
-}
-
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
- ftrace_function_local_disable(&event->ftrace_ops);
-}
-
int perf_ftrace_event_register(struct trace_event_call *call,
enum trace_reg type, void *data)
{
+ struct perf_event *event = data;
+
switch (type) {
case TRACE_REG_REGISTER:
case TRACE_REG_UNREGISTER:
@@ -379,11 +395,11 @@ int perf_ftrace_event_register(struct trace_event_call *call,
case TRACE_REG_PERF_CLOSE:
return perf_ftrace_function_unregister(data);
case TRACE_REG_PERF_ADD:
- perf_ftrace_function_enable(data);
- return 0;
+ event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
+ return 1;
case TRACE_REG_PERF_DEL:
- perf_ftrace_function_disable(data);
- return 0;
+ event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
+ return 1;
}
return -EINVAL;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 87468398b9ed..ec0f9aa4e151 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1406,8 +1406,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
return -ENODEV;
/* Make sure the system still exists */
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
list_for_each_entry(dir, &tr->systems, list) {
if (dir == inode->i_private) {
@@ -1421,8 +1421,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
}
}
exit_loop:
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
if (!system)
return -ENODEV;
@@ -2294,15 +2294,15 @@ static void __add_event_to_tracers(struct trace_event_call *call);
int trace_add_event_call(struct trace_event_call *call)
{
int ret;
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
ret = __register_event(call, NULL);
if (ret >= 0)
__add_event_to_tracers(call);
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
}
@@ -2356,13 +2356,13 @@ int trace_remove_event_call(struct trace_event_call *call)
{
int ret;
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
down_write(&trace_event_sem);
ret = probe_remove_event_call(call);
up_write(&trace_event_sem);
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
}
@@ -2424,8 +2424,8 @@ static int trace_module_notify(struct notifier_block *self,
{
struct module *mod = data;
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
switch (val) {
case MODULE_STATE_COMING:
trace_module_add_events(mod);
@@ -2434,8 +2434,8 @@ static int trace_module_notify(struct notifier_block *self,
trace_module_remove_events(mod);
break;
}
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return 0;
}
@@ -2950,24 +2950,24 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
* creates the event hierachry in the @parent/events directory.
*
* Returns 0 on success.
+ *
+ * Must be called with event_mutex held.
*/
int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
{
int ret;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
ret = create_event_toplevel_files(parent, tr);
if (ret)
- goto out_unlock;
+ goto out;
down_write(&trace_event_sem);
__trace_add_event_dirs(tr);
up_write(&trace_event_sem);
- out_unlock:
- mutex_unlock(&event_mutex);
-
+ out:
return ret;
}
@@ -2996,9 +2996,10 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
return ret;
}
+/* Must be called with event_mutex held */
int event_trace_del_tracer(struct trace_array *tr)
{
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
/* Disable any event triggers and associated soft-disabled events */
clear_event_triggers(tr);
@@ -3019,8 +3020,6 @@ int event_trace_del_tracer(struct trace_array *tr)
tr->event_dir = NULL;
- mutex_unlock(&event_mutex);
-
return 0;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1c21d0e2a145..1e1558c99d56 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -28,12 +28,16 @@ struct hist_field;
typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
+#define HIST_FIELD_OPERANDS_MAX 2
+
struct hist_field {
struct ftrace_event_field *field;
unsigned long flags;
hist_field_fn_t fn;
unsigned int size;
unsigned int offset;
+ unsigned int is_signed;
+ struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
};
static u64 hist_field_none(struct hist_field *field, void *event)
@@ -71,7 +75,9 @@ static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
static u64 hist_field_log2(struct hist_field *hist_field, void *event)
{
- u64 val = *(u64 *)(event + hist_field->field->offset);
+ struct hist_field *operand = hist_field->operands[0];
+
+ u64 val = operand->fn(operand, event);
return (u64) ilog2(roundup_pow_of_two(val));
}
@@ -110,16 +116,16 @@ DEFINE_HIST_FIELD_FN(u8);
#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
enum hist_field_flags {
- HIST_FIELD_FL_HITCOUNT = 1,
- HIST_FIELD_FL_KEY = 2,
- HIST_FIELD_FL_STRING = 4,
- HIST_FIELD_FL_HEX = 8,
- HIST_FIELD_FL_SYM = 16,
- HIST_FIELD_FL_SYM_OFFSET = 32,
- HIST_FIELD_FL_EXECNAME = 64,
- HIST_FIELD_FL_SYSCALL = 128,
- HIST_FIELD_FL_STACKTRACE = 256,
- HIST_FIELD_FL_LOG2 = 512,
+ HIST_FIELD_FL_HITCOUNT = 1 << 0,
+ HIST_FIELD_FL_KEY = 1 << 1,
+ HIST_FIELD_FL_STRING = 1 << 2,
+ HIST_FIELD_FL_HEX = 1 << 3,
+ HIST_FIELD_FL_SYM = 1 << 4,
+ HIST_FIELD_FL_SYM_OFFSET = 1 << 5,
+ HIST_FIELD_FL_EXECNAME = 1 << 6,
+ HIST_FIELD_FL_SYSCALL = 1 << 7,
+ HIST_FIELD_FL_STACKTRACE = 1 << 8,
+ HIST_FIELD_FL_LOG2 = 1 << 9,
};
struct hist_trigger_attrs {
@@ -146,6 +152,25 @@ struct hist_trigger_data {
struct tracing_map *map;
};
+static const char *hist_field_name(struct hist_field *field,
+ unsigned int level)
+{
+ const char *field_name = "";
+
+ if (level > 1)
+ return field_name;
+
+ if (field->field)
+ field_name = field->field->name;
+ else if (field->flags & HIST_FIELD_FL_LOG2)
+ field_name = hist_field_name(field->operands[0], ++level);
+
+ if (field_name == NULL)
+ field_name = "";
+
+ return field_name;
+}
+
static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
{
hist_field_fn_t fn = NULL;
@@ -340,8 +365,20 @@ static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
.elt_init = hist_trigger_elt_comm_init,
};
-static void destroy_hist_field(struct hist_field *hist_field)
+static void destroy_hist_field(struct hist_field *hist_field,
+ unsigned int level)
{
+ unsigned int i;
+
+ if (level > 2)
+ return;
+
+ if (!hist_field)
+ return;
+
+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
+ destroy_hist_field(hist_field->operands[i], level + 1);
+
kfree(hist_field);
}
@@ -368,7 +405,10 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
}
if (flags & HIST_FIELD_FL_LOG2) {
+ unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
hist_field->fn = hist_field_log2;
+ hist_field->operands[0] = create_hist_field(field, fl);
+ hist_field->size = hist_field->operands[0]->size;
goto out;
}
@@ -388,7 +428,7 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
hist_field->fn = select_value_fn(field->size,
field->is_signed);
if (!hist_field->fn) {
- destroy_hist_field(hist_field);
+ destroy_hist_field(hist_field, 0);
return NULL;
}
}
@@ -405,7 +445,7 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data)
for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
if (hist_data->fields[i]) {
- destroy_hist_field(hist_data->fields[i]);
+ destroy_hist_field(hist_data->fields[i], 0);
hist_data->fields[i] = NULL;
}
}
@@ -450,7 +490,7 @@ static int create_val_field(struct hist_trigger_data *hist_data,
}
field = trace_find_event_field(file->event_call, field_name);
- if (!field) {
+ if (!field || !field->size) {
ret = -EINVAL;
goto out;
}
@@ -548,7 +588,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
}
field = trace_find_event_field(file->event_call, field_name);
- if (!field) {
+ if (!field || !field->size) {
ret = -EINVAL;
goto out;
}
@@ -653,7 +693,6 @@ static int is_descending(const char *str)
static int create_sort_keys(struct hist_trigger_data *hist_data)
{
char *fields_str = hist_data->attrs->sort_key_str;
- struct ftrace_event_field *field = NULL;
struct tracing_map_sort_key *sort_key;
int descending, ret = 0;
unsigned int i, j;
@@ -670,7 +709,9 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
}
for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
+ struct hist_field *hist_field;
char *field_str, *field_name;
+ const char *test_name;
sort_key = &hist_data->sort_keys[i];
@@ -703,8 +744,10 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
}
for (j = 1; j < hist_data->n_fields; j++) {
- field = hist_data->fields[j]->field;
- if (field && (strcmp(field_name, field->name) == 0)) {
+ hist_field = hist_data->fields[j];
+ test_name = hist_field_name(hist_field, 0);
+
+ if (strcmp(field_name, test_name) == 0) {
sort_key->field_idx = j;
descending = is_descending(field_str);
if (descending < 0) {
@@ -952,6 +995,7 @@ hist_trigger_entry_print(struct seq_file *m,
struct hist_field *key_field;
char str[KSYM_SYMBOL_LEN];
bool multiline = false;
+ const char *field_name;
unsigned int i;
u64 uval;
@@ -963,26 +1007,27 @@ hist_trigger_entry_print(struct seq_file *m,
if (i > hist_data->n_vals)
seq_puts(m, ", ");
+ field_name = hist_field_name(key_field, 0);
+
if (key_field->flags & HIST_FIELD_FL_HEX) {
uval = *(u64 *)(key + key_field->offset);
- seq_printf(m, "%s: %llx",
- key_field->field->name, uval);
+ seq_printf(m, "%s: %llx", field_name, uval);
} else if (key_field->flags & HIST_FIELD_FL_SYM) {
uval = *(u64 *)(key + key_field->offset);
sprint_symbol_no_offset(str, uval);
- seq_printf(m, "%s: [%llx] %-45s",
- key_field->field->name, uval, str);
+ seq_printf(m, "%s: [%llx] %-45s", field_name,
+ uval, str);
} else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
uval = *(u64 *)(key + key_field->offset);
sprint_symbol(str, uval);
- seq_printf(m, "%s: [%llx] %-55s",
- key_field->field->name, uval, str);
+ seq_printf(m, "%s: [%llx] %-55s", field_name,
+ uval, str);
} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
char *comm = elt->private_data;
uval = *(u64 *)(key + key_field->offset);
- seq_printf(m, "%s: %-16s[%10llu]",
- key_field->field->name, comm, uval);
+ seq_printf(m, "%s: %-16s[%10llu]", field_name,
+ comm, uval);
} else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
const char *syscall_name;
@@ -991,8 +1036,8 @@ hist_trigger_entry_print(struct seq_file *m,
if (!syscall_name)
syscall_name = "unknown_syscall";
- seq_printf(m, "%s: %-30s[%3llu]",
- key_field->field->name, syscall_name, uval);
+ seq_printf(m, "%s: %-30s[%3llu]", field_name,
+ syscall_name, uval);
} else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
seq_puts(m, "stacktrace:\n");
hist_trigger_stacktrace_print(m,
@@ -1000,15 +1045,14 @@ hist_trigger_entry_print(struct seq_file *m,
HIST_STACKTRACE_DEPTH);
multiline = true;
} else if (key_field->flags & HIST_FIELD_FL_LOG2) {
- seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
+ seq_printf(m, "%s: ~ 2^%-2llu", field_name,
*(u64 *)(key + key_field->offset));
} else if (key_field->flags & HIST_FIELD_FL_STRING) {
- seq_printf(m, "%s: %-50s", key_field->field->name,
+ seq_printf(m, "%s: %-50s", field_name,
(char *)(key + key_field->offset));
} else {
uval = *(u64 *)(key + key_field->offset);
- seq_printf(m, "%s: %10llu", key_field->field->name,
- uval);
+ seq_printf(m, "%s: %10llu", field_name, uval);
}
}
@@ -1021,13 +1065,13 @@ hist_trigger_entry_print(struct seq_file *m,
tracing_map_read_sum(elt, HITCOUNT_IDX));
for (i = 1; i < hist_data->n_vals; i++) {
+ field_name = hist_field_name(hist_data->fields[i], 0);
+
if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
- seq_printf(m, " %s: %10llx",
- hist_data->fields[i]->field->name,
+ seq_printf(m, " %s: %10llx", field_name,
tracing_map_read_sum(elt, i));
} else {
- seq_printf(m, " %s: %10llu",
- hist_data->fields[i]->field->name,
+ seq_printf(m, " %s: %10llu", field_name,
tracing_map_read_sum(elt, i));
}
}
@@ -1062,7 +1106,7 @@ static void hist_trigger_show(struct seq_file *m,
struct event_trigger_data *data, int n)
{
struct hist_trigger_data *hist_data;
- int n_entries, ret = 0;
+ int n_entries;
if (n > 0)
seq_puts(m, "\n\n");
@@ -1073,10 +1117,8 @@ static void hist_trigger_show(struct seq_file *m,
hist_data = data->private_data;
n_entries = print_entries(m, hist_data);
- if (n_entries < 0) {
- ret = n_entries;
+ if (n_entries < 0)
n_entries = 0;
- }
seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n",
(u64)atomic64_read(&hist_data->map->hits),
@@ -1142,7 +1184,9 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
{
- seq_printf(m, "%s", hist_field->field->name);
+ const char *field_name = hist_field_name(hist_field, 0);
+
+ seq_printf(m, "%s", field_name);
if (hist_field->flags) {
const char *flags_str = get_hist_field_flags(hist_field);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7758bc0617cb..03ecb4465ee4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,10 @@
#include "trace.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/preemptirq.h>
+
+#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
static struct trace_array *irqsoff_trace __read_mostly;
static int tracer_enabled __read_mostly;
@@ -463,63 +467,43 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
#else /* !CONFIG_PROVE_LOCKING */
/*
- * Stubs:
- */
-
-void trace_softirqs_on(unsigned long ip)
-{
-}
-
-void trace_softirqs_off(unsigned long ip)
-{
-}
-
-inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
-
-/*
* We are only interested in hardirq on/off events:
*/
-void trace_hardirqs_on(void)
+static inline void tracer_hardirqs_on(void)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL(trace_hardirqs_on);
-void trace_hardirqs_off(void)
+static inline void tracer_hardirqs_off(void)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL(trace_hardirqs_off);
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, caller_addr);
}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, caller_addr);
}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_PROVE_LOCKING */
#endif /* CONFIG_IRQSOFF_TRACER */
#ifdef CONFIG_PREEMPT_TRACER
-void trace_preempt_on(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1)
{
if (preempt_trace() && !irq_trace())
stop_critical_timing(a0, a1);
}
-void trace_preempt_off(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1)
{
if (preempt_trace() && !irq_trace())
start_critical_timing(a0, a1);
@@ -781,3 +765,100 @@ __init static int init_irqsoff_tracer(void)
return 0;
}
core_initcall(init_irqsoff_tracer);
+#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */
+
+#ifndef CONFIG_IRQSOFF_TRACER
+static inline void tracer_hardirqs_on(void) { }
+static inline void tracer_hardirqs_off(void) { }
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { }
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { }
+#endif
+
+#ifndef CONFIG_PREEMPT_TRACER
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
+#endif
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+/* Per-cpu variable to prevent redundant calls when IRQs already off */
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+
+void trace_hardirqs_on(void)
+{
+ if (!this_cpu_read(tracing_irq_cpu))
+ return;
+
+ trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_on();
+
+ this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+ if (this_cpu_read(tracing_irq_cpu))
+ return;
+
+ this_cpu_write(tracing_irq_cpu, 1);
+
+ trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_off();
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+ if (!this_cpu_read(tracing_irq_cpu))
+ return;
+
+ trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
+ tracer_hardirqs_on_caller(caller_addr);
+
+ this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+ if (this_cpu_read(tracing_irq_cpu))
+ return;
+
+ this_cpu_write(tracing_irq_cpu, 1);
+
+ trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
+ tracer_hardirqs_off_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+/*
+ * Stubs:
+ */
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACER) || \
+ (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+ trace_preempt_enable_rcuidle(a0, a1);
+ tracer_preempt_on(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+ trace_preempt_disable_rcuidle(a0, a1);
+ tracer_preempt_off(a0, a1);
+}
+#endif
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8a907e12b6b9..492700c5fb4d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -907,8 +907,8 @@ static int probes_open(struct inode *inode, struct file *file)
static ssize_t probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- return traceprobe_probes_write(file, buffer, count, ppos,
- create_trace_kprobe);
+ return trace_parse_run_command(file, buffer, count, ppos,
+ create_trace_kprobe);
}
static const struct file_operations kprobe_events_ops = {
@@ -1174,13 +1174,12 @@ static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct trace_event_call *call = &tk->tp.call;
- struct bpf_prog *prog = call->prog;
struct kprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
head = this_cpu_ptr(call->perf_events);
@@ -1200,7 +1199,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
memset(&entry[1], 0, dsize);
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL, NULL);
+ head, NULL);
}
NOKPROBE_SYMBOL(kprobe_perf_func);
@@ -1210,13 +1209,12 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_event_call *call = &tk->tp.call;
- struct bpf_prog *prog = call->prog;
struct kretprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
head = this_cpu_ptr(call->perf_events);
@@ -1236,7 +1234,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL, NULL);
+ head, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);
#endif /* CONFIG_PERF_EVENTS */
@@ -1433,9 +1431,9 @@ static __init int kprobe_trace_self_tests_init(void)
pr_info("Testing kprobe tracing: ");
- ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
- "$stack $stack0 +0($stack)",
- create_trace_kprobe);
+ ret = trace_run_command("p:testprobe kprobe_trace_selftest_target "
+ "$stack $stack0 +0($stack)",
+ create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function entry.\n");
warn++;
@@ -1455,8 +1453,8 @@ static __init int kprobe_trace_self_tests_init(void)
}
}
- ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
- "$retval", create_trace_kprobe);
+ ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target "
+ "$retval", create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function return.\n");
warn++;
@@ -1526,13 +1524,13 @@ static __init int kprobe_trace_self_tests_init(void)
disable_trace_kprobe(tk, file);
}
- ret = traceprobe_command("-:testprobe", create_trace_kprobe);
+ ret = trace_run_command("-:testprobe", create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on deleting a probe.\n");
warn++;
}
- ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
+ ret = trace_run_command("-:testprobe2", create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on deleting a probe.\n");
warn++;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 52478f033f88..d59357308677 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -623,92 +623,6 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)
kfree(arg->comm);
}
-int traceprobe_command(const char *buf, int (*createfn)(int, char **))
-{
- char **argv;
- int argc, ret;
-
- argc = 0;
- ret = 0;
- argv = argv_split(GFP_KERNEL, buf, &argc);
- if (!argv)
- return -ENOMEM;
-
- if (argc)
- ret = createfn(argc, argv);
-
- argv_free(argv);
-
- return ret;
-}
-
-#define WRITE_BUFSIZE 4096
-
-ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
- size_t count, loff_t *ppos,
- int (*createfn)(int, char **))
-{
- char *kbuf, *buf, *tmp;
- int ret = 0;
- size_t done = 0;
- size_t size;
-
- kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
- if (!kbuf)
- return -ENOMEM;
-
- while (done < count) {
- size = count - done;
-
- if (size >= WRITE_BUFSIZE)
- size = WRITE_BUFSIZE - 1;
-
- if (copy_from_user(kbuf, buffer + done, size)) {
- ret = -EFAULT;
- goto out;
- }
- kbuf[size] = '\0';
- buf = kbuf;
- do {
- tmp = strchr(buf, '\n');
- if (tmp) {
- *tmp = '\0';
- size = tmp - buf + 1;
- } else {
- size = strlen(buf);
- if (done + size < count) {
- if (buf != kbuf)
- break;
- /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
- pr_warn("Line length is too long: Should be less than %d\n",
- WRITE_BUFSIZE - 2);
- ret = -EINVAL;
- goto out;
- }
- }
- done += size;
-
- /* Remove comments */
- tmp = strchr(buf, '#');
-
- if (tmp)
- *tmp = '\0';
-
- ret = traceprobe_command(buf, createfn);
- if (ret)
- goto out;
- buf += size;
-
- } while (done < count);
- }
- ret = done;
-
-out:
- kfree(kbuf);
-
- return ret;
-}
-
static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
bool is_return)
{
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 903273c93e61..fb66e3eaa192 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -42,7 +42,6 @@
#define MAX_TRACE_ARGS 128
#define MAX_ARGSTR_LEN 63
-#define MAX_EVENT_NAME_LEN 64
#define MAX_STRING_SIZE PATH_MAX
/* Reserved field names */
@@ -356,12 +355,6 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg);
extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
-extern ssize_t traceprobe_probes_write(struct file *file,
- const char __user *buffer, size_t count, loff_t *ppos,
- int (*createfn)(int, char**));
-
-extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
-
/* Sum up total data length for dynamic arraies (strings) */
static nokprobe_inline int
__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index cd70eb5df38e..11e9daa4a568 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -60,7 +60,7 @@ static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
* Test the trace buffer to see if all the elements
* are still sane.
*/
-static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
+static int __maybe_unused trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
{
unsigned long flags, cnt = 0;
int cpu, ret = 0;
@@ -1151,38 +1151,6 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
}
#endif /* CONFIG_SCHED_TRACER */
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-int
-trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
-{
- unsigned long count;
- int ret;
-
- /* start the tracing */
- ret = tracer_init(trace, tr);
- if (ret) {
- warn_failed_init_tracer(trace, ret);
- return ret;
- }
-
- /* Sleep for a 1/10 of a second */
- msleep(100);
- /* stop the tracing. */
- tracing_stop();
- /* check the trace buffer */
- ret = trace_test_buffer(&tr->trace_buffer, &count);
- trace->reset(tr);
- tracing_start();
-
- if (!ret && !count) {
- printk(KERN_CONT ".. no entries found ..");
- ret = -1;
- }
-
- return ret;
-}
-#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
-
#ifdef CONFIG_BRANCH_TRACER
int
trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index a2a642f2c64f..f93a56d2db27 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -560,9 +560,10 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
-static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
- struct syscall_metadata *sys_data,
- struct syscall_trace_enter *rec) {
+static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
+ struct syscall_metadata *sys_data,
+ struct syscall_trace_enter *rec)
+{
struct syscall_tp_t {
unsigned long long regs;
unsigned long syscall_nr;
@@ -574,7 +575,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
param.syscall_nr = rec->nr;
for (i = 0; i < sys_data->nb_args; i++)
param.args[i] = rec->args[i];
- return trace_call_bpf(prog, &param);
+ return trace_call_bpf(call, &param);
}
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
@@ -582,7 +583,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
struct hlist_head *head;
- struct bpf_prog *prog;
+ bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
@@ -597,9 +598,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (!sys_data)
return;
- prog = READ_ONCE(sys_data->enter_event->prog);
head = this_cpu_ptr(sys_data->enter_event->perf_events);
- if (!prog && hlist_empty(head))
+ valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+ if (!valid_prog_array && hlist_empty(head))
return;
/* get the size after alignment with the u32 buffer size field */
@@ -615,7 +616,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
- if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
+ if ((valid_prog_array &&
+ !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
@@ -623,7 +625,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
- head, NULL, NULL);
+ head, NULL);
}
static int perf_sysenter_enable(struct trace_event_call *call)
@@ -660,8 +662,9 @@ static void perf_sysenter_disable(struct trace_event_call *call)
mutex_unlock(&syscall_trace_lock);
}
-static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
- struct syscall_trace_exit *rec) {
+static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
+ struct syscall_trace_exit *rec)
+{
struct syscall_tp_t {
unsigned long long regs;
unsigned long syscall_nr;
@@ -671,7 +674,7 @@ static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
param.ret = rec->ret;
- return trace_call_bpf(prog, &param);
+ return trace_call_bpf(call, &param);
}
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
@@ -679,7 +682,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
struct hlist_head *head;
- struct bpf_prog *prog;
+ bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
@@ -694,9 +697,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data)
return;
- prog = READ_ONCE(sys_data->exit_event->prog);
head = this_cpu_ptr(sys_data->exit_event->perf_events);
- if (!prog && hlist_empty(head))
+ valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
+ if (!valid_prog_array && hlist_empty(head))
return;
/* We can probably do that at build time */
@@ -710,14 +713,15 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
- if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
+ if ((valid_prog_array &&
+ !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
}
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
- 1, regs, head, NULL, NULL);
+ 1, regs, head, NULL);
}
static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4525e0271a53..40592e7b3568 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -651,7 +651,7 @@ static int probes_open(struct inode *inode, struct file *file)
static ssize_t probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
+ return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe);
}
static const struct file_operations uprobe_events_ops = {
@@ -1113,13 +1113,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
{
struct trace_event_call *call = &tu->tp.call;
struct uprobe_trace_entry_head *entry;
- struct bpf_prog *prog = call->prog;
struct hlist_head *head;
void *data;
int size, esize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
@@ -1156,7 +1155,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
}
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL, NULL);
+ head, NULL);
out:
preempt_enable();
}
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 305039b122fa..07e75344725b 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -428,7 +428,8 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
if (test_key && test_key == key_hash && entry->val &&
keys_match(key, entry->val->key, map->key_size)) {
- atomic64_inc(&map->hits);
+ if (!lookup_only)
+ atomic64_inc(&map->hits);
return entry->val;
}
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index ab0ca77331d0..5b5bbf8ae550 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -6,7 +6,7 @@
#define TRACING_MAP_BITS_MAX 17
#define TRACING_MAP_BITS_MIN 7
-#define TRACING_MAP_KEYS_MAX 2
+#define TRACING_MAP_KEYS_MAX 3
#define TRACING_MAP_VALS_MAX 3
#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \
TRACING_MAP_VALS_MAX)
diff --git a/kernel/umh.c b/kernel/umh.c
index 6ff9905250ff..18e5fa4b0e71 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -537,14 +537,14 @@ static int proc_cap_handler(struct ctl_table *table, int write,
/*
* Drop everything not in the new_cap (but don't add things)
*/
- spin_lock(&umh_sysctl_lock);
if (write) {
+ spin_lock(&umh_sysctl_lock);
if (table->data == CAP_BSET)
usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
if (table->data == CAP_PI)
usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ spin_unlock(&umh_sysctl_lock);
}
- spin_unlock(&umh_sysctl_lock);
return 0;
}
diff --git a/kernel/user.c b/kernel/user.c
index 00281add65b2..9a20acce460d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,26 +26,32 @@
struct user_namespace init_user_ns = {
.uid_map = {
.nr_extents = 1,
- .extent[0] = {
- .first = 0,
- .lower_first = 0,
- .count = 4294967295U,
+ {
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
},
},
.gid_map = {
.nr_extents = 1,
- .extent[0] = {
- .first = 0,
- .lower_first = 0,
- .count = 4294967295U,
+ {
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
},
},
.projid_map = {
.nr_extents = 1,
- .extent[0] = {
- .first = 0,
- .lower_first = 0,
- .count = 4294967295U,
+ {
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
},
},
.count = ATOMIC_INIT(3),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index d32b45662fb6..246d4d4ce5c7 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -23,6 +23,8 @@
#include <linux/ctype.h>
#include <linux/projid.h>
#include <linux/fs_struct.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
static DEFINE_MUTEX(userns_state_mutex);
@@ -181,6 +183,18 @@ static void free_user_ns(struct work_struct *work)
do {
struct ucounts *ucounts = ns->ucounts;
parent = ns->parent;
+ if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+ kfree(ns->gid_map.forward);
+ kfree(ns->gid_map.reverse);
+ }
+ if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+ kfree(ns->uid_map.forward);
+ kfree(ns->uid_map.reverse);
+ }
+ if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+ kfree(ns->projid_map.forward);
+ kfree(ns->projid_map.reverse);
+ }
retire_userns_sysctls(ns);
#ifdef CONFIG_PERSISTENT_KEYRINGS
key_put(ns->persistent_keyring_register);
@@ -198,26 +212,101 @@ void __put_user_ns(struct user_namespace *ns)
}
EXPORT_SYMBOL(__put_user_ns);
-static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
+/**
+ * idmap_key struct holds the information necessary to find an idmapping in a
+ * sorted idmap array. It is passed to cmp_map_id() as first argument.
+ */
+struct idmap_key {
+ bool map_up; /* true -> id from kid; false -> kid from id */
+ u32 id; /* id to find */
+ u32 count; /* == 0 unless used with map_id_range_down() */
+};
+
+/**
+ * cmp_map_id - Function to be passed to bsearch() to find the requested
+ * idmapping. Expects struct idmap_key to be passed via @k.
+ */
+static int cmp_map_id(const void *k, const void *e)
{
- unsigned idx, extents;
+ u32 first, last, id2;
+ const struct idmap_key *key = k;
+ const struct uid_gid_extent *el = e;
+
+ id2 = key->id + key->count - 1;
+
+ /* handle map_id_{down,up}() */
+ if (key->map_up)
+ first = el->lower_first;
+ else
+ first = el->first;
+
+ last = first + el->count - 1;
+
+ if (key->id >= first && key->id <= last &&
+ (id2 >= first && id2 <= last))
+ return 0;
+
+ if (key->id < first || id2 < first)
+ return -1;
+
+ return 1;
+}
+
+/**
+ * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static struct uid_gid_extent *
+map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
+{
+ struct idmap_key key;
+
+ key.map_up = false;
+ key.count = count;
+ key.id = id;
+
+ return bsearch(&key, map->forward, extents,
+ sizeof(struct uid_gid_extent), cmp_map_id);
+}
+
+/**
+ * map_id_range_down_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static struct uid_gid_extent *
+map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
+{
+ unsigned idx;
u32 first, last, id2;
id2 = id + count - 1;
/* Find the matching extent */
- extents = map->nr_extents;
- smp_rmb();
for (idx = 0; idx < extents; idx++) {
first = map->extent[idx].first;
last = first + map->extent[idx].count - 1;
if (id >= first && id <= last &&
(id2 >= first && id2 <= last))
- break;
+ return &map->extent[idx];
}
+ return NULL;
+}
+
+static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
+{
+ struct uid_gid_extent *extent;
+ unsigned extents = map->nr_extents;
+ smp_rmb();
+
+ if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ extent = map_id_range_down_base(extents, map, id, count);
+ else
+ extent = map_id_range_down_max(extents, map, id, count);
+
/* Map the id or note failure */
- if (idx < extents)
- id = (id - first) + map->extent[idx].lower_first;
+ if (extent)
+ id = (id - extent->first) + extent->lower_first;
else
id = (u32) -1;
@@ -226,44 +315,61 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
static u32 map_id_down(struct uid_gid_map *map, u32 id)
{
- unsigned idx, extents;
+ return map_id_range_down(map, id, 1);
+}
+
+/**
+ * map_id_up_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static struct uid_gid_extent *
+map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
+{
+ unsigned idx;
u32 first, last;
/* Find the matching extent */
- extents = map->nr_extents;
- smp_rmb();
for (idx = 0; idx < extents; idx++) {
- first = map->extent[idx].first;
+ first = map->extent[idx].lower_first;
last = first + map->extent[idx].count - 1;
if (id >= first && id <= last)
- break;
+ return &map->extent[idx];
}
- /* Map the id or note failure */
- if (idx < extents)
- id = (id - first) + map->extent[idx].lower_first;
- else
- id = (u32) -1;
+ return NULL;
+}
- return id;
+/**
+ * map_id_up_max - Find idmap via binary search in ordered idmap array.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static struct uid_gid_extent *
+map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
+{
+ struct idmap_key key;
+
+ key.map_up = true;
+ key.count = 1;
+ key.id = id;
+
+ return bsearch(&key, map->reverse, extents,
+ sizeof(struct uid_gid_extent), cmp_map_id);
}
static u32 map_id_up(struct uid_gid_map *map, u32 id)
{
- unsigned idx, extents;
- u32 first, last;
-
- /* Find the matching extent */
- extents = map->nr_extents;
+ struct uid_gid_extent *extent;
+ unsigned extents = map->nr_extents;
smp_rmb();
- for (idx = 0; idx < extents; idx++) {
- first = map->extent[idx].lower_first;
- last = first + map->extent[idx].count - 1;
- if (id >= first && id <= last)
- break;
- }
+
+ if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ extent = map_id_up_base(extents, map, id);
+ else
+ extent = map_id_up_max(extents, map, id);
+
/* Map the id or note failure */
- if (idx < extents)
- id = (id - first) + map->extent[idx].first;
+ if (extent)
+ id = (id - extent->lower_first) + extent->first;
else
id = (u32) -1;
@@ -540,13 +646,17 @@ static int projid_m_show(struct seq_file *seq, void *v)
static void *m_start(struct seq_file *seq, loff_t *ppos,
struct uid_gid_map *map)
{
- struct uid_gid_extent *extent = NULL;
loff_t pos = *ppos;
+ unsigned extents = map->nr_extents;
+ smp_rmb();
- if (pos < map->nr_extents)
- extent = &map->extent[pos];
+ if (pos >= extents)
+ return NULL;
- return extent;
+ if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ return &map->extent[pos];
+
+ return &map->forward[pos];
}
static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
@@ -618,7 +728,10 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
u32 prev_upper_last, prev_lower_last;
struct uid_gid_extent *prev;
- prev = &new_map->extent[idx];
+ if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ prev = &new_map->extent[idx];
+ else
+ prev = &new_map->forward[idx];
prev_upper_first = prev->first;
prev_lower_first = prev->lower_first;
@@ -638,6 +751,101 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
return false;
}
+/**
+ * insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
+ * Takes care to allocate a 4K block of memory if the number of mappings exceeds
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
+{
+ struct uid_gid_extent *dest;
+
+ if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
+ struct uid_gid_extent *forward;
+
+ /* Allocate memory for 340 mappings. */
+ forward = kmalloc(sizeof(struct uid_gid_extent) *
+ UID_GID_MAP_MAX_EXTENTS, GFP_KERNEL);
+ if (!forward)
+ return -ENOMEM;
+
+ /* Copy over memory. Only set up memory for the forward pointer.
+ * Defer the memory setup for the reverse pointer.
+ */
+ memcpy(forward, map->extent,
+ map->nr_extents * sizeof(map->extent[0]));
+
+ map->forward = forward;
+ map->reverse = NULL;
+ }
+
+ if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS)
+ dest = &map->extent[map->nr_extents];
+ else
+ dest = &map->forward[map->nr_extents];
+
+ *dest = *extent;
+ map->nr_extents++;
+ return 0;
+}
+
+/* cmp function to sort() forward mappings */
+static int cmp_extents_forward(const void *a, const void *b)
+{
+ const struct uid_gid_extent *e1 = a;
+ const struct uid_gid_extent *e2 = b;
+
+ if (e1->first < e2->first)
+ return -1;
+
+ if (e1->first > e2->first)
+ return 1;
+
+ return 0;
+}
+
+/* cmp function to sort() reverse mappings */
+static int cmp_extents_reverse(const void *a, const void *b)
+{
+ const struct uid_gid_extent *e1 = a;
+ const struct uid_gid_extent *e2 = b;
+
+ if (e1->lower_first < e2->lower_first)
+ return -1;
+
+ if (e1->lower_first > e2->lower_first)
+ return 1;
+
+ return 0;
+}
+
+/**
+ * sort_idmaps - Sorts an array of idmap entries.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static int sort_idmaps(struct uid_gid_map *map)
+{
+ if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ return 0;
+
+ /* Sort forward array. */
+ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent),
+ cmp_extents_forward, NULL);
+
+ /* Only copy the memory from forward we actually need. */
+ map->reverse = kmemdup(map->forward,
+ map->nr_extents * sizeof(struct uid_gid_extent),
+ GFP_KERNEL);
+ if (!map->reverse)
+ return -ENOMEM;
+
+ /* Sort reverse array. */
+ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent),
+ cmp_extents_reverse, NULL);
+
+ return 0;
+}
+
static ssize_t map_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos,
int cap_setid,
@@ -648,7 +856,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
struct user_namespace *ns = seq->private;
struct uid_gid_map new_map;
unsigned idx;
- struct uid_gid_extent *extent = NULL;
+ struct uid_gid_extent extent;
char *kbuf = NULL, *pos, *next_line;
ssize_t ret = -EINVAL;
@@ -673,6 +881,8 @@ static ssize_t map_write(struct file *file, const char __user *buf,
*/
mutex_lock(&userns_state_mutex);
+ memset(&new_map, 0, sizeof(struct uid_gid_map));
+
ret = -EPERM;
/* Only allow one successful write to the map */
if (map->nr_extents != 0)
@@ -700,9 +910,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
/* Parse the user data */
ret = -EINVAL;
pos = kbuf;
- new_map.nr_extents = 0;
for (; pos; pos = next_line) {
- extent = &new_map.extent[new_map.nr_extents];
/* Find the end of line and ensure I don't look past it */
next_line = strchr(pos, '\n');
@@ -714,17 +922,17 @@ static ssize_t map_write(struct file *file, const char __user *buf,
}
pos = skip_spaces(pos);
- extent->first = simple_strtoul(pos, &pos, 10);
+ extent.first = simple_strtoul(pos, &pos, 10);
if (!isspace(*pos))
goto out;
pos = skip_spaces(pos);
- extent->lower_first = simple_strtoul(pos, &pos, 10);
+ extent.lower_first = simple_strtoul(pos, &pos, 10);
if (!isspace(*pos))
goto out;
pos = skip_spaces(pos);
- extent->count = simple_strtoul(pos, &pos, 10);
+ extent.count = simple_strtoul(pos, &pos, 10);
if (*pos && !isspace(*pos))
goto out;
@@ -734,29 +942,31 @@ static ssize_t map_write(struct file *file, const char __user *buf,
goto out;
/* Verify we have been given valid starting values */
- if ((extent->first == (u32) -1) ||
- (extent->lower_first == (u32) -1))
+ if ((extent.first == (u32) -1) ||
+ (extent.lower_first == (u32) -1))
goto out;
/* Verify count is not zero and does not cause the
* extent to wrap
*/
- if ((extent->first + extent->count) <= extent->first)
+ if ((extent.first + extent.count) <= extent.first)
goto out;
- if ((extent->lower_first + extent->count) <=
- extent->lower_first)
+ if ((extent.lower_first + extent.count) <=
+ extent.lower_first)
goto out;
/* Do the ranges in extent overlap any previous extents? */
- if (mappings_overlap(&new_map, extent))
+ if (mappings_overlap(&new_map, &extent))
goto out;
- new_map.nr_extents++;
-
- /* Fail if the file contains too many extents */
- if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
+ if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS &&
(next_line != NULL))
goto out;
+
+ ret = insert_extent(&new_map, &extent);
+ if (ret < 0)
+ goto out;
+ ret = -EINVAL;
}
/* Be very certaint the new map actually exists */
if (new_map.nr_extents == 0)
@@ -767,16 +977,26 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
goto out;
+ ret = sort_idmaps(&new_map);
+ if (ret < 0)
+ goto out;
+
+ ret = -EPERM;
/* Map the lower ids from the parent user namespace to the
* kernel global id space.
*/
for (idx = 0; idx < new_map.nr_extents; idx++) {
+ struct uid_gid_extent *e;
u32 lower_first;
- extent = &new_map.extent[idx];
+
+ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ e = &new_map.extent[idx];
+ else
+ e = &new_map.forward[idx];
lower_first = map_id_range_down(parent_map,
- extent->lower_first,
- extent->count);
+ e->lower_first,
+ e->count);
/* Fail if we can not map the specified extent to
* the kernel global id space.
@@ -784,18 +1004,31 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (lower_first == (u32) -1)
goto out;
- extent->lower_first = lower_first;
+ e->lower_first = lower_first;
}
/* Install the map */
- memcpy(map->extent, new_map.extent,
- new_map.nr_extents*sizeof(new_map.extent[0]));
+ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
+ memcpy(map->extent, new_map.extent,
+ new_map.nr_extents * sizeof(new_map.extent[0]));
+ } else {
+ map->forward = new_map.forward;
+ map->reverse = new_map.reverse;
+ }
smp_wmb();
map->nr_extents = new_map.nr_extents;
*ppos = count;
ret = count;
out:
+ if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+ kfree(new_map.forward);
+ kfree(new_map.reverse);
+ map->forward = NULL;
+ map->reverse = NULL;
+ map->nr_extents = 0;
+ }
+
mutex_unlock(&userns_state_mutex);
kfree(kbuf);
return ret;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7368b57842ea..8fdb710bfdd7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1509,7 +1509,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work = &dwork->work;
WARN_ON_ONCE(!wq);
- WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn);
+ WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
WARN_ON_ONCE(timer_pending(timer));
WARN_ON_ONCE(!list_empty(&work->entry));
@@ -4990,9 +4990,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
*
* Unbound workqueues have the following extra attributes.
*
- * id RO int : the associated pool ID
+ * pool_ids RO int : the associated pool IDs for each node
* nice RW int : nice value of the workers
* cpumask RW mask : bitmask of allowed CPUs for the workers
+ * numa RW bool : whether enable NUMA affinity
*/
struct wq_device {
struct workqueue_struct *wq;