summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c5
-rw-r--r--kernel/audit.c76
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_tree.c3
-rw-r--r--kernel/auditfilter.c39
-rw-r--r--kernel/auditsc.c29
-rw-r--r--kernel/bounds.c1
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/arraymap.c10
-rw-r--r--kernel/bpf/cgroup.c570
-rw-r--r--kernel/bpf/core.c188
-rw-r--r--kernel/bpf/cpumap.c706
-rw-r--r--kernel/bpf/devmap.c21
-rw-r--r--kernel/bpf/disasm.c214
-rw-r--r--kernel/bpf/disasm.h32
-rw-r--r--kernel/bpf/hashtab.c9
-rw-r--r--kernel/bpf/inode.c16
-rw-r--r--kernel/bpf/lpm_trie.c98
-rw-r--r--kernel/bpf/offload.c191
-rw-r--r--kernel/bpf/percpu_freelist.c8
-rw-r--r--kernel/bpf/sockmap.c62
-rw-r--r--kernel/bpf/stackmap.c5
-rw-r--r--kernel/bpf/syscall.c321
-rw-r--r--kernel/bpf/verifier.c1530
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup/Makefile3
-rw-r--r--kernel/cgroup/cgroup-internal.h10
-rw-r--r--kernel/cgroup/cgroup.c203
-rw-r--r--kernel/cgroup/cpuset.c15
-rw-r--r--kernel/cgroup/debug.c1
-rw-r--r--kernel/cgroup/namespace.c1
-rw-r--r--kernel/cgroup/stat.c334
-rw-r--r--kernel/compat.c77
-rw-r--r--kernel/cpu.c513
-rw-r--r--kernel/crash_core.c3
-rw-r--r--kernel/dma.c1
-rw-r--r--kernel/elfcore.c1
-rw-r--r--kernel/events/Makefile1
-rw-r--r--kernel/events/core.c550
-rw-r--r--kernel/events/internal.h1
-rw-r--r--kernel/events/ring_buffer.c26
-rw-r--r--kernel/exec_domain.c1
-rw-r--r--kernel/exit.c31
-rw-r--r--kernel/extable.c47
-rw-r--r--kernel/fork.c48
-rw-r--r--kernel/futex.c66
-rw-r--r--kernel/futex_compat.c1
-rw-r--r--kernel/gcov/Makefile1
-rw-r--r--kernel/gcov/base.c1
-rw-r--r--kernel/gcov/fs.c1
-rw-r--r--kernel/gcov/gcc_3_4.c1
-rw-r--r--kernel/gcov/gcc_4_7.c1
-rw-r--r--kernel/gcov/gcov.h1
-rw-r--r--kernel/groups.c1
-rw-r--r--kernel/irq/Kconfig6
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/affinity.c1
-rw-r--r--kernel/irq/autoprobe.c3
-rw-r--r--kernel/irq/chip.c39
-rw-r--r--kernel/irq/cpuhotplug.c28
-rw-r--r--kernel/irq/debug.h1
-rw-r--r--kernel/irq/debugfs.c12
-rw-r--r--kernel/irq/generic-chip.c16
-rw-r--r--kernel/irq/internals.h20
-rw-r--r--kernel/irq/irqdesc.c10
-rw-r--r--kernel/irq/irqdomain.c64
-rw-r--r--kernel/irq/manage.c57
-rw-r--r--kernel/irq/matrix.c443
-rw-r--r--kernel/irq/migration.c1
-rw-r--r--kernel/irq/msi.c32
-rw-r--r--kernel/irq/proc.c6
-rw-r--r--kernel/irq/resend.c1
-rw-r--r--kernel/irq/settings.h1
-rw-r--r--kernel/irq/spurious.c7
-rw-r--r--kernel/irq/timings.c2
-rw-r--r--kernel/irq_work.c21
-rw-r--r--kernel/jump_label.c16
-rw-r--r--kernel/kallsyms.c86
-rw-r--r--kernel/kcmp.c3
-rw-r--r--kernel/kcov.c217
-rw-r--r--kernel/kexec_file.c5
-rw-r--r--kernel/kexec_internal.h1
-rw-r--r--kernel/kprobes.c18
-rw-r--r--kernel/kthread.c76
-rw-r--r--kernel/livepatch/Makefile2
-rw-r--r--kernel/livepatch/core.c112
-rw-r--r--kernel/livepatch/core.h41
-rw-r--r--kernel/livepatch/patch.c1
-rw-r--r--kernel/livepatch/patch.h1
-rw-r--r--kernel/livepatch/shadow.c277
-rw-r--r--kernel/livepatch/transition.c45
-rw-r--r--kernel/livepatch/transition.h1
-rw-r--r--kernel/locking/Makefile1
-rw-r--r--kernel/locking/lockdep.c74
-rw-r--r--kernel/locking/lockdep_internals.h1
-rw-r--r--kernel/locking/lockdep_proc.c1
-rw-r--r--kernel/locking/mcs_spinlock.h1
-rw-r--r--kernel/locking/mutex-debug.h1
-rw-r--r--kernel/locking/mutex.h1
-rw-r--r--kernel/locking/osq_lock.c1
-rw-r--r--kernel/locking/qrwlock.c86
-rw-r--r--kernel/locking/qspinlock_paravirt.h48
-rw-r--r--kernel/locking/rtmutex-debug.c1
-rw-r--r--kernel/locking/rtmutex-debug.h1
-rw-r--r--kernel/locking/rtmutex.h1
-rw-r--r--kernel/locking/rtmutex_common.h1
-rw-r--r--kernel/locking/rwsem-spinlock.c1
-rw-r--r--kernel/locking/rwsem-xadd.c28
-rw-r--r--kernel/locking/rwsem.c17
-rw-r--r--kernel/locking/rwsem.h1
-rw-r--r--kernel/locking/spinlock.c10
-rw-r--r--kernel/memremap.c4
-rw-r--r--kernel/module.c38
-rw-r--r--kernel/padata.c75
-rw-r--r--kernel/panic.c47
-rw-r--r--kernel/params.c35
-rw-r--r--kernel/pid.c247
-rw-r--r--kernel/pid_namespace.c59
-rw-r--r--kernel/power/Kconfig14
-rw-r--r--kernel/power/Makefile1
-rw-r--r--kernel/power/autosleep.c1
-rw-r--r--kernel/power/console.c1
-rw-r--r--kernel/power/power.h1
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/qos.c4
-rw-r--r--kernel/power/snapshot.c39
-rw-r--r--kernel/power/suspend.c20
-rw-r--r--kernel/power/swap.c128
-rw-r--r--kernel/power/wakelock.c1
-rw-r--r--kernel/printk/braille.c1
-rw-r--r--kernel/printk/braille.h1
-rw-r--r--kernel/printk/console_cmdline.h1
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/printk/printk_safe.c17
-rw-r--r--kernel/range.c1
-rw-r--r--kernel/rcu/Makefile1
-rw-r--r--kernel/rcu/rcu.h21
-rw-r--r--kernel/rcu/rcu_segcblist.c1
-rw-r--r--kernel/rcu/rcutorture.c28
-rw-r--r--kernel/rcu/srcutree.c2
-rw-r--r--kernel/rcu/sync.c9
-rw-r--r--kernel/rcu/tree.c203
-rw-r--r--kernel/rcu/tree.h5
-rw-r--r--kernel/rcu/tree_plugin.h36
-rw-r--r--kernel/rcu/update.c28
-rw-r--r--kernel/reboot.c27
-rw-r--r--kernel/resource.c76
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/autogroup.c1
-rw-r--r--kernel/sched/autogroup.h1
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/completion.c1
-rw-r--r--kernel/sched/core.c295
-rw-r--r--kernel/sched/cpuacct.c1
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cpudeadline.h1
-rw-r--r--kernel/sched/cpufreq_schedutil.c12
-rw-r--r--kernel/sched/cpupri.h1
-rw-r--r--kernel/sched/cputime.c17
-rw-r--r--kernel/sched/deadline.c24
-rw-r--r--kernel/sched/debug.c20
-rw-r--r--kernel/sched/fair.c1192
-rw-r--r--kernel/sched/features.h4
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/isolation.c155
-rw-r--r--kernel/sched/loadavg.c1
-rw-r--r--kernel/sched/membarrier.c34
-rw-r--r--kernel/sched/rt.c319
-rw-r--r--kernel/sched/sched-pelt.h1
-rw-r--r--kernel/sched/sched.h76
-rw-r--r--kernel/sched/stats.c1
-rw-r--r--kernel/sched/stats.h1
-rw-r--r--kernel/sched/stop_task.c3
-rw-r--r--kernel/sched/swait.c1
-rw-r--r--kernel/sched/topology.c50
-rw-r--r--kernel/sched/wait_bit.c18
-rw-r--r--kernel/seccomp.c347
-rw-r--r--kernel/signal.c100
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/smpboot.c25
-rw-r--r--kernel/smpboot.h1
-rw-r--r--kernel/softirq.c18
-rw-r--r--kernel/sys.c13
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c116
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/task_work.c3
-rw-r--r--kernel/test_kprobes.c29
-rw-r--r--kernel/time/Kconfig6
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/clockevents.c21
-rw-r--r--kernel/time/clocksource.c5
-rw-r--r--kernel/time/hrtimer.c4
-rw-r--r--kernel/time/itimer.c1
-rw-r--r--kernel/time/ntp.c228
-rw-r--r--kernel/time/ntp_internal.h2
-rw-r--r--kernel/time/posix-cpu-timers.c7
-rw-r--r--kernel/time/posix-stubs.c20
-rw-r--r--kernel/time/posix-timers.h1
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c1
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-oneshot.c1
-rw-r--r--kernel/time/tick-sched.c38
-rw-r--r--kernel/time/tick-sched.h1
-rw-r--r--kernel/time/time.c71
-rw-r--r--kernel/time/timekeeping.c227
-rw-r--r--kernel/time/timekeeping.h3
-rw-r--r--kernel/time/timekeeping_internal.h1
-rw-r--r--kernel/time/timer.c122
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/trace/Kconfig15
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c108
-rw-r--r--kernel/trace/bpf_trace.c177
-rw-r--r--kernel/trace/ftrace.c368
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c69
-rw-r--r--kernel/trace/rpm-traces.c1
-rw-r--r--kernel/trace/trace.c110
-rw-r--r--kernel/trace/trace.h14
-rw-r--r--kernel/trace/trace_benchmark.c1
-rw-r--r--kernel/trace/trace_benchmark.h1
-rw-r--r--kernel/trace/trace_branch.c1
-rw-r--r--kernel/trace/trace_entries.h1
-rw-r--r--kernel/trace/trace_event_perf.c82
-rw-r--r--kernel/trace/trace_events.c31
-rw-r--r--kernel/trace/trace_events_filter_test.h1
-rw-r--r--kernel/trace/trace_events_hist.c128
-rw-r--r--kernel/trace/trace_export.c1
-rw-r--r--kernel/trace/trace_functions.c1
-rw-r--r--kernel/trace/trace_functions_graph.c1
-rw-r--r--kernel/trace/trace_irqsoff.c133
-rw-r--r--kernel/trace/trace_kdb.c1
-rw-r--r--kernel/trace/trace_kprobe.c28
-rw-r--r--kernel/trace/trace_mmiotrace.c2
-rw-r--r--kernel/trace/trace_nop.c1
-rw-r--r--kernel/trace/trace_output.c21
-rw-r--r--kernel/trace/trace_output.h1
-rw-r--r--kernel/trace/trace_probe.c86
-rw-r--r--kernel/trace/trace_probe.h7
-rw-r--r--kernel/trace/trace_sched_switch.c1
-rw-r--r--kernel/trace/trace_sched_wakeup.c9
-rw-r--r--kernel/trace/trace_selftest.c35
-rw-r--r--kernel/trace/trace_selftest_dynamic.c1
-rw-r--r--kernel/trace/trace_stack.c18
-rw-r--r--kernel/trace/trace_stat.c1
-rw-r--r--kernel/trace/trace_stat.h1
-rw-r--r--kernel/trace/trace_syscalls.c39
-rw-r--r--kernel/trace/trace_uprobe.c7
-rw-r--r--kernel/trace/tracing_map.c3
-rw-r--r--kernel/trace/tracing_map.h3
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/umh.c4
-rw-r--r--kernel/user.c30
-rw-r--r--kernel/user_namespace.c351
-rw-r--r--kernel/watchdog.c657
-rw-r--r--kernel/watchdog_hld.c204
-rw-r--r--kernel/workqueue.c94
-rw-r--r--kernel/workqueue_internal.h4
261 files changed, 10973 insertions, 4770 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ed470aac53da..172d151d429c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux kernel.
#
diff --git a/kernel/acct.c b/kernel/acct.c
index 5e72af29ab73..d15c0ee4d955 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/acct.c
*
@@ -146,7 +147,7 @@ static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
again:
smp_rmb();
rcu_read_lock();
- res = to_acct(ACCESS_ONCE(ns->bacct));
+ res = to_acct(READ_ONCE(ns->bacct));
if (!res) {
rcu_read_unlock();
return NULL;
@@ -158,7 +159,7 @@ again:
}
rcu_read_unlock();
mutex_lock(&res->lock);
- if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
+ if (res != to_acct(READ_ONCE(ns->bacct))) {
mutex_unlock(&res->lock);
acct_put(res);
goto again;
diff --git a/kernel/audit.c b/kernel/audit.c
index be1c28fd4d57..227db99b0f19 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -85,13 +85,13 @@ static int audit_initialized;
#define AUDIT_OFF 0
#define AUDIT_ON 1
#define AUDIT_LOCKED 2
-u32 audit_enabled;
-u32 audit_ever_enabled;
+u32 audit_enabled = AUDIT_OFF;
+bool audit_ever_enabled = !!AUDIT_OFF;
EXPORT_SYMBOL_GPL(audit_enabled);
/* Default state when kernel boots without any parameters. */
-static u32 audit_default;
+static u32 audit_default = AUDIT_OFF;
/* If auditing cannot proceed, audit_failure selects what happens. */
static u32 audit_failure = AUDIT_FAIL_PRINTK;
@@ -1197,25 +1197,28 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
pid_t auditd_pid;
struct pid *req_pid = task_tgid(current);
- /* sanity check - PID values must match */
- if (new_pid != pid_vnr(req_pid))
+ /* Sanity check - PID values must match. Setting
+ * pid to 0 is how auditd ends auditing. */
+ if (new_pid && (new_pid != pid_vnr(req_pid)))
return -EINVAL;
/* test the auditd connection */
audit_replace(req_pid);
auditd_pid = auditd_pid_vnr();
- /* only the current auditd can unregister itself */
- if ((!new_pid) && (new_pid != auditd_pid)) {
- audit_log_config_change("audit_pid", new_pid,
- auditd_pid, 0);
- return -EACCES;
- }
- /* replacing a healthy auditd is not allowed */
- if (auditd_pid && new_pid) {
- audit_log_config_change("audit_pid", new_pid,
- auditd_pid, 0);
- return -EEXIST;
+ if (auditd_pid) {
+ /* replacing a healthy auditd is not allowed */
+ if (new_pid) {
+ audit_log_config_change("audit_pid",
+ new_pid, auditd_pid, 0);
+ return -EEXIST;
+ }
+ /* only current auditd can unregister itself */
+ if (pid_vnr(req_pid) != auditd_pid) {
+ audit_log_config_change("audit_pid",
+ new_pid, auditd_pid, 0);
+ return -EACCES;
+ }
}
if (new_pid) {
@@ -1549,8 +1552,6 @@ static int __init audit_init(void)
register_pernet_subsys(&audit_net_ops);
audit_initialized = AUDIT_INITIALIZED;
- audit_enabled = audit_default;
- audit_ever_enabled |= !!audit_default;
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
if (IS_ERR(kauditd_task)) {
@@ -1564,14 +1565,21 @@ static int __init audit_init(void)
return 0;
}
-__initcall(audit_init);
+postcore_initcall(audit_init);
/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
static int __init audit_enable(char *str)
{
- audit_default = !!simple_strtol(str, NULL, 0);
- if (!audit_default)
+ long val;
+
+ if (kstrtol(str, 0, &val))
+ panic("audit: invalid 'audit' parameter value (%s)\n", str);
+ audit_default = (val ? AUDIT_ON : AUDIT_OFF);
+
+ if (audit_default == AUDIT_OFF)
audit_initialized = AUDIT_DISABLED;
+ if (audit_set_enabled(audit_default))
+ panic("audit: error setting audit state (%d)\n", audit_default);
pr_info("%s\n", audit_default ?
"enabled (after initialization)" : "disabled (until reboot)");
@@ -2337,32 +2345,6 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
}
}
-#ifdef CONFIG_SECURITY
-/**
- * audit_log_secctx - Converts and logs SELinux context
- * @ab: audit_buffer
- * @secid: security number
- *
- * This is a helper function that calls security_secid_to_secctx to convert
- * secid to secctx and then adds the (converted) SELinux context to the audit
- * log by calling audit_log_format, thus also preventing leak of internal secid
- * to userspace. If secid cannot be converted audit_panic is called.
- */
-void audit_log_secctx(struct audit_buffer *ab, u32 secid)
-{
- u32 len;
- char *secctx;
-
- if (security_secid_to_secctx(secid, &secctx, &len)) {
- audit_panic("Cannot convert secid to context");
- } else {
- audit_log_format(ab, " obj=%s", secctx);
- security_release_secctx(secctx, len);
- }
-}
-EXPORT_SYMBOL(audit_log_secctx);
-#endif
-
EXPORT_SYMBOL(audit_log_start);
EXPORT_SYMBOL(audit_log_end);
EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/audit.h b/kernel/audit.h
index 9b110ae17ee3..af5bc59487ed 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -208,7 +208,7 @@ struct audit_context {
struct audit_proctitle proctitle;
};
-extern u32 audit_ever_enabled;
+extern bool audit_ever_enabled;
extern void audit_copy_inode(struct audit_names *name,
const struct dentry *dentry,
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 011d46e5f73f..fd353120e0d9 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "audit.h"
#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
@@ -1007,7 +1008,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
* We are guaranteed to have at least one reference to the mark from
* either the inode or the caller of fsnotify_destroy_mark().
*/
- BUG_ON(atomic_read(&entry->refcnt) < 1);
+ BUG_ON(refcount_read(&entry->refcnt) < 1);
}
static const struct fsnotify_ops audit_tree_ops = {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0b0aa5854dac..4a1758adb222 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -56,7 +56,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_filter_list[3]),
LIST_HEAD_INIT(audit_filter_list[4]),
LIST_HEAD_INIT(audit_filter_list[5]),
-#if AUDIT_NR_FILTERS != 6
+ LIST_HEAD_INIT(audit_filter_list[6]),
+#if AUDIT_NR_FILTERS != 7
#error Fix audit_filter_list initialiser
#endif
};
@@ -67,6 +68,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_rules_list[3]),
LIST_HEAD_INIT(audit_rules_list[4]),
LIST_HEAD_INIT(audit_rules_list[5]),
+ LIST_HEAD_INIT(audit_rules_list[6]),
};
DEFINE_MUTEX(audit_filter_mutex);
@@ -263,6 +265,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
#endif
case AUDIT_FILTER_USER:
case AUDIT_FILTER_TYPE:
+ case AUDIT_FILTER_FS:
;
}
if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -338,6 +341,21 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
entry->rule.listnr != AUDIT_FILTER_USER)
return -EINVAL;
break;
+ case AUDIT_FSTYPE:
+ if (entry->rule.listnr != AUDIT_FILTER_FS)
+ return -EINVAL;
+ break;
+ }
+
+ switch(entry->rule.listnr) {
+ case AUDIT_FILTER_FS:
+ switch(f->type) {
+ case AUDIT_FSTYPE:
+ case AUDIT_FILTERKEY:
+ break;
+ default:
+ return -EINVAL;
+ }
}
switch(f->type) {
@@ -391,6 +409,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
return -EINVAL;
/* FALL THROUGH */
case AUDIT_ARCH:
+ case AUDIT_FSTYPE:
if (f->op != Audit_not_equal && f->op != Audit_equal)
return -EINVAL;
break;
@@ -910,10 +929,13 @@ static inline int audit_add_rule(struct audit_entry *entry)
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
- /* If either of these, don't count towards total */
- if (entry->rule.listnr == AUDIT_FILTER_USER ||
- entry->rule.listnr == AUDIT_FILTER_TYPE)
+ /* If any of these, don't count towards total */
+ switch(entry->rule.listnr) {
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_TYPE:
+ case AUDIT_FILTER_FS:
dont_count = 1;
+ }
#endif
mutex_lock(&audit_filter_mutex);
@@ -989,10 +1011,13 @@ int audit_del_rule(struct audit_entry *entry)
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
- /* If either of these, don't count towards total */
- if (entry->rule.listnr == AUDIT_FILTER_USER ||
- entry->rule.listnr == AUDIT_FILTER_TYPE)
+ /* If any of these, don't count towards total */
+ switch(entry->rule.listnr) {
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_TYPE:
+ case AUDIT_FILTER_FS:
dont_count = 1;
+ }
#endif
mutex_lock(&audit_filter_mutex);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ecc23e25c9eb..e80459f7e132 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1869,10 +1869,33 @@ void __audit_inode_child(struct inode *parent,
struct inode *inode = d_backing_inode(dentry);
const char *dname = dentry->d_name.name;
struct audit_names *n, *found_parent = NULL, *found_child = NULL;
+ struct audit_entry *e;
+ struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
+ int i;
if (!context->in_syscall)
return;
+ rcu_read_lock();
+ if (!list_empty(list)) {
+ list_for_each_entry_rcu(e, list, list) {
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+
+ if (f->type == AUDIT_FSTYPE) {
+ if (audit_comparator(parent->i_sb->s_magic,
+ f->op, f->val)) {
+ if (e->rule.action == AUDIT_NEVER) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ }
+ }
+ }
+ }
+ rcu_read_unlock();
+
if (inode)
handle_one(inode);
@@ -2390,6 +2413,12 @@ void __audit_log_kern_module(char *name)
context->type = AUDIT_KERN_MODULE;
}
+void __audit_fanotify(unsigned int response)
+{
+ audit_log(current->audit_context, GFP_KERNEL,
+ AUDIT_FANOTIFY, "resp=%u", response);
+}
+
static void audit_log_task(struct audit_buffer *ab)
{
kuid_t auid, uid;
diff --git a/kernel/bounds.c b/kernel/bounds.c
index e1d1d1952bfa..c373e887c066 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Generate definitions needed by the preprocessor.
* This code generates raw asm output which is post-processed
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 897daa005b23..e691da0b3bab 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,9 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += offload.o
ifeq ($(CONFIG_STREAM_PARSER),y)
obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 98c0f00c3f5e..7c25426d3cf5 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -19,6 +19,9 @@
#include "map_in_map.h"
+#define ARRAY_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
static void bpf_array_free_percpu(struct bpf_array *array)
{
int i;
@@ -56,7 +59,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+ attr->value_size == 0 ||
+ attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
(percpu && numa_node != NUMA_NO_NODE))
return ERR_PTR(-EINVAL);
@@ -98,7 +102,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
if (array_size >= U32_MAX - PAGE_SIZE ||
- elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
+ bpf_array_alloc_percpu(array)) {
bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
@@ -492,7 +496,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
ee = ERR_PTR(-EOPNOTSUPP);
event = perf_file->private_data;
- if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
+ if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
goto err_out;
ee = bpf_event_entry_gen(perf_file, map_file);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 546113430049..b789ab78d28f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -27,129 +27,405 @@ void cgroup_bpf_put(struct cgroup *cgrp)
{
unsigned int type;
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
- struct bpf_prog *prog = cgrp->bpf.prog[type];
-
- if (prog) {
- bpf_prog_put(prog);
+ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ struct bpf_prog_list *pl, *tmp;
+
+ list_for_each_entry_safe(pl, tmp, progs, node) {
+ list_del(&pl->node);
+ bpf_prog_put(pl->prog);
+ kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key);
}
+ bpf_prog_array_free(cgrp->bpf.effective[type]);
+ }
+}
+
+/* count number of elements in the list.
+ * it's slow but the list cannot be long
+ */
+static u32 prog_list_length(struct list_head *head)
+{
+ struct bpf_prog_list *pl;
+ u32 cnt = 0;
+
+ list_for_each_entry(pl, head, node) {
+ if (!pl->prog)
+ continue;
+ cnt++;
}
+ return cnt;
+}
+
+/* if parent has non-overridable prog attached,
+ * disallow attaching new programs to the descendent cgroup.
+ * if parent has overridable or multi-prog, allow attaching
+ */
+static bool hierarchy_allows_attach(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ u32 new_flags)
+{
+ struct cgroup *p;
+
+ p = cgroup_parent(cgrp);
+ if (!p)
+ return true;
+ do {
+ u32 flags = p->bpf.flags[type];
+ u32 cnt;
+
+ if (flags & BPF_F_ALLOW_MULTI)
+ return true;
+ cnt = prog_list_length(&p->bpf.progs[type]);
+ WARN_ON_ONCE(cnt > 1);
+ if (cnt == 1)
+ return !!(flags & BPF_F_ALLOW_OVERRIDE);
+ p = cgroup_parent(p);
+ } while (p);
+ return true;
+}
+
+/* compute a chain of effective programs for a given cgroup:
+ * start from the list of programs in this cgroup and add
+ * all parent programs.
+ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
+ * to programs in this cgroup
+ */
+static int compute_effective_progs(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ struct bpf_prog_array __rcu **array)
+{
+ struct bpf_prog_array __rcu *progs;
+ struct bpf_prog_list *pl;
+ struct cgroup *p = cgrp;
+ int cnt = 0;
+
+ /* count number of effective programs by walking parents */
+ do {
+ if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ cnt += prog_list_length(&p->bpf.progs[type]);
+ p = cgroup_parent(p);
+ } while (p);
+
+ progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+ if (!progs)
+ return -ENOMEM;
+
+ /* populate the array with effective progs */
+ cnt = 0;
+ p = cgrp;
+ do {
+ if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ list_for_each_entry(pl,
+ &p->bpf.progs[type], node) {
+ if (!pl->prog)
+ continue;
+ rcu_dereference_protected(progs, 1)->
+ progs[cnt++] = pl->prog;
+ }
+ p = cgroup_parent(p);
+ } while (p);
+
+ *array = progs;
+ return 0;
+}
+
+static void activate_effective_progs(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ struct bpf_prog_array __rcu *array)
+{
+ struct bpf_prog_array __rcu *old_array;
+
+ old_array = xchg(&cgrp->bpf.effective[type], array);
+ /* free prog array after grace period, since __cgroup_bpf_run_*()
+ * might be still walking the array
+ */
+ bpf_prog_array_free(old_array);
}
/**
* cgroup_bpf_inherit() - inherit effective programs from parent
* @cgrp: the cgroup to modify
- * @parent: the parent to inherit from
*/
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+int cgroup_bpf_inherit(struct cgroup *cgrp)
{
- unsigned int type;
+/* has to use marco instead of const int, since compiler thinks
+ * that array below is variable length
+ */
+#define NR ARRAY_SIZE(cgrp->bpf.effective)
+ struct bpf_prog_array __rcu *arrays[NR] = {};
+ int i;
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
- struct bpf_prog *e;
+ for (i = 0; i < NR; i++)
+ INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
- e = rcu_dereference_protected(parent->bpf.effective[type],
- lockdep_is_held(&cgroup_mutex));
- rcu_assign_pointer(cgrp->bpf.effective[type], e);
- cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
- }
+ for (i = 0; i < NR; i++)
+ if (compute_effective_progs(cgrp, i, &arrays[i]))
+ goto cleanup;
+
+ for (i = 0; i < NR; i++)
+ activate_effective_progs(cgrp, i, arrays[i]);
+
+ return 0;
+cleanup:
+ for (i = 0; i < NR; i++)
+ bpf_prog_array_free(arrays[i]);
+ return -ENOMEM;
}
+#define BPF_CGROUP_MAX_PROGS 64
+
/**
- * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * __cgroup_bpf_attach() - Attach the program to a cgroup, and
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
- * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
- * @prog: A new program to pin
- * @type: Type of pinning operation (ingress/egress)
- *
- * Each cgroup has a set of two pointers for bpf programs; one for eBPF
- * programs it owns, and which is effective for execution.
- *
- * If @prog is not %NULL, this function attaches a new program to the cgroup
- * and releases the one that is currently attached, if any. @prog is then made
- * the effective program of type @type in that cgroup.
- *
- * If @prog is %NULL, the currently attached program of type @type is released,
- * and the effective program of the parent cgroup (if any) is inherited to
- * @cgrp.
- *
- * Then, the descendants of @cgrp are walked and the effective program for
- * each of them is set to the effective program of @cgrp unless the
- * descendant has its own program attached, in which case the subbranch is
- * skipped. This ensures that delegated subcgroups with own programs are left
- * untouched.
+ * @prog: A program to attach
+ * @type: Type of attach operation
*
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
- struct bpf_prog *prog, enum bpf_attach_type type,
- bool new_overridable)
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
{
- struct bpf_prog *old_prog, *effective = NULL;
- struct cgroup_subsys_state *pos;
- bool overridable = true;
-
- if (parent) {
- overridable = !parent->bpf.disallow_override[type];
- effective = rcu_dereference_protected(parent->bpf.effective[type],
- lockdep_is_held(&cgroup_mutex));
- }
-
- if (prog && effective && !overridable)
- /* if parent has non-overridable prog attached, disallow
- * attaching new programs to descendent cgroup
- */
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ struct bpf_prog *old_prog = NULL;
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_list *pl;
+ bool pl_was_allocated;
+ int err;
+
+ if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+ /* invalid combination */
+ return -EINVAL;
+
+ if (!hierarchy_allows_attach(cgrp, type, flags))
return -EPERM;
- if (prog && effective && overridable != new_overridable)
- /* if parent has overridable prog attached, only
- * allow overridable programs in descendent cgroup
+ if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+ /* Disallow attaching non-overridable on top
+ * of existing overridable in this cgroup.
+ * Disallow attaching multi-prog if overridable or none
*/
return -EPERM;
- old_prog = cgrp->bpf.prog[type];
-
- if (prog) {
- overridable = new_overridable;
- effective = prog;
- if (old_prog &&
- cgrp->bpf.disallow_override[type] == new_overridable)
- /* disallow attaching non-overridable on top
- * of existing overridable in this cgroup
- * and vice versa
- */
- return -EPERM;
+ if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+ return -E2BIG;
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ list_for_each_entry(pl, progs, node)
+ if (pl->prog == prog)
+ /* disallow attaching the same prog twice */
+ return -EINVAL;
+
+ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl)
+ return -ENOMEM;
+ pl_was_allocated = true;
+ pl->prog = prog;
+ list_add_tail(&pl->node, progs);
+ } else {
+ if (list_empty(progs)) {
+ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl)
+ return -ENOMEM;
+ pl_was_allocated = true;
+ list_add_tail(&pl->node, progs);
+ } else {
+ pl = list_first_entry(progs, typeof(*pl), node);
+ old_prog = pl->prog;
+ pl_was_allocated = false;
+ }
+ pl->prog = prog;
}
- if (!prog && !old_prog)
- /* report error when trying to detach and nothing is attached */
- return -ENOENT;
+ cgrp->bpf.flags[type] = flags;
- cgrp->bpf.prog[type] = prog;
+ /* allocate and recompute effective prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
- css_for_each_descendant_pre(pos, &cgrp->self) {
- struct cgroup *desc = container_of(pos, struct cgroup, self);
-
- /* skip the subtree if the descendant has its own program */
- if (desc->bpf.prog[type] && desc != cgrp) {
- pos = css_rightmost_descendant(pos);
- } else {
- rcu_assign_pointer(desc->bpf.effective[type],
- effective);
- desc->bpf.disallow_override[type] = !overridable;
- }
+ err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ if (err)
+ goto cleanup;
}
- if (prog)
- static_branch_inc(&cgroup_bpf_enabled_key);
+ /* all allocations were successful. Activate all prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+ activate_effective_progs(desc, type, desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ static_branch_inc(&cgroup_bpf_enabled_key);
if (old_prog) {
bpf_prog_put(old_prog);
static_branch_dec(&cgroup_bpf_enabled_key);
}
return 0;
+
+cleanup:
+ /* oom while computing effective. Free all computed effective arrays
+ * since they were not activated
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* and cleanup the prog list */
+ pl->prog = old_prog;
+ if (pl_was_allocated) {
+ list_del(&pl->node);
+ kfree(pl);
+ }
+ return err;
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program from a cgroup, and
+ * propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @type: Type of detach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 unused_flags)
+{
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ u32 flags = cgrp->bpf.flags[type];
+ struct bpf_prog *old_prog = NULL;
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_list *pl;
+ int err;
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ if (!prog)
+ /* to detach MULTI prog the user has to specify valid FD
+ * of the program to be detached
+ */
+ return -EINVAL;
+ } else {
+ if (list_empty(progs))
+ /* report error when trying to detach and nothing is attached */
+ return -ENOENT;
+ }
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ /* find the prog and detach it */
+ list_for_each_entry(pl, progs, node) {
+ if (pl->prog != prog)
+ continue;
+ old_prog = prog;
+ /* mark it deleted, so it's ignored while
+ * recomputing effective
+ */
+ pl->prog = NULL;
+ break;
+ }
+ if (!old_prog)
+ return -ENOENT;
+ } else {
+ /* to maintain backward compatibility NONE and OVERRIDE cgroups
+ * allow detaching with invalid FD (prog==NULL)
+ */
+ pl = list_first_entry(progs, typeof(*pl), node);
+ old_prog = pl->prog;
+ pl->prog = NULL;
+ }
+
+ /* allocate and recompute effective prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ if (err)
+ goto cleanup;
+ }
+
+ /* all allocations were successful. Activate all prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ activate_effective_progs(desc, type, desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* now can actually delete it from this cgroup list */
+ list_del(&pl->node);
+ kfree(pl);
+ if (list_empty(progs))
+ /* last program was detached, reset flags to zero */
+ cgrp->bpf.flags[type] = 0;
+
+ bpf_prog_put(old_prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ return 0;
+
+cleanup:
+ /* oom while computing effective. Free all computed effective arrays
+ * since they were not activated
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* and restore back old_prog */
+ pl->prog = old_prog;
+ return err;
+}
+
+/* Must be called with cgroup_mutex held to avoid races. */
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ enum bpf_attach_type type = attr->query.attach_type;
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ u32 flags = cgrp->bpf.flags[type];
+ int cnt, ret = 0, i;
+
+ if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
+ cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+ else
+ cnt = prog_list_length(progs);
+
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+ return -EFAULT;
+ if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+ /* return early if user requested only program count + flags */
+ return 0;
+ if (attr->query.prog_cnt < cnt) {
+ cnt = attr->query.prog_cnt;
+ ret = -ENOSPC;
+ }
+
+ if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+ return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
+ prog_ids, cnt);
+ } else {
+ struct bpf_prog_list *pl;
+ u32 id;
+
+ i = 0;
+ list_for_each_entry(pl, progs, node) {
+ id = pl->prog->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (++i == cnt)
+ break;
+ }
+ }
+ return ret;
}
/**
@@ -171,36 +447,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
enum bpf_attach_type type)
{
- struct bpf_prog *prog;
+ unsigned int offset = skb->data - skb_network_header(skb);
+ struct sock *save_sk;
struct cgroup *cgrp;
- int ret = 0;
+ int ret;
if (!sk || !sk_fullsock(sk))
return 0;
- if (sk->sk_family != AF_INET &&
- sk->sk_family != AF_INET6)
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
return 0;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-
- rcu_read_lock();
-
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog) {
- unsigned int offset = skb->data - skb_network_header(skb);
- struct sock *save_sk = skb->sk;
-
- skb->sk = sk;
- __skb_push(skb, offset);
- ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
- __skb_pull(skb, offset);
- skb->sk = save_sk;
- }
-
- rcu_read_unlock();
-
- return ret;
+ save_sk = skb->sk;
+ skb->sk = sk;
+ __skb_push(skb, offset);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+ bpf_prog_run_save_cb);
+ __skb_pull(skb, offset);
+ skb->sk = save_sk;
+ return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
@@ -221,19 +487,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
enum bpf_attach_type type)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- struct bpf_prog *prog;
- int ret = 0;
+ int ret;
-
- rcu_read_lock();
-
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog)
- ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
-
- rcu_read_unlock();
-
- return ret;
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+ return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -258,18 +515,77 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
enum bpf_attach_type type)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- struct bpf_prog *prog;
- int ret = 0;
+ int ret;
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
+ BPF_PROG_RUN);
+ return ret == 1 ? 0 : -EPERM;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+ short access, enum bpf_attach_type type)
+{
+ struct cgroup *cgrp;
+ struct bpf_cgroup_dev_ctx ctx = {
+ .access_type = (access << 16) | dev_type,
+ .major = major,
+ .minor = minor,
+ };
+ int allow = 1;
rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
+ BPF_PROG_RUN);
+ rcu_read_unlock();
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog)
- ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
+ return !allow;
+}
+EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
- rcu_read_unlock();
+static const struct bpf_func_proto *
+cgroup_dev_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_trace_printk:
+ if (capable(CAP_SYS_ADMIN))
+ return bpf_get_trace_printk_proto();
+ default:
+ return NULL;
+ }
+}
- return ret;
+static bool cgroup_dev_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE)
+ return false;
+
+ if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
+ return false;
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u32))
+ return false;
+
+ return true;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+const struct bpf_prog_ops cg_dev_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_dev_verifier_ops = {
+ .get_func_proto = cgroup_dev_func_proto,
+ .is_valid_access = cgroup_dev_is_valid_access,
+};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 917cc04a0a94..b9f8686a84cf 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -85,8 +85,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
if (fp == NULL)
return NULL;
- kmemcheck_annotate_bitfield(fp, meta);
-
aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
if (aux == NULL) {
vfree(fp);
@@ -127,8 +125,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
if (fp == NULL) {
__bpf_prog_uncharge(fp_old->aux->user, delta);
} else {
- kmemcheck_annotate_bitfield(fp, meta);
-
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
fp->pages = pages;
fp->aux->prog = fp;
@@ -309,12 +305,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
{
+ const char *end = sym + KSYM_NAME_LEN;
+
BUILD_BUG_ON(sizeof("bpf_prog_") +
- sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+ sizeof(prog->tag) * 2 +
+ /* name has been null terminated.
+ * We should need +1 for the '_' preceding
+ * the name. However, the null character
+ * is double counted between the name and the
+ * sizeof("bpf_prog_") above, so we omit
+ * the +1 here.
+ */
+ sizeof(prog->aux->name) > KSYM_NAME_LEN);
sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
- *sym = 0;
+ if (prog->aux->name[0])
+ snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
+ else
+ *sym = 0;
}
static __always_inline unsigned long
@@ -662,8 +671,6 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
if (fp != NULL) {
- kmemcheck_annotate_bitfield(fp, meta);
-
/* aux->prog still points to the fp_other one, so
* when promoting the clone to the real program,
* this still needs to be adapted.
@@ -1022,7 +1029,7 @@ select_insn:
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog;
- u64 index = BPF_R3;
+ u32 index = BPF_R3;
if (unlikely(index >= array->map.max_entries))
goto out;
@@ -1367,7 +1374,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* valid program, which in this case would simply not
* be JITed, but falls back to the interpreter.
*/
- fp = bpf_int_jit_compile(fp);
+ if (!bpf_prog_is_dev_bound(fp->aux)) {
+ fp = bpf_int_jit_compile(fp);
+ } else {
+ *err = bpf_prog_offload_compile(fp);
+ if (*err)
+ return fp;
+ }
bpf_prog_lock_ro(fp);
/* The tail call compatibility check can only be done at
@@ -1381,11 +1394,163 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+static unsigned int __bpf_prog_ret1(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ return 1;
+}
+
+static struct bpf_prog_dummy {
+ struct bpf_prog prog;
+} dummy_bpf_prog = {
+ .prog = {
+ .bpf_func = __bpf_prog_ret1,
+ },
+};
+
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+static struct {
+ struct bpf_prog_array hdr;
+ struct bpf_prog *null_prog;
+} empty_prog_array = {
+ .null_prog = NULL,
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+{
+ if (prog_cnt)
+ return kzalloc(sizeof(struct bpf_prog_array) +
+ sizeof(struct bpf_prog *) * (prog_cnt + 1),
+ flags);
+
+ return &empty_prog_array.hdr;
+}
+
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+{
+ if (!progs ||
+ progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+ return;
+ kfree_rcu(progs, rcu);
+}
+
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+{
+ struct bpf_prog **prog;
+ u32 cnt = 0;
+
+ rcu_read_lock();
+ prog = rcu_dereference(progs)->progs;
+ for (; *prog; prog++)
+ cnt++;
+ rcu_read_unlock();
+ return cnt;
+}
+
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+ __u32 __user *prog_ids, u32 cnt)
+{
+ struct bpf_prog **prog;
+ u32 i = 0, id;
+
+ rcu_read_lock();
+ prog = rcu_dereference(progs)->progs;
+ for (; *prog; prog++) {
+ id = (*prog)->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
+ rcu_read_unlock();
+ return -EFAULT;
+ }
+ if (++i == cnt) {
+ prog++;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (*prog)
+ return -ENOSPC;
+ return 0;
+}
+
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_prog **prog = progs->progs;
+
+ for (; *prog; prog++)
+ if (*prog == old_prog) {
+ WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+ break;
+ }
+}
+
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+ struct bpf_prog *exclude_prog,
+ struct bpf_prog *include_prog,
+ struct bpf_prog_array **new_array)
+{
+ int new_prog_cnt, carry_prog_cnt = 0;
+ struct bpf_prog **existing_prog;
+ struct bpf_prog_array *array;
+ int new_prog_idx = 0;
+
+ /* Figure out how many existing progs we need to carry over to
+ * the new array.
+ */
+ if (old_array) {
+ existing_prog = old_array->progs;
+ for (; *existing_prog; existing_prog++) {
+ if (*existing_prog != exclude_prog &&
+ *existing_prog != &dummy_bpf_prog.prog)
+ carry_prog_cnt++;
+ if (*existing_prog == include_prog)
+ return -EEXIST;
+ }
+ }
+
+ /* How many progs (not NULL) will be in the new array? */
+ new_prog_cnt = carry_prog_cnt;
+ if (include_prog)
+ new_prog_cnt += 1;
+
+ /* Do we have any prog (not NULL) in the new array? */
+ if (!new_prog_cnt) {
+ *new_array = NULL;
+ return 0;
+ }
+
+ /* +1 as the end of prog_array is marked with NULL */
+ array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+
+ /* Fill in the new prog array */
+ if (carry_prog_cnt) {
+ existing_prog = old_array->progs;
+ for (; *existing_prog; existing_prog++)
+ if (*existing_prog != exclude_prog &&
+ *existing_prog != &dummy_bpf_prog.prog)
+ array->progs[new_prog_idx++] = *existing_prog;
+ }
+ if (include_prog)
+ array->progs[new_prog_idx++] = include_prog;
+ array->progs[new_prog_idx] = NULL;
+ *new_array = array;
+ return 0;
+}
+
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
aux = container_of(work, struct bpf_prog_aux, work);
+ if (bpf_prog_is_dev_bound(aux))
+ bpf_prog_offload_destroy(aux->prog);
bpf_jit_free(aux->prog);
}
@@ -1498,5 +1663,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+/* These are only used within the BPF_SYSCALL code */
+#ifdef CONFIG_BPF_SYSCALL
EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
+#endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
new file mode 100644
index 000000000000..ce5b669003b2
--- /dev/null
+++ b/kernel/bpf/cpumap.c
@@ -0,0 +1,706 @@
+/* bpf/cpumap.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2. See COPYING.
+ */
+
+/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+ * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
+ *
+ * Unlike devmap which redirects XDP frames out another NIC device,
+ * this map type redirects raw XDP frames to another CPU. The remote
+ * CPU will do SKB-allocation and call the normal network stack.
+ *
+ * This is a scalability and isolation mechanism, that allow
+ * separating the early driver network XDP layer, from the rest of the
+ * netstack, and assigning dedicated CPUs for this stage. This
+ * basically allows for 10G wirespeed pre-filtering via bpf.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ptr_ring.h>
+
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/capability.h>
+#include <trace/events/xdp.h>
+
+#include <linux/netdevice.h> /* netif_receive_skb_core */
+#include <linux/etherdevice.h> /* eth_type_trans */
+
+/* General idea: XDP packets getting XDP redirected to another CPU,
+ * will maximum be stored/queued for one driver ->poll() call. It is
+ * guaranteed that setting flush bit and flush operation happen on
+ * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
+ * which queue in bpf_cpu_map_entry contains packets.
+ */
+
+#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
+struct xdp_bulk_queue {
+ void *q[CPU_MAP_BULK_SIZE];
+ unsigned int count;
+};
+
+/* Struct for every remote "destination" CPU in map */
+struct bpf_cpu_map_entry {
+ u32 cpu; /* kthread CPU and map index */
+ int map_id; /* Back reference to map */
+ u32 qsize; /* Queue size placeholder for map lookup */
+
+ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */
+ struct xdp_bulk_queue __percpu *bulkq;
+
+ /* Queue with potential multi-producers, and single-consumer kthread */
+ struct ptr_ring *queue;
+ struct task_struct *kthread;
+ struct work_struct kthread_stop_wq;
+
+ atomic_t refcnt; /* Control when this struct can be free'ed */
+ struct rcu_head rcu;
+};
+
+struct bpf_cpu_map {
+ struct bpf_map map;
+ /* Below members specific for map type */
+ struct bpf_cpu_map_entry **cpu_map;
+ unsigned long __percpu *flush_needed;
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_bulk_queue *bq);
+
+static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
+{
+ return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_cpu_map *cmap;
+ int err = -ENOMEM;
+ u64 cost;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ return ERR_PTR(-EINVAL);
+
+ cmap = kzalloc(sizeof(*cmap), GFP_USER);
+ if (!cmap)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ cmap->map.map_type = attr->map_type;
+ cmap->map.key_size = attr->key_size;
+ cmap->map.value_size = attr->value_size;
+ cmap->map.max_entries = attr->max_entries;
+ cmap->map.map_flags = attr->map_flags;
+ cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+
+ /* Pre-limit array size based on NR_CPUS, not final CPU check */
+ if (cmap->map.max_entries > NR_CPUS) {
+ err = -E2BIG;
+ goto free_cmap;
+ }
+
+ /* make sure page count doesn't overflow */
+ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
+ cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_cmap;
+ cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* Notice returns -EPERM on if map size is larger than memlock limit */
+ ret = bpf_map_precharge_memlock(cmap->map.pages);
+ if (ret) {
+ err = ret;
+ goto free_cmap;
+ }
+
+ /* A per cpu bitfield with a bit per possible CPU in map */
+ cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
+ __alignof__(unsigned long));
+ if (!cmap->flush_needed)
+ goto free_cmap;
+
+ /* Alloc array for possible remote "destination" CPUs */
+ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
+ sizeof(struct bpf_cpu_map_entry *),
+ cmap->map.numa_node);
+ if (!cmap->cpu_map)
+ goto free_percpu;
+
+ return &cmap->map;
+free_percpu:
+ free_percpu(cmap->flush_needed);
+free_cmap:
+ kfree(cmap);
+ return ERR_PTR(err);
+}
+
+void __cpu_map_queue_destructor(void *ptr)
+{
+ /* The tear-down procedure should have made sure that queue is
+ * empty. See __cpu_map_entry_replace() and work-queue
+ * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+ * gracefully and warn once.
+ */
+ if (WARN_ON_ONCE(ptr))
+ page_frag_free(ptr);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+ if (atomic_dec_and_test(&rcpu->refcnt)) {
+ /* The queue should be empty at this point */
+ ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
+ kfree(rcpu->queue);
+ kfree(rcpu);
+ }
+}
+
+static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+ atomic_inc(&rcpu->refcnt);
+}
+
+/* called from workqueue, to workaround syscall using preempt_disable */
+static void cpu_map_kthread_stop(struct work_struct *work)
+{
+ struct bpf_cpu_map_entry *rcpu;
+
+ rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
+
+ /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
+ * as it waits until all in-flight call_rcu() callbacks complete.
+ */
+ rcu_barrier();
+
+ /* kthread_stop will wake_up_process and wait for it to complete */
+ kthread_stop(rcpu->kthread);
+}
+
+/* For now, xdp_pkt is a cpumap internal data structure, with info
+ * carried between enqueue to dequeue. It is mapped into the top
+ * headroom of the packet, to avoid allocating separate mem.
+ */
+struct xdp_pkt {
+ void *data;
+ u16 len;
+ u16 headroom;
+ u16 metasize;
+ struct net_device *dev_rx;
+};
+
+/* Convert xdp_buff to xdp_pkt */
+static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
+{
+ struct xdp_pkt *xdp_pkt;
+ int metasize;
+ int headroom;
+
+ /* Assure headroom is available for storing info */
+ headroom = xdp->data - xdp->data_hard_start;
+ metasize = xdp->data - xdp->data_meta;
+ metasize = metasize > 0 ? metasize : 0;
+ if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
+ return NULL;
+
+ /* Store info in top of packet */
+ xdp_pkt = xdp->data_hard_start;
+
+ xdp_pkt->data = xdp->data;
+ xdp_pkt->len = xdp->data_end - xdp->data;
+ xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
+ xdp_pkt->metasize = metasize;
+
+ return xdp_pkt;
+}
+
+struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_pkt *xdp_pkt)
+{
+ unsigned int frame_size;
+ void *pkt_data_start;
+ struct sk_buff *skb;
+
+ /* build_skb need to place skb_shared_info after SKB end, and
+ * also want to know the memory "truesize". Thus, need to
+ * know the memory frame size backing xdp_buff.
+ *
+ * XDP was designed to have PAGE_SIZE frames, but this
+ * assumption is not longer true with ixgbe and i40e. It
+ * would be preferred to set frame_size to 2048 or 4096
+ * depending on the driver.
+ * frame_size = 2048;
+ * frame_len = frame_size - sizeof(*xdp_pkt);
+ *
+ * Instead, with info avail, skb_shared_info in placed after
+ * packet len. This, unfortunately fakes the truesize.
+ * Another disadvantage of this approach, the skb_shared_info
+ * is not at a fixed memory location, with mixed length
+ * packets, which is bad for cache-line hotness.
+ */
+ frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+ skb = build_skb(pkt_data_start, frame_size);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, xdp_pkt->headroom);
+ __skb_put(skb, xdp_pkt->len);
+ if (xdp_pkt->metasize)
+ skb_metadata_set(skb, xdp_pkt->metasize);
+
+ /* Essential SKB info: protocol and skb->dev */
+ skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+
+ /* Optional SKB info, currently missing:
+ * - HW checksum info (skb->ip_summed)
+ * - HW RX hash (skb_set_hash)
+ * - RX ring dev queue index (skb_record_rx_queue)
+ */
+
+ return skb;
+}
+
+static int cpu_map_kthread_run(void *data)
+{
+ struct bpf_cpu_map_entry *rcpu = data;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /* When kthread gives stop order, then rcpu have been disconnected
+ * from map, thus no new packets can enter. Remaining in-flight
+ * per CPU stored packets are flushed to this queue. Wait honoring
+ * kthread_stop signal until queue is empty.
+ */
+ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+ unsigned int processed = 0, drops = 0, sched = 0;
+ struct xdp_pkt *xdp_pkt;
+
+ /* Release CPU reschedule checks */
+ if (__ptr_ring_empty(rcpu->queue)) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ /* Recheck to avoid lost wake-up */
+ if (__ptr_ring_empty(rcpu->queue)) {
+ schedule();
+ sched = 1;
+ } else {
+ __set_current_state(TASK_RUNNING);
+ }
+ } else {
+ sched = cond_resched();
+ }
+
+ /* Process packets in rcpu->queue */
+ local_bh_disable();
+ /*
+ * The bpf_cpu_map_entry is single consumer, with this
+ * kthread CPU pinned. Lockless access to ptr_ring
+ * consume side valid as no-resize allowed of queue.
+ */
+ while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+ struct sk_buff *skb;
+ int ret;
+
+ skb = cpu_map_build_skb(rcpu, xdp_pkt);
+ if (!skb) {
+ page_frag_free(xdp_pkt);
+ continue;
+ }
+
+ /* Inject into network stack */
+ ret = netif_receive_skb_core(skb);
+ if (ret == NET_RX_DROP)
+ drops++;
+
+ /* Limit BH-disable period */
+ if (++processed == 8)
+ break;
+ }
+ /* Feedback loop via tracepoint */
+ trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched);
+
+ local_bh_enable(); /* resched point, may call do_softirq() */
+ }
+ __set_current_state(TASK_RUNNING);
+
+ put_cpu_map_entry(rcpu);
+ return 0;
+}
+
+struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+{
+ gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
+ struct bpf_cpu_map_entry *rcpu;
+ int numa, err;
+
+ /* Have map->numa_node, but choose node of redirect target CPU */
+ numa = cpu_to_node(cpu);
+
+ rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
+ if (!rcpu)
+ return NULL;
+
+ /* Alloc percpu bulkq */
+ rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
+ sizeof(void *), gfp);
+ if (!rcpu->bulkq)
+ goto free_rcu;
+
+ /* Alloc queue */
+ rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
+ if (!rcpu->queue)
+ goto free_bulkq;
+
+ err = ptr_ring_init(rcpu->queue, qsize, gfp);
+ if (err)
+ goto free_queue;
+
+ rcpu->cpu = cpu;
+ rcpu->map_id = map_id;
+ rcpu->qsize = qsize;
+
+ /* Setup kthread */
+ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
+ "cpumap/%d/map:%d", cpu, map_id);
+ if (IS_ERR(rcpu->kthread))
+ goto free_ptr_ring;
+
+ get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
+ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
+
+ /* Make sure kthread runs on a single CPU */
+ kthread_bind(rcpu->kthread, cpu);
+ wake_up_process(rcpu->kthread);
+
+ return rcpu;
+
+free_ptr_ring:
+ ptr_ring_cleanup(rcpu->queue, NULL);
+free_queue:
+ kfree(rcpu->queue);
+free_bulkq:
+ free_percpu(rcpu->bulkq);
+free_rcu:
+ kfree(rcpu);
+ return NULL;
+}
+
+void __cpu_map_entry_free(struct rcu_head *rcu)
+{
+ struct bpf_cpu_map_entry *rcpu;
+ int cpu;
+
+ /* This cpu_map_entry have been disconnected from map and one
+ * RCU graze-period have elapsed. Thus, XDP cannot queue any
+ * new packets and cannot change/set flush_needed that can
+ * find this entry.
+ */
+ rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+
+ /* Flush remaining packets in percpu bulkq */
+ for_each_online_cpu(cpu) {
+ struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
+
+ /* No concurrent bq_enqueue can run at this point */
+ bq_flush_to_queue(rcpu, bq);
+ }
+ free_percpu(rcpu->bulkq);
+ /* Cannot kthread_stop() here, last put free rcpu resources */
+ put_cpu_map_entry(rcpu);
+}
+
+/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
+ * ensure any driver rcu critical sections have completed, but this
+ * does not guarantee a flush has happened yet. Because driver side
+ * rcu_read_lock/unlock only protects the running XDP program. The
+ * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
+ * pending flush op doesn't fail.
+ *
+ * The bpf_cpu_map_entry is still used by the kthread, and there can
+ * still be pending packets (in queue and percpu bulkq). A refcnt
+ * makes sure to last user (kthread_stop vs. call_rcu) free memory
+ * resources.
+ *
+ * The rcu callback __cpu_map_entry_free flush remaining packets in
+ * percpu bulkq to queue. Due to caller map_delete_elem() disable
+ * preemption, cannot call kthread_stop() to make sure queue is empty.
+ * Instead a work_queue is started for stopping kthread,
+ * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * stopping kthread, emptying the queue.
+ */
+void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+ u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+{
+ struct bpf_cpu_map_entry *old_rcpu;
+
+ old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+ if (old_rcpu) {
+ call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
+ INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
+ schedule_work(&old_rcpu->kthread_stop_wq);
+ }
+}
+
+int cpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ u32 key_cpu = *(u32 *)key;
+
+ if (key_cpu >= map->max_entries)
+ return -EINVAL;
+
+ /* notice caller map_delete_elem() use preempt_disable() */
+ __cpu_map_entry_replace(cmap, key_cpu, NULL);
+ return 0;
+}
+
+int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpu_map_entry *rcpu;
+
+ /* Array index key correspond to CPU number */
+ u32 key_cpu = *(u32 *)key;
+ /* Value is the queue size */
+ u32 qsize = *(u32 *)value;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(key_cpu >= cmap->map.max_entries))
+ return -E2BIG;
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+ if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+ return -EOVERFLOW;
+
+ /* Make sure CPU is a valid possible cpu */
+ if (!cpu_possible(key_cpu))
+ return -ENODEV;
+
+ if (qsize == 0) {
+ rcpu = NULL; /* Same as deleting */
+ } else {
+ /* Updating qsize cause re-allocation of bpf_cpu_map_entry */
+ rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+ if (!rcpu)
+ return -ENOMEM;
+ }
+ rcu_read_lock();
+ __cpu_map_entry_replace(cmap, key_cpu, rcpu);
+ rcu_read_unlock();
+ return 0;
+}
+
+void cpu_map_free(struct bpf_map *map)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ int cpu;
+ u32 i;
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the bpf programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete. The rcu critical section only guarantees
+ * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
+ * It does __not__ ensure pending flush operations (if any) are
+ * complete.
+ */
+ synchronize_rcu();
+
+ /* To ensure all pending flush operations have completed wait for flush
+ * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * Because the above synchronize_rcu() ensures the map is disconnected
+ * from the program we can assume no new bits will be set.
+ */
+ for_each_online_cpu(cpu) {
+ unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+
+ while (!bitmap_empty(bitmap, cmap->map.max_entries))
+ cond_resched();
+ }
+
+ /* For cpu_map the remote CPUs can still be using the entries
+ * (struct bpf_cpu_map_entry).
+ */
+ for (i = 0; i < cmap->map.max_entries; i++) {
+ struct bpf_cpu_map_entry *rcpu;
+
+ rcpu = READ_ONCE(cmap->cpu_map[i]);
+ if (!rcpu)
+ continue;
+
+ /* bq flush and cleanup happens after RCU graze-period */
+ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+ }
+ free_percpu(cmap->flush_needed);
+ bpf_map_area_free(cmap->cpu_map);
+ kfree(cmap);
+}
+
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpu_map_entry *rcpu;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ rcpu = READ_ONCE(cmap->cpu_map[key]);
+ return rcpu;
+}
+
+static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_cpu_map_entry *rcpu =
+ __cpu_map_lookup_elem(map, *(u32 *)key);
+
+ return rcpu ? &rcpu->qsize : NULL;
+}
+
+static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= cmap->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == cmap->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+const struct bpf_map_ops cpu_map_ops = {
+ .map_alloc = cpu_map_alloc,
+ .map_free = cpu_map_free,
+ .map_delete_elem = cpu_map_delete_elem,
+ .map_update_elem = cpu_map_update_elem,
+ .map_lookup_elem = cpu_map_lookup_elem,
+ .map_get_next_key = cpu_map_get_next_key,
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_bulk_queue *bq)
+{
+ unsigned int processed = 0, drops = 0;
+ const int to_cpu = rcpu->cpu;
+ struct ptr_ring *q;
+ int i;
+
+ if (unlikely(!bq->count))
+ return 0;
+
+ q = rcpu->queue;
+ spin_lock(&q->producer_lock);
+
+ for (i = 0; i < bq->count; i++) {
+ void *xdp_pkt = bq->q[i];
+ int err;
+
+ err = __ptr_ring_produce(q, xdp_pkt);
+ if (err) {
+ drops++;
+ page_frag_free(xdp_pkt); /* Free xdp_pkt */
+ }
+ processed++;
+ }
+ bq->count = 0;
+ spin_unlock(&q->producer_lock);
+
+ /* Feedback loop via tracepoints */
+ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
+ return 0;
+}
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+{
+ struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+
+ if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
+ bq_flush_to_queue(rcpu, bq);
+
+ /* Notice, xdp_buff/page MUST be queued here, long enough for
+ * driver to code invoking us to finished, due to driver
+ * (e.g. ixgbe) recycle tricks based on page-refcnt.
+ *
+ * Thus, incoming xdp_pkt is always queued here (else we race
+ * with another CPU on page-refcnt and remaining driver code).
+ * Queue time is very short, as driver will invoke flush
+ * operation, when completing napi->poll call.
+ */
+ bq->q[bq->count++] = xdp_pkt;
+ return 0;
+}
+
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ struct xdp_pkt *xdp_pkt;
+
+ xdp_pkt = convert_to_xdp_pkt(xdp);
+ if (unlikely(!xdp_pkt))
+ return -EOVERFLOW;
+
+ /* Info needed when constructing SKB on remote CPU */
+ xdp_pkt->dev_rx = dev_rx;
+
+ bq_enqueue(rcpu, xdp_pkt);
+ return 0;
+}
+
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+
+ __set_bit(bit, bitmap);
+}
+
+void __cpu_map_flush(struct bpf_map *map)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+ u32 bit;
+
+ /* The napi->poll softirq makes sure __cpu_map_insert_ctx()
+ * and __cpu_map_flush() happen on same CPU. Thus, the percpu
+ * bitmap indicate which percpu bulkq have packets.
+ */
+ for_each_set_bit(bit, bitmap, map->max_entries) {
+ struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
+ struct xdp_bulk_queue *bq;
+
+ /* This is possible if entry is removed by user space
+ * between xdp redirect and flush op.
+ */
+ if (unlikely(!rcpu))
+ continue;
+
+ __clear_bit(bit, bitmap);
+
+ /* Flush all frames in bulkq to real queue */
+ bq = this_cpu_ptr(rcpu->bulkq);
+ bq_flush_to_queue(rcpu, bq);
+
+ /* If already running, costs spin_lock_irqsave + smb_mb */
+ wake_up_process(rcpu->kthread);
+ }
+}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 959c9a07f318..ebdef54bf7df 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,9 @@
#include <linux/bpf.h>
#include <linux/filter.h>
+#define DEV_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
struct bpf_dtab_netdev {
struct net_device *dev;
struct bpf_dtab *dtab;
@@ -69,18 +72,21 @@ static LIST_HEAD(dev_map_list);
static u64 dev_map_bitmap_size(const union bpf_attr *attr)
{
- return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+ return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
}
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
{
struct bpf_dtab *dtab;
+ int err = -EINVAL;
u64 cost;
- int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
dtab = kzalloc(sizeof(*dtab), GFP_USER);
@@ -108,9 +114,12 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (err)
goto free_dtab;
+ err = -ENOMEM;
+
/* A per cpu bitfield with a bit per possible net device */
- dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr),
- __alignof__(unsigned long));
+ dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
+ __alignof__(unsigned long),
+ GFP_KERNEL | __GFP_NOWARN);
if (!dtab->flush_needed)
goto free_dtab;
@@ -128,7 +137,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
free_dtab:
free_percpu(dtab->flush_needed);
kfree(dtab);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(err);
}
static void dev_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
new file mode 100644
index 000000000000..e682850c9715
--- /dev/null
+++ b/kernel/bpf/disasm.c
@@ -0,0 +1,214 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+
+#include "disasm.h"
+
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+ __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+const char *func_id_name(int id)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+ if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+ return func_id_str[id];
+ else
+ return "unknown";
+}
+
+const char *const bpf_class_string[8] = {
+ [BPF_LD] = "ld",
+ [BPF_LDX] = "ldx",
+ [BPF_ST] = "st",
+ [BPF_STX] = "stx",
+ [BPF_ALU] = "alu",
+ [BPF_JMP] = "jmp",
+ [BPF_RET] = "BUG",
+ [BPF_ALU64] = "alu64",
+};
+
+const char *const bpf_alu_string[16] = {
+ [BPF_ADD >> 4] = "+=",
+ [BPF_SUB >> 4] = "-=",
+ [BPF_MUL >> 4] = "*=",
+ [BPF_DIV >> 4] = "/=",
+ [BPF_OR >> 4] = "|=",
+ [BPF_AND >> 4] = "&=",
+ [BPF_LSH >> 4] = "<<=",
+ [BPF_RSH >> 4] = ">>=",
+ [BPF_NEG >> 4] = "neg",
+ [BPF_MOD >> 4] = "%=",
+ [BPF_XOR >> 4] = "^=",
+ [BPF_MOV >> 4] = "=",
+ [BPF_ARSH >> 4] = "s>>=",
+ [BPF_END >> 4] = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+ [BPF_W >> 3] = "u32",
+ [BPF_H >> 3] = "u16",
+ [BPF_B >> 3] = "u8",
+ [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[16] = {
+ [BPF_JA >> 4] = "jmp",
+ [BPF_JEQ >> 4] = "==",
+ [BPF_JGT >> 4] = ">",
+ [BPF_JLT >> 4] = "<",
+ [BPF_JGE >> 4] = ">=",
+ [BPF_JLE >> 4] = "<=",
+ [BPF_JSET >> 4] = "&",
+ [BPF_JNE >> 4] = "!=",
+ [BPF_JSGT >> 4] = "s>",
+ [BPF_JSLT >> 4] = "s<",
+ [BPF_JSGE >> 4] = "s>=",
+ [BPF_JSLE >> 4] = "s<=",
+ [BPF_CALL >> 4] = "call",
+ [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_end_insn(bpf_insn_print_cb verbose,
+ struct bpf_verifier_env *env,
+ const struct bpf_insn *insn)
+{
+ verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+ BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+ insn->imm, insn->dst_reg);
+}
+
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, bool allow_ptr_leaks)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (BPF_OP(insn->code) == BPF_END) {
+ if (class == BPF_ALU64)
+ verbose(env, "BUG_alu64_%02x\n", insn->code);
+ else
+ print_bpf_end_insn(verbose, env, insn);
+ } else if (BPF_OP(insn->code) == BPF_NEG) {
+ verbose(env, "(%02x) r%d = %s-r%d\n",
+ insn->code, insn->dst_reg,
+ class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose(env, "(%02x) %sr%d %s %sr%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->src_reg);
+ } else {
+ verbose(env, "(%02x) %sr%d %s %s%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->imm);
+ }
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->src_reg);
+ else if (BPF_MODE(insn->code) == BPF_XADD)
+ verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ insn->src_reg);
+ else
+ verbose(env, "BUG_%02x\n", insn->code);
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "BUG_st_%02x\n", insn->code);
+ return;
+ }
+ verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (class == BPF_LD) {
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IND) {
+ verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM &&
+ BPF_SIZE(insn->code) == BPF_DW) {
+ /* At this point, we already made sure that the second
+ * part of the ldimm64 insn is accessible.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+ bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+ if (map_ptr && !allow_ptr_leaks)
+ imm = 0;
+
+ verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
+ insn->dst_reg, (unsigned long long)imm);
+ } else {
+ verbose(env, "BUG_ld_%02x\n", insn->code);
+ return;
+ }
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ verbose(env, "(%02x) call %s#%d\n", insn->code,
+ func_id_name(insn->imm), insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_JA)) {
+ verbose(env, "(%02x) goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+ verbose(env, "(%02x) exit\n", insn->code);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->src_reg, insn->off);
+ } else {
+ verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->imm, insn->off);
+ }
+ } else {
+ verbose(env, "(%02x) %s\n",
+ insn->code, bpf_class_string[class]);
+ }
+}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
new file mode 100644
index 000000000000..8de977e420b6
--- /dev/null
+++ b/kernel/bpf/disasm.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __BPF_DISASM_H__
+#define __BPF_DISASM_H__
+
+#include <linux/bpf.h>
+#include <linux/kernel.h>
+#include <linux/stringify.h>
+
+extern const char *const bpf_alu_string[16];
+extern const char *const bpf_class_string[8];
+
+const char *func_id_name(int id);
+
+struct bpf_verifier_env;
+typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env,
+ const char *, ...);
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, bool allow_ptr_leaks);
+
+#endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 431126f31ea3..e469e05c8e83 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,8 +18,9 @@
#include "bpf_lru_list.h"
#include "map_in_map.h"
-#define HTAB_CREATE_FLAG_MASK \
- (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+#define HTAB_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
+ BPF_F_RDONLY | BPF_F_WRONLY)
struct bucket {
struct hlist_nulls_head head;
@@ -317,10 +318,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
goto free_htab;
- if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
- /* make sure the size for pcpu_alloc() is reasonable */
- goto free_htab;
-
htab->elem_size = sizeof(struct htab_elem) +
round_up(htab->map.key_size, 8);
if (percpu)
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index e833ed914358..01aaef1a77c5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -295,7 +295,7 @@ out:
}
static void *bpf_obj_do_get(const struct filename *pathname,
- enum bpf_type *type)
+ enum bpf_type *type, int flags)
{
struct inode *inode;
struct path path;
@@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
return ERR_PTR(ret);
inode = d_backing_inode(path.dentry);
- ret = inode_permission(inode, MAY_WRITE);
+ ret = inode_permission(inode, ACC_MODE(flags));
if (ret)
goto out;
@@ -326,18 +326,23 @@ out:
return ERR_PTR(ret);
}
-int bpf_obj_get_user(const char __user *pathname)
+int bpf_obj_get_user(const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
struct filename *pname;
int ret = -ENOENT;
+ int f_flags;
void *raw;
+ f_flags = bpf_get_file_flag(flags);
+ if (f_flags < 0)
+ return f_flags;
+
pname = getname(pathname);
if (IS_ERR(pname))
return PTR_ERR(pname);
- raw = bpf_obj_do_get(pname, &type);
+ raw = bpf_obj_do_get(pname, &type, f_flags);
if (IS_ERR(raw)) {
ret = PTR_ERR(raw);
goto out;
@@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname)
if (type == BPF_TYPE_PROG)
ret = bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
- ret = bpf_map_new_fd(raw);
+ ret = bpf_map_new_fd(raw, f_flags);
else
goto out;
@@ -363,6 +368,7 @@ out:
putname(pname);
return ret;
}
+EXPORT_SYMBOL_GPL(bpf_obj_get_user);
static void bpf_evict_inode(struct inode *inode)
{
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1b767844a76f..885e45479680 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -389,10 +389,99 @@ out:
return ret;
}
-static int trie_delete_elem(struct bpf_map *map, void *key)
+/* Called from syscall or from eBPF program */
+static int trie_delete_elem(struct bpf_map *map, void *_key)
{
- /* TODO */
- return -ENOSYS;
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct bpf_lpm_trie_key *key = _key;
+ struct lpm_trie_node __rcu **trim, **trim2;
+ struct lpm_trie_node *node, *parent;
+ unsigned long irq_flags;
+ unsigned int next_bit;
+ size_t matchlen = 0;
+ int ret = 0;
+
+ if (key->prefixlen > trie->max_prefixlen)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&trie->lock, irq_flags);
+
+ /* Walk the tree looking for an exact key/length match and keeping
+ * track of the path we traverse. We will need to know the node
+ * we wish to delete, and the slot that points to the node we want
+ * to delete. We may also need to know the nodes parent and the
+ * slot that contains it.
+ */
+ trim = &trie->root;
+ trim2 = trim;
+ parent = NULL;
+ while ((node = rcu_dereference_protected(
+ *trim, lockdep_is_held(&trie->lock)))) {
+ matchlen = longest_prefix_match(trie, node, key);
+
+ if (node->prefixlen != matchlen ||
+ node->prefixlen == key->prefixlen)
+ break;
+
+ parent = node;
+ trim2 = trim;
+ next_bit = extract_bit(key->data, node->prefixlen);
+ trim = &node->child[next_bit];
+ }
+
+ if (!node || node->prefixlen != key->prefixlen ||
+ (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ trie->n_entries--;
+
+ /* If the node we are removing has two children, simply mark it
+ * as intermediate and we are done.
+ */
+ if (rcu_access_pointer(node->child[0]) &&
+ rcu_access_pointer(node->child[1])) {
+ node->flags |= LPM_TREE_NODE_FLAG_IM;
+ goto out;
+ }
+
+ /* If the parent of the node we are about to delete is an intermediate
+ * node, and the deleted node doesn't have any children, we can delete
+ * the intermediate parent as well and promote its other child
+ * up the tree. Doing this maintains the invariant that all
+ * intermediate nodes have exactly 2 children and that there are no
+ * unnecessary intermediate nodes in the tree.
+ */
+ if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) &&
+ !node->child[0] && !node->child[1]) {
+ if (node == rcu_access_pointer(parent->child[0]))
+ rcu_assign_pointer(
+ *trim2, rcu_access_pointer(parent->child[1]));
+ else
+ rcu_assign_pointer(
+ *trim2, rcu_access_pointer(parent->child[0]));
+ kfree_rcu(parent, rcu);
+ kfree_rcu(node, rcu);
+ goto out;
+ }
+
+ /* The node we are removing has either zero or one child. If there
+ * is a child, move it into the removed node's slot then delete
+ * the node. Otherwise just clear the slot and delete the node.
+ */
+ if (node->child[0])
+ rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0]));
+ else if (node->child[1])
+ rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
+ else
+ RCU_INIT_POINTER(*trim, NULL);
+ kfree_rcu(node, rcu);
+
+out:
+ raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+ return ret;
}
#define LPM_DATA_SIZE_MAX 256
@@ -406,7 +495,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key)
#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
-#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \
+ BPF_F_RDONLY | BPF_F_WRONLY)
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
new file mode 100644
index 000000000000..68ec884440b7
--- /dev/null
+++ b/kernel/bpf/offload.c
@@ -0,0 +1,191 @@
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rtnetlink.h>
+
+/* protected by RTNL */
+static LIST_HEAD(bpf_prog_offload_devs);
+
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_dev_offload *offload;
+
+ if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_type != BPF_PROG_TYPE_XDP)
+ return -EINVAL;
+
+ if (attr->prog_flags)
+ return -EINVAL;
+
+ offload = kzalloc(sizeof(*offload), GFP_USER);
+ if (!offload)
+ return -ENOMEM;
+
+ offload->prog = prog;
+ init_waitqueue_head(&offload->verifier_done);
+
+ rtnl_lock();
+ offload->netdev = __dev_get_by_index(net, attr->prog_ifindex);
+ if (!offload->netdev) {
+ rtnl_unlock();
+ kfree(offload);
+ return -EINVAL;
+ }
+
+ prog->aux->offload = offload;
+ list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
+ rtnl_unlock();
+
+ return 0;
+}
+
+static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
+ struct netdev_bpf *data)
+{
+ struct net_device *netdev = prog->aux->offload->netdev;
+
+ ASSERT_RTNL();
+
+ if (!netdev)
+ return -ENODEV;
+ if (!netdev->netdev_ops->ndo_bpf)
+ return -EOPNOTSUPP;
+
+ data->command = cmd;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, data);
+}
+
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+ struct netdev_bpf data = {};
+ int err;
+
+ data.verifier.prog = env->prog;
+
+ rtnl_lock();
+ err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
+ if (err)
+ goto exit_unlock;
+
+ env->dev_ops = data.verifier.ops;
+
+ env->prog->aux->offload->dev_state = true;
+ env->prog->aux->offload->verifier_running = true;
+exit_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_dev_offload *offload = prog->aux->offload;
+ struct netdev_bpf data = {};
+
+ /* Caution - if netdev is destroyed before the program, this function
+ * will be called twice.
+ */
+
+ data.offload.prog = prog;
+
+ if (offload->verifier_running)
+ wait_event(offload->verifier_done, !offload->verifier_running);
+
+ if (offload->dev_state)
+ WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+
+ offload->dev_state = false;
+ list_del_init(&offload->offloads);
+ offload->netdev = NULL;
+}
+
+void bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_dev_offload *offload = prog->aux->offload;
+
+ offload->verifier_running = false;
+ wake_up(&offload->verifier_done);
+
+ rtnl_lock();
+ __bpf_prog_offload_destroy(prog);
+ rtnl_unlock();
+
+ kfree(offload);
+}
+
+static int bpf_prog_offload_translate(struct bpf_prog *prog)
+{
+ struct bpf_dev_offload *offload = prog->aux->offload;
+ struct netdev_bpf data = {};
+ int ret;
+
+ data.offload.prog = prog;
+
+ offload->verifier_running = false;
+ wake_up(&offload->verifier_done);
+
+ rtnl_lock();
+ ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
+ rtnl_unlock();
+
+ return ret;
+}
+
+static unsigned int bpf_prog_warn_on_exec(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ WARN(1, "attempt to execute device eBPF program on the host!");
+ return 0;
+}
+
+int bpf_prog_offload_compile(struct bpf_prog *prog)
+{
+ prog->bpf_func = bpf_prog_warn_on_exec;
+
+ return bpf_prog_offload_translate(prog);
+}
+
+const struct bpf_prog_ops bpf_offload_prog_ops = {
+};
+
+static int bpf_offload_notification(struct notifier_block *notifier,
+ ulong event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct bpf_dev_offload *offload, *tmp;
+
+ ASSERT_RTNL();
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* ignore namespace changes */
+ if (netdev->reg_state != NETREG_UNREGISTERING)
+ break;
+
+ list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
+ offloads) {
+ if (offload->netdev == netdev)
+ __bpf_prog_offload_destroy(offload->prog);
+ }
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block bpf_offload_notifier = {
+ .notifier_call = bpf_offload_notification,
+};
+
+static int __init bpf_offload_init(void)
+{
+ register_netdevice_notifier(&bpf_offload_notifier);
+ return 0;
+}
+
+subsys_initcall(bpf_offload_init);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 5c51d1985b51..673fa6fe2d73 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -78,8 +78,10 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
+ unsigned long flags;
int orig_cpu, cpu;
+ local_irq_save(flags);
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
@@ -87,14 +89,16 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
node = head->first;
if (node) {
head->first = node->next;
- raw_spin_unlock(&head->lock);
+ raw_spin_unlock_irqrestore(&head->lock, flags);
return node;
}
raw_spin_unlock(&head->lock);
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
- if (cpu == orig_cpu)
+ if (cpu == orig_cpu) {
+ local_irq_restore(flags);
return NULL;
+ }
}
}
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 6424ce0e4969..5ee2e41893d9 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -39,6 +39,10 @@
#include <linux/workqueue.h>
#include <linux/list.h>
#include <net/strparser.h>
+#include <net/tcp.h>
+
+#define SOCK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
struct bpf_stab {
struct bpf_map map;
@@ -92,21 +96,45 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
return rcu_dereference_sk_user_data(sk);
}
+/* compute the linear packet data range [data, data_end) for skb when
+ * sk_skb type programs are in use.
+ */
+static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
+{
+ TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
+}
+
+enum __sk_action {
+ __SK_DROP = 0,
+ __SK_PASS,
+ __SK_REDIRECT,
+};
+
static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
{
struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
int rc;
if (unlikely(!prog))
- return SK_DROP;
+ return __SK_DROP;
skb_orphan(skb);
+ /* We need to ensure that BPF metadata for maps is also cleared
+ * when we orphan the skb so that we don't have the possibility
+ * to reference a stale map.
+ */
+ TCP_SKB_CB(skb)->bpf.map = NULL;
skb->sk = psock->sock;
- bpf_compute_data_end(skb);
+ bpf_compute_data_pointers(skb);
+ preempt_disable();
rc = (*prog->bpf_func)(skb, prog->insnsi);
+ preempt_enable();
skb->sk = NULL;
- return rc;
+ /* Moving return codes from UAPI namespace into internal namespace */
+ return rc == SK_PASS ?
+ (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) :
+ __SK_DROP;
}
static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
@@ -114,17 +142,10 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
struct sock *sk;
int rc;
- /* Because we use per cpu values to feed input from sock redirect
- * in BPF program to do_sk_redirect_map() call we need to ensure we
- * are not preempted. RCU read lock is not sufficient in this case
- * with CONFIG_PREEMPT_RCU enabled so we must be explicit here.
- */
- preempt_disable();
rc = smap_verdict_func(psock, skb);
switch (rc) {
- case SK_REDIRECT:
- sk = do_sk_redirect_map();
- preempt_enable();
+ case __SK_REDIRECT:
+ sk = do_sk_redirect_map(skb);
if (likely(sk)) {
struct smap_psock *peer = smap_psock_sk(sk);
@@ -139,10 +160,8 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
}
}
/* Fall through and free skb otherwise */
- case SK_DROP:
+ case __SK_DROP:
default:
- if (rc != SK_REDIRECT)
- preempt_enable();
kfree_skb(skb);
}
}
@@ -369,7 +388,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
* any socket yet.
*/
skb->sk = psock->sock;
- bpf_compute_data_end(skb);
+ bpf_compute_data_pointers(skb);
rc = (*prog->bpf_func)(skb, prog->insnsi);
skb->sk = NULL;
rcu_read_unlock();
@@ -487,9 +506,12 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
int err = -EINVAL;
u64 cost;
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -840,6 +862,12 @@ static int sock_map_update_elem(struct bpf_map *map,
return -EINVAL;
}
+ if (skops.sk->sk_type != SOCK_STREAM ||
+ skops.sk->sk_protocol != IPPROTO_TCP) {
+ fput(socket->file);
+ return -EOPNOTSUPP;
+ }
+
err = sock_map_ctx_update_elem(&skops, map, key, flags);
fput(socket->file);
return err;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 135be433e9a0..a15bc636cc98 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,9 @@
#include <linux/perf_event.h>
#include "percpu_freelist.h"
+#define STACK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
struct stack_map_bucket {
struct pcpu_freelist_node fnode;
u32 hash;
@@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- if (attr->map_flags & ~BPF_F_NUMA_NODE)
+ if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
/* check sanity of attributes */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cb17e1cd1d43..2c4cfeaa8d5e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,9 @@
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/idr.h>
+#include <linux/cred.h>
+#include <linux/timekeeping.h>
+#include <linux/ctype.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -31,6 +34,8 @@
#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
+#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
+
DEFINE_PER_CPU(int, bpf_prog_active);
static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
@@ -186,15 +191,17 @@ static int bpf_map_alloc_id(struct bpf_map *map)
static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
{
+ unsigned long flags;
+
if (do_idr_lock)
- spin_lock_bh(&map_idr_lock);
+ spin_lock_irqsave(&map_idr_lock, flags);
else
__acquire(&map_idr_lock);
idr_remove(&map_idr, map->id);
if (do_idr_lock)
- spin_unlock_bh(&map_idr_lock);
+ spin_unlock_irqrestore(&map_idr_lock, flags);
else
__release(&map_idr_lock);
}
@@ -205,6 +212,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
struct bpf_map *map = container_of(work, struct bpf_map, work);
bpf_map_uncharge_memlock(map);
+ security_bpf_map_free(map);
/* implementation dependent freeing */
map->ops->map_free(map);
}
@@ -289,17 +297,54 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
-static const struct file_operations bpf_map_fops = {
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+ loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_READ.
+ */
+ return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+ size_t siz, loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_WRITE.
+ */
+ return -EINVAL;
+}
+
+const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
#endif
.release = bpf_map_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
};
-int bpf_map_new_fd(struct bpf_map *map)
+int bpf_map_new_fd(struct bpf_map *map, int flags)
{
+ int ret;
+
+ ret = security_bpf_map(map, OPEN_FMODE(flags));
+ if (ret < 0)
+ return ret;
+
return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
- O_RDWR | O_CLOEXEC);
+ flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+ if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+ return -EINVAL;
+ if (flags & BPF_F_RDONLY)
+ return O_RDONLY;
+ if (flags & BPF_F_WRONLY)
+ return O_WRONLY;
+ return O_RDWR;
}
/* helper macro to check that unused fields 'union bpf_attr' are zero */
@@ -310,18 +355,46 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD numa_node
+/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
+ * Return 0 on success and < 0 on error.
+ */
+static int bpf_obj_name_cpy(char *dst, const char *src)
+{
+ const char *end = src + BPF_OBJ_NAME_LEN;
+
+ memset(dst, 0, BPF_OBJ_NAME_LEN);
+
+ /* Copy all isalnum() and '_' char */
+ while (src < end && *src) {
+ if (!isalnum(*src) && *src != '_')
+ return -EINVAL;
+ *dst++ = *src++;
+ }
+
+ /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
+ if (src == end)
+ return -EINVAL;
+
+ return 0;
+}
+
+#define BPF_MAP_CREATE_LAST_FIELD map_name
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_map *map;
+ int f_flags;
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;
+ f_flags = bpf_get_file_flag(attr->map_flags);
+ if (f_flags < 0)
+ return f_flags;
+
if (numa_node != NUMA_NO_NODE &&
((unsigned int)numa_node >= nr_node_ids ||
!node_online(numa_node)))
@@ -332,18 +405,26 @@ static int map_create(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ err = bpf_obj_name_cpy(map->name, attr->map_name);
+ if (err)
+ goto free_map_nouncharge;
+
atomic_set(&map->refcnt, 1);
atomic_set(&map->usercnt, 1);
- err = bpf_map_charge_memlock(map);
+ err = security_bpf_map_alloc(map);
if (err)
goto free_map_nouncharge;
+ err = bpf_map_charge_memlock(map);
+ if (err)
+ goto free_map_sec;
+
err = bpf_map_alloc_id(map);
if (err)
goto free_map;
- err = bpf_map_new_fd(map);
+ err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
/* failed to allocate fd.
* bpf_map_put() is needed because the above
@@ -360,6 +441,8 @@ static int map_create(union bpf_attr *attr)
free_map:
bpf_map_uncharge_memlock(map);
+free_map_sec:
+ security_bpf_map_free(map);
free_map_nouncharge:
map->ops->map_free(map);
return err;
@@ -458,6 +541,11 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -538,6 +626,11 @@ static int map_update_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -560,6 +653,12 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value;
+ /* Need to create a kthread, thus must support schedule */
+ if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
+ goto out;
+ }
+
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
* inside bpf map update or delete otherwise deadlocks are possible
*/
@@ -590,7 +689,7 @@ static int map_update_elem(union bpf_attr *attr)
}
__this_cpu_dec(bpf_prog_active);
preempt_enable();
-
+out:
if (!err)
trace_bpf_map_update_elem(map, ufd, key, value);
free_value:
@@ -621,6 +720,11 @@ static int map_delete_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -664,6 +768,11 @@ static int map_get_next_key(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
if (ukey) {
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
@@ -701,9 +810,9 @@ err_put:
return err;
}
-static const struct bpf_verifier_ops * const bpf_prog_types[] = {
-#define BPF_PROG_TYPE(_id, _ops) \
- [_id] = &_ops,
+static const struct bpf_prog_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+ [_id] = & _name ## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
@@ -715,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
return -EINVAL;
- prog->aux->ops = bpf_prog_types[type];
+ if (!bpf_prog_is_dev_bound(prog->aux))
+ prog->aux->ops = bpf_prog_types[type];
+ else
+ prog->aux->ops = &bpf_offload_prog_ops;
prog->type = type;
return 0;
}
@@ -818,6 +930,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
free_used_maps(aux);
bpf_prog_uncharge_memlock(aux->prog);
+ security_bpf_prog_free(aux);
bpf_prog_free(aux->prog);
}
@@ -865,15 +978,23 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
-static const struct file_operations bpf_prog_fops = {
+const struct file_operations bpf_prog_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_prog_show_fdinfo,
#endif
.release = bpf_prog_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
};
int bpf_prog_new_fd(struct bpf_prog *prog)
{
+ int ret;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ret;
+
return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
O_RDWR | O_CLOEXEC);
}
@@ -936,7 +1057,23 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
}
EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
+static bool bpf_prog_get_ok(struct bpf_prog *prog,
+ enum bpf_prog_type *attach_type, bool attach_drv)
+{
+ /* not an attachment, just a refcount inc, always allow */
+ if (!attach_type)
+ return true;
+
+ if (prog->type != *attach_type)
+ return false;
+ if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
+ return false;
+
+ return true;
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
+ bool attach_drv)
{
struct fd f = fdget(ufd);
struct bpf_prog *prog;
@@ -944,7 +1081,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
prog = ____bpf_prog_get(f);
if (IS_ERR(prog))
return prog;
- if (type && prog->type != *type) {
+ if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
prog = ERR_PTR(-EINVAL);
goto out;
}
@@ -957,21 +1094,22 @@ out:
struct bpf_prog *bpf_prog_get(u32 ufd)
{
- return __bpf_prog_get(ufd, NULL);
+ return __bpf_prog_get(ufd, NULL, false);
}
-struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+ bool attach_drv)
{
- struct bpf_prog *prog = __bpf_prog_get(ufd, &type);
+ struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv);
if (!IS_ERR(prog))
trace_bpf_prog_get_type(prog);
return prog;
}
-EXPORT_SYMBOL_GPL(bpf_prog_get_type);
+EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD prog_flags
+#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex
static int bpf_prog_load(union bpf_attr *attr)
{
@@ -1013,10 +1151,14 @@ static int bpf_prog_load(union bpf_attr *attr)
if (!prog)
return -ENOMEM;
- err = bpf_prog_charge_memlock(prog);
+ err = security_bpf_prog_alloc(prog->aux);
if (err)
goto free_prog_nouncharge;
+ err = bpf_prog_charge_memlock(prog);
+ if (err)
+ goto free_prog_sec;
+
prog->len = attr->insn_cnt;
err = -EFAULT;
@@ -1030,11 +1172,22 @@ static int bpf_prog_load(union bpf_attr *attr)
atomic_set(&prog->aux->refcnt, 1);
prog->gpl_compatible = is_gpl ? 1 : 0;
+ if (attr->prog_ifindex) {
+ err = bpf_prog_offload_init(prog, attr);
+ if (err)
+ goto free_prog;
+ }
+
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
if (err < 0)
goto free_prog;
+ prog->aux->load_time = ktime_get_boot_ns();
+ err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
+ if (err)
+ goto free_prog;
+
/* run eBPF verifier */
err = bpf_check(&prog, attr);
if (err < 0)
@@ -1069,16 +1222,18 @@ free_used_maps:
free_used_maps(prog->aux);
free_prog:
bpf_prog_uncharge_memlock(prog);
+free_prog_sec:
+ security_bpf_prog_free(prog->aux);
free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}
-#define BPF_OBJ_LAST_FIELD bpf_fd
+#define BPF_OBJ_LAST_FIELD file_flags
static int bpf_obj_pin(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ))
+ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
return -EINVAL;
return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
@@ -1086,10 +1241,12 @@ static int bpf_obj_pin(const union bpf_attr *attr)
static int bpf_obj_get(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+ if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+ attr->file_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
- return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
+ return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+ attr->file_flags);
}
#ifdef CONFIG_CGROUP_BPF
@@ -1130,6 +1287,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
return 0;
}
+#define BPF_F_ATTACH_MASK \
+ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+
static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
@@ -1143,7 +1303,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
- if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
return -EINVAL;
switch (attr->attach_type) {
@@ -1157,6 +1317,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_SOCK_OPS:
ptype = BPF_PROG_TYPE_SOCK_OPS;
break;
+ case BPF_CGROUP_DEVICE:
+ ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+ break;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
return sockmap_get_from_fd(attr, true);
@@ -1174,8 +1337,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return PTR_ERR(cgrp);
}
- ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
- attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+ ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+ attr->attach_flags);
if (ret)
bpf_prog_put(prog);
cgroup_put(cgrp);
@@ -1187,6 +1350,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
static int bpf_prog_detach(const union bpf_attr *attr)
{
+ enum bpf_prog_type ptype;
+ struct bpf_prog *prog;
struct cgroup *cgrp;
int ret;
@@ -1199,26 +1364,71 @@ static int bpf_prog_detach(const union bpf_attr *attr)
switch (attr->attach_type) {
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
+ ptype = BPF_PROG_TYPE_CGROUP_SKB;
+ break;
case BPF_CGROUP_INET_SOCK_CREATE:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+ break;
case BPF_CGROUP_SOCK_OPS:
- cgrp = cgroup_get_from_fd(attr->target_fd);
- if (IS_ERR(cgrp))
- return PTR_ERR(cgrp);
-
- ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
- cgroup_put(cgrp);
+ ptype = BPF_PROG_TYPE_SOCK_OPS;
+ break;
+ case BPF_CGROUP_DEVICE:
+ ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
- ret = sockmap_get_from_fd(attr, false);
- break;
+ return sockmap_get_from_fd(attr, false);
default:
return -EINVAL;
}
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ prog = NULL;
+
+ ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+ if (prog)
+ bpf_prog_put(prog);
+ cgroup_put(cgrp);
return ret;
}
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+
+static int bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (CHECK_ATTR(BPF_PROG_QUERY))
+ return -EINVAL;
+ if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
+ return -EINVAL;
+
+ switch (attr->query.attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_DEVICE:
+ break;
+ default:
+ return -EINVAL;
+ }
+ cgrp = cgroup_get_from_fd(attr->query.target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+ ret = cgroup_bpf_query(cgrp, attr, uattr);
+ cgroup_put(cgrp);
+ return ret;
+}
#endif /* CONFIG_CGROUP_BPF */
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -1303,20 +1513,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
return fd;
}
-#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
{
struct bpf_map *map;
u32 id = attr->map_id;
+ int f_flags;
int fd;
- if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+ if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
+ attr->open_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ f_flags = bpf_get_file_flag(attr->open_flags);
+ if (f_flags < 0)
+ return f_flags;
+
spin_lock_bh(&map_idr_lock);
map = idr_find(&map_idr, id);
if (map)
@@ -1328,7 +1544,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- fd = bpf_map_new_fd(map);
+ fd = bpf_map_new_fd(map, f_flags);
if (fd < 0)
bpf_map_put(map);
@@ -1356,8 +1572,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info.type = prog->type;
info.id = prog->aux->id;
+ info.load_time = prog->aux->load_time;
+ info.created_by_uid = from_kuid_munged(current_user_ns(),
+ prog->aux->user->uid);
memcpy(info.tag, prog->tag, sizeof(prog->tag));
+ memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+
+ ulen = info.nr_map_ids;
+ info.nr_map_ids = prog->aux->used_map_cnt;
+ ulen = min_t(u32, info.nr_map_ids, ulen);
+ if (ulen) {
+ u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
+ u32 i;
+
+ for (i = 0; i < ulen; i++)
+ if (put_user(prog->aux->used_maps[i]->id,
+ &user_map_ids[i]))
+ return -EFAULT;
+ }
if (!capable(CAP_SYS_ADMIN)) {
info.jited_prog_len = 0;
@@ -1411,6 +1644,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
info.value_size = map->value_size;
info.max_entries = map->max_entries;
info.map_flags = map->map_flags;
+ memcpy(info.name, map->name, sizeof(map->name));
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
@@ -1465,6 +1699,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
+ err = security_bpf(cmd, &attr, size);
+ if (err < 0)
+ return err;
+
switch (cmd) {
case BPF_MAP_CREATE:
err = map_create(&attr);
@@ -1497,6 +1735,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_PROG_DETACH:
err = bpf_prog_detach(&attr);
break;
+ case BPF_PROG_QUERY:
+ err = bpf_prog_query(&attr, uattr);
+ break;
#endif
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 799b2451ef2d..d4593571c404 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21,6 +21,17 @@
#include <linux/vmalloc.h>
#include <linux/stringify.h>
+#include "disasm.h"
+
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+ [_id] = & _name ## _verifier_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
+
/* bpf_check() is a static code analyzer that walks eBPF program
* instruction by instruction and updates register/stack state.
* All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -153,28 +164,42 @@ struct bpf_call_arg_meta {
int access_size;
};
-/* verbose verifier prints what it's seeing
- * bpf_check() is called under lock, so no race to access these global vars
- */
-static u32 log_level, log_size, log_len;
-static char *log_buf;
-
static DEFINE_MUTEX(bpf_verifier_lock);
/* log_level controls verbosity level of eBPF verifier.
* verbose() is used to dump the verification trace to the log, so the user
* can figure out what's wrong with the program
*/
-static __printf(1, 2) void verbose(const char *fmt, ...)
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+ const char *fmt, ...)
{
+ struct bpf_verifer_log *log = &env->log;
+ unsigned int n;
va_list args;
- if (log_level == 0 || log_len >= log_size - 1)
+ if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
return;
va_start(args, fmt);
- log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
va_end(args);
+
+ WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+ "verifier log line truncated - local buffer too short\n");
+
+ n = min(log->len_total - log->len_used - 1, n);
+ log->kbuf[n] = '\0';
+
+ if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+ log->len_used += n;
+ else
+ log->ubuf = NULL;
+}
+
+static bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+ return type == PTR_TO_PACKET ||
+ type == PTR_TO_PACKET_META;
}
/* string representation of 'enum bpf_reg_type' */
@@ -187,26 +212,12 @@ static const char * const reg_type_str[] = {
[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
[PTR_TO_STACK] = "fp",
[PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_META] = "pkt_meta",
[PTR_TO_PACKET_END] = "pkt_end",
};
-#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
-static const char * const func_id_str[] = {
- __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
-};
-#undef __BPF_FUNC_STR_FN
-
-static const char *func_id_name(int id)
-{
- BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
-
- if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
- return func_id_str[id];
- else
- return "unknown";
-}
-
-static void print_verifier_state(struct bpf_verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state)
{
struct bpf_reg_state *reg;
enum bpf_reg_type t;
@@ -217,21 +228,21 @@ static void print_verifier_state(struct bpf_verifier_state *state)
t = reg->type;
if (t == NOT_INIT)
continue;
- verbose(" R%d=%s", i, reg_type_str[t]);
+ verbose(env, " R%d=%s", i, reg_type_str[t]);
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
tnum_is_const(reg->var_off)) {
/* reg->off should be 0 for SCALAR_VALUE */
- verbose("%lld", reg->var_off.value + reg->off);
+ verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
- verbose("(id=%d", reg->id);
+ verbose(env, "(id=%d", reg->id);
if (t != SCALAR_VALUE)
- verbose(",off=%d", reg->off);
- if (t == PTR_TO_PACKET)
- verbose(",r=%d", reg->range);
+ verbose(env, ",off=%d", reg->off);
+ if (type_is_pkt_pointer(t))
+ verbose(env, ",r=%d", reg->range);
else if (t == CONST_PTR_TO_MAP ||
t == PTR_TO_MAP_VALUE ||
t == PTR_TO_MAP_VALUE_OR_NULL)
- verbose(",ks=%d,vs=%d",
+ verbose(env, ",ks=%d,vs=%d",
reg->map_ptr->key_size,
reg->map_ptr->value_size);
if (tnum_is_const(reg->var_off)) {
@@ -239,243 +250,174 @@ static void print_verifier_state(struct bpf_verifier_state *state)
* could be a pointer whose offset is too big
* for reg->off
*/
- verbose(",imm=%llx", reg->var_off.value);
+ verbose(env, ",imm=%llx", reg->var_off.value);
} else {
if (reg->smin_value != reg->umin_value &&
reg->smin_value != S64_MIN)
- verbose(",smin_value=%lld",
+ verbose(env, ",smin_value=%lld",
(long long)reg->smin_value);
if (reg->smax_value != reg->umax_value &&
reg->smax_value != S64_MAX)
- verbose(",smax_value=%lld",
+ verbose(env, ",smax_value=%lld",
(long long)reg->smax_value);
if (reg->umin_value != 0)
- verbose(",umin_value=%llu",
+ verbose(env, ",umin_value=%llu",
(unsigned long long)reg->umin_value);
if (reg->umax_value != U64_MAX)
- verbose(",umax_value=%llu",
+ verbose(env, ",umax_value=%llu",
(unsigned long long)reg->umax_value);
if (!tnum_is_unknown(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(",var_off=%s", tn_buf);
+ verbose(env, ",var_off=%s", tn_buf);
}
}
- verbose(")");
+ verbose(env, ")");
}
}
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] == STACK_SPILL)
- verbose(" fp%d=%s", -MAX_BPF_STACK + i,
- reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] == STACK_SPILL)
+ verbose(env, " fp%d=%s",
+ -MAX_BPF_STACK + i * BPF_REG_SIZE,
+ reg_type_str[state->stack[i].spilled_ptr.type]);
}
- verbose("\n");
+ verbose(env, "\n");
}
-static const char *const bpf_class_string[] = {
- [BPF_LD] = "ld",
- [BPF_LDX] = "ldx",
- [BPF_ST] = "st",
- [BPF_STX] = "stx",
- [BPF_ALU] = "alu",
- [BPF_JMP] = "jmp",
- [BPF_RET] = "BUG",
- [BPF_ALU64] = "alu64",
-};
-
-static const char *const bpf_alu_string[16] = {
- [BPF_ADD >> 4] = "+=",
- [BPF_SUB >> 4] = "-=",
- [BPF_MUL >> 4] = "*=",
- [BPF_DIV >> 4] = "/=",
- [BPF_OR >> 4] = "|=",
- [BPF_AND >> 4] = "&=",
- [BPF_LSH >> 4] = "<<=",
- [BPF_RSH >> 4] = ">>=",
- [BPF_NEG >> 4] = "neg",
- [BPF_MOD >> 4] = "%=",
- [BPF_XOR >> 4] = "^=",
- [BPF_MOV >> 4] = "=",
- [BPF_ARSH >> 4] = "s>>=",
- [BPF_END >> 4] = "endian",
-};
-
-static const char *const bpf_ldst_string[] = {
- [BPF_W >> 3] = "u32",
- [BPF_H >> 3] = "u16",
- [BPF_B >> 3] = "u8",
- [BPF_DW >> 3] = "u64",
-};
-
-static const char *const bpf_jmp_string[16] = {
- [BPF_JA >> 4] = "jmp",
- [BPF_JEQ >> 4] = "==",
- [BPF_JGT >> 4] = ">",
- [BPF_JLT >> 4] = "<",
- [BPF_JGE >> 4] = ">=",
- [BPF_JLE >> 4] = "<=",
- [BPF_JSET >> 4] = "&",
- [BPF_JNE >> 4] = "!=",
- [BPF_JSGT >> 4] = "s>",
- [BPF_JSLT >> 4] = "s<",
- [BPF_JSGE >> 4] = "s>=",
- [BPF_JSLE >> 4] = "s<=",
- [BPF_CALL >> 4] = "call",
- [BPF_EXIT >> 4] = "exit",
-};
+static int copy_stack_state(struct bpf_verifier_state *dst,
+ const struct bpf_verifier_state *src)
+{
+ if (!src->stack)
+ return 0;
+ if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
+ /* internal bug, make state invalid to reject the program */
+ memset(dst, 0, sizeof(*dst));
+ return -EFAULT;
+ }
+ memcpy(dst->stack, src->stack,
+ sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
+ return 0;
+}
-static void print_bpf_insn(const struct bpf_verifier_env *env,
- const struct bpf_insn *insn)
+/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
+ * make it consume minimal amount of memory. check_stack_write() access from
+ * the program calls into realloc_verifier_state() to grow the stack size.
+ * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
+ * which this function copies over. It points to previous bpf_verifier_state
+ * which is never reallocated
+ */
+static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
+ bool copy_old)
{
- u8 class = BPF_CLASS(insn->code);
-
- if (class == BPF_ALU || class == BPF_ALU64) {
- if (BPF_SRC(insn->code) == BPF_X)
- verbose("(%02x) %sr%d %s %sr%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
- insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
- insn->src_reg);
- else
- verbose("(%02x) %sr%d %s %s%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
- insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
- insn->imm);
- } else if (class == BPF_STX) {
- if (BPF_MODE(insn->code) == BPF_MEM)
- verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->src_reg);
- else if (BPF_MODE(insn->code) == BPF_XADD)
- verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg, insn->off,
- insn->src_reg);
- else
- verbose("BUG_%02x\n", insn->code);
- } else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose("BUG_st_%02x\n", insn->code);
- return;
- }
- verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->imm);
- } else if (class == BPF_LDX) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose("BUG_ldx_%02x\n", insn->code);
- return;
+ u32 old_size = state->allocated_stack;
+ struct bpf_stack_state *new_stack;
+ int slot = size / BPF_REG_SIZE;
+
+ if (size <= old_size || !size) {
+ if (copy_old)
+ return 0;
+ state->allocated_stack = slot * BPF_REG_SIZE;
+ if (!size && old_size) {
+ kfree(state->stack);
+ state->stack = NULL;
}
- verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
- insn->code, insn->dst_reg,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->src_reg, insn->off);
- } else if (class == BPF_LD) {
- if (BPF_MODE(insn->code) == BPF_ABS) {
- verbose("(%02x) r0 = *(%s *)skb[%d]\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IND) {
- verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->src_reg, insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IMM &&
- BPF_SIZE(insn->code) == BPF_DW) {
- /* At this point, we already made sure that the second
- * part of the ldimm64 insn is accessible.
- */
- u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+ return 0;
+ }
+ new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
+ GFP_KERNEL);
+ if (!new_stack)
+ return -ENOMEM;
+ if (copy_old) {
+ if (state->stack)
+ memcpy(new_stack, state->stack,
+ sizeof(*new_stack) * (old_size / BPF_REG_SIZE));
+ memset(new_stack + old_size / BPF_REG_SIZE, 0,
+ sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE);
+ }
+ state->allocated_stack = slot * BPF_REG_SIZE;
+ kfree(state->stack);
+ state->stack = new_stack;
+ return 0;
+}
- if (map_ptr && !env->allow_ptr_leaks)
- imm = 0;
+static void free_verifier_state(struct bpf_verifier_state *state,
+ bool free_self)
+{
+ kfree(state->stack);
+ if (free_self)
+ kfree(state);
+}
- verbose("(%02x) r%d = 0x%llx\n", insn->code,
- insn->dst_reg, (unsigned long long)imm);
- } else {
- verbose("BUG_ld_%02x\n", insn->code);
- return;
- }
- } else if (class == BPF_JMP) {
- u8 opcode = BPF_OP(insn->code);
+/* copy verifier state from src to dst growing dst stack space
+ * when necessary to accommodate larger src stack
+ */
+static int copy_verifier_state(struct bpf_verifier_state *dst,
+ const struct bpf_verifier_state *src)
+{
+ int err;
- if (opcode == BPF_CALL) {
- verbose("(%02x) call %s#%d\n", insn->code,
- func_id_name(insn->imm), insn->imm);
- } else if (insn->code == (BPF_JMP | BPF_JA)) {
- verbose("(%02x) goto pc%+d\n",
- insn->code, insn->off);
- } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
- verbose("(%02x) exit\n", insn->code);
- } else if (BPF_SRC(insn->code) == BPF_X) {
- verbose("(%02x) if r%d %s r%d goto pc%+d\n",
- insn->code, insn->dst_reg,
- bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->src_reg, insn->off);
- } else {
- verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
- insn->code, insn->dst_reg,
- bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->imm, insn->off);
- }
- } else {
- verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
- }
+ err = realloc_verifier_state(dst, src->allocated_stack, false);
+ if (err)
+ return err;
+ memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack));
+ return copy_stack_state(dst, src);
}
-static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
+ int *insn_idx)
{
- struct bpf_verifier_stack_elem *elem;
- int insn_idx;
+ struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_verifier_stack_elem *elem, *head = env->head;
+ int err;
if (env->head == NULL)
- return -1;
+ return -ENOENT;
- memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
- insn_idx = env->head->insn_idx;
+ if (cur) {
+ err = copy_verifier_state(cur, &head->st);
+ if (err)
+ return err;
+ }
+ if (insn_idx)
+ *insn_idx = head->insn_idx;
if (prev_insn_idx)
- *prev_insn_idx = env->head->prev_insn_idx;
- elem = env->head->next;
- kfree(env->head);
+ *prev_insn_idx = head->prev_insn_idx;
+ elem = head->next;
+ free_verifier_state(&head->st, false);
+ kfree(head);
env->head = elem;
env->stack_size--;
- return insn_idx;
+ return 0;
}
static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx)
{
+ struct bpf_verifier_state *cur = env->cur_state;
struct bpf_verifier_stack_elem *elem;
+ int err;
- elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
if (!elem)
goto err;
- memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
env->head = elem;
env->stack_size++;
+ err = copy_verifier_state(&elem->st, cur);
+ if (err)
+ goto err;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
- verbose("BPF program is too complex\n");
+ verbose(env, "BPF program is too complex\n");
goto err;
}
return &elem->st;
err:
/* pop all elements and return */
- while (pop_stack(env, NULL) >= 0);
+ while (!pop_stack(env, NULL, NULL));
return NULL;
}
@@ -507,10 +449,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
__mark_reg_known(reg, 0);
}
-static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_known_zero(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_known_zero(regs, %u)\n", regno);
+ verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -519,6 +462,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
__mark_reg_known_zero(regs + regno);
}
+static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+{
+ return type_is_pkt_pointer(reg->type);
+}
+
+static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
+{
+ return reg_is_pkt_pointer(reg) ||
+ reg->type == PTR_TO_PACKET_END;
+}
+
+/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
+static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
+ enum bpf_reg_type which)
+{
+ /* The register can already have a range from prior markings.
+ * This is fine as long as it hasn't been advanced from its
+ * origin.
+ */
+ return reg->type == which &&
+ reg->id == 0 &&
+ reg->off == 0 &&
+ tnum_equals_const(reg->var_off, 0);
+}
+
/* Attempts to improve min/max values based on var_off information */
static void __update_reg_bounds(struct bpf_reg_state *reg)
{
@@ -595,10 +563,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
__mark_reg_unbounded(reg);
}
-static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_unknown(regs, %u)\n", regno);
+ verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -613,10 +582,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg)
reg->type = NOT_INIT;
}
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_not_init(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_not_init(regs, %u)\n", regno);
+ verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -625,22 +595,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
__mark_reg_not_init(regs + regno);
}
-static void init_reg_state(struct bpf_reg_state *regs)
+static void init_reg_state(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
{
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
- mark_reg_not_init(regs, i);
+ mark_reg_not_init(env, regs, i);
regs[i].live = REG_LIVE_NONE;
}
/* frame pointer */
regs[BPF_REG_FP].type = PTR_TO_STACK;
- mark_reg_known_zero(regs, BPF_REG_FP);
+ mark_reg_known_zero(env, regs, BPF_REG_FP);
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
- mark_reg_known_zero(regs, BPF_REG_1);
+ mark_reg_known_zero(env, regs, BPF_REG_1);
}
enum reg_arg_type {
@@ -653,6 +624,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
{
struct bpf_verifier_state *parent = state->parent;
+ if (regno == BPF_REG_FP)
+ /* We don't need to worry about FP liveness because it's read-only */
+ return;
+
while (parent) {
/* if read wasn't screened by an earlier write ... */
if (state->regs[regno].live & REG_LIVE_WRITTEN)
@@ -667,29 +642,29 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
enum reg_arg_type t)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state->regs;
if (regno >= MAX_BPF_REG) {
- verbose("R%d is invalid\n", regno);
+ verbose(env, "R%d is invalid\n", regno);
return -EINVAL;
}
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
if (regs[regno].type == NOT_INIT) {
- verbose("R%d !read_ok\n", regno);
+ verbose(env, "R%d !read_ok\n", regno);
return -EACCES;
}
- mark_reg_read(&env->cur_state, regno);
+ mark_reg_read(env->cur_state, regno);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
- verbose("frame pointer is read only\n");
+ verbose(env, "frame pointer is read only\n");
return -EACCES;
}
regs[regno].live |= REG_LIVE_WRITTEN;
if (t == DST_OP)
- mark_reg_unknown(regs, regno);
+ mark_reg_unknown(env, regs, regno);
}
return 0;
}
@@ -702,6 +677,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
case PTR_TO_PACKET_END:
case CONST_PTR_TO_MAP:
return true;
@@ -713,35 +689,48 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
-static int check_stack_write(struct bpf_verifier_state *state, int off,
+static int check_stack_write(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state, int off,
int size, int value_regno)
{
- int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+
+ err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE),
+ true);
+ if (err)
+ return err;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
+ if (!env->allow_ptr_leaks &&
+ state->stack[spi].slot_type[0] == STACK_SPILL &&
+ size != BPF_REG_SIZE) {
+ verbose(env, "attempt to corrupt spilled pointer on stack\n");
+ return -EACCES;
+ }
if (value_regno >= 0 &&
is_spillable_regtype(state->regs[value_regno].type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
- verbose("invalid size of register spill\n");
+ verbose(env, "invalid size of register spill\n");
return -EACCES;
}
/* save register state */
- state->spilled_regs[spi] = state->regs[value_regno];
- state->spilled_regs[spi].live |= REG_LIVE_WRITTEN;
+ state->stack[spi].spilled_ptr = state->regs[value_regno];
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
for (i = 0; i < BPF_REG_SIZE; i++)
- state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+ state->stack[spi].slot_type[i] = STACK_SPILL;
} else {
/* regular write of data into stack */
- state->spilled_regs[spi] = (struct bpf_reg_state) {};
+ state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
for (i = 0; i < size; i++)
- state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+ state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
+ STACK_MISC;
}
return 0;
}
@@ -752,66 +741,72 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
while (parent) {
/* if read wasn't screened by an earlier write ... */
- if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN)
+ if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
break;
/* ... then we depend on parent's value */
- parent->spilled_regs[slot].live |= REG_LIVE_READ;
+ parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
state = parent;
parent = state->parent;
}
}
-static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state, int off, int size,
int value_regno)
{
- u8 *slot_type;
- int i, spi;
+ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+ u8 *stype;
- slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+ if (state->allocated_stack <= slot) {
+ verbose(env, "invalid read from stack off %d+0 size %d\n",
+ off, size);
+ return -EACCES;
+ }
+ stype = state->stack[spi].slot_type;
- if (slot_type[0] == STACK_SPILL) {
+ if (stype[0] == STACK_SPILL) {
if (size != BPF_REG_SIZE) {
- verbose("invalid size of register spill\n");
+ verbose(env, "invalid size of register spill\n");
return -EACCES;
}
for (i = 1; i < BPF_REG_SIZE; i++) {
- if (slot_type[i] != STACK_SPILL) {
- verbose("corrupted spill memory\n");
+ if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
+ verbose(env, "corrupted spill memory\n");
return -EACCES;
}
}
- spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
-
if (value_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] = state->spilled_regs[spi];
+ state->regs[value_regno] = state->stack[spi].spilled_ptr;
mark_stack_slot_read(state, spi);
}
return 0;
} else {
for (i = 0; i < size; i++) {
- if (slot_type[i] != STACK_MISC) {
- verbose("invalid read from stack off %d+%d size %d\n",
+ if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) {
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
}
}
if (value_regno >= 0)
/* have read misc data from the stack */
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, state->regs, value_regno);
return 0;
}
}
/* check read/write into map element returned by bpf_map_lookup_elem() */
static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size)
+ int size, bool zero_size_allowed)
{
- struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_map *map = regs[regno].map_ptr;
- if (off < 0 || size <= 0 || off + size > map->value_size) {
- verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+ if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
+ off + size > map->value_size) {
+ verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
map->value_size, off, size);
return -EACCES;
}
@@ -820,9 +815,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
/* check read/write into a map element with possible variable offset */
static int check_map_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size)
+ int off, int size, bool zero_size_allowed)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *reg = &state->regs[regno];
int err;
@@ -830,8 +825,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* need to try adding each of min_value and max_value to off
* to make sure our theoretical access will be safe.
*/
- if (log_level)
- print_verifier_state(state);
+ if (env->log.level)
+ print_verifier_state(env, state);
/* The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
@@ -839,13 +834,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* will have a set floor within our range.
*/
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
- err = __check_map_access(env, regno, reg->smin_value + off, size);
+ err = __check_map_access(env, regno, reg->smin_value + off, size,
+ zero_size_allowed);
if (err) {
- verbose("R%d min value is outside of the array range\n", regno);
+ verbose(env, "R%d min value is outside of the array range\n",
+ regno);
return err;
}
@@ -854,13 +851,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* If reg->umax_value + off could overflow, treat that as unbounded too.
*/
if (reg->umax_value >= BPF_MAX_VAR_OFF) {
- verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+ verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
regno);
return -EACCES;
}
- err = __check_map_access(env, regno, reg->umax_value + off, size);
+ err = __check_map_access(env, regno, reg->umax_value + off, size,
+ zero_size_allowed);
if (err)
- verbose("R%d max value is outside of the array range\n", regno);
+ verbose(env, "R%d max value is outside of the array range\n",
+ regno);
return err;
}
@@ -893,13 +892,14 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
}
static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size)
+ int off, int size, bool zero_size_allowed)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = &regs[regno];
- if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
- verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+ if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
+ (u64)off + size > reg->range) {
+ verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
off, size, regno, reg->id, reg->off, reg->range);
return -EACCES;
}
@@ -907,9 +907,9 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
}
static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size)
+ int size, bool zero_size_allowed)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = &regs[regno];
int err;
@@ -922,13 +922,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
* detail to prove they're safe.
*/
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
- err = __check_packet_access(env, regno, off, size);
+ err = __check_packet_access(env, regno, off, size, zero_size_allowed);
if (err) {
- verbose("R%d offset is outside of the packet\n", regno);
+ verbose(env, "R%d offset is outside of the packet\n", regno);
return err;
}
return err;
@@ -942,12 +942,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
.reg_type = *reg_type,
};
- /* for analyzer ctx accesses are already validated and converted */
- if (env->analyzer_ops)
- return 0;
-
- if (env->prog->aux->ops->is_valid_access &&
- env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+ if (env->ops->is_valid_access &&
+ env->ops->is_valid_access(off, size, t, &info)) {
/* A non zero info.ctx_field_size indicates that this field is a
* candidate for later verifier transformation to load the whole
* field and then apply a mask when accessed with a narrower
@@ -955,16 +951,16 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
* will only allow for whole field access and rejects any other
* type of narrower access.
*/
- env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
*reg_type = info.reg_type;
+ env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
return 0;
}
- verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+ verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
return -EACCES;
}
@@ -979,10 +975,11 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
{
- return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+ return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
}
-static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
int off, int size, bool strict)
{
struct tnum reg_off;
@@ -1007,7 +1004,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+ verbose(env,
+ "misaligned packet access off %d+%s+%d+%d size %d\n",
ip_align, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -1015,7 +1013,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
return 0;
}
-static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
const char *pointer_desc,
int off, int size, bool strict)
{
@@ -1030,7 +1029,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("misaligned %saccess off %s+%d+%d size %d\n",
+ verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
pointer_desc, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -1047,8 +1046,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
switch (reg->type) {
case PTR_TO_PACKET:
- /* special case, because of NET_IP_ALIGN */
- return check_pkt_ptr_alignment(reg, off, size, strict);
+ case PTR_TO_PACKET_META:
+ /* Special case, because of NET_IP_ALIGN. Given metadata sits
+ * right in front, treat it the very same way.
+ */
+ return check_pkt_ptr_alignment(env, reg, off, size, strict);
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
break;
@@ -1061,7 +1063,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
default:
break;
}
- return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
+ return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
+ strict);
}
/* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -1074,8 +1077,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
int bpf_size, enum bpf_access_type t,
int value_regno)
{
- struct bpf_verifier_state *state = &env->cur_state;
- struct bpf_reg_state *reg = &state->regs[regno];
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = regs + regno;
int size, err = 0;
size = bpf_size_to_bytes(bpf_size);
@@ -1093,48 +1097,55 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (reg->type == PTR_TO_MAP_VALUE) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into map\n", value_regno);
+ verbose(env, "R%d leaks addr into map\n", value_regno);
return -EACCES;
}
- err = check_map_access(env, regno, off, size);
+ err = check_map_access(env, regno, off, size, false);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into ctx\n", value_regno);
+ verbose(env, "R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
/* ctx accesses must be at a fixed offset, so that we can
* determine what type of data were returned.
*/
- if (!tnum_is_const(reg->var_off)) {
+ if (reg->off) {
+ verbose(env,
+ "dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
+ regno, reg->off, off - reg->off);
+ return -EACCES;
+ }
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("variable ctx access var_off=%s off=%d size=%d",
+ verbose(env,
+ "variable ctx access var_off=%s off=%d size=%d",
tn_buf, off, size);
return -EACCES;
}
- off += reg->var_off.value;
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
/* ctx access returns either a scalar, or a
- * PTR_TO_PACKET[_END]. In the latter case, we know
- * the offset is zero.
+ * PTR_TO_PACKET[_META,_END]. In the latter
+ * case, we know the offset is zero.
*/
if (reg_type == SCALAR_VALUE)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, regs, value_regno);
else
- mark_reg_known_zero(state->regs, value_regno);
- state->regs[value_regno].id = 0;
- state->regs[value_regno].off = 0;
- state->regs[value_regno].range = 0;
- state->regs[value_regno].type = reg_type;
+ mark_reg_known_zero(env, regs,
+ value_regno);
+ regs[value_regno].id = 0;
+ regs[value_regno].off = 0;
+ regs[value_regno].range = 0;
+ regs[value_regno].type = reg_type;
}
} else if (reg->type == PTR_TO_STACK) {
@@ -1146,55 +1157,52 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("variable stack access var_off=%s off=%d size=%d",
+ verbose(env, "variable stack access var_off=%s off=%d size=%d",
tn_buf, off, size);
return -EACCES;
}
off += reg->var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK) {
- verbose("invalid stack off=%d size=%d\n", off, size);
+ verbose(env, "invalid stack off=%d size=%d\n", off,
+ size);
return -EACCES;
}
if (env->prog->aux->stack_depth < -off)
env->prog->aux->stack_depth = -off;
- if (t == BPF_WRITE) {
- if (!env->allow_ptr_leaks &&
- state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
- size != BPF_REG_SIZE) {
- verbose("attempt to corrupt spilled pointer on stack\n");
- return -EACCES;
- }
- err = check_stack_write(state, off, size, value_regno);
- } else {
- err = check_stack_read(state, off, size, value_regno);
- }
- } else if (reg->type == PTR_TO_PACKET) {
+ if (t == BPF_WRITE)
+ err = check_stack_write(env, state, off, size,
+ value_regno);
+ else
+ err = check_stack_read(env, state, off, size,
+ value_regno);
+ } else if (reg_is_pkt_pointer(reg)) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
- verbose("cannot write into packet\n");
+ verbose(env, "cannot write into packet\n");
return -EACCES;
}
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into packet\n", value_regno);
+ verbose(env, "R%d leaks addr into packet\n",
+ value_regno);
return -EACCES;
}
- err = check_packet_access(env, regno, off, size);
+ err = check_packet_access(env, regno, off, size, false);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, regs, value_regno);
} else {
- verbose("R%d invalid mem access '%s'\n",
- regno, reg_type_str[reg->type]);
+ verbose(env, "R%d invalid mem access '%s'\n", regno,
+ reg_type_str[reg->type]);
return -EACCES;
}
if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
- state->regs[value_regno].type == SCALAR_VALUE) {
+ regs[value_regno].type == SCALAR_VALUE) {
/* b/h/w load zero-extends, mark upper bits as known 0 */
- state->regs[value_regno].var_off = tnum_cast(
- state->regs[value_regno].var_off, size);
- __update_reg_bounds(&state->regs[value_regno]);
+ regs[value_regno].var_off =
+ tnum_cast(regs[value_regno].var_off, size);
+ __update_reg_bounds(&regs[value_regno]);
}
return err;
}
@@ -1205,7 +1213,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
insn->imm != 0) {
- verbose("BPF_XADD uses reserved fields\n");
+ verbose(env, "BPF_XADD uses reserved fields\n");
return -EINVAL;
}
@@ -1220,7 +1228,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
return err;
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d leaks addr into mem\n", insn->src_reg);
+ verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
return -EACCES;
}
@@ -1251,9 +1259,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *regs = state->regs;
- int off, i;
+ int off, i, slot, spi;
if (regs[regno].type != PTR_TO_STACK) {
/* Allow zero-byte read from NULL, regardless of pointer type */
@@ -1261,7 +1269,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
register_is_null(regs[regno]))
return 0;
- verbose("R%d type=%s expected=%s\n", regno,
+ verbose(env, "R%d type=%s expected=%s\n", regno,
reg_type_str[regs[regno].type],
reg_type_str[PTR_TO_STACK]);
return -EACCES;
@@ -1272,13 +1280,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
- verbose("invalid variable stack read R%d var_off=%s\n",
+ verbose(env, "invalid variable stack read R%d var_off=%s\n",
regno, tn_buf);
}
off = regs[regno].off + regs[regno].var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
- access_size <= 0) {
- verbose("invalid stack type R%d off=%d access_size=%d\n",
+ access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
+ verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
regno, off, access_size);
return -EACCES;
}
@@ -1293,8 +1301,12 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
}
for (i = 0; i < access_size; i++) {
- if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
- verbose("invalid indirect read from stack off %d+%d size %d\n",
+ slot = -(off + i) - 1;
+ spi = slot / BPF_REG_SIZE;
+ if (state->allocated_stack <= slot ||
+ state->stack[spi].slot_type[slot % BPF_REG_SIZE] !=
+ STACK_MISC) {
+ verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
off, i, access_size);
return -EACCES;
}
@@ -1306,13 +1318,16 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
switch (reg->type) {
case PTR_TO_PACKET:
- return check_packet_access(env, regno, reg->off, access_size);
+ case PTR_TO_PACKET_META:
+ return check_packet_access(env, regno, reg->off, access_size,
+ zero_size_allowed);
case PTR_TO_MAP_VALUE:
- return check_map_access(env, regno, reg->off, access_size);
+ return check_map_access(env, regno, reg->off, access_size,
+ zero_size_allowed);
default: /* scalar_value|ptr_to_stack or invalid ptr */
return check_stack_boundary(env, regno, access_size,
zero_size_allowed, meta);
@@ -1323,7 +1338,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
enum bpf_reg_type expected_type, type = reg->type;
int err = 0;
@@ -1336,22 +1351,24 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_ANYTHING) {
if (is_pointer_value(env, regno)) {
- verbose("R%d leaks addr into helper function\n", regno);
+ verbose(env, "R%d leaks addr into helper function\n",
+ regno);
return -EACCES;
}
return 0;
}
- if (type == PTR_TO_PACKET &&
+ if (type_is_pkt_pointer(type) &&
!may_access_direct_pkt_data(env, meta, BPF_READ)) {
- verbose("helper access to the packet is not allowed\n");
+ verbose(env, "helper access to the packet is not allowed\n");
return -EACCES;
}
if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
- if (type != PTR_TO_PACKET && type != expected_type)
+ if (!type_is_pkt_pointer(type) &&
+ type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -1367,20 +1384,23 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (type != expected_type)
goto err_type;
} else if (arg_type == ARG_PTR_TO_MEM ||
+ arg_type == ARG_PTR_TO_MEM_OR_NULL ||
arg_type == ARG_PTR_TO_UNINIT_MEM) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
* passed in as argument, it's a SCALAR_VALUE type. Final test
* happens during stack boundary checking.
*/
- if (register_is_null(*reg))
+ if (register_is_null(*reg) &&
+ arg_type == ARG_PTR_TO_MEM_OR_NULL)
/* final test in check_stack_boundary() */;
- else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+ else if (!type_is_pkt_pointer(type) &&
+ type != PTR_TO_MAP_VALUE &&
type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
} else {
- verbose("unsupported arg_type %d\n", arg_type);
+ verbose(env, "unsupported arg_type %d\n", arg_type);
return -EFAULT;
}
@@ -1398,12 +1418,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
* we have to check map_key here. Otherwise it means
* that kernel subsystem misconfigured verifier
*/
- verbose("invalid map_ptr to access map->key\n");
+ verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
- if (type == PTR_TO_PACKET)
+ if (type_is_pkt_pointer(type))
err = check_packet_access(env, regno, reg->off,
- meta->map_ptr->key_size);
+ meta->map_ptr->key_size,
+ false);
else
err = check_stack_boundary(env, regno,
meta->map_ptr->key_size,
@@ -1414,12 +1435,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (!meta->map_ptr) {
/* kernel subsystem misconfigured verifier */
- verbose("invalid map_ptr to access map->value\n");
+ verbose(env, "invalid map_ptr to access map->value\n");
return -EACCES;
}
- if (type == PTR_TO_PACKET)
+ if (type_is_pkt_pointer(type))
err = check_packet_access(env, regno, reg->off,
- meta->map_ptr->value_size);
+ meta->map_ptr->value_size,
+ false);
else
err = check_stack_boundary(env, regno,
meta->map_ptr->value_size,
@@ -1434,7 +1456,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (regno == 0) {
/* kernel subsystem misconfigured verifier */
- verbose("ARG_CONST_SIZE cannot be first argument\n");
+ verbose(env,
+ "ARG_CONST_SIZE cannot be first argument\n");
return -EACCES;
}
@@ -1451,7 +1474,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
meta = NULL;
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+ verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
regno);
return -EACCES;
}
@@ -1465,7 +1488,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
}
if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
- verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
regno);
return -EACCES;
}
@@ -1476,12 +1499,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return err;
err_type:
- verbose("R%d type=%s expected=%s\n", regno,
+ verbose(env, "R%d type=%s expected=%s\n", regno,
reg_type_str[type], reg_type_str[expected_type]);
return -EACCES;
}
-static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+static int check_map_func_compatibility(struct bpf_verifier_env *env,
+ struct bpf_map *map, int func_id)
{
if (!map)
return 0;
@@ -1494,7 +1518,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
break;
case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
if (func_id != BPF_FUNC_perf_event_read &&
- func_id != BPF_FUNC_perf_event_output)
+ func_id != BPF_FUNC_perf_event_output &&
+ func_id != BPF_FUNC_perf_event_read_value)
goto error;
break;
case BPF_MAP_TYPE_STACK_TRACE:
@@ -1514,6 +1539,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
+ /* Restrict bpf side of cpumap, open when use-cases appear */
+ case BPF_MAP_TYPE_CPUMAP:
+ if (func_id != BPF_FUNC_redirect_map)
+ goto error;
+ break;
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
if (func_id != BPF_FUNC_map_lookup_elem)
@@ -1537,6 +1567,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
break;
case BPF_FUNC_perf_event_read:
case BPF_FUNC_perf_event_output:
+ case BPF_FUNC_perf_event_read_value:
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;
break;
@@ -1550,7 +1581,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
goto error;
break;
case BPF_FUNC_redirect_map:
- if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+ if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+ map->map_type != BPF_MAP_TYPE_CPUMAP)
goto error;
break;
case BPF_FUNC_sk_redirect_map:
@@ -1567,7 +1599,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
return 0;
error:
- verbose("cannot pass map_type %d into func %s#%d\n",
+ verbose(env, "cannot pass map_type %d into func %s#%d\n",
map->map_type, func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -1590,57 +1622,55 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
return count > 1 ? -EINVAL : 0;
}
-/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
- * so turn them into unknown SCALAR_VALUE.
+/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
+ * are now invalid, so turn them into unknown SCALAR_VALUE.
*/
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *regs = state->regs, *reg;
int i;
for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].type == PTR_TO_PACKET ||
- regs[i].type == PTR_TO_PACKET_END)
- mark_reg_unknown(regs, i);
+ if (reg_is_pkt_pointer_any(&regs[i]))
+ mark_reg_unknown(env, regs, i);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- reg = &state->spilled_regs[i / BPF_REG_SIZE];
- if (reg->type != PTR_TO_PACKET &&
- reg->type != PTR_TO_PACKET_END)
- continue;
- __mark_reg_unknown(reg);
+ reg = &state->stack[i].spilled_ptr;
+ if (reg_is_pkt_pointer_any(reg))
+ __mark_reg_unknown(reg);
}
}
static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
{
- struct bpf_verifier_state *state = &env->cur_state;
const struct bpf_func_proto *fn = NULL;
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs;
struct bpf_call_arg_meta meta;
bool changes_data;
int i, err;
/* find function prototype */
if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
- verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
+ verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
+ func_id);
return -EINVAL;
}
- if (env->prog->aux->ops->get_func_proto)
- fn = env->prog->aux->ops->get_func_proto(func_id);
+ if (env->ops->get_func_proto)
+ fn = env->ops->get_func_proto(func_id);
if (!fn) {
- verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
+ verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
+ func_id);
return -EINVAL;
}
/* eBPF programs must be GPL compatible to use GPL-ed functions */
if (!env->prog->gpl_compatible && fn->gpl_only) {
- verbose("cannot call GPL only function from proprietary program\n");
+ verbose(env, "cannot call GPL only function from proprietary program\n");
return -EINVAL;
}
@@ -1654,7 +1684,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
*/
err = check_raw_mode(fn);
if (err) {
- verbose("kernel subsystem misconfigured func %s#%d\n",
+ verbose(env, "kernel subsystem misconfigured func %s#%d\n",
func_id_name(func_id), func_id);
return err;
}
@@ -1685,16 +1715,17 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return err;
}
+ regs = cur_regs(env);
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(regs, caller_saved[i]);
+ mark_reg_not_init(env, regs, caller_saved[i]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
/* update return register (already marked as written above) */
if (fn->ret_type == RET_INTEGER) {
/* sets type to SCALAR_VALUE */
- mark_reg_unknown(regs, BPF_REG_0);
+ mark_reg_unknown(env, regs, BPF_REG_0);
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1702,14 +1733,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
/* There is no offset yet applied, variable or fixed */
- mark_reg_known_zero(regs, BPF_REG_0);
+ mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].off = 0;
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
*/
if (meta.map_ptr == NULL) {
- verbose("kernel subsystem misconfigured verifier\n");
+ verbose(env,
+ "kernel subsystem misconfigured verifier\n");
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
@@ -1720,12 +1752,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
else if (insn_aux->map_ptr != meta.map_ptr)
insn_aux->map_ptr = BPF_MAP_PTR_POISON;
} else {
- verbose("unknown return type %d of func %s#%d\n",
+ verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
return -EINVAL;
}
- err = check_map_func_compatibility(meta.map_ptr, func_id);
+ err = check_map_func_compatibility(env, meta.map_ptr, func_id);
if (err)
return err;
@@ -1772,7 +1804,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
const struct bpf_reg_state *ptr_reg,
const struct bpf_reg_state *off_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ struct bpf_reg_state *regs = cur_regs(env), *dst_reg;
bool known = tnum_is_const(off_reg->var_off);
s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
@@ -1784,39 +1816,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg = &regs[dst];
if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: known but bad sbounds\n");
+ print_verifier_state(env, env->cur_state);
+ verbose(env,
+ "verifier internal error: known but bad sbounds\n");
return -EINVAL;
}
if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: known but bad ubounds\n");
+ print_verifier_state(env, env->cur_state);
+ verbose(env,
+ "verifier internal error: known but bad ubounds\n");
return -EINVAL;
}
if (BPF_CLASS(insn->code) != BPF_ALU64) {
/* 32-bit ALU ops on pointers produce (meaningless) scalars */
if (!env->allow_ptr_leaks)
- verbose("R%d 32-bit pointer arithmetic prohibited\n",
+ verbose(env,
+ "R%d 32-bit pointer arithmetic prohibited\n",
dst);
return -EACCES;
}
if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+ verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
dst);
return -EACCES;
}
if (ptr_reg->type == CONST_PTR_TO_MAP) {
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+ verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
dst);
return -EACCES;
}
if (ptr_reg->type == PTR_TO_PACKET_END) {
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+ verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
dst);
return -EACCES;
}
@@ -1871,7 +1906,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
- if (ptr_reg->type == PTR_TO_PACKET) {
+ if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
dst_reg->range = 0;
@@ -1881,7 +1916,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (dst_reg == off_reg) {
/* scalar -= pointer. Creates an unknown scalar */
if (!env->allow_ptr_leaks)
- verbose("R%d tried to subtract pointer from scalar\n",
+ verbose(env, "R%d tried to subtract pointer from scalar\n",
dst);
return -EACCES;
}
@@ -1891,7 +1926,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
*/
if (ptr_reg->type == PTR_TO_STACK) {
if (!env->allow_ptr_leaks)
- verbose("R%d subtraction from stack pointer prohibited\n",
+ verbose(env, "R%d subtraction from stack pointer prohibited\n",
dst);
return -EACCES;
}
@@ -1931,7 +1966,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
- if (ptr_reg->type == PTR_TO_PACKET) {
+ if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
if (smin_val < 0)
@@ -1946,13 +1981,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
* ptr &= ~3 which would reduce min_value by 3.)
*/
if (!env->allow_ptr_leaks)
- verbose("R%d bitwise operator %s on pointer prohibited\n",
+ verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
default:
/* other operators (e.g. MUL,LSH) produce non-pointer results */
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic with %s operator prohibited\n",
+ verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
}
@@ -1968,7 +2003,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_reg_state *dst_reg,
struct bpf_reg_state src_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
bool src_known, dst_known;
s64 smin_val, smax_val;
@@ -2118,7 +2153,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* Shifts greater than 63 are undefined. This includes
* shifts by a negative number.
*/
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
/* We lose all sign bit information (except what we can pick
@@ -2146,7 +2181,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* Shifts greater than 63 are undefined. This includes
* shifts by a negative number.
*/
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
/* BPF_RSH is an unsigned shift, so make the appropriate casts */
@@ -2174,7 +2209,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
break;
default:
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
@@ -2189,7 +2224,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+ struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
u8 opcode = BPF_OP(insn->code);
int rc;
@@ -2206,12 +2241,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* an arbitrary scalar.
*/
if (!env->allow_ptr_leaks) {
- verbose("R%d pointer %s pointer prohibited\n",
+ verbose(env, "R%d pointer %s pointer prohibited\n",
insn->dst_reg,
bpf_alu_string[opcode >> 4]);
return -EACCES;
}
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
return 0;
} else {
/* scalar += pointer
@@ -2263,13 +2298,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: unexpected ptr_reg\n");
+ print_verifier_state(env, env->cur_state);
+ verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: no src_reg\n");
+ print_verifier_state(env, env->cur_state);
+ verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
@@ -2278,7 +2313,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* check validity of 32-bit and 64-bit arithmetic operations */
static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
int err;
@@ -2287,14 +2322,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->off != 0 || insn->imm != 0) {
- verbose("BPF_NEG uses reserved fields\n");
+ verbose(env, "BPF_NEG uses reserved fields\n");
return -EINVAL;
}
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
(insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
BPF_CLASS(insn->code) == BPF_ALU64) {
- verbose("BPF_END uses reserved fields\n");
+ verbose(env, "BPF_END uses reserved fields\n");
return -EINVAL;
}
}
@@ -2305,7 +2340,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
+ verbose(env, "R%d pointer arithmetic prohibited\n",
insn->dst_reg);
return -EACCES;
}
@@ -2319,7 +2354,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
- verbose("BPF_MOV uses reserved fields\n");
+ verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
@@ -2329,7 +2364,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose("BPF_MOV uses reserved fields\n");
+ verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
}
@@ -2345,14 +2380,16 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* copy register state to dest reg
*/
regs[insn->dst_reg] = regs[insn->src_reg];
+ regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d partial copy of pointer\n",
+ verbose(env,
+ "R%d partial copy of pointer\n",
insn->src_reg);
return -EACCES;
}
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
/* high 32 bits are known zero. */
regs[insn->dst_reg].var_off = tnum_cast(
regs[insn->dst_reg].var_off, 4);
@@ -2367,14 +2404,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
} else if (opcode > BPF_END) {
- verbose("invalid BPF_ALU opcode %x\n", opcode);
+ verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
return -EINVAL;
} else { /* all other ALU ops: and, sub, xor, add, ... */
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
- verbose("BPF_ALU uses reserved fields\n");
+ verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
/* check src1 operand */
@@ -2383,7 +2420,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose("BPF_ALU uses reserved fields\n");
+ verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
}
@@ -2395,7 +2432,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
- verbose("div by zero\n");
+ verbose(env, "div by zero\n");
return -EINVAL;
}
@@ -2404,7 +2441,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
if (insn->imm < 0 || insn->imm >= size) {
- verbose("invalid shift %d\n", insn->imm);
+ verbose(env, "invalid shift %d\n", insn->imm);
return -EINVAL;
}
}
@@ -2421,12 +2458,16 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
static void find_good_pkt_pointers(struct bpf_verifier_state *state,
- struct bpf_reg_state *dst_reg)
+ struct bpf_reg_state *dst_reg,
+ enum bpf_reg_type type,
+ bool range_right_open)
{
struct bpf_reg_state *regs = state->regs, *reg;
+ u16 new_range;
int i;
- if (dst_reg->off < 0)
+ if (dst_reg->off < 0 ||
+ (dst_reg->off == 0 && range_right_open))
/* This doesn't give us any range */
return;
@@ -2437,9 +2478,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
*/
return;
- /* LLVM can generate four kind of checks:
+ new_range = dst_reg->off;
+ if (range_right_open)
+ new_range--;
+
+ /* Examples for register markings:
*
- * Type 1/2:
+ * pkt_data in dst register:
*
* r2 = r3;
* r2 += 8;
@@ -2456,7 +2501,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
* r2=pkt(id=n,off=8,r=0)
* r3=pkt(id=n,off=0,r=0)
*
- * Type 3/4:
+ * pkt_data in src register:
*
* r2 = r3;
* r2 += 8;
@@ -2474,7 +2519,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
* r3=pkt(id=n,off=0,r=0)
*
* Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
- * so that range of bytes [r3, r3 + 8) is safe to access.
+ * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8)
+ * and [r3, r3 + 8-1) respectively is safe to access depending on
+ * the check.
*/
/* If our ids match, then we must have the same max_value. And we
@@ -2483,16 +2530,16 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+ if (regs[i].type == type && regs[i].id == dst_reg->id)
/* keep the maximum range already checked */
- regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
+ regs[i].range = max(regs[i].range, new_range);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- reg = &state->spilled_regs[i / BPF_REG_SIZE];
- if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
- reg->range = max_t(u16, reg->range, dst_reg->off);
+ reg = &state->stack[i].spilled_ptr;
+ if (reg->type == type && reg->id == dst_reg->id)
+ reg->range = max(reg->range, new_range);
}
}
@@ -2740,29 +2787,122 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
for (i = 0; i < MAX_BPF_REG; i++)
mark_map_reg(regs, i, id, is_null);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null);
+ mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
+ }
+}
+
+static bool try_match_pkt_pointers(const struct bpf_insn *insn,
+ struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg,
+ struct bpf_verifier_state *this_branch,
+ struct bpf_verifier_state *other_branch)
+{
+ if (BPF_SRC(insn->code) != BPF_X)
+ return false;
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_JGT:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' > pkt_end, pkt_meta' > pkt_data */
+ find_good_pkt_pointers(this_branch, dst_reg,
+ dst_reg->type, false);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end > pkt_data', pkt_data > pkt_meta' */
+ find_good_pkt_pointers(other_branch, src_reg,
+ src_reg->type, true);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JLT:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' < pkt_end, pkt_meta' < pkt_data */
+ find_good_pkt_pointers(other_branch, dst_reg,
+ dst_reg->type, true);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end < pkt_data', pkt_data > pkt_meta' */
+ find_good_pkt_pointers(this_branch, src_reg,
+ src_reg->type, false);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JGE:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
+ find_good_pkt_pointers(this_branch, dst_reg,
+ dst_reg->type, true);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
+ find_good_pkt_pointers(other_branch, src_reg,
+ src_reg->type, false);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JLE:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
+ find_good_pkt_pointers(other_branch, dst_reg,
+ dst_reg->type, false);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
+ find_good_pkt_pointers(this_branch, src_reg,
+ src_reg->type, true);
+ } else {
+ return false;
+ }
+ break;
+ default:
+ return false;
}
+
+ return true;
}
static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
- struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
+ struct bpf_verifier_state *other_branch, *this_branch = env->cur_state;
struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
u8 opcode = BPF_OP(insn->code);
int err;
if (opcode > BPF_JSLE) {
- verbose("invalid BPF_JMP opcode %x\n", opcode);
+ verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
return -EINVAL;
}
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
- verbose("BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP uses reserved fields\n");
return -EINVAL;
}
@@ -2772,13 +2912,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return err;
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d pointer comparison prohibited\n",
+ verbose(env, "R%d pointer comparison prohibited\n",
insn->src_reg);
return -EACCES;
}
} else {
if (insn->src_reg != BPF_REG_0) {
- verbose("BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP uses reserved fields\n");
return -EINVAL;
}
}
@@ -2853,28 +2993,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*/
mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
- dst_reg->type == PTR_TO_PACKET &&
- regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- find_good_pkt_pointers(this_branch, dst_reg);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
- dst_reg->type == PTR_TO_PACKET &&
- regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- find_good_pkt_pointers(other_branch, dst_reg);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
- dst_reg->type == PTR_TO_PACKET_END &&
- regs[insn->src_reg].type == PTR_TO_PACKET) {
- find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
- dst_reg->type == PTR_TO_PACKET_END &&
- regs[insn->src_reg].type == PTR_TO_PACKET) {
- find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
- } else if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+ } else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
+ this_branch, other_branch) &&
+ is_pointer_value(env, insn->dst_reg)) {
+ verbose(env, "R%d pointer comparison prohibited\n",
+ insn->dst_reg);
return -EACCES;
}
- if (log_level)
- print_verifier_state(this_branch);
+ if (env->log.level)
+ print_verifier_state(env, this_branch);
return 0;
}
@@ -2889,15 +3016,15 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
/* verify BPF_LD_IMM64 instruction */
static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
int err;
if (BPF_SIZE(insn->code) != BPF_DW) {
- verbose("invalid BPF_LD_IMM insn\n");
+ verbose(env, "invalid BPF_LD_IMM insn\n");
return -EINVAL;
}
if (insn->off != 0) {
- verbose("BPF_LD_IMM64 uses reserved fields\n");
+ verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
return -EINVAL;
}
@@ -2950,19 +3077,19 @@ static bool may_access_skb(enum bpf_prog_type type)
*/
static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 mode = BPF_MODE(insn->code);
int i, err;
if (!may_access_skb(env->prog->type)) {
- verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
+ verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
return -EINVAL;
}
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
- verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
+ verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
return -EINVAL;
}
@@ -2972,7 +3099,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (regs[BPF_REG_6].type != PTR_TO_CTX) {
- verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+ verbose(env,
+ "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
return -EINVAL;
}
@@ -2985,7 +3113,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* reset caller saved regs to unreadable */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(regs, caller_saved[i]);
+ mark_reg_not_init(env, regs, caller_saved[i]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
@@ -2993,7 +3121,45 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* the value fetched from the packet.
* Already marked as written above.
*/
- mark_reg_unknown(regs, BPF_REG_0);
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ return 0;
+}
+
+static int check_return_code(struct bpf_verifier_env *env)
+{
+ struct bpf_reg_state *reg;
+ struct tnum range = tnum_range(0, 1);
+
+ switch (env->prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ break;
+ default:
+ return 0;
+ }
+
+ reg = cur_regs(env) + BPF_REG_0;
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "At program exit the register R0 is not a known value (%s)\n",
+ reg_type_str[reg->type]);
+ return -EINVAL;
+ }
+
+ if (!tnum_in(range, reg->var_off)) {
+ verbose(env, "At program exit the register R0 ");
+ if (!tnum_is_unknown(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "has value %s", tn_buf);
+ } else {
+ verbose(env, "has unknown scalar value");
+ }
+ verbose(env, " should have been 0 or 1\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -3057,7 +3223,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
return 0;
if (w < 0 || w >= env->prog->len) {
- verbose("jump out of range from insn %d to %d\n", t, w);
+ verbose(env, "jump out of range from insn %d to %d\n", t, w);
return -EINVAL;
}
@@ -3074,13 +3240,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
insn_stack[cur_stack++] = w;
return 1;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- verbose("back-edge from insn %d to %d\n", t, w);
+ verbose(env, "back-edge from insn %d to %d\n", t, w);
return -EINVAL;
} else if (insn_state[w] == EXPLORED) {
/* forward- or cross-edge */
insn_state[t] = DISCOVERED | e;
} else {
- verbose("insn state internal bug\n");
+ verbose(env, "insn state internal bug\n");
return -EFAULT;
}
return 0;
@@ -3174,7 +3340,7 @@ peek_stack:
mark_explored:
insn_state[t] = EXPLORED;
if (cur_stack-- <= 0) {
- verbose("pop stack internal bug\n");
+ verbose(env, "pop stack internal bug\n");
ret = -EFAULT;
goto err_free;
}
@@ -3183,7 +3349,7 @@ mark_explored:
check_state:
for (i = 0; i < insn_cnt; i++) {
if (insn_state[i] != EXPLORED) {
- verbose("unreachable insn %d\n", i);
+ verbose(env, "unreachable insn %d\n", i);
ret = -EINVAL;
goto err_free;
}
@@ -3298,8 +3464,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
/* Check our ids match any regs they're supposed to */
return check_ids(rold->id, rcur->id, idmap);
+ case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
- if (rcur->type != PTR_TO_PACKET)
+ if (rcur->type != rold->type)
return false;
/* We must have at least as much range as the old ptr
* did, so that any accesses which were safe before are
@@ -3337,6 +3504,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
}
+static bool stacksafe(struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur,
+ struct idpair *idmap)
+{
+ int i, spi;
+
+ /* if explored stack has more populated slots than current stack
+ * such stacks are not equivalent
+ */
+ if (old->allocated_stack > cur->allocated_stack)
+ return false;
+
+ /* walk slots of the explored stack and ignore any additional
+ * slots in the current stack, since explored(safe) state
+ * didn't use them
+ */
+ for (i = 0; i < old->allocated_stack; i++) {
+ spi = i / BPF_REG_SIZE;
+
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
+ continue;
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+ /* Ex: old explored (safe) state has STACK_SPILL in
+ * this stack slot, but current has has STACK_MISC ->
+ * this verifier states are not equivalent,
+ * return false to continue verification of this path
+ */
+ return false;
+ if (i % BPF_REG_SIZE)
+ continue;
+ if (old->stack[spi].slot_type[0] != STACK_SPILL)
+ continue;
+ if (!regsafe(&old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr,
+ idmap))
+ /* when explored and current stack slot are both storing
+ * spilled registers, check that stored pointers types
+ * are the same as well.
+ * Ex: explored safe path could have stored
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
+ * but current path has stored:
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
+ * such verifier states are not equivalent.
+ * return false to continue verification of this path
+ */
+ return false;
+ }
+ return true;
+}
+
/* compare two verifier states
*
* all states stored in state_list are known to be valid, since
@@ -3381,37 +3599,8 @@ static bool states_equal(struct bpf_verifier_env *env,
goto out_free;
}
- for (i = 0; i < MAX_BPF_STACK; i++) {
- if (old->stack_slot_type[i] == STACK_INVALID)
- continue;
- if (old->stack_slot_type[i] != cur->stack_slot_type[i])
- /* Ex: old explored (safe) state has STACK_SPILL in
- * this stack slot, but current has has STACK_MISC ->
- * this verifier states are not equivalent,
- * return false to continue verification of this path
- */
- goto out_free;
- if (i % BPF_REG_SIZE)
- continue;
- if (old->stack_slot_type[i] != STACK_SPILL)
- continue;
- if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
- &cur->spilled_regs[i / BPF_REG_SIZE],
- idmap))
- /* when explored and current stack slot are both storing
- * spilled registers, check that stored pointers types
- * are the same as well.
- * Ex: explored safe path could have stored
- * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
- * but current path has stored:
- * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
- * such verifier states are not equivalent.
- * return false to continue verification of this path
- */
- goto out_free;
- else
- continue;
- }
+ if (!stacksafe(old, cur, idmap))
+ goto out_free;
ret = true;
out_free:
kfree(idmap);
@@ -3447,17 +3636,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state,
}
}
/* ... and stack slots */
- for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) {
- if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
+ i < parent->allocated_stack / BPF_REG_SIZE; i++) {
+ if (parent->stack[i].slot_type[0] != STACK_SPILL)
continue;
- if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- if (parent->spilled_regs[i].live & REG_LIVE_READ)
+ if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
continue;
- if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN))
+ if (writes &&
+ (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN))
continue;
- if (state->spilled_regs[i].live & REG_LIVE_READ) {
- parent->spilled_regs[i].live |= REG_LIVE_READ;
+ if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) {
+ parent->stack[i].spilled_ptr.live |= REG_LIVE_READ;
touched = true;
}
}
@@ -3487,7 +3678,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
- int i;
+ struct bpf_verifier_state *cur = env->cur_state;
+ int i, err;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -3497,7 +3689,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
return 0;
while (sl != STATE_LIST_MARK) {
- if (states_equal(env, &sl->state, &env->cur_state)) {
+ if (states_equal(env, &sl->state, cur)) {
/* reached equivalent register/stack state,
* prune the search.
* Registers read by the continuation are read by us.
@@ -3508,7 +3700,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* they'll be immediately forgotten as we're pruning
* this state and will pop a new one.
*/
- propagate_liveness(&sl->state, &env->cur_state);
+ propagate_liveness(&sl->state, cur);
return 1;
}
sl = sl->next;
@@ -3520,16 +3712,21 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* it will be rejected. Since there are no loops, we won't be
* seeing this 'insn_idx' instruction again on the way to bpf_exit
*/
- new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
+ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
if (!new_sl)
return -ENOMEM;
/* add new state to the head of linked list */
- memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+ err = copy_verifier_state(&new_sl->state, cur);
+ if (err) {
+ free_verifier_state(&new_sl->state, false);
+ kfree(new_sl);
+ return err;
+ }
new_sl->next = env->explored_states[insn_idx];
env->explored_states[insn_idx] = new_sl;
/* connect new state to parentage chain */
- env->cur_state.parent = &new_sl->state;
+ cur->parent = &new_sl->state;
/* clear write marks in current state: the writes we did are not writes
* our child did, so they don't screen off its reads from us.
* (There are no read marks in current state, because reads always mark
@@ -3537,33 +3734,37 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* explored_states can get read marks.)
*/
for (i = 0; i < BPF_REG_FP; i++)
- env->cur_state.regs[i].live = REG_LIVE_NONE;
- for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
- if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
- env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
+ cur->regs[i].live = REG_LIVE_NONE;
+ for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++)
+ if (cur->stack[i].slot_type[0] == STACK_SPILL)
+ cur->stack[i].spilled_ptr.live = REG_LIVE_NONE;
return 0;
}
static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx)
{
- if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
- return 0;
+ if (env->dev_ops && env->dev_ops->insn_hook)
+ return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
- return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+ return 0;
}
static int do_check(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state;
struct bpf_insn *insns = env->prog->insnsi;
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs;
int insn_cnt = env->prog->len;
int insn_idx, prev_insn_idx = 0;
int insn_processed = 0;
bool do_print_state = false;
- init_reg_state(regs);
+ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+ env->cur_state = state;
+ init_reg_state(env, state->regs);
state->parent = NULL;
insn_idx = 0;
for (;;) {
@@ -3572,7 +3773,7 @@ static int do_check(struct bpf_verifier_env *env)
int err;
if (insn_idx >= insn_cnt) {
- verbose("invalid insn idx %d insn_cnt %d\n",
+ verbose(env, "invalid insn idx %d insn_cnt %d\n",
insn_idx, insn_cnt);
return -EFAULT;
}
@@ -3581,7 +3782,8 @@ static int do_check(struct bpf_verifier_env *env)
class = BPF_CLASS(insn->code);
if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
- verbose("BPF program is too large. Processed %d insn\n",
+ verbose(env,
+ "BPF program is too large. Processed %d insn\n",
insn_processed);
return -E2BIG;
}
@@ -3591,12 +3793,12 @@ static int do_check(struct bpf_verifier_env *env)
return err;
if (err == 1) {
/* found equivalent state, can prune the search */
- if (log_level) {
+ if (env->log.level) {
if (do_print_state)
- verbose("\nfrom %d to %d: safe\n",
+ verbose(env, "\nfrom %d to %d: safe\n",
prev_insn_idx, insn_idx);
else
- verbose("%d: safe\n", insn_idx);
+ verbose(env, "%d: safe\n", insn_idx);
}
goto process_bpf_exit;
}
@@ -3604,25 +3806,28 @@ static int do_check(struct bpf_verifier_env *env)
if (need_resched())
cond_resched();
- if (log_level > 1 || (log_level && do_print_state)) {
- if (log_level > 1)
- verbose("%d:", insn_idx);
+ if (env->log.level > 1 || (env->log.level && do_print_state)) {
+ if (env->log.level > 1)
+ verbose(env, "%d:", insn_idx);
else
- verbose("\nfrom %d to %d:",
+ verbose(env, "\nfrom %d to %d:",
prev_insn_idx, insn_idx);
- print_verifier_state(&env->cur_state);
+ print_verifier_state(env, state);
do_print_state = false;
}
- if (log_level) {
- verbose("%d: ", insn_idx);
- print_bpf_insn(env, insn);
+ if (env->log.level) {
+ verbose(env, "%d: ", insn_idx);
+ print_bpf_insn(verbose, env, insn,
+ env->allow_ptr_leaks);
}
err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
if (err)
return err;
+ regs = cur_regs(env);
+ env->insn_aux_data[insn_idx].seen = true;
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err)
@@ -3672,7 +3877,7 @@ static int do_check(struct bpf_verifier_env *env)
* src_reg == stack|map in some other branch.
* Reject it.
*/
- verbose("same insn cannot be used with different pointers\n");
+ verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
}
@@ -3712,14 +3917,14 @@ static int do_check(struct bpf_verifier_env *env)
} else if (dst_reg_type != *prev_dst_type &&
(dst_reg_type == PTR_TO_CTX ||
*prev_dst_type == PTR_TO_CTX)) {
- verbose("same insn cannot be used with different pointers\n");
+ verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
}
} else if (class == BPF_ST) {
if (BPF_MODE(insn->code) != BPF_MEM ||
insn->src_reg != BPF_REG_0) {
- verbose("BPF_ST uses reserved fields\n");
+ verbose(env, "BPF_ST uses reserved fields\n");
return -EINVAL;
}
/* check src operand */
@@ -3742,7 +3947,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->off != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_CALL uses reserved fields\n");
+ verbose(env, "BPF_CALL uses reserved fields\n");
return -EINVAL;
}
@@ -3755,7 +3960,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_JA uses reserved fields\n");
+ verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}
@@ -3767,7 +3972,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_EXIT uses reserved fields\n");
+ verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
@@ -3782,13 +3987,18 @@ static int do_check(struct bpf_verifier_env *env)
return err;
if (is_pointer_value(env, BPF_REG_0)) {
- verbose("R0 leaks addr as return value\n");
+ verbose(env, "R0 leaks addr as return value\n");
return -EACCES;
}
+ err = check_return_code(env);
+ if (err)
+ return err;
process_bpf_exit:
- insn_idx = pop_stack(env, &prev_insn_idx);
- if (insn_idx < 0) {
+ err = pop_stack(env, &prev_insn_idx, &insn_idx);
+ if (err < 0) {
+ if (err != -ENOENT)
+ return err;
break;
} else {
do_print_state = true;
@@ -3813,20 +4023,21 @@ process_bpf_exit:
return err;
insn_idx++;
+ env->insn_aux_data[insn_idx].seen = true;
} else {
- verbose("invalid BPF_LD mode\n");
+ verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
}
} else {
- verbose("unknown insn class %d\n", class);
+ verbose(env, "unknown insn class %d\n", class);
return -EINVAL;
}
insn_idx++;
}
- verbose("processed %d insns, stack depth %d\n",
- insn_processed, env->prog->aux->stack_depth);
+ verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
+ env->prog->aux->stack_depth);
return 0;
}
@@ -3838,7 +4049,8 @@ static int check_map_prealloc(struct bpf_map *map)
!(map->map_flags & BPF_F_NO_PREALLOC);
}
-static int check_map_prog_compatibility(struct bpf_map *map,
+static int check_map_prog_compatibility(struct bpf_verifier_env *env,
+ struct bpf_map *map,
struct bpf_prog *prog)
{
@@ -3849,12 +4061,12 @@ static int check_map_prog_compatibility(struct bpf_map *map,
*/
if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
if (!check_map_prealloc(map)) {
- verbose("perf_event programs can only use preallocated hash map\n");
+ verbose(env, "perf_event programs can only use preallocated hash map\n");
return -EINVAL;
}
if (map->inner_map_meta &&
!check_map_prealloc(map->inner_map_meta)) {
- verbose("perf_event programs can only use preallocated inner hash map\n");
+ verbose(env, "perf_event programs can only use preallocated inner hash map\n");
return -EINVAL;
}
}
@@ -3877,14 +4089,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
- verbose("BPF_LDX uses reserved fields\n");
+ verbose(env, "BPF_LDX uses reserved fields\n");
return -EINVAL;
}
if (BPF_CLASS(insn->code) == BPF_STX &&
((BPF_MODE(insn->code) != BPF_MEM &&
BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
- verbose("BPF_STX uses reserved fields\n");
+ verbose(env, "BPF_STX uses reserved fields\n");
return -EINVAL;
}
@@ -3895,7 +4107,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
insn[1].off != 0) {
- verbose("invalid bpf_ld_imm64 insn\n");
+ verbose(env, "invalid bpf_ld_imm64 insn\n");
return -EINVAL;
}
@@ -3904,19 +4116,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
goto next_insn;
if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
- verbose("unrecognized bpf_ld_imm64 insn\n");
+ verbose(env,
+ "unrecognized bpf_ld_imm64 insn\n");
return -EINVAL;
}
f = fdget(insn->imm);
map = __bpf_map_get(f);
if (IS_ERR(map)) {
- verbose("fd %d is not pointing to valid bpf_map\n",
+ verbose(env, "fd %d is not pointing to valid bpf_map\n",
insn->imm);
return PTR_ERR(map);
}
- err = check_map_prog_compatibility(map, env->prog);
+ err = check_map_prog_compatibility(env, map, env->prog);
if (err) {
fdput(f);
return err;
@@ -3993,6 +4206,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
u32 off, u32 cnt)
{
struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ int i;
if (cnt == 1)
return 0;
@@ -4002,6 +4216,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ for (i = off; i < off + cnt - 1; i++)
+ new_data[i].seen = true;
env->insn_aux_data = new_data;
vfree(old_data);
return 0;
@@ -4020,12 +4236,31 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
return new_prog;
}
+/* The verifier does more data flow analysis than llvm and will not explore
+ * branches that are dead at run time. Malicious programs can have dead code
+ * too. Therefore replace all dead at-run-time code with nops.
+ */
+static void sanitize_dead_code(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (aux_data[i].seen)
+ continue;
+ memcpy(insn + i, &nop, sizeof(nop));
+ }
+}
+
/* convert load instructions that access fields of 'struct __sk_buff'
* into sequence of instructions that access fields of 'struct sk_buff'
*/
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
- const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+ const struct bpf_verifier_ops *ops = env->ops;
int i, cnt, size, ctx_field_size, delta = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16], *insn;
@@ -4038,7 +4273,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
env->prog);
if (cnt >= ARRAY_SIZE(insn_buf)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
} else if (cnt) {
new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
@@ -4086,7 +4321,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
u8 size_code;
if (type == BPF_WRITE) {
- verbose("bpf verifier narrow ctx access misconfigured\n");
+ verbose(env, "bpf verifier narrow ctx access misconfigured\n");
return -EINVAL;
}
@@ -4105,7 +4340,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
&target_size);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
(ctx_field_size && !target_size)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -4187,7 +4422,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -4205,7 +4440,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
}
if (insn->imm == BPF_FUNC_redirect_map) {
- u64 addr = (unsigned long)prog;
+ /* Note, we cannot use prog directly as imm as subsequent
+ * rewrites would still change the prog pointer. The only
+ * stable address we can use is aux, which also works with
+ * prog clones during blinding.
+ */
+ u64 addr = (unsigned long)prog->aux;
struct bpf_insn r4_ld[] = {
BPF_LD_IMM64(BPF_REG_4, addr),
*insn,
@@ -4221,12 +4461,13 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn = new_prog->insnsi + i + delta;
}
patch_call_imm:
- fn = prog->aux->ops->get_func_proto(insn->imm);
+ fn = env->ops->get_func_proto(insn->imm);
/* all functions that have prototype and verifier allowed
* programs to call them, must be real in-kernel functions
*/
if (!fn->func) {
- verbose("kernel subsystem misconfigured func %s#%d\n",
+ verbose(env,
+ "kernel subsystem misconfigured func %s#%d\n",
func_id_name(insn->imm), insn->imm);
return -EFAULT;
}
@@ -4250,6 +4491,7 @@ static void free_states(struct bpf_verifier_env *env)
if (sl)
while (sl != STATE_LIST_MARK) {
sln = sl->next;
+ free_verifier_state(&sl->state, false);
kfree(sl);
sl = sln;
}
@@ -4260,16 +4502,21 @@ static void free_states(struct bpf_verifier_env *env)
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
- char __user *log_ubuf = NULL;
struct bpf_verifier_env *env;
+ struct bpf_verifer_log *log;
int ret = -EINVAL;
+ /* no program is valid */
+ if (ARRAY_SIZE(bpf_verifier_ops) == 0)
+ return -EINVAL;
+
/* 'struct bpf_verifier_env' can be global, but since it's not small,
* allocate/free it every time bpf_check() is called
*/
env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
+ log = &env->log;
env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
(*prog)->len);
@@ -4277,6 +4524,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (!env->insn_aux_data)
goto err_free_env;
env->prog = *prog;
+ env->ops = bpf_verifier_ops[env->prog->type];
/* grab the mutex to protect few globals used by verifier */
mutex_lock(&bpf_verifier_lock);
@@ -4285,29 +4533,27 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
/* user requested verbose verifier output
* and supplied buffer to store the verification trace
*/
- log_level = attr->log_level;
- log_ubuf = (char __user *) (unsigned long) attr->log_buf;
- log_size = attr->log_size;
- log_len = 0;
+ log->level = attr->log_level;
+ log->ubuf = (char __user *) (unsigned long) attr->log_buf;
+ log->len_total = attr->log_size;
ret = -EINVAL;
- /* log_* values have to be sane */
- if (log_size < 128 || log_size > UINT_MAX >> 8 ||
- log_level == 0 || log_ubuf == NULL)
- goto err_unlock;
-
- ret = -ENOMEM;
- log_buf = vmalloc(log_size);
- if (!log_buf)
+ /* log attributes have to be sane */
+ if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+ !log->level || !log->ubuf)
goto err_unlock;
- } else {
- log_level = 0;
}
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
+ if (env->prog->aux->offload) {
+ ret = bpf_prog_offload_verifier_prep(env);
+ if (ret)
+ goto err_unlock;
+ }
+
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
@@ -4326,29 +4572,30 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
ret = do_check(env);
+ if (env->cur_state) {
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ }
skip_full_check:
- while (pop_stack(env, NULL) >= 0);
+ while (!pop_stack(env, NULL, NULL));
free_states(env);
if (ret == 0)
+ sanitize_dead_code(env);
+
+ if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
if (ret == 0)
ret = fixup_bpf_calls(env);
- if (log_level && log_len >= log_size - 1) {
- BUG_ON(log_len >= log_size);
- /* verifier log exceeded user supplied buffer */
+ if (log->level && bpf_verifier_log_full(log))
ret = -ENOSPC;
- /* fall through to return what was recorded */
- }
-
- /* copy verifier log back to user space including trailing zero */
- if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ if (log->level && !log->ubuf) {
ret = -EFAULT;
- goto free_log_buf;
+ goto err_release_maps;
}
if (ret == 0 && env->used_map_cnt) {
@@ -4359,7 +4606,7 @@ skip_full_check:
if (!env->prog->aux->used_maps) {
ret = -ENOMEM;
- goto free_log_buf;
+ goto err_release_maps;
}
memcpy(env->prog->aux->used_maps, env->used_maps,
@@ -4372,9 +4619,7 @@ skip_full_check:
convert_pseudo_ld_imm64(env);
}
-free_log_buf:
- if (log_level)
- vfree(log_buf);
+err_release_maps:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_bpf_prog_info() will release them.
@@ -4388,58 +4633,3 @@ err_free_env:
kfree(env);
return ret;
}
-
-int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
- void *priv)
-{
- struct bpf_verifier_env *env;
- int ret;
-
- env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
- if (!env)
- return -ENOMEM;
-
- env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
- prog->len);
- ret = -ENOMEM;
- if (!env->insn_aux_data)
- goto err_free_env;
- env->prog = prog;
- env->analyzer_ops = ops;
- env->analyzer_priv = priv;
-
- /* grab the mutex to protect few globals used by verifier */
- mutex_lock(&bpf_verifier_lock);
-
- log_level = 0;
-
- env->strict_alignment = false;
- if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
- env->strict_alignment = true;
-
- env->explored_states = kcalloc(env->prog->len,
- sizeof(struct bpf_verifier_state_list *),
- GFP_KERNEL);
- ret = -ENOMEM;
- if (!env->explored_states)
- goto skip_full_check;
-
- ret = check_cfg(env);
- if (ret < 0)
- goto skip_full_check;
-
- env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
-
- ret = do_check(env);
-
-skip_full_check:
- while (pop_stack(env, NULL) >= 0);
- free_states(env);
-
- mutex_unlock(&bpf_verifier_lock);
- vfree(env->insn_aux_data);
-err_free_env:
- kfree(env);
- return ret;
-}
-EXPORT_SYMBOL_GPL(bpf_analyzer);
diff --git a/kernel/capability.c b/kernel/capability.c
index f97fe77ceb88..1e1c0236f55b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/capability.c
*
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index ce693ccb8c58..2be89a003185 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,4 +1,5 @@
-obj-y := cgroup.o namespace.o cgroup-v1.o
+# SPDX-License-Identifier: GPL-2.0
+obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 5151ff256c29..b928b27050c6 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __CGROUP_INTERNAL_H
#define __CGROUP_INTERNAL_H
@@ -200,6 +201,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
int cgroup_task_count(const struct cgroup *cgrp);
/*
+ * stat.c
+ */
+void cgroup_stat_flush(struct cgroup *cgrp);
+int cgroup_stat_init(struct cgroup *cgrp);
+void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_stat_show_cputime(struct seq_file *seq);
+void cgroup_stat_boot(void);
+
+/*
* namespace.c
*/
extern const struct proc_ns_operations cgroupns_operations;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..0b1ffe147f24 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
};
#undef SUBSYS
+static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+
/*
* The default hierarchy, reserved for the subsystems that are otherwise
* unattached - it never has more than a single cgroup, and all tasks are
* part of that cgroup.
*/
-struct cgroup_root cgrp_dfl_root;
+struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
@@ -462,6 +464,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
}
/**
+ * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist
+ * or is offline, %NULL is returned.
+ */
+static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+ css = cgroup_css(cgrp, ss);
+ if (!css || !css_tryget_online(css))
+ css = NULL;
+ rcu_read_unlock();
+
+ return css;
+}
+
+/**
* cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
@@ -647,6 +671,14 @@ struct css_set init_css_set = {
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+
+ /*
+ * The following field is re-initialized when this cset gets linked
+ * in cgroup_init(). However, let's initialize the field
+ * statically too so that the default cgroup can be accessed safely
+ * early during boot.
+ */
+ .dfl_cgrp = &cgrp_dfl_root.cgrp,
};
static int css_set_count = 1; /* 1 for init_css_set */
@@ -1896,6 +1928,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
if (ret)
goto destroy_root;
+ ret = cgroup_bpf_inherit(root_cgrp);
+ WARN_ON_ONCE(ret);
+
trace_cgroup_setup_root(root);
/*
@@ -2311,6 +2346,14 @@ out_release_tset:
list_del_init(&cset->mg_node);
}
spin_unlock_irq(&css_set_lock);
+
+ /*
+ * Re-initialize the cgroup_taskset structure in case it is reused
+ * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
+ * iteration.
+ */
+ tset->nr_tasks = 0;
+ tset->csets = &tset->src_csets;
return ret;
}
@@ -3304,6 +3347,37 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
return 0;
}
+static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
+ struct cgroup *cgrp, int ssid)
+{
+ struct cgroup_subsys *ss = cgroup_subsys[ssid];
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (!ss->css_extra_stat_show)
+ return 0;
+
+ css = cgroup_tryget_css(cgrp, ss);
+ if (!css)
+ return 0;
+
+ ret = ss->css_extra_stat_show(seq, css);
+ css_put(css);
+ return ret;
+}
+
+static int cpu_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
+ int ret = 0;
+
+ cgroup_stat_show_cputime(seq);
+#ifdef CONFIG_CGROUP_SCHED
+ ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
+#endif
+ return ret;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
@@ -4411,6 +4485,11 @@ static struct cftype cgroup_base_files[] = {
.name = "cgroup.stat",
.seq_show = cgroup_stat_show,
},
+ {
+ .name = "cpu.stat",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_stat_show,
+ },
{ } /* terminate */
};
@@ -4471,6 +4550,8 @@ static void css_free_work_fn(struct work_struct *work)
*/
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
+ if (cgroup_on_dfl(cgrp))
+ cgroup_stat_exit(cgrp);
kfree(cgrp);
} else {
/*
@@ -4515,6 +4596,9 @@ static void css_release_work_fn(struct work_struct *work)
/* cgroup release path */
trace_cgroup_release(cgrp);
+ if (cgroup_on_dfl(cgrp))
+ cgroup_stat_flush(cgrp);
+
for (tcgrp = cgroup_parent(cgrp); tcgrp;
tcgrp = cgroup_parent(tcgrp))
tcgrp->nr_dying_descendants--;
@@ -4698,6 +4782,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (ret)
goto out_free_cgrp;
+ if (cgroup_on_dfl(parent)) {
+ ret = cgroup_stat_init(cgrp);
+ if (ret)
+ goto out_cancel_ref;
+ }
+
/*
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
@@ -4705,7 +4795,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
if (cgrp->id < 0) {
ret = -ENOMEM;
- goto out_cancel_ref;
+ goto out_stat_exit;
}
init_cgroup_housekeeping(cgrp);
@@ -4713,6 +4803,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->self.parent = &parent->self;
cgrp->root = root;
cgrp->level = level;
+ ret = cgroup_bpf_inherit(cgrp);
+ if (ret)
+ goto out_idr_free;
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@ -4747,13 +4840,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (!cgroup_on_dfl(cgrp))
cgrp->subtree_control = cgroup_control(cgrp);
- if (parent)
- cgroup_bpf_inherit(cgrp, parent);
-
cgroup_propagate_control(cgrp);
return cgrp;
+out_idr_free:
+ cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_stat_exit:
+ if (cgroup_on_dfl(parent))
+ cgroup_stat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -5148,6 +5243,8 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
+ cgroup_stat_boot();
+
/*
* The latency of the synchronize_sched() is too high for cgroups,
* avoid it at the cost of forcing all readers into the slow path.
@@ -5736,15 +5833,103 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#endif /* CONFIG_SOCK_CGROUP_DATA */
#ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, bool overridable)
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
- struct cgroup *parent = cgroup_parent(cgrp);
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+ ret = __cgroup_bpf_query(cgrp, attr, uattr);
mutex_unlock(&cgroup_mutex);
return ret;
}
#endif /* CONFIG_CGROUP_BPF */
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_delegatable_files(struct cftype *files, char *buf,
+ ssize_t size, const char *prefix)
+{
+ struct cftype *cft;
+ ssize_t ret = 0;
+
+ for (cft = files; cft && cft->name[0] != '\0'; cft++) {
+ if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
+ continue;
+
+ if (prefix)
+ ret += snprintf(buf + ret, size - ret, "%s.", prefix);
+
+ ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
+
+ if (unlikely(ret >= size)) {
+ WARN_ON(1);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
+ ssize_t ret = 0;
+
+ ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
+ NULL);
+
+ for_each_subsys(ss, ssid)
+ ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
+ PAGE_SIZE - ret,
+ cgroup_subsys_name[ssid]);
+
+ return ret;
+}
+static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
+
+static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+}
+static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
+
+static struct attribute *cgroup_sysfs_attrs[] = {
+ &cgroup_delegate_attr.attr,
+ &cgroup_features_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group cgroup_sysfs_attr_group = {
+ .attrs = cgroup_sysfs_attrs,
+ .name = "cgroup",
+};
+
+static int __init cgroup_sysfs_init(void)
+{
+ return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
+}
+subsys_initcall(cgroup_sysfs_init);
+#endif /* CONFIG_SYSFS */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4657e2924ecb..f7efa7b4d825 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -57,7 +57,7 @@
#include <linux/backing-dev.h>
#include <linux/sort.h>
#include <linux/oom.h>
-
+#include <linux/sched/isolation.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
@@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
- cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
@@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
dattr = NULL;
csa = NULL;
- if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
- goto done;
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
ndoms = 1;
@@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
- non_isolated_cpus);
+ housekeeping_cpumask(HK_FLAG_DOMAIN));
goto done;
}
@@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
+ cpumask_intersects(cp->cpus_allowed,
+ housekeeping_cpumask(HK_FLAG_DOMAIN))))
continue;
if (is_sched_load_balance(cp))
@@ -789,7 +785,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, non_isolated_cpus);
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -802,7 +798,6 @@ restart:
BUG_ON(nslot != ndoms);
done:
- free_cpumask_var(non_isolated_cpus);
kfree(csa);
/*
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index f661b4cc5efd..5f780d8f6a9d 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Debug controller
*
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 66129eb4371d..b05f1dd58a62 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "cgroup-internal.h"
#include <linux/sched/task.h>
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
new file mode 100644
index 000000000000..133b465691d6
--- /dev/null
+++ b/kernel/cgroup/stat.c
@@ -0,0 +1,334 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched/cputime.h>
+
+static DEFINE_MUTEX(cgroup_stat_mutex);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
+{
+ return per_cpu_ptr(cgrp->cpu_stat, cpu);
+}
+
+/**
+ * cgroup_cpu_stat_updated - keep track of updated cpu_stat
+ * @cgrp: target cgroup
+ * @cpu: cpu on which cpu_stat was updated
+ *
+ * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching
+ * cpu_stat->updated_children list. See the comment on top of
+ * cgroup_cpu_stat definition for details.
+ */
+static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
+{
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+ struct cgroup *parent;
+ unsigned long flags;
+
+ /*
+ * Speculative already-on-list test. This may race leading to
+ * temporary inaccuracies, which is fine.
+ *
+ * Because @parent's updated_children is terminated with @parent
+ * instead of NULL, we can tell whether @cgrp is on the list by
+ * testing the next pointer for NULL.
+ */
+ if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
+ return;
+
+ raw_spin_lock_irqsave(cpu_lock, flags);
+
+ /* put @cgrp and all ancestors on the corresponding updated lists */
+ for (parent = cgroup_parent(cgrp); parent;
+ cgrp = parent, parent = cgroup_parent(cgrp)) {
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+ struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+
+ /*
+ * Both additions and removals are bottom-up. If a cgroup
+ * is already in the tree, all ancestors are.
+ */
+ if (cstat->updated_next)
+ break;
+
+ cstat->updated_next = pcstat->updated_children;
+ pcstat->updated_children = cgrp;
+ }
+
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
+/**
+ * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
+ * @pos: current position
+ * @root: root of the tree to traversal
+ * @cpu: target cpu
+ *
+ * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts
+ * the traversal and %NULL return indicates the end. During traversal,
+ * each returned cgroup is unlinked from the tree. Must be called with the
+ * matching cgroup_cpu_stat_lock held.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, if a child is visited, its parent is
+ * guaranteed to be visited afterwards.
+ */
+static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
+ struct cgroup *root, int cpu)
+{
+ struct cgroup_cpu_stat *cstat;
+ struct cgroup *parent;
+
+ if (pos == root)
+ return NULL;
+
+ /*
+ * We're gonna walk down to the first leaf and visit/remove it. We
+ * can pick whatever unvisited node as the starting point.
+ */
+ if (!pos)
+ pos = root;
+ else
+ pos = cgroup_parent(pos);
+
+ /* walk down to the first leaf */
+ while (true) {
+ cstat = cgroup_cpu_stat(pos, cpu);
+ if (cstat->updated_children == pos)
+ break;
+ pos = cstat->updated_children;
+ }
+
+ /*
+ * Unlink @pos from the tree. As the updated_children list is
+ * singly linked, we have to walk it to find the removal point.
+ * However, due to the way we traverse, @pos will be the first
+ * child in most cases. The only exception is @root.
+ */
+ parent = cgroup_parent(pos);
+ if (parent && cstat->updated_next) {
+ struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+ struct cgroup_cpu_stat *ncstat;
+ struct cgroup **nextp;
+
+ nextp = &pcstat->updated_children;
+ while (true) {
+ ncstat = cgroup_cpu_stat(*nextp, cpu);
+ if (*nextp == pos)
+ break;
+
+ WARN_ON_ONCE(*nextp == parent);
+ nextp = &ncstat->updated_next;
+ }
+
+ *nextp = cstat->updated_next;
+ cstat->updated_next = NULL;
+ }
+
+ return pos;
+}
+
+static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
+ struct cgroup_stat *src_stat)
+{
+ dst_stat->cputime.utime += src_stat->cputime.utime;
+ dst_stat->cputime.stime += src_stat->cputime.stime;
+ dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
+}
+
+static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+ struct task_cputime *last_cputime = &cstat->last_cputime;
+ struct task_cputime cputime;
+ struct cgroup_stat delta;
+ unsigned seq;
+
+ lockdep_assert_held(&cgroup_stat_mutex);
+
+ /* fetch the current per-cpu values */
+ do {
+ seq = __u64_stats_fetch_begin(&cstat->sync);
+ cputime = cstat->cputime;
+ } while (__u64_stats_fetch_retry(&cstat->sync, seq));
+
+ /* accumulate the deltas to propgate */
+ delta.cputime.utime = cputime.utime - last_cputime->utime;
+ delta.cputime.stime = cputime.stime - last_cputime->stime;
+ delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
+ last_cputime->sum_exec_runtime;
+ *last_cputime = cputime;
+
+ /* transfer the pending stat into delta */
+ cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
+ memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
+
+ /* propagate delta into the global stat and the parent's pending */
+ cgroup_stat_accumulate(&cgrp->stat, &delta);
+ if (parent)
+ cgroup_stat_accumulate(&parent->pending_stat, &delta);
+}
+
+/* see cgroup_stat_flush() */
+static void cgroup_stat_flush_locked(struct cgroup *cgrp)
+{
+ int cpu;
+
+ lockdep_assert_held(&cgroup_stat_mutex);
+
+ for_each_possible_cpu(cpu) {
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+ struct cgroup *pos = NULL;
+
+ raw_spin_lock_irq(cpu_lock);
+ while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
+ cgroup_cpu_stat_flush_one(pos, cpu);
+ raw_spin_unlock_irq(cpu_lock);
+ }
+}
+
+/**
+ * cgroup_stat_flush - flush stats in @cgrp's subtree
+ * @cgrp: target cgroup
+ *
+ * Collect all per-cpu stats in @cgrp's subtree into the global counters
+ * and propagate them upwards. After this function returns, all cgroups in
+ * the subtree have up-to-date ->stat.
+ *
+ * This also gets all cgroups in the subtree including @cgrp off the
+ * ->updated_children lists.
+ */
+void cgroup_stat_flush(struct cgroup *cgrp)
+{
+ mutex_lock(&cgroup_stat_mutex);
+ cgroup_stat_flush_locked(cgrp);
+ mutex_unlock(&cgroup_stat_mutex);
+}
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
+{
+ struct cgroup_cpu_stat *cstat;
+
+ cstat = get_cpu_ptr(cgrp->cpu_stat);
+ u64_stats_update_begin(&cstat->sync);
+ return cstat;
+}
+
+static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
+ struct cgroup_cpu_stat *cstat)
+{
+ u64_stats_update_end(&cstat->sync);
+ cgroup_cpu_stat_updated(cgrp, smp_processor_id());
+ put_cpu_ptr(cstat);
+}
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+ struct cgroup_cpu_stat *cstat;
+
+ cstat = cgroup_cpu_stat_account_begin(cgrp);
+ cstat->cputime.sum_exec_runtime += delta_exec;
+ cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+ enum cpu_usage_stat index, u64 delta_exec)
+{
+ struct cgroup_cpu_stat *cstat;
+
+ cstat = cgroup_cpu_stat_account_begin(cgrp);
+
+ switch (index) {
+ case CPUTIME_USER:
+ case CPUTIME_NICE:
+ cstat->cputime.utime += delta_exec;
+ break;
+ case CPUTIME_SYSTEM:
+ case CPUTIME_IRQ:
+ case CPUTIME_SOFTIRQ:
+ cstat->cputime.stime += delta_exec;
+ break;
+ default:
+ break;
+ }
+
+ cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void cgroup_stat_show_cputime(struct seq_file *seq)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ u64 usage, utime, stime;
+
+ if (!cgroup_parent(cgrp))
+ return;
+
+ mutex_lock(&cgroup_stat_mutex);
+
+ cgroup_stat_flush_locked(cgrp);
+
+ usage = cgrp->stat.cputime.sum_exec_runtime;
+ cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
+ &utime, &stime);
+
+ mutex_unlock(&cgroup_stat_mutex);
+
+ do_div(usage, NSEC_PER_USEC);
+ do_div(utime, NSEC_PER_USEC);
+ do_div(stime, NSEC_PER_USEC);
+
+ seq_printf(seq, "usage_usec %llu\n"
+ "user_usec %llu\n"
+ "system_usec %llu\n",
+ usage, utime, stime);
+}
+
+int cgroup_stat_init(struct cgroup *cgrp)
+{
+ int cpu;
+
+ /* the root cgrp has cpu_stat preallocated */
+ if (!cgrp->cpu_stat) {
+ cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
+ if (!cgrp->cpu_stat)
+ return -ENOMEM;
+ }
+
+ /* ->updated_children list is self terminated */
+ for_each_possible_cpu(cpu)
+ cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
+
+ prev_cputime_init(&cgrp->stat.prev_cputime);
+
+ return 0;
+}
+
+void cgroup_stat_exit(struct cgroup *cgrp)
+{
+ int cpu;
+
+ cgroup_stat_flush(cgrp);
+
+ /* sanity check */
+ for_each_possible_cpu(cpu) {
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+
+ if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
+ WARN_ON_ONCE(cstat->updated_next))
+ return;
+ }
+
+ free_percpu(cgrp->cpu_stat);
+ cgrp->cpu_stat = NULL;
+}
+
+void __init cgroup_stat_boot(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
+
+ BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
+}
diff --git a/kernel/compat.c b/kernel/compat.c
index 772e038d04d9..d1cee656a7ed 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -367,24 +367,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
return ret;
}
-int get_compat_itimerspec(struct itimerspec *dst,
- const struct compat_itimerspec __user *src)
-{
- if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
- __compat_get_timespec(&dst->it_value, &src->it_value))
- return -EFAULT;
- return 0;
-}
-
-int put_compat_itimerspec(struct compat_itimerspec __user *dst,
- const struct itimerspec *src)
-{
- if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
- __compat_put_timespec(&src->it_value, &dst->it_value))
- return -EFAULT;
- return 0;
-}
-
int get_compat_itimerspec64(struct itimerspec64 *its,
const struct compat_itimerspec __user *uits)
{
@@ -485,27 +467,44 @@ Efault:
return -EFAULT;
}
-void
-sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
+int
+get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
{
+#ifdef __BIG_ENDIAN
+ compat_sigset_t v;
+ if (copy_from_user(&v, compat, sizeof(compat_sigset_t)))
+ return -EFAULT;
switch (_NSIG_WORDS) {
- case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
- case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
- case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
- case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
+ case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
+ case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
+ case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
+ case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
}
+#else
+ if (copy_from_user(set, compat, sizeof(compat_sigset_t)))
+ return -EFAULT;
+#endif
+ return 0;
}
-EXPORT_SYMBOL_GPL(sigset_from_compat);
+EXPORT_SYMBOL_GPL(get_compat_sigset);
-void
-sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+int
+put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
+ unsigned int size)
{
+ /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
+#ifdef __BIG_ENDIAN
+ compat_sigset_t v;
switch (_NSIG_WORDS) {
- case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
- case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
- case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
- case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+ case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
+ case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2];
+ case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
+ case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0];
}
+ return copy_to_user(compat, &v, size) ? -EFAULT : 0;
+#else
+ return copy_to_user(compat, set, size) ? -EFAULT : 0;
+#endif
}
#ifdef CONFIG_NUMA
@@ -563,22 +562,6 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
}
#endif
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
- compat_pid_t, pid,
- struct compat_timespec __user *, interval)
-{
- struct timespec t;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
- set_fs(old_fs);
- if (compat_put_timespec(&t, interval))
- return -EFAULT;
- return ret;
-}
-
/*
* Allocate user-space memory for the duration of a single system call,
* in order to marshall parameters inside a compat thunk.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index acf5308fad51..04892a82f6ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -24,6 +24,7 @@
#include <linux/lockdep.h>
#include <linux/tick.h>
#include <linux/irq.h>
+#include <linux/nmi.h>
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
@@ -46,11 +47,13 @@
* @bringup: Single callback bringup or teardown selector
* @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
- * @done: Signal completion to the issuer of the task
+ * @done_up: Signal completion to the issuer of the task for cpu-up
+ * @done_down: Signal completion to the issuer of the task for cpu-down
*/
struct cpuhp_cpu_state {
enum cpuhp_state state;
enum cpuhp_state target;
+ enum cpuhp_state fail;
#ifdef CONFIG_SMP
struct task_struct *thread;
bool should_run;
@@ -58,18 +61,39 @@ struct cpuhp_cpu_state {
bool single;
bool bringup;
struct hlist_node *node;
+ struct hlist_node *last;
enum cpuhp_state cb_state;
int result;
- struct completion done;
+ struct completion done_up;
+ struct completion done_down;
#endif
};
-static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
+static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
+ .fail = CPUHP_INVALID,
+};
#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
-static struct lock_class_key cpuhp_state_key;
-static struct lockdep_map cpuhp_state_lock_map =
- STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key);
+static struct lockdep_map cpuhp_state_up_map =
+ STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
+static struct lockdep_map cpuhp_state_down_map =
+ STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
+
+
+static void inline cpuhp_lock_acquire(bool bringup)
+{
+ lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+
+static void inline cpuhp_lock_release(bool bringup)
+{
+ lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+#else
+
+static void inline cpuhp_lock_acquire(bool bringup) { }
+static void inline cpuhp_lock_release(bool bringup) { }
+
#endif
/**
@@ -123,13 +147,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
/**
* cpuhp_invoke_callback _ Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
- * @step: The step in the state machine
+ * @state: The state to do callbacks for
* @bringup: True if the bringup callback should be invoked
+ * @node: For multi-instance, do a single entry callback for install/remove
+ * @lastp: For multi-instance rollback, remember how far we got
*
* Called from cpu hotplug and from the state register machinery.
*/
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
- bool bringup, struct hlist_node *node)
+ bool bringup, struct hlist_node *node,
+ struct hlist_node **lastp)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct cpuhp_step *step = cpuhp_get_step(state);
@@ -137,7 +164,17 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
int (*cb)(unsigned int cpu);
int ret, cnt;
+ if (st->fail == state) {
+ st->fail = CPUHP_INVALID;
+
+ if (!(bringup ? step->startup.single : step->teardown.single))
+ return 0;
+
+ return -EAGAIN;
+ }
+
if (!step->multi_instance) {
+ WARN_ON_ONCE(lastp && *lastp);
cb = bringup ? step->startup.single : step->teardown.single;
if (!cb)
return 0;
@@ -152,6 +189,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
/* Single invocation for instance add/remove */
if (node) {
+ WARN_ON_ONCE(lastp && *lastp);
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
@@ -161,13 +199,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
/* State transition. Invoke on all instances */
cnt = 0;
hlist_for_each(node, &step->list) {
+ if (lastp && node == *lastp)
+ break;
+
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
- if (ret)
- goto err;
+ if (ret) {
+ if (!lastp)
+ goto err;
+
+ *lastp = node;
+ return ret;
+ }
cnt++;
}
+ if (lastp)
+ *lastp = NULL;
return 0;
err:
/* Rollback the instances if one failed */
@@ -178,12 +226,39 @@ err:
hlist_for_each(node, &step->list) {
if (!cnt--)
break;
- cbm(cpu, node);
+
+ trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+ ret = cbm(cpu, node);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ /*
+ * Rollback must not fail,
+ */
+ WARN_ON_ONCE(ret);
}
return ret;
}
#ifdef CONFIG_SMP
+static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+ struct completion *done = bringup ? &st->done_up : &st->done_down;
+ wait_for_completion(done);
+}
+
+static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+ struct completion *done = bringup ? &st->done_up : &st->done_down;
+ complete(done);
+}
+
+/*
+ * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
+ */
+static bool cpuhp_is_atomic_state(enum cpuhp_state state)
+{
+ return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
+}
+
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
@@ -271,14 +346,79 @@ void cpu_hotplug_enable(void)
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
-static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st);
+static inline enum cpuhp_state
+cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+
+ st->rollback = false;
+ st->last = NULL;
+
+ st->target = target;
+ st->single = false;
+ st->bringup = st->state < target;
+
+ return prev_state;
+}
+
+static inline void
+cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
+{
+ st->rollback = true;
+
+ /*
+ * If we have st->last we need to undo partial multi_instance of this
+ * state first. Otherwise start undo at the previous state.
+ */
+ if (!st->last) {
+ if (st->bringup)
+ st->state--;
+ else
+ st->state++;
+ }
+
+ st->target = prev_state;
+ st->bringup = !st->bringup;
+}
+
+/* Regular hotplug invocation of the AP hotplug thread */
+static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
+{
+ if (!st->single && st->state == st->target)
+ return;
+
+ st->result = 0;
+ /*
+ * Make sure the above stores are visible before should_run becomes
+ * true. Paired with the mb() above in cpuhp_thread_fun()
+ */
+ smp_mb();
+ st->should_run = true;
+ wake_up_process(st->thread);
+ wait_for_ap_thread(st, st->bringup);
+}
+
+static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state;
+ int ret;
+
+ prev_state = cpuhp_set_state(st, target);
+ __cpuhp_kick_ap(st);
+ if ((ret = st->result)) {
+ cpuhp_reset_state(st, prev_state);
+ __cpuhp_kick_ap(st);
+ }
+
+ return ret;
+}
static int bringup_wait_for_ap(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
/* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
- wait_for_completion(&st->done);
+ wait_for_ap_thread(st, true);
if (WARN_ON_ONCE((!cpu_online(cpu))))
return -ECANCELED;
@@ -286,12 +426,10 @@ static int bringup_wait_for_ap(unsigned int cpu)
stop_machine_unpark(cpu);
kthread_unpark(st->thread);
- /* Should we go further up ? */
- if (st->target > CPUHP_AP_ONLINE_IDLE) {
- __cpuhp_kick_ap_work(st);
- wait_for_completion(&st->done);
- }
- return st->result;
+ if (st->target <= CPUHP_AP_ONLINE_IDLE)
+ return 0;
+
+ return cpuhp_kick_ap(st, st->target);
}
static int bringup_cpu(unsigned int cpu)
@@ -317,32 +455,6 @@ static int bringup_cpu(unsigned int cpu)
/*
* Hotplug state machine related functions
*/
-static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- for (st->state++; st->state < st->target; st->state++) {
- struct cpuhp_step *step = cpuhp_get_step(st->state);
-
- if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, true, NULL);
- }
-}
-
-static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
- enum cpuhp_state target)
-{
- enum cpuhp_state prev_state = st->state;
- int ret = 0;
-
- for (; st->state > target; st->state--) {
- ret = cpuhp_invoke_callback(cpu, st->state, false, NULL);
- if (ret) {
- st->target = prev_state;
- undo_cpu_down(cpu, st);
- break;
- }
- }
- return ret;
-}
static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
{
@@ -350,7 +462,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
struct cpuhp_step *step = cpuhp_get_step(st->state);
if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, false, NULL);
+ cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
}
}
@@ -362,7 +474,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
while (st->state < target) {
st->state++;
- ret = cpuhp_invoke_callback(cpu, st->state, true, NULL);
+ ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
if (ret) {
st->target = prev_state;
undo_cpu_up(cpu, st);
@@ -379,7 +491,8 @@ static void cpuhp_create(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
- init_completion(&st->done);
+ init_completion(&st->done_up);
+ init_completion(&st->done_down);
}
static int cpuhp_should_run(unsigned int cpu)
@@ -389,69 +502,90 @@ static int cpuhp_should_run(unsigned int cpu)
return st->should_run;
}
-/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */
-static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
-
- return cpuhp_down_callbacks(cpu, st, target);
-}
-
-/* Execute the online startup callbacks. Used to be CPU_ONLINE */
-static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- return cpuhp_up_callbacks(cpu, st, st->target);
-}
-
/*
* Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
* callbacks when a state gets [un]installed at runtime.
+ *
+ * Each invocation of this function by the smpboot thread does a single AP
+ * state callback.
+ *
+ * It has 3 modes of operation:
+ * - single: runs st->cb_state
+ * - up: runs ++st->state, while st->state < st->target
+ * - down: runs st->state--, while st->state > st->target
+ *
+ * When complete or on error, should_run is cleared and the completion is fired.
*/
static void cpuhp_thread_fun(unsigned int cpu)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
- int ret = 0;
+ bool bringup = st->bringup;
+ enum cpuhp_state state;
/*
- * Paired with the mb() in cpuhp_kick_ap_work and
- * cpuhp_invoke_ap_callback, so the work set is consistent visible.
+ * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
+ * that if we see ->should_run we also see the rest of the state.
*/
smp_mb();
- if (!st->should_run)
+
+ if (WARN_ON_ONCE(!st->should_run))
return;
- st->should_run = false;
+ cpuhp_lock_acquire(bringup);
- lock_map_acquire(&cpuhp_state_lock_map);
- /* Single callback invocation for [un]install ? */
if (st->single) {
- if (st->cb_state < CPUHP_AP_ONLINE) {
- local_irq_disable();
- ret = cpuhp_invoke_callback(cpu, st->cb_state,
- st->bringup, st->node);
- local_irq_enable();
+ state = st->cb_state;
+ st->should_run = false;
+ } else {
+ if (bringup) {
+ st->state++;
+ state = st->state;
+ st->should_run = (st->state < st->target);
+ WARN_ON_ONCE(st->state > st->target);
} else {
- ret = cpuhp_invoke_callback(cpu, st->cb_state,
- st->bringup, st->node);
+ state = st->state;
+ st->state--;
+ st->should_run = (st->state > st->target);
+ WARN_ON_ONCE(st->state < st->target);
}
- } else if (st->rollback) {
- BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
+ }
+
+ WARN_ON_ONCE(!cpuhp_is_ap_state(state));
- undo_cpu_down(cpu, st);
- st->rollback = false;
+ if (st->rollback) {
+ struct cpuhp_step *step = cpuhp_get_step(state);
+ if (step->skip_onerr)
+ goto next;
+ }
+
+ if (cpuhp_is_atomic_state(state)) {
+ local_irq_disable();
+ st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+ local_irq_enable();
+
+ /*
+ * STARTING/DYING must not fail!
+ */
+ WARN_ON_ONCE(st->result);
} else {
- /* Cannot happen .... */
- BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
-
- /* Regular hotplug work */
- if (st->state < st->target)
- ret = cpuhp_ap_online(cpu, st);
- else if (st->state > st->target)
- ret = cpuhp_ap_offline(cpu, st);
+ st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+ }
+
+ if (st->result) {
+ /*
+ * If we fail on a rollback, we're up a creek without no
+ * paddle, no way forward, no way back. We loose, thanks for
+ * playing.
+ */
+ WARN_ON_ONCE(st->rollback);
+ st->should_run = false;
}
- lock_map_release(&cpuhp_state_lock_map);
- st->result = ret;
- complete(&st->done);
+
+next:
+ cpuhp_lock_release(bringup);
+
+ if (!st->should_run)
+ complete_ap_thread(st, bringup);
}
/* Invoke a single callback on a remote cpu */
@@ -460,62 +594,69 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
struct hlist_node *node)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int ret;
if (!cpu_online(cpu))
return 0;
- lock_map_acquire(&cpuhp_state_lock_map);
- lock_map_release(&cpuhp_state_lock_map);
+ cpuhp_lock_acquire(false);
+ cpuhp_lock_release(false);
+
+ cpuhp_lock_acquire(true);
+ cpuhp_lock_release(true);
/*
* If we are up and running, use the hotplug thread. For early calls
* we invoke the thread function directly.
*/
if (!st->thread)
- return cpuhp_invoke_callback(cpu, state, bringup, node);
+ return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
+ st->rollback = false;
+ st->last = NULL;
+
+ st->node = node;
+ st->bringup = bringup;
st->cb_state = state;
st->single = true;
- st->bringup = bringup;
- st->node = node;
+
+ __cpuhp_kick_ap(st);
/*
- * Make sure the above stores are visible before should_run becomes
- * true. Paired with the mb() above in cpuhp_thread_fun()
+ * If we failed and did a partial, do a rollback.
*/
- smp_mb();
- st->should_run = true;
- wake_up_process(st->thread);
- wait_for_completion(&st->done);
- return st->result;
-}
+ if ((ret = st->result) && st->last) {
+ st->rollback = true;
+ st->bringup = !bringup;
+
+ __cpuhp_kick_ap(st);
+ }
-/* Regular hotplug invocation of the AP hotplug thread */
-static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
-{
- st->result = 0;
- st->single = false;
/*
- * Make sure the above stores are visible before should_run becomes
- * true. Paired with the mb() above in cpuhp_thread_fun()
+ * Clean up the leftovers so the next hotplug operation wont use stale
+ * data.
*/
- smp_mb();
- st->should_run = true;
- wake_up_process(st->thread);
+ st->node = st->last = NULL;
+ return ret;
}
static int cpuhp_kick_ap_work(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
- enum cpuhp_state state = st->state;
+ enum cpuhp_state prev_state = st->state;
+ int ret;
+
+ cpuhp_lock_acquire(false);
+ cpuhp_lock_release(false);
+
+ cpuhp_lock_acquire(true);
+ cpuhp_lock_release(true);
- trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
- lock_map_acquire(&cpuhp_state_lock_map);
- lock_map_release(&cpuhp_state_lock_map);
- __cpuhp_kick_ap_work(st);
- wait_for_completion(&st->done);
- trace_cpuhp_exit(cpu, st->state, state, st->result);
- return st->result;
+ trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
+ ret = cpuhp_kick_ap(st, st->target);
+ trace_cpuhp_exit(cpu, st->state, prev_state, ret);
+
+ return ret;
}
static struct smp_hotplug_thread cpuhp_threads = {
@@ -581,6 +722,7 @@ static int take_cpu_down(void *_param)
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
int err, cpu = smp_processor_id();
+ int ret;
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
@@ -594,8 +736,13 @@ static int take_cpu_down(void *_param)
WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
st->state--;
/* Invoke the former CPU_DYING callbacks */
- for (; st->state > target; st->state--)
- cpuhp_invoke_callback(cpu, st->state, false, NULL);
+ for (; st->state > target; st->state--) {
+ ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+ /*
+ * DYING must not fail!
+ */
+ WARN_ON_ONCE(ret);
+ }
/* Give up timekeeping duties */
tick_handover_do_timer();
@@ -639,7 +786,7 @@ static int takedown_cpu(unsigned int cpu)
*
* Wait for the stop thread to go away.
*/
- wait_for_completion(&st->done);
+ wait_for_ap_thread(st, false);
BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
/* Interrupts are moved away from the dying cpu, reenable alloc/free */
@@ -658,7 +805,7 @@ static void cpuhp_complete_idle_dead(void *arg)
{
struct cpuhp_cpu_state *st = arg;
- complete(&st->done);
+ complete_ap_thread(st, false);
}
void cpuhp_report_idle_dead(void)
@@ -676,11 +823,32 @@ void cpuhp_report_idle_dead(void)
cpuhp_complete_idle_dead, st, 0);
}
-#else
-#define takedown_cpu NULL
-#endif
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+ for (st->state++; st->state < st->target; st->state++) {
+ struct cpuhp_step *step = cpuhp_get_step(st->state);
-#ifdef CONFIG_HOTPLUG_CPU
+ if (!step->skip_onerr)
+ cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+ }
+}
+
+static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+ int ret = 0;
+
+ for (; st->state > target; st->state--) {
+ ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+ if (ret) {
+ st->target = prev_state;
+ undo_cpu_down(cpu, st);
+ break;
+ }
+ }
+ return ret;
+}
/* Requires cpu_add_remove_lock to be held */
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
@@ -699,13 +867,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
cpuhp_tasks_frozen = tasks_frozen;
- prev_state = st->state;
- st->target = target;
+ prev_state = cpuhp_set_state(st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread.
*/
if (st->state > CPUHP_TEARDOWN_CPU) {
+ st->target = max((int)target, CPUHP_TEARDOWN_CPU);
ret = cpuhp_kick_ap_work(cpu);
/*
* The AP side has done the error rollback already. Just
@@ -720,6 +888,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
*/
if (st->state > CPUHP_TEARDOWN_CPU)
goto out;
+
+ st->target = target;
}
/*
* The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
@@ -727,13 +897,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
*/
ret = cpuhp_down_callbacks(cpu, st, target);
if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
- st->target = prev_state;
- st->rollback = true;
- cpuhp_kick_ap_work(cpu);
+ cpuhp_reset_state(st, prev_state);
+ __cpuhp_kick_ap(st);
}
out:
cpus_write_unlock();
+ /*
+ * Do post unplug cleanup. This is still protected against
+ * concurrent CPU hotplug via cpu_add_remove_lock.
+ */
+ lockup_detector_cleanup();
return ret;
}
@@ -754,11 +928,15 @@ out:
cpu_maps_update_done();
return err;
}
+
int cpu_down(unsigned int cpu)
{
return do_cpu_down(cpu, CPUHP_OFFLINE);
}
EXPORT_SYMBOL(cpu_down);
+
+#else
+#define takedown_cpu NULL
#endif /*CONFIG_HOTPLUG_CPU*/
/**
@@ -772,11 +950,16 @@ void notify_cpu_starting(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
+ int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
while (st->state < target) {
st->state++;
- cpuhp_invoke_callback(cpu, st->state, true, NULL);
+ ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+ /*
+ * STARTING must not fail!
+ */
+ WARN_ON_ONCE(ret);
}
}
@@ -794,7 +977,7 @@ void cpuhp_online_idle(enum cpuhp_state state)
return;
st->state = CPUHP_AP_ONLINE_IDLE;
- complete(&st->done);
+ complete_ap_thread(st, true);
}
/* Requires cpu_add_remove_lock to be held */
@@ -829,7 +1012,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
cpuhp_tasks_frozen = tasks_frozen;
- st->target = target;
+ cpuhp_set_state(st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread once more.
@@ -1296,6 +1479,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
struct cpuhp_step *sp = cpuhp_get_step(state);
int ret;
+ /*
+ * If there's nothing to do, we done.
+ * Relies on the union for multi_instance.
+ */
if ((bringup && !sp->startup.single) ||
(!bringup && !sp->teardown.single))
return 0;
@@ -1307,9 +1494,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
if (cpuhp_is_ap_state(state))
ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
else
- ret = cpuhp_invoke_callback(cpu, state, bringup, node);
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#else
- ret = cpuhp_invoke_callback(cpu, state, bringup, node);
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#endif
BUG_ON(ret && !bringup);
return ret;
@@ -1641,9 +1828,55 @@ static ssize_t show_cpuhp_target(struct device *dev,
}
static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
+
+static ssize_t write_cpuhp_fail(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+ struct cpuhp_step *sp;
+ int fail, ret;
+
+ ret = kstrtoint(buf, 10, &fail);
+ if (ret)
+ return ret;
+
+ /*
+ * Cannot fail STARTING/DYING callbacks.
+ */
+ if (cpuhp_is_atomic_state(fail))
+ return -EINVAL;
+
+ /*
+ * Cannot fail anything that doesn't have callbacks.
+ */
+ mutex_lock(&cpuhp_state_mutex);
+ sp = cpuhp_get_step(fail);
+ if (!sp->startup.single && !sp->teardown.single)
+ ret = -EINVAL;
+ mutex_unlock(&cpuhp_state_mutex);
+ if (ret)
+ return ret;
+
+ st->fail = fail;
+
+ return count;
+}
+
+static ssize_t show_cpuhp_fail(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+ return sprintf(buf, "%d\n", st->fail);
+}
+
+static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);
+
static struct attribute *cpuhp_cpu_attrs[] = {
&dev_attr_state.attr,
&dev_attr_target.attr,
+ &dev_attr_fail.attr,
NULL
};
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 6db80fc0810b..b3663896278e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -108,7 +108,8 @@ static int __init parse_crashkernel_mem(char *cmdline,
return -EINVAL;
}
}
- }
+ } else
+ pr_info("crashkernel size resulted in zero bytes\n");
return 0;
}
diff --git a/kernel/dma.c b/kernel/dma.c
index 6c6262f86c17..3506fc34a712 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
*
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
index e556751d15d9..fc482c8e0bd8 100644
--- a/kernel/elfcore.c
+++ b/kernel/elfcore.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/elf.h>
#include <linux/fs.h>
#include <linux/mm.h>
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 2925188f50ea..3c022e33c109 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
endif
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3e691b75b2db..16beab4767e1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -209,7 +209,7 @@ static int event_function(void *info)
struct perf_event_context *task_ctx = cpuctx->task_ctx;
int ret = 0;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
perf_ctx_lock(cpuctx, task_ctx);
/*
@@ -306,7 +306,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
struct task_struct *task = READ_ONCE(ctx->task);
struct perf_event_context *task_ctx = NULL;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
if (task) {
if (task == TASK_TOMBSTONE)
@@ -582,6 +582,88 @@ static inline u64 perf_event_clock(struct perf_event *event)
return event->clock();
}
+/*
+ * State based event timekeeping...
+ *
+ * The basic idea is to use event->state to determine which (if any) time
+ * fields to increment with the current delta. This means we only need to
+ * update timestamps when we change state or when they are explicitly requested
+ * (read).
+ *
+ * Event groups make things a little more complicated, but not terribly so. The
+ * rules for a group are that if the group leader is OFF the entire group is
+ * OFF, irrespecive of what the group member states are. This results in
+ * __perf_effective_state().
+ *
+ * A futher ramification is that when a group leader flips between OFF and
+ * !OFF, we need to update all group member times.
+ *
+ *
+ * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
+ * need to make sure the relevant context time is updated before we try and
+ * update our timestamps.
+ */
+
+static __always_inline enum perf_event_state
+__perf_effective_state(struct perf_event *event)
+{
+ struct perf_event *leader = event->group_leader;
+
+ if (leader->state <= PERF_EVENT_STATE_OFF)
+ return leader->state;
+
+ return event->state;
+}
+
+static __always_inline void
+__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
+{
+ enum perf_event_state state = __perf_effective_state(event);
+ u64 delta = now - event->tstamp;
+
+ *enabled = event->total_time_enabled;
+ if (state >= PERF_EVENT_STATE_INACTIVE)
+ *enabled += delta;
+
+ *running = event->total_time_running;
+ if (state >= PERF_EVENT_STATE_ACTIVE)
+ *running += delta;
+}
+
+static void perf_event_update_time(struct perf_event *event)
+{
+ u64 now = perf_event_time(event);
+
+ __perf_update_times(event, now, &event->total_time_enabled,
+ &event->total_time_running);
+ event->tstamp = now;
+}
+
+static void perf_event_update_sibling_time(struct perf_event *leader)
+{
+ struct perf_event *sibling;
+
+ list_for_each_entry(sibling, &leader->sibling_list, group_entry)
+ perf_event_update_time(sibling);
+}
+
+static void
+perf_event_set_state(struct perf_event *event, enum perf_event_state state)
+{
+ if (event->state == state)
+ return;
+
+ perf_event_update_time(event);
+ /*
+ * If a group leader gets enabled/disabled all its siblings
+ * are affected too.
+ */
+ if ((event->state < 0) ^ (state < 0))
+ perf_event_update_sibling_time(event);
+
+ WRITE_ONCE(event->state, state);
+}
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
@@ -662,7 +744,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
/*
* Do not update time when cgroup is not active
*/
- if (cgrp == event->cgrp)
+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
__update_cgrp_time(event->cgrp);
}
@@ -841,40 +923,6 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
event->shadow_ctx_time = now - t->timestamp;
}
-static inline void
-perf_cgroup_defer_enabled(struct perf_event *event)
-{
- /*
- * when the current task's perf cgroup does not match
- * the event's, we need to remember to call the
- * perf_mark_enable() function the first time a task with
- * a matching perf cgroup is scheduled in.
- */
- if (is_cgroup_event(event) && !perf_cgroup_match(event))
- event->cgrp_defer_enabled = 1;
-}
-
-static inline void
-perf_cgroup_mark_enabled(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- struct perf_event *sub;
- u64 tstamp = perf_event_time(event);
-
- if (!event->cgrp_defer_enabled)
- return;
-
- event->cgrp_defer_enabled = 0;
-
- event->tstamp_enabled = tstamp - event->total_time_enabled;
- list_for_each_entry(sub, &event->sibling_list, group_entry) {
- if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
- sub->cgrp_defer_enabled = 0;
- }
- }
-}
-
/*
* Update cpuctx->cgrp so that it is set when first cgroup event is added and
* cleared when last cgroup event is removed.
@@ -901,9 +949,11 @@ list_update_cgroup_event(struct perf_event *event,
cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
/* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
if (add) {
+ struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
+
list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
- if (perf_cgroup_from_task(current, ctx) == event->cgrp)
- cpuctx->cgrp = event->cgrp;
+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
+ cpuctx->cgrp = cgrp;
} else {
list_del(cpuctx_entry);
cpuctx->cgrp = NULL;
@@ -973,17 +1023,6 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
}
static inline void
-perf_cgroup_defer_enabled(struct perf_event *event)
-{
-}
-
-static inline void
-perf_cgroup_mark_enabled(struct perf_event *event,
- struct perf_event_context *ctx)
-{
-}
-
-static inline void
list_update_cgroup_event(struct perf_event *event,
struct perf_event_context *ctx, bool add)
{
@@ -1004,7 +1043,7 @@ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
struct perf_cpu_context *cpuctx;
int rotations = 0;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
rotations = perf_rotate_context(cpuctx);
@@ -1091,7 +1130,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx)
{
struct list_head *head = this_cpu_ptr(&active_ctx_list);
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
WARN_ON(!list_empty(&ctx->active_ctx_list));
@@ -1100,7 +1139,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx)
static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
{
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
WARN_ON(list_empty(&ctx->active_ctx_list));
@@ -1200,7 +1239,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
again:
rcu_read_lock();
- ctx = ACCESS_ONCE(event->ctx);
+ ctx = READ_ONCE(event->ctx);
if (!atomic_inc_not_zero(&ctx->refcount)) {
rcu_read_unlock();
goto again;
@@ -1396,60 +1435,6 @@ static u64 perf_event_time(struct perf_event *event)
return ctx ? ctx->time : 0;
}
-/*
- * Update the total_time_enabled and total_time_running fields for a event.
- */
-static void update_event_times(struct perf_event *event)
-{
- struct perf_event_context *ctx = event->ctx;
- u64 run_end;
-
- lockdep_assert_held(&ctx->lock);
-
- if (event->state < PERF_EVENT_STATE_INACTIVE ||
- event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
- return;
-
- /*
- * in cgroup mode, time_enabled represents
- * the time the event was enabled AND active
- * tasks were in the monitored cgroup. This is
- * independent of the activity of the context as
- * there may be a mix of cgroup and non-cgroup events.
- *
- * That is why we treat cgroup events differently
- * here.
- */
- if (is_cgroup_event(event))
- run_end = perf_cgroup_event_time(event);
- else if (ctx->is_active)
- run_end = ctx->time;
- else
- run_end = event->tstamp_stopped;
-
- event->total_time_enabled = run_end - event->tstamp_enabled;
-
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- run_end = event->tstamp_stopped;
- else
- run_end = perf_event_time(event);
-
- event->total_time_running = run_end - event->tstamp_running;
-
-}
-
-/*
- * Update total_time_enabled and total_time_running for all events in a group.
- */
-static void update_group_times(struct perf_event *leader)
-{
- struct perf_event *event;
-
- update_event_times(leader);
- list_for_each_entry(event, &leader->sibling_list, group_entry)
- update_event_times(event);
-}
-
static enum event_type_t get_event_type(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
@@ -1492,6 +1477,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
event->attach_state |= PERF_ATTACH_CONTEXT;
+ event->tstamp = perf_event_time(event);
+
/*
* If we're a stand alone event or group leader, we go to the context
* list, group events are kept attached to the group so that
@@ -1699,8 +1686,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
if (event->group_leader == event)
list_del_init(&event->group_entry);
- update_group_times(event);
-
/*
* If event was in error state, then keep it
* that way, otherwise bogus counts will be
@@ -1709,7 +1694,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
* of the event
*/
if (event->state > PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_OFF;
+ perf_event_set_state(event, PERF_EVENT_STATE_OFF);
ctx->generation++;
}
@@ -1808,38 +1793,24 @@ event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
- u64 delta;
+ enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
- /*
- * An event which could not be activated because of
- * filter mismatch still needs to have its timings
- * maintained, otherwise bogus information is return
- * via read() for time_enabled, time_running:
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE &&
- !event_filter_match(event)) {
- delta = tstamp - event->tstamp_stopped;
- event->tstamp_running += delta;
- event->tstamp_stopped = tstamp;
- }
-
if (event->state != PERF_EVENT_STATE_ACTIVE)
return;
perf_pmu_disable(event->pmu);
- event->tstamp_stopped = tstamp;
event->pmu->del(event, 0);
event->oncpu = -1;
- event->state = PERF_EVENT_STATE_INACTIVE;
+
if (event->pending_disable) {
event->pending_disable = 0;
- event->state = PERF_EVENT_STATE_OFF;
+ state = PERF_EVENT_STATE_OFF;
}
+ perf_event_set_state(event, state);
if (!is_software_event(event))
cpuctx->active_oncpu--;
@@ -1859,7 +1830,9 @@ group_sched_out(struct perf_event *group_event,
struct perf_event_context *ctx)
{
struct perf_event *event;
- int state = group_event->state;
+
+ if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
perf_pmu_disable(ctx->pmu);
@@ -1873,7 +1846,7 @@ group_sched_out(struct perf_event *group_event,
perf_pmu_enable(ctx->pmu);
- if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
+ if (group_event->attr.exclusive)
cpuctx->exclusive = 0;
}
@@ -1893,6 +1866,11 @@ __perf_remove_from_context(struct perf_event *event,
{
unsigned long flags = (unsigned long)info;
+ if (ctx->is_active & EVENT_TIME) {
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx);
+ }
+
event_sched_out(event, cpuctx, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
@@ -1955,14 +1933,17 @@ static void __perf_event_disable(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return;
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- update_group_times(event);
+ if (ctx->is_active & EVENT_TIME) {
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
+
if (event == event->group_leader)
group_sched_out(event, cpuctx, ctx);
else
event_sched_out(event, cpuctx, ctx);
- event->state = PERF_EVENT_STATE_OFF;
+
+ perf_event_set_state(event, PERF_EVENT_STATE_OFF);
}
/*
@@ -2019,8 +2000,7 @@ void perf_event_disable_inatomic(struct perf_event *event)
}
static void perf_set_shadow_time(struct perf_event *event,
- struct perf_event_context *ctx,
- u64 tstamp)
+ struct perf_event_context *ctx)
{
/*
* use the correct time source for the time snapshot
@@ -2048,9 +2028,9 @@ static void perf_set_shadow_time(struct perf_event *event,
* is cleaner and simpler to understand.
*/
if (is_cgroup_event(event))
- perf_cgroup_set_shadow_time(event, tstamp);
+ perf_cgroup_set_shadow_time(event, event->tstamp);
else
- event->shadow_ctx_time = tstamp - ctx->timestamp;
+ event->shadow_ctx_time = event->tstamp - ctx->timestamp;
}
#define MAX_INTERRUPTS (~0ULL)
@@ -2063,7 +2043,6 @@ event_sched_in(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
int ret = 0;
lockdep_assert_held(&ctx->lock);
@@ -2073,11 +2052,12 @@ event_sched_in(struct perf_event *event,
WRITE_ONCE(event->oncpu, smp_processor_id());
/*
- * Order event::oncpu write to happen before the ACTIVE state
- * is visible.
+ * Order event::oncpu write to happen before the ACTIVE state is
+ * visible. This allows perf_event_{stop,read}() to observe the correct
+ * ->oncpu if it sees ACTIVE.
*/
smp_wmb();
- WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
+ perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
/*
* Unthrottle events, since we scheduled we might have missed several
@@ -2089,26 +2069,19 @@ event_sched_in(struct perf_event *event,
event->hw.interrupts = 0;
}
- /*
- * The new state must be visible before we turn it on in the hardware:
- */
- smp_wmb();
-
perf_pmu_disable(event->pmu);
- perf_set_shadow_time(event, ctx, tstamp);
+ perf_set_shadow_time(event, ctx);
perf_log_itrace_start(event);
if (event->pmu->add(event, PERF_EF_START)) {
- event->state = PERF_EVENT_STATE_INACTIVE;
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
event->oncpu = -1;
ret = -EAGAIN;
goto out;
}
- event->tstamp_running += tstamp - event->tstamp_stopped;
-
if (!is_software_event(event))
cpuctx->active_oncpu++;
if (!ctx->nr_active++)
@@ -2132,8 +2105,6 @@ group_sched_in(struct perf_event *group_event,
{
struct perf_event *event, *partial_group = NULL;
struct pmu *pmu = ctx->pmu;
- u64 now = ctx->time;
- bool simulate = false;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
@@ -2163,27 +2134,13 @@ group_error:
/*
* Groups can be scheduled in as one unit only, so undo any
* partial group before returning:
- * The events up to the failed event are scheduled out normally,
- * tstamp_stopped will be updated.
- *
- * The failed events and the remaining siblings need to have
- * their timings updated as if they had gone thru event_sched_in()
- * and event_sched_out(). This is required to get consistent timings
- * across the group. This also takes care of the case where the group
- * could never be scheduled by ensuring tstamp_stopped is set to mark
- * the time the event was actually stopped, such that time delta
- * calculation in update_event_times() is correct.
+ * The events up to the failed event are scheduled out normally.
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
if (event == partial_group)
- simulate = true;
+ break;
- if (simulate) {
- event->tstamp_running += now - event->tstamp_stopped;
- event->tstamp_stopped = now;
- } else {
- event_sched_out(event, cpuctx, ctx);
- }
+ event_sched_out(event, cpuctx, ctx);
}
event_sched_out(group_event, cpuctx, ctx);
@@ -2225,46 +2182,11 @@ static int group_can_go_on(struct perf_event *event,
return can_add_hw;
}
-/*
- * Complement to update_event_times(). This computes the tstamp_* values to
- * continue 'enabled' state from @now, and effectively discards the time
- * between the prior tstamp_stopped and now (as we were in the OFF state, or
- * just switched (context) time base).
- *
- * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
- * cannot have been scheduled in yet. And going into INACTIVE state means
- * '@event->tstamp_stopped = @now'.
- *
- * Thus given the rules of update_event_times():
- *
- * total_time_enabled = tstamp_stopped - tstamp_enabled
- * total_time_running = tstamp_stopped - tstamp_running
- *
- * We can insert 'tstamp_stopped == now' and reverse them to compute new
- * tstamp_* values.
- */
-static void __perf_event_enable_time(struct perf_event *event, u64 now)
-{
- WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
-
- event->tstamp_stopped = now;
- event->tstamp_enabled = now - event->total_time_enabled;
- event->tstamp_running = now - event->total_time_running;
-}
-
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
-
list_add_event(event, ctx);
perf_group_attach(event);
- /*
- * We can be called with event->state == STATE_OFF when we create with
- * .disabled = 1. In that case the IOC_ENABLE will call this function.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- __perf_event_enable_time(event, tstamp);
}
static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2496,28 +2418,6 @@ again:
}
/*
- * Put a event into inactive state and update time fields.
- * Enabling the leader of a group effectively enables all
- * the group members that aren't explicitly disabled, so we
- * have to update their ->tstamp_enabled also.
- * Note: this works for group members as well as group leaders
- * since the non-leader members' sibling_lists will be empty.
- */
-static void __perf_event_mark_enabled(struct perf_event *event)
-{
- struct perf_event *sub;
- u64 tstamp = perf_event_time(event);
-
- event->state = PERF_EVENT_STATE_INACTIVE;
- __perf_event_enable_time(event, tstamp);
- list_for_each_entry(sub, &event->sibling_list, group_entry) {
- /* XXX should not be > INACTIVE if event isn't */
- if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- __perf_event_enable_time(sub, tstamp);
- }
-}
-
-/*
* Cross CPU call to enable a performance event
*/
static void __perf_event_enable(struct perf_event *event,
@@ -2535,14 +2435,12 @@ static void __perf_event_enable(struct perf_event *event,
if (ctx->is_active)
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
- __perf_event_mark_enabled(event);
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
if (!ctx->is_active)
return;
if (!event_filter_match(event)) {
- if (is_cgroup_event(event))
- perf_cgroup_defer_enabled(event);
ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
return;
}
@@ -2862,18 +2760,10 @@ static void __perf_event_sync_stat(struct perf_event *event,
* we know the event must be on the current CPU, therefore we
* don't need to use it.
*/
- switch (event->state) {
- case PERF_EVENT_STATE_ACTIVE:
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
event->pmu->read(event);
- /* fall-through */
- case PERF_EVENT_STATE_INACTIVE:
- update_event_times(event);
- break;
-
- default:
- break;
- }
+ perf_event_update_time(event);
/*
* In order to keep per-task stats reliable we need to flip the event
@@ -3110,10 +3000,6 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
-
if (group_can_go_on(event, cpuctx, 1))
group_sched_in(event, cpuctx, ctx);
@@ -3121,10 +3007,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
* If this pinned group hasn't been scheduled,
* put it in error state.
*/
- if (event->state == PERF_EVENT_STATE_INACTIVE) {
- update_group_times(event);
- event->state = PERF_EVENT_STATE_ERROR;
- }
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
}
@@ -3146,10 +3030,6 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
-
if (group_can_go_on(event, cpuctx, can_add_hw)) {
if (group_sched_in(event, cpuctx, ctx))
can_add_hw = 0;
@@ -3521,7 +3401,7 @@ void perf_event_task_tick(void)
struct perf_event_context *ctx, *tmp;
int throttled;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
__this_cpu_inc(perf_throttled_seq);
throttled = __this_cpu_xchg(perf_throttled_count, 0);
@@ -3541,7 +3421,7 @@ static int event_enable_on_exec(struct perf_event *event,
if (event->state >= PERF_EVENT_STATE_INACTIVE)
return 0;
- __perf_event_mark_enabled(event);
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
return 1;
}
@@ -3635,12 +3515,15 @@ static void __perf_event_read(void *info)
return;
raw_spin_lock(&ctx->lock);
- if (ctx->is_active) {
+ if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
- update_event_times(event);
+ perf_event_update_time(event);
+ if (data->group)
+ perf_event_update_sibling_time(event);
+
if (event->state != PERF_EVENT_STATE_ACTIVE)
goto unlock;
@@ -3655,7 +3538,6 @@ static void __perf_event_read(void *info)
pmu->read(event);
list_for_each_entry(sub, &event->sibling_list, group_entry) {
- update_event_times(sub);
if (sub->state == PERF_EVENT_STATE_ACTIVE) {
/*
* Use sibling's PMU rather than @event's since
@@ -3684,7 +3566,8 @@ static inline u64 perf_event_count(struct perf_event *event)
* will not be local and we cannot read them atomically
* - must not have a pmu::count method
*/
-int perf_event_read_local(struct perf_event *event, u64 *value)
+int perf_event_read_local(struct perf_event *event, u64 *value,
+ u64 *enabled, u64 *running)
{
unsigned long flags;
int ret = 0;
@@ -3727,6 +3610,16 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
event->pmu->read(event);
*value = local64_read(&event->count);
+ if (enabled || running) {
+ u64 now = event->shadow_ctx_time + perf_clock();
+ u64 __enabled, __running;
+
+ __perf_update_times(event, now, &__enabled, &__running);
+ if (enabled)
+ *enabled = __enabled;
+ if (running)
+ *running = __running;
+ }
out:
local_irq_restore(flags);
@@ -3735,23 +3628,35 @@ out:
static int perf_event_read(struct perf_event *event, bool group)
{
+ enum perf_event_state state = READ_ONCE(event->state);
int event_cpu, ret = 0;
/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE) {
- struct perf_read_data data = {
- .event = event,
- .group = group,
- .ret = 0,
- };
+again:
+ if (state == PERF_EVENT_STATE_ACTIVE) {
+ struct perf_read_data data;
+
+ /*
+ * Orders the ->state and ->oncpu loads such that if we see
+ * ACTIVE we must also see the right ->oncpu.
+ *
+ * Matches the smp_wmb() from event_sched_in().
+ */
+ smp_rmb();
event_cpu = READ_ONCE(event->oncpu);
if ((unsigned)event_cpu >= nr_cpu_ids)
return 0;
+ data = (struct perf_read_data){
+ .event = event,
+ .group = group,
+ .ret = 0,
+ };
+
preempt_disable();
event_cpu = __perf_event_read_cpu(event, event_cpu);
@@ -3768,24 +3673,30 @@ static int perf_event_read(struct perf_event *event, bool group)
(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
preempt_enable();
ret = data.ret;
- } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+
+ } else if (state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
unsigned long flags;
raw_spin_lock_irqsave(&ctx->lock, flags);
+ state = event->state;
+ if (state != PERF_EVENT_STATE_INACTIVE) {
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+ goto again;
+ }
+
/*
- * may read while context is not active
- * (e.g., thread is blocked), in that case
- * we cannot update context time
+ * May read while context is not active (e.g., thread is
+ * blocked), in that case we cannot update context time
*/
- if (ctx->is_active) {
+ if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
+
+ perf_event_update_time(event);
if (group)
- update_group_times(event);
- else
- update_event_times(event);
+ perf_event_update_sibling_time(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
@@ -4231,7 +4142,7 @@ static void perf_remove_from_owner(struct perf_event *event)
* indeed free this event, otherwise we need to serialize on
* owner->perf_event_mutex.
*/
- owner = lockless_dereference(event->owner);
+ owner = READ_ONCE(event->owner);
if (owner) {
/*
* Since delayed_put_task_struct() also drops the last
@@ -4328,7 +4239,7 @@ again:
* Cannot change, child events are not migrated, see the
* comment with perf_event_ctx_lock_nested().
*/
- ctx = lockless_dereference(child->ctx);
+ ctx = READ_ONCE(child->ctx);
/*
* Since child_mutex nests inside ctx::mutex, we must jump
* through hoops. We start by grabbing a reference on the ctx.
@@ -4388,7 +4299,7 @@ static int perf_release(struct inode *inode, struct file *file)
return 0;
}
-u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
u64 total = 0;
@@ -4416,6 +4327,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
return total;
}
+
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+{
+ struct perf_event_context *ctx;
+ u64 count;
+
+ ctx = perf_event_ctx_lock(event);
+ count = __perf_event_read_value(event, enabled, running);
+ perf_event_ctx_unlock(event, ctx);
+
+ return count;
+}
EXPORT_SYMBOL_GPL(perf_event_read_value);
static int __perf_read_group_add(struct perf_event *leader,
@@ -4431,6 +4354,8 @@ static int __perf_read_group_add(struct perf_event *leader,
if (ret)
return ret;
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+
/*
* Since we co-schedule groups, {enabled,running} times of siblings
* will be identical to those of the leader, so we only publish one
@@ -4453,8 +4378,6 @@ static int __perf_read_group_add(struct perf_event *leader,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
- raw_spin_lock_irqsave(&ctx->lock, flags);
-
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
@@ -4518,7 +4441,7 @@ static int perf_read_one(struct perf_event *event,
u64 values[4];
int n = 0;
- values[n++] = perf_event_read_value(event, &enabled, &running);
+ values[n++] = __perf_event_read_value(event, &enabled, &running);
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
values[n++] = enabled;
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
@@ -4897,8 +4820,7 @@ static void calc_timer_values(struct perf_event *event,
*now = perf_clock();
ctx_time = event->shadow_ctx_time + *now;
- *enabled = ctx_time - event->tstamp_enabled;
- *running = ctx_time - event->tstamp_running;
+ __perf_update_times(event, ctx_time, enabled, running);
}
static void perf_event_init_userpage(struct perf_event *event)
@@ -5302,8 +5224,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (!rb)
goto aux_unlock;
- aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
- aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+ aux_offset = READ_ONCE(rb->user_page->aux_offset);
+ aux_size = READ_ONCE(rb->user_page->aux_size);
if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
goto aux_unlock;
@@ -6754,6 +6676,7 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
ns_inode = ns_path.dentry->d_inode;
ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
ns_link_info->ino = ns_inode->i_ino;
+ path_put(&ns_path);
}
}
@@ -7944,25 +7867,24 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
struct pt_regs *regs, struct hlist_head *head,
struct task_struct *task)
{
- struct bpf_prog *prog = call->prog;
-
- if (prog) {
+ if (bpf_prog_array_valid(call)) {
*(struct pt_regs **)raw_data = regs;
- if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+ if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
}
}
perf_tp_event(call->event.type, count, raw_data, size, regs, head,
- rctx, task, NULL);
+ rctx, task);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
- struct task_struct *task, struct perf_event *event)
+ struct task_struct *task)
{
struct perf_sample_data data;
+ struct perf_event *event;
struct perf_raw_record raw = {
.frag = {
@@ -7976,15 +7898,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
perf_trace_buf_update(record, event_type);
- /* Use the given event instead of the hlist */
- if (event) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
- } else {
- hlist_for_each_entry_rcu(event, head, hlist_entry) {
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
}
/*
@@ -8072,6 +7988,7 @@ static void bpf_overflow_handler(struct perf_event *event,
struct bpf_perf_event_data_kern ctx = {
.data = data,
.regs = regs,
+ .event = event,
};
int ret = 0;
@@ -8136,13 +8053,11 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
+ int ret;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return perf_event_set_bpf_handler(event, prog_fd);
- if (event->tp_event->prog)
- return -EEXIST;
-
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
is_syscall_tp = is_syscall_trace_event(event->tp_event);
@@ -8170,25 +8085,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
return -EACCES;
}
}
- event->tp_event->prog = prog;
- return 0;
+ ret = perf_event_attach_bpf_prog(event, prog);
+ if (ret)
+ bpf_prog_put(prog);
+ return ret;
}
static void perf_event_free_bpf_prog(struct perf_event *event)
{
- struct bpf_prog *prog;
-
- perf_event_free_bpf_handler(event);
-
- if (!event->tp_event)
+ if (event->attr.type != PERF_TYPE_TRACEPOINT) {
+ perf_event_free_bpf_handler(event);
return;
-
- prog = event->tp_event->prog;
- if (prog) {
- event->tp_event->prog = NULL;
- bpf_prog_put(prog);
}
+ perf_event_detach_bpf_prog(event);
}
#else
@@ -8954,6 +8864,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
static void free_pmu_context(struct pmu *pmu)
{
+ /*
+ * Static contexts such as perf_sw_context have a global lifetime
+ * and may be shared between different PMUs. Avoid freeing them
+ * when a single PMU is going away.
+ */
+ if (pmu->task_ctx_nr > perf_invalid_context)
+ return;
+
mutex_lock(&pmus_lock);
free_percpu(pmu->pmu_cpu_context);
mutex_unlock(&pmus_lock);
@@ -9393,6 +9311,11 @@ static void account_event(struct perf_event *event)
inc = true;
if (inc) {
+ /*
+ * We need the mutex here because static_branch_enable()
+ * must complete *before* the perf_sched_count increment
+ * becomes visible.
+ */
if (atomic_inc_not_zero(&perf_sched_count))
goto enabled;
@@ -10518,7 +10441,7 @@ perf_event_exit_event(struct perf_event *child_event,
if (parent_event)
perf_group_detach(child_event);
list_del_event(child_event, child_ctx);
- child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+ perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
raw_spin_unlock_irq(&child_ctx->lock);
/*
@@ -10756,7 +10679,7 @@ inherit_event(struct perf_event *parent_event,
struct perf_event *group_leader,
struct perf_event_context *child_ctx)
{
- enum perf_event_active_state parent_state = parent_event->state;
+ enum perf_event_state parent_state = parent_event->state;
struct perf_event *child_event;
unsigned long flags;
@@ -11092,6 +11015,7 @@ static void __perf_event_exit_context(void *__info)
struct perf_event *event;
raw_spin_lock(&ctx->lock);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 843e97047335..09b1537ae06c 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _KERNEL_EVENTS_INTERNAL_H
#define _KERNEL_EVENTS_INTERNAL_H
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index af71a84e12ee..141aa2ca8728 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -381,7 +381,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* (B) <-> (C) ordering is still observed by the pmu driver.
*/
if (!rb->aux_overwrite) {
- aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
+ aux_tail = READ_ONCE(rb->user_page->aux_tail);
handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
if (aux_head - aux_tail < perf_aux_size(rb))
handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
@@ -411,6 +411,20 @@ err:
return NULL;
}
+EXPORT_SYMBOL_GPL(perf_aux_output_begin);
+
+static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb)
+{
+ if (rb->aux_overwrite)
+ return false;
+
+ if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+ rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
+ return true;
+ }
+
+ return false;
+}
/*
* Commit the data written by hardware into the ring buffer by adjusting
@@ -451,10 +465,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
}
rb->user_page->aux_head = rb->aux_head;
- if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+ if (rb_need_aux_wakeup(rb))
wakeup = true;
- rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
- }
if (wakeup) {
if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
@@ -469,6 +481,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
rb_free_aux(rb);
ring_buffer_put(rb);
}
+EXPORT_SYMBOL_GPL(perf_aux_output_end);
/*
* Skip over a given number of bytes in the AUX buffer, due to, for example,
@@ -484,9 +497,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
rb->aux_head += size;
rb->user_page->aux_head = rb->aux_head;
- if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+ if (rb_need_aux_wakeup(rb)) {
perf_output_wakeup(handle);
- rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
}
@@ -495,6 +507,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
return 0;
}
+EXPORT_SYMBOL_GPL(perf_aux_output_skip);
void *perf_get_aux(struct perf_output_handle *handle)
{
@@ -504,6 +517,7 @@ void *perf_get_aux(struct perf_output_handle *handle)
return handle->rb->aux_priv;
}
+EXPORT_SYMBOL_GPL(perf_get_aux);
#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 6873bb3e6b7e..0975b0268545 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Handling of different ABIs (personalities).
*
diff --git a/kernel/exit.c b/kernel/exit.c
index 3481ababd06a..6b4298a41167 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1339,7 +1339,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
* can't confuse the checks below.
*/
- int exit_state = ACCESS_ONCE(p->exit_state);
+ int exit_state = READ_ONCE(p->exit_state);
int ret;
if (unlikely(exit_state == EXIT_DEAD))
@@ -1600,18 +1600,19 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
struct waitid_info info = {.status = 0};
long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
int signo = 0;
+
if (err > 0) {
signo = SIGCHLD;
err = 0;
- }
-
- if (!err) {
if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
return -EFAULT;
}
if (!infop)
return err;
+ if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+ return -EFAULT;
+
user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
@@ -1723,21 +1724,23 @@ COMPAT_SYSCALL_DEFINE5(waitid,
if (err > 0) {
signo = SIGCHLD;
err = 0;
- }
-
- if (!err && uru) {
- /* kernel_waitid() overwrites everything in ru */
- if (COMPAT_USE_64BIT_TIME)
- err = copy_to_user(uru, &ru, sizeof(ru));
- else
- err = put_compat_rusage(&ru, uru);
- if (err)
- return -EFAULT;
+ if (uru) {
+ /* kernel_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ err = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ err = put_compat_rusage(&ru, uru);
+ if (err)
+ return -EFAULT;
+ }
}
if (!infop)
return err;
+ if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+ return -EFAULT;
+
user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
diff --git a/kernel/extable.c b/kernel/extable.c
index 38c2412401a1..a17fdb63dc3e 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -31,6 +31,8 @@
* mutex protecting text section modification (dynamic code patching).
* some users need to sleep (allocating memory...) while they hold this lock.
*
+ * Note: Also protects SMP-alternatives modification on x86.
+ *
* NOT exported to modules - patching kernel text is a really delicate matter.
*/
DEFINE_MUTEX(text_mutex);
@@ -102,15 +104,7 @@ int core_kernel_data(unsigned long addr)
int __kernel_text_address(unsigned long addr)
{
- if (core_kernel_text(addr))
- return 1;
- if (is_module_text_address(addr))
- return 1;
- if (is_ftrace_trampoline(addr))
- return 1;
- if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
- return 1;
- if (is_bpf_text_address(addr))
+ if (kernel_text_address(addr))
return 1;
/*
* There might be init symbols in saved stacktraces.
@@ -127,17 +121,42 @@ int __kernel_text_address(unsigned long addr)
int kernel_text_address(unsigned long addr)
{
+ bool no_rcu;
+ int ret = 1;
+
if (core_kernel_text(addr))
return 1;
+
+ /*
+ * If a stack dump happens while RCU is not watching, then
+ * RCU needs to be notified that it requires to start
+ * watching again. This can happen either by tracing that
+ * triggers a stack trace, or a WARN() that happens during
+ * coming back from idle, or cpu on or offlining.
+ *
+ * is_module_text_address() as well as the kprobe slots
+ * and is_bpf_text_address() require RCU to be watching.
+ */
+ no_rcu = !rcu_is_watching();
+
+ /* Treat this like an NMI as it can happen anywhere */
+ if (no_rcu)
+ rcu_nmi_enter();
+
if (is_module_text_address(addr))
- return 1;
+ goto out;
if (is_ftrace_trampoline(addr))
- return 1;
+ goto out;
if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
- return 1;
+ goto out;
if (is_bpf_text_address(addr))
- return 1;
- return 0;
+ goto out;
+ ret = 0;
+out:
+ if (no_rcu)
+ rcu_nmi_exit();
+
+ return ret;
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 10646182440f..432eadf6b58c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
+#ifdef CONFIG_DEBUG_KMEMLEAK
+ /* Clear stale pointers from reused stack. */
+ memset(s->addr, 0, THREAD_SIZE);
+#endif
tsk->stack_vm_area = s;
return s->addr;
}
@@ -465,7 +469,7 @@ void __init fork_init(void)
/* create a slab on which task_structs can be allocated */
task_struct_cachep = kmem_cache_create("task_struct",
arch_task_struct_size, align,
- SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
+ SLAB_PANIC|SLAB_ACCOUNT, NULL);
#endif
/* do the arch specific task caches init */
@@ -813,8 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
- atomic_long_set(&mm->nr_ptes, 0);
- mm_nr_pmds_init(mm);
+ mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
mm->pinned_vm = 0;
@@ -868,12 +871,9 @@ static void check_mm(struct mm_struct *mm)
"mm:%p idx:%d val:%ld\n", mm, i, x);
}
- if (atomic_long_read(&mm->nr_ptes))
- pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
- atomic_long_read(&mm->nr_ptes));
- if (mm_nr_pmds(mm))
- pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
- mm_nr_pmds(mm));
+ if (mm_pgtables_bytes(mm))
+ pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+ mm_pgtables_bytes(mm));
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
@@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
+#ifdef CONFIG_MMU
+static void mmput_async_fn(struct work_struct *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct,
+ async_put_work);
+
+ __mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
+ if (atomic_dec_and_test(&mm->mm_users)) {
+ INIT_WORK(&mm->async_put_work, mmput_async_fn);
+ schedule_work(&mm->async_put_work);
+ }
+}
+#endif
+
/**
* set_mm_exe_file - change a reference to the mm's executable file
*
@@ -1853,7 +1871,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = -ERESTARTNOINTR;
goto bad_fork_cancel_cgroup;
}
- if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
goto bad_fork_cancel_cgroup;
}
@@ -2187,18 +2205,18 @@ void __init proc_caches_init(void)
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
- SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
+ SLAB_ACCOUNT, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
/*
* FIXME! The "sizeof(struct mm_struct)" currently includes the
@@ -2209,7 +2227,7 @@ void __init proc_caches_init(void)
*/
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
diff --git a/kernel/futex.c b/kernel/futex.c
index 3d38eaf05492..76ed5921117a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state)
/*
* Drops a reference to the pi_state object and frees or caches it
* when the last reference is gone.
- *
- * Must be called with the hb lock held.
*/
static void put_pi_state(struct futex_pi_state *pi_state)
{
@@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state)
* and has cleaned up the pi_state already
*/
if (pi_state->owner) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
- list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ struct task_struct *owner;
- rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ owner = pi_state->owner;
+ if (owner) {
+ raw_spin_lock(&owner->pi_lock);
+ list_del_init(&pi_state->list);
+ raw_spin_unlock(&owner->pi_lock);
+ }
+ rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
}
- if (current->pi_state_cache)
+ if (current->pi_state_cache) {
kfree(pi_state);
- else {
+ } else {
/*
* pi_state->list is already empty.
* clear pi_state->owner.
@@ -899,22 +903,41 @@ void exit_pi_state_list(struct task_struct *curr)
*/
raw_spin_lock_irq(&curr->pi_lock);
while (!list_empty(head)) {
-
next = head->next;
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
hb = hash_futex(&key);
+
+ /*
+ * We can race against put_pi_state() removing itself from the
+ * list (a waiter going away). put_pi_state() will first
+ * decrement the reference count and then modify the list, so
+ * its possible to see the list entry but fail this reference
+ * acquire.
+ *
+ * In that case; drop the locks to let put_pi_state() make
+ * progress and retry the loop.
+ */
+ if (!atomic_inc_not_zero(&pi_state->refcount)) {
+ raw_spin_unlock_irq(&curr->pi_lock);
+ cpu_relax();
+ raw_spin_lock_irq(&curr->pi_lock);
+ continue;
+ }
raw_spin_unlock_irq(&curr->pi_lock);
spin_lock(&hb->lock);
-
- raw_spin_lock_irq(&curr->pi_lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock(&curr->pi_lock);
/*
* We dropped the pi-lock, so re-check whether this
* task still owns the PI-state:
*/
if (head->next != next) {
+ /* retain curr->pi_lock for the loop invariant */
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
+ put_pi_state(pi_state);
continue;
}
@@ -922,9 +945,9 @@ void exit_pi_state_list(struct task_struct *curr)
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
pi_state->owner = NULL;
- raw_spin_unlock_irq(&curr->pi_lock);
- get_pi_state(pi_state);
+ raw_spin_unlock(&curr->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
rt_mutex_futex_unlock(&pi_state->pi_mutex);
@@ -1208,6 +1231,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &p->pi_state_list);
+ /*
+ * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+ * because there is no concurrency as the object is not published yet.
+ */
pi_state->owner = p;
raw_spin_unlock_irq(&p->pi_lock);
@@ -1560,8 +1587,16 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
int oldval, ret;
if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
- if (oparg < 0 || oparg > 31)
- return -EINVAL;
+ if (oparg < 0 || oparg > 31) {
+ char comm[sizeof(current->comm)];
+ /*
+ * kill this print and return -EINVAL when userspace
+ * is sane again
+ */
+ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
+ get_task_comm(comm, current), oparg);
+ oparg &= 31;
+ }
oparg = 1 << oparg;
}
@@ -2878,6 +2913,7 @@ retry:
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
+ /* drops pi_state->pi_mutex.wait_lock */
ret = wake_futex_pi(uaddr, uval, pi_state);
put_pi_state(pi_state);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 3f409968e466..83f830acbb5f 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/futex_compat.c
*
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 752d6486b67e..c6c50e5c680e 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-y := base.o fs.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index c51a49c9be70..9c7c8d5c18f2 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This code maintains a list of active profiling data structures.
*
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index edf67c493a8e..6e40ff6be083 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This code exports profiling data as debugfs files to userspace.
*
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 27bc88a35013..1e32e66c9563 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This code provides functions to handle gcc's profiling data format
* introduced with gcc 3.4. Future versions of gcc may change the gcov
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 46a18e72bce6..ca5e5c0ef853 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This code provides functions to handle gcc's profiling data format
* introduced with gcc 4.7.
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 92c8e22a29ed..de118ad4a024 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Profiling infrastructure declarations.
*
diff --git a/kernel/groups.c b/kernel/groups.c
index 434f6665f187..e357bc800111 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Supplementary group IDs
*/
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index a117adf7084b..89e355866450 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -97,6 +97,12 @@ config HANDLE_DOMAIN_IRQ
config IRQ_TIMINGS
bool
+config GENERIC_IRQ_MATRIX_ALLOCATOR
+ bool
+
+config GENERIC_IRQ_RESERVATION_MODE
+ bool
+
config IRQ_DOMAIN_DEBUG
bool "Expose hardware/virtual IRQ mapping via debugfs"
depends on IRQ_DOMAIN && DEBUG_FS
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 1970cafe8f2a..ff6e352e3a6c 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
obj-$(CONFIG_IRQ_TIMINGS) += timings.o
@@ -13,3 +14,4 @@ obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
obj-$(CONFIG_SMP) += affinity.o
obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
+obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index d69bd77252a7..e12d35108225 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2016 Thomas Gleixner.
* Copyright (C) 2016-2017 Christoph Hellwig.
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index d30a0dd5cc02..4e8089b319ae 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/irq/autoprobe.c
*
@@ -53,7 +54,7 @@ unsigned long probe_irq_on(void)
if (desc->irq_data.chip->irq_set_type)
desc->irq_data.chip->irq_set_type(&desc->irq_data,
IRQ_TYPE_PROBE);
- irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE);
+ irq_activate_and_startup(desc, IRQ_NORESEND);
}
raw_spin_unlock_irq(&desc->lock);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f51b7b6d2451..043bfc35b353 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -202,25 +202,29 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
irqd_clr_managed_shutdown(d);
- if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) {
+ if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) {
/*
* Catch code which fiddles with enable_irq() on a managed
* and potentially shutdown IRQ. Chained interrupt
* installment or irq auto probing should not happen on
- * managed irqs either. Emit a warning, break the affinity
- * and start it up as a normal interrupt.
+ * managed irqs either.
*/
if (WARN_ON_ONCE(force))
- return IRQ_STARTUP_NORMAL;
+ return IRQ_STARTUP_ABORT;
/*
* The interrupt was requested, but there is no online CPU
* in it's affinity mask. Put it into managed shutdown
* state and let the cpu hotplug mechanism start it up once
* a CPU in the mask becomes available.
*/
- irqd_set_managed_shutdown(d);
return IRQ_STARTUP_ABORT;
}
+ /*
+ * Managed interrupts have reserved resources, so this should not
+ * happen.
+ */
+ if (WARN_ON(irq_domain_activate_irq(d, false)))
+ return IRQ_STARTUP_ABORT;
return IRQ_STARTUP_MANAGED;
}
#else
@@ -236,7 +240,9 @@ static int __irq_startup(struct irq_desc *desc)
struct irq_data *d = irq_desc_get_irq_data(desc);
int ret = 0;
- irq_domain_activate_irq(d);
+ /* Warn if this interrupt is not activated but try nevertheless */
+ WARN_ON_ONCE(!irqd_is_activated(d));
+
if (d->chip->irq_startup) {
ret = d->chip->irq_startup(d);
irq_state_clr_disabled(desc);
@@ -265,10 +271,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
irq_setup_affinity(desc);
break;
case IRQ_STARTUP_MANAGED:
+ irq_do_set_affinity(d, aff, false);
ret = __irq_startup(desc);
- irq_set_affinity_locked(d, aff, false);
break;
case IRQ_STARTUP_ABORT:
+ irqd_set_managed_shutdown(d);
return 0;
}
}
@@ -278,6 +285,22 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
return ret;
}
+int irq_activate(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+
+ if (!irqd_affinity_is_managed(d))
+ return irq_domain_activate_irq(d, false);
+ return 0;
+}
+
+void irq_activate_and_startup(struct irq_desc *desc, bool resend)
+{
+ if (WARN_ON(irq_activate(desc)))
+ return;
+ irq_startup(desc, resend, IRQ_START_FORCE);
+}
+
static void __irq_disable(struct irq_desc *desc, bool mask);
void irq_shutdown(struct irq_desc *desc)
@@ -953,7 +976,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
desc->action = &chained_action;
- irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
+ irq_activate_and_startup(desc, IRQ_RESEND);
}
}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 638eb9c83d9f..9eb09aef0313 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -18,8 +18,34 @@
static inline bool irq_needs_fixup(struct irq_data *d)
{
const struct cpumask *m = irq_data_get_effective_affinity_mask(d);
+ unsigned int cpu = smp_processor_id();
- return cpumask_test_cpu(smp_processor_id(), m);
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ /*
+ * The cpumask_empty() check is a workaround for interrupt chips,
+ * which do not implement effective affinity, but the architecture has
+ * enabled the config switch. Use the general affinity mask instead.
+ */
+ if (cpumask_empty(m))
+ m = irq_data_get_affinity_mask(d);
+
+ /*
+ * Sanity check. If the mask is not empty when excluding the outgoing
+ * CPU then it must contain at least one online CPU. The outgoing CPU
+ * has been removed from the online mask already.
+ */
+ if (cpumask_any_but(m, cpu) < nr_cpu_ids &&
+ cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) {
+ /*
+ * If this happens then there was a missed IRQ fixup at some
+ * point. Warn about it and enforce fixup.
+ */
+ pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n",
+ cpumask_pr_args(m), d->irq, cpu);
+ return true;
+ }
+#endif
+ return cpumask_test_cpu(cpu, m);
}
static bool migrate_one_irq(struct irq_desc *desc)
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index e75e29e4434a..17f05ef8f575 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Debugging printout:
*/
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index c3fdb36dec30..7f608ac39653 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -81,6 +81,8 @@ irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind)
data->domain ? data->domain->name : "");
seq_printf(m, "%*shwirq: 0x%lx\n", ind + 1, "", data->hwirq);
irq_debug_show_chip(m, data, ind + 1);
+ if (data->domain && data->domain->ops && data->domain->ops->debug_show)
+ data->domain->ops->debug_show(m, NULL, data, ind + 1);
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
if (!data->parent_data)
return;
@@ -149,6 +151,7 @@ static int irq_debug_show(struct seq_file *m, void *p)
raw_spin_lock_irq(&desc->lock);
data = irq_desc_get_irq_data(desc);
seq_printf(m, "handler: %pf\n", desc->handle_irq);
+ seq_printf(m, "device: %s\n", desc->dev_name);
seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors);
irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
ARRAY_SIZE(irqdesc_states));
@@ -226,6 +229,15 @@ static const struct file_operations dfs_irq_ops = {
.release = single_release,
};
+void irq_debugfs_copy_devname(int irq, struct device *dev)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ const char *name = dev_name(dev);
+
+ if (name)
+ desc->dev_name = kstrdup(name, GFP_KERNEL);
+}
+
void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
{
char name [10];
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index f7086b78ad6e..c26c5bb6b491 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -135,17 +135,26 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
}
/**
- * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt
+ * irq_gc_mask_disable_and_ack_set - Mask and ack pending interrupt
* @d: irq_data
+ *
+ * This generic implementation of the irq_mask_ack method is for chips
+ * with separate enable/disable registers instead of a single mask
+ * register and where a pending interrupt is acknowledged by setting a
+ * bit.
+ *
+ * Note: This is the only permutation currently used. Similar generic
+ * functions should be added here if other permutations are required.
*/
-void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
+void irq_gc_mask_disable_and_ack_set(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(gc, mask, ct->regs.mask);
+ irq_reg_writel(gc, mask, ct->regs.disable);
+ *ct->mask_cache &= ~mask;
irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
@@ -322,7 +331,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
/* Calc pointer to the next generic chip */
tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
}
- d->name = name;
return 0;
}
EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a4aa39009f0d..07d08ca701ec 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* IRQ subsystem internal functions and variables:
*
@@ -74,6 +75,8 @@ extern void __enable_irq(struct irq_desc *desc);
#define IRQ_START_FORCE true
#define IRQ_START_COND false
+extern int irq_activate(struct irq_desc *desc);
+extern void irq_activate_and_startup(struct irq_desc *desc, bool resend);
extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
extern void irq_shutdown(struct irq_desc *desc);
@@ -436,6 +439,18 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
}
#endif /* !CONFIG_GENERIC_PENDING_IRQ */
+#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
+static inline int irq_domain_activate_irq(struct irq_data *data, bool early)
+{
+ irqd_set_activated(data);
+ return 0;
+}
+static inline void irq_domain_deactivate_irq(struct irq_data *data)
+{
+ irqd_clr_activated(data);
+}
+#endif
+
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
#include <linux/debugfs.h>
@@ -443,7 +458,9 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
{
debugfs_remove(desc->debugfs_file);
+ kfree(desc->dev_name);
}
+void irq_debugfs_copy_devname(int irq, struct device *dev);
# ifdef CONFIG_IRQ_DOMAIN
void irq_domain_debugfs_init(struct dentry *root);
# else
@@ -458,4 +475,7 @@ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
static inline void irq_remove_debugfs_entry(struct irq_desc *d)
{
}
+static inline void irq_debugfs_copy_devname(int irq, struct device *dev)
+{
+}
#endif /* CONFIG_GENERIC_IRQ_DEBUGFS */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 82afb7ed369f..49b54e9979cc 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -27,7 +27,7 @@ static struct lock_class_key irq_desc_lock_class;
#if defined(CONFIG_SMP)
static int __init irq_affinity_setup(char *str)
{
- zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+ alloc_bootmem_cpumask_var(&irq_default_affinity);
cpulist_parse(str, irq_default_affinity);
/*
* Set at least the boot cpu. We don't want to end up with
@@ -40,10 +40,8 @@ __setup("irqaffinity=", irq_affinity_setup);
static void __init init_irq_default_affinity(void)
{
-#ifdef CONFIG_CPUMASK_OFFSTACK
- if (!irq_default_affinity)
+ if (!cpumask_available(irq_default_affinity))
zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
-#endif
if (cpumask_empty(irq_default_affinity))
cpumask_setall(irq_default_affinity);
}
@@ -448,7 +446,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
}
}
- flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
+ flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0;
mask = NULL;
for (i = 0; i < cnt; i++) {
@@ -462,6 +460,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
goto err;
irq_insert_desc(start + i, desc);
irq_sysfs_add(start + i, desc);
+ irq_add_debugfs_entry(start + i, desc);
}
bitmap_set(allocated_irqs, start, cnt);
return start;
@@ -863,6 +862,7 @@ int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity)
return 0;
}
+EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition);
void kstat_incr_irq_this_cpu(unsigned int irq)
{
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index e84b7056bb08..4f4f60015e8a 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -21,7 +21,6 @@
static LIST_HEAD(irq_domain_list);
static DEFINE_MUTEX(irq_domain_mutex);
-static DEFINE_MUTEX(revmap_trees_mutex);
static struct irq_domain *irq_default_domain;
static void irq_domain_check_hierarchy(struct irq_domain *domain);
@@ -211,6 +210,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
+ mutex_init(&domain->revmap_tree_mutex);
domain->ops = ops;
domain->host_data = host_data;
domain->hwirq_max = hwirq_max;
@@ -462,9 +462,9 @@ static void irq_domain_clear_mapping(struct irq_domain *domain,
if (hwirq < domain->revmap_size) {
domain->linear_revmap[hwirq] = 0;
} else {
- mutex_lock(&revmap_trees_mutex);
+ mutex_lock(&domain->revmap_tree_mutex);
radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&revmap_trees_mutex);
+ mutex_unlock(&domain->revmap_tree_mutex);
}
}
@@ -475,9 +475,9 @@ static void irq_domain_set_mapping(struct irq_domain *domain,
if (hwirq < domain->revmap_size) {
domain->linear_revmap[hwirq] = irq_data->irq;
} else {
- mutex_lock(&revmap_trees_mutex);
+ mutex_lock(&domain->revmap_tree_mutex);
radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
- mutex_unlock(&revmap_trees_mutex);
+ mutex_unlock(&domain->revmap_tree_mutex);
}
}
@@ -921,8 +921,7 @@ static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc)
chip = irq_data_get_irq_chip(data);
seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
- seq_printf(m, data ? "0x%p " : " %p ",
- irq_data_get_irq_chip_data(data));
+ seq_printf(m, "0x%p ", irq_data_get_irq_chip_data(data));
seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq);
@@ -945,7 +944,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
struct irq_desc *desc;
struct irq_domain *domain;
struct radix_tree_iter iter;
- void **slot;
+ void __rcu **slot;
int i;
seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
@@ -1453,17 +1452,17 @@ out_free_desc:
/* The irq_data was moved, fix the revmap to refer to the new location */
static void irq_domain_fix_revmap(struct irq_data *d)
{
- void **slot;
+ void __rcu **slot;
if (d->hwirq < d->domain->revmap_size)
return; /* Not using radix tree. */
/* Fix up the revmap. */
- mutex_lock(&revmap_trees_mutex);
+ mutex_lock(&d->domain->revmap_tree_mutex);
slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
if (slot)
radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
- mutex_unlock(&revmap_trees_mutex);
+ mutex_unlock(&d->domain->revmap_tree_mutex);
}
/**
@@ -1682,28 +1681,36 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain,
}
EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
-static void __irq_domain_activate_irq(struct irq_data *irq_data)
+static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
{
if (irq_data && irq_data->domain) {
struct irq_domain *domain = irq_data->domain;
+ if (domain->ops->deactivate)
+ domain->ops->deactivate(domain, irq_data);
if (irq_data->parent_data)
- __irq_domain_activate_irq(irq_data->parent_data);
- if (domain->ops->activate)
- domain->ops->activate(domain, irq_data);
+ __irq_domain_deactivate_irq(irq_data->parent_data);
}
}
-static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
+static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
{
- if (irq_data && irq_data->domain) {
- struct irq_domain *domain = irq_data->domain;
+ int ret = 0;
- if (domain->ops->deactivate)
- domain->ops->deactivate(domain, irq_data);
- if (irq_data->parent_data)
- __irq_domain_deactivate_irq(irq_data->parent_data);
+ if (irqd && irqd->domain) {
+ struct irq_domain *domain = irqd->domain;
+
+ if (irqd->parent_data)
+ ret = __irq_domain_activate_irq(irqd->parent_data,
+ early);
+ if (!ret && domain->ops->activate) {
+ ret = domain->ops->activate(domain, irqd, early);
+ /* Rollback in case of error */
+ if (ret && irqd->parent_data)
+ __irq_domain_deactivate_irq(irqd->parent_data);
+ }
}
+ return ret;
}
/**
@@ -1714,12 +1721,15 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
* This is the second step to call domain_ops->activate to program interrupt
* controllers, so the interrupt could actually get delivered.
*/
-void irq_domain_activate_irq(struct irq_data *irq_data)
+int irq_domain_activate_irq(struct irq_data *irq_data, bool early)
{
- if (!irqd_is_activated(irq_data)) {
- __irq_domain_activate_irq(irq_data);
+ int ret = 0;
+
+ if (!irqd_is_activated(irq_data))
+ ret = __irq_domain_activate_irq(irq_data, early);
+ if (!ret)
irqd_set_activated(irq_data);
- }
+ return ret;
}
/**
@@ -1810,6 +1820,8 @@ irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
d->revmap_size + d->revmap_direct_max_irq);
seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount);
seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags);
+ if (d->ops && d->ops->debug_show)
+ d->ops->debug_show(m, d, NULL, ind + 1);
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
if (!d->parent)
return;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 573dc52b0806..0f922729bab9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_desc *desc)
set_bit(IRQTF_AFFINITY, &action->thread_flags);
}
+static void irq_validate_effective_affinity(struct irq_data *data)
+{
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ const struct cpumask *m = irq_data_get_effective_affinity_mask(data);
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+
+ if (!cpumask_empty(m))
+ return;
+ pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
+ chip->name, data->irq);
+#endif
+}
+
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
{
@@ -175,12 +188,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
struct irq_chip *chip = irq_data_get_irq_chip(data);
int ret;
+ if (!chip || !chip->irq_set_affinity)
+ return -EINVAL;
+
ret = chip->irq_set_affinity(data, mask, force);
switch (ret) {
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
cpumask_copy(desc->irq_common_data.affinity, mask);
case IRQ_SET_MASK_OK_NOCOPY:
+ irq_validate_effective_affinity(data);
irq_set_thread_affinity(desc);
ret = 0;
}
@@ -381,7 +398,8 @@ int irq_select_affinity_usr(unsigned int irq)
/**
* irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
* @irq: interrupt number to set affinity
- * @vcpu_info: vCPU specific data
+ * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU
+ * specific data for percpu_devid interrupts
*
* This function uses the vCPU specific data to set the vCPU
* affinity for an irq. The vCPU specific data is passed from
@@ -519,7 +537,7 @@ void __enable_irq(struct irq_desc *desc)
* time. If it was already started up, then irq_startup()
* will invoke irq_enable() under the hood.
*/
- irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
+ irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
break;
}
default:
@@ -1228,7 +1246,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* set the trigger type must match. Also all must
* agree on ONESHOT.
*/
- unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data);
+ unsigned int oldtype;
+
+ /*
+ * If nobody did set the configuration before, inherit
+ * the one provided by the requester.
+ */
+ if (irqd_trigger_type_was_set(&desc->irq_data)) {
+ oldtype = irqd_get_trigger_type(&desc->irq_data);
+ } else {
+ oldtype = new->flags & IRQF_TRIGGER_MASK;
+ irqd_set_trigger_type(&desc->irq_data, oldtype);
+ }
if (!((old->flags & new->flags) & IRQF_SHARED) ||
(oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
@@ -1288,7 +1317,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* thread_mask assigned. See the loop above which or's
* all existing action->thread_mask bits.
*/
- new->thread_mask = 1 << ffz(thread_mask);
+ new->thread_mask = 1UL << ffz(thread_mask);
} else if (new->handler == irq_default_primary_handler &&
!(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
@@ -1325,6 +1354,21 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
goto out_unlock;
}
+ /*
+ * Activate the interrupt. That activation must happen
+ * independently of IRQ_NOAUTOEN. request_irq() can fail
+ * and the callers are supposed to handle
+ * that. enable_irq() of an interrupt requested with
+ * IRQ_NOAUTOEN is not supposed to fail. The activation
+ * keeps it in shutdown mode, it merily associates
+ * resources if necessary and if that's not possible it
+ * fails. Interrupts which are in managed shutdown mode
+ * will simply ignore that activation request.
+ */
+ ret = irq_activate(desc);
+ if (ret)
+ goto out_unlock;
+
desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
IRQS_ONESHOT | IRQS_WAITING);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
@@ -1400,7 +1444,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
wake_up_process(new->secondary->thread);
register_irq_proc(irq, desc);
- irq_add_debugfs_entry(irq, desc);
new->dir = NULL;
register_handler_proc(irq, new);
return 0;
@@ -1643,6 +1686,10 @@ const void *free_irq(unsigned int irq, void *dev_id)
#endif
action = __free_irq(irq, dev_id);
+
+ if (!action)
+ return NULL;
+
devname = action->name;
kfree(action);
return devname;
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
new file mode 100644
index 000000000000..7df2480005f8
--- /dev/null
+++ b/kernel/irq/matrix.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+#include <linux/bitmap.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/irq.h>
+
+#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS) * sizeof(unsigned long))
+
+struct cpumap {
+ unsigned int available;
+ unsigned int allocated;
+ unsigned int managed;
+ bool online;
+ unsigned long alloc_map[IRQ_MATRIX_SIZE];
+ unsigned long managed_map[IRQ_MATRIX_SIZE];
+};
+
+struct irq_matrix {
+ unsigned int matrix_bits;
+ unsigned int alloc_start;
+ unsigned int alloc_end;
+ unsigned int alloc_size;
+ unsigned int global_available;
+ unsigned int global_reserved;
+ unsigned int systembits_inalloc;
+ unsigned int total_allocated;
+ unsigned int online_maps;
+ struct cpumap __percpu *maps;
+ unsigned long scratch_map[IRQ_MATRIX_SIZE];
+ unsigned long system_map[IRQ_MATRIX_SIZE];
+};
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/irq_matrix.h>
+
+/**
+ * irq_alloc_matrix - Allocate a irq_matrix structure and initialize it
+ * @matrix_bits: Number of matrix bits must be <= IRQ_MATRIX_BITS
+ * @alloc_start: From which bit the allocation search starts
+ * @alloc_end: At which bit the allocation search ends, i.e first
+ * invalid bit
+ */
+__init struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits,
+ unsigned int alloc_start,
+ unsigned int alloc_end)
+{
+ struct irq_matrix *m;
+
+ if (matrix_bits > IRQ_MATRIX_BITS)
+ return NULL;
+
+ m = kzalloc(sizeof(*m), GFP_KERNEL);
+ if (!m)
+ return NULL;
+
+ m->matrix_bits = matrix_bits;
+ m->alloc_start = alloc_start;
+ m->alloc_end = alloc_end;
+ m->alloc_size = alloc_end - alloc_start;
+ m->maps = alloc_percpu(*m->maps);
+ if (!m->maps) {
+ kfree(m);
+ return NULL;
+ }
+ return m;
+}
+
+/**
+ * irq_matrix_online - Bring the local CPU matrix online
+ * @m: Matrix pointer
+ */
+void irq_matrix_online(struct irq_matrix *m)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ BUG_ON(cm->online);
+
+ bitmap_zero(cm->alloc_map, m->matrix_bits);
+ cm->available = m->alloc_size - (cm->managed + m->systembits_inalloc);
+ cm->allocated = 0;
+ m->global_available += cm->available;
+ cm->online = true;
+ m->online_maps++;
+ trace_irq_matrix_online(m);
+}
+
+/**
+ * irq_matrix_offline - Bring the local CPU matrix offline
+ * @m: Matrix pointer
+ */
+void irq_matrix_offline(struct irq_matrix *m)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ /* Update the global available size */
+ m->global_available -= cm->available;
+ cm->online = false;
+ m->online_maps--;
+ trace_irq_matrix_offline(m);
+}
+
+static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm,
+ unsigned int num, bool managed)
+{
+ unsigned int area, start = m->alloc_start;
+ unsigned int end = m->alloc_end;
+
+ bitmap_or(m->scratch_map, cm->managed_map, m->system_map, end);
+ bitmap_or(m->scratch_map, m->scratch_map, cm->alloc_map, end);
+ area = bitmap_find_next_zero_area(m->scratch_map, end, start, num, 0);
+ if (area >= end)
+ return area;
+ if (managed)
+ bitmap_set(cm->managed_map, area, num);
+ else
+ bitmap_set(cm->alloc_map, area, num);
+ return area;
+}
+
+/**
+ * irq_matrix_assign_system - Assign system wide entry in the matrix
+ * @m: Matrix pointer
+ * @bit: Which bit to reserve
+ * @replace: Replace an already allocated vector with a system
+ * vector at the same bit position.
+ *
+ * The BUG_ON()s below are on purpose. If this goes wrong in the
+ * early boot process, then the chance to survive is about zero.
+ * If this happens when the system is life, it's not much better.
+ */
+void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit,
+ bool replace)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ BUG_ON(bit > m->matrix_bits);
+ BUG_ON(m->online_maps > 1 || (m->online_maps && !replace));
+
+ set_bit(bit, m->system_map);
+ if (replace) {
+ BUG_ON(!test_and_clear_bit(bit, cm->alloc_map));
+ cm->allocated--;
+ m->total_allocated--;
+ }
+ if (bit >= m->alloc_start && bit < m->alloc_end)
+ m->systembits_inalloc++;
+
+ trace_irq_matrix_assign_system(bit, m);
+}
+
+/**
+ * irq_matrix_reserve_managed - Reserve a managed interrupt in a CPU map
+ * @m: Matrix pointer
+ * @msk: On which CPUs the bits should be reserved.
+ *
+ * Can be called for offline CPUs. Note, this will only reserve one bit
+ * on all CPUs in @msk, but it's not guaranteed that the bits are at the
+ * same offset on all CPUs
+ */
+int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk)
+{
+ unsigned int cpu, failed_cpu;
+
+ for_each_cpu(cpu, msk) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+ unsigned int bit;
+
+ bit = matrix_alloc_area(m, cm, 1, true);
+ if (bit >= m->alloc_end)
+ goto cleanup;
+ cm->managed++;
+ if (cm->online) {
+ cm->available--;
+ m->global_available--;
+ }
+ trace_irq_matrix_reserve_managed(bit, cpu, m, cm);
+ }
+ return 0;
+cleanup:
+ failed_cpu = cpu;
+ for_each_cpu(cpu, msk) {
+ if (cpu == failed_cpu)
+ break;
+ irq_matrix_remove_managed(m, cpumask_of(cpu));
+ }
+ return -ENOSPC;
+}
+
+/**
+ * irq_matrix_remove_managed - Remove managed interrupts in a CPU map
+ * @m: Matrix pointer
+ * @msk: On which CPUs the bits should be removed
+ *
+ * Can be called for offline CPUs
+ *
+ * This removes not allocated managed interrupts from the map. It does
+ * not matter which one because the managed interrupts free their
+ * allocation when they shut down. If not, the accounting is screwed,
+ * but all what can be done at this point is warn about it.
+ */
+void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, msk) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+ unsigned int bit, end = m->alloc_end;
+
+ if (WARN_ON_ONCE(!cm->managed))
+ continue;
+
+ /* Get managed bit which are not allocated */
+ bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end);
+
+ bit = find_first_bit(m->scratch_map, end);
+ if (WARN_ON_ONCE(bit >= end))
+ continue;
+
+ clear_bit(bit, cm->managed_map);
+
+ cm->managed--;
+ if (cm->online) {
+ cm->available++;
+ m->global_available++;
+ }
+ trace_irq_matrix_remove_managed(bit, cpu, m, cm);
+ }
+}
+
+/**
+ * irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map
+ * @m: Matrix pointer
+ * @cpu: On which CPU the interrupt should be allocated
+ */
+int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu)
+{
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+ unsigned int bit, end = m->alloc_end;
+
+ /* Get managed bit which are not allocated */
+ bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end);
+ bit = find_first_bit(m->scratch_map, end);
+ if (bit >= end)
+ return -ENOSPC;
+ set_bit(bit, cm->alloc_map);
+ cm->allocated++;
+ m->total_allocated++;
+ trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
+ return bit;
+}
+
+/**
+ * irq_matrix_assign - Assign a preallocated interrupt in the local CPU map
+ * @m: Matrix pointer
+ * @bit: Which bit to mark
+ *
+ * This should only be used to mark preallocated vectors
+ */
+void irq_matrix_assign(struct irq_matrix *m, unsigned int bit)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
+ return;
+ if (WARN_ON_ONCE(test_and_set_bit(bit, cm->alloc_map)))
+ return;
+ cm->allocated++;
+ m->total_allocated++;
+ cm->available--;
+ m->global_available--;
+ trace_irq_matrix_assign(bit, smp_processor_id(), m, cm);
+}
+
+/**
+ * irq_matrix_reserve - Reserve interrupts
+ * @m: Matrix pointer
+ *
+ * This is merily a book keeping call. It increments the number of globally
+ * reserved interrupt bits w/o actually allocating them. This allows to
+ * setup interrupt descriptors w/o assigning low level resources to it.
+ * The actual allocation happens when the interrupt gets activated.
+ */
+void irq_matrix_reserve(struct irq_matrix *m)
+{
+ if (m->global_reserved <= m->global_available &&
+ m->global_reserved + 1 > m->global_available)
+ pr_warn("Interrupt reservation exceeds available resources\n");
+
+ m->global_reserved++;
+ trace_irq_matrix_reserve(m);
+}
+
+/**
+ * irq_matrix_remove_reserved - Remove interrupt reservation
+ * @m: Matrix pointer
+ *
+ * This is merily a book keeping call. It decrements the number of globally
+ * reserved interrupt bits. This is used to undo irq_matrix_reserve() when the
+ * interrupt was never in use and a real vector allocated, which undid the
+ * reservation.
+ */
+void irq_matrix_remove_reserved(struct irq_matrix *m)
+{
+ m->global_reserved--;
+ trace_irq_matrix_remove_reserved(m);
+}
+
+/**
+ * irq_matrix_alloc - Allocate a regular interrupt in a CPU map
+ * @m: Matrix pointer
+ * @msk: Which CPUs to search in
+ * @reserved: Allocate previously reserved interrupts
+ * @mapped_cpu: Pointer to store the CPU for which the irq was allocated
+ */
+int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
+ bool reserved, unsigned int *mapped_cpu)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, msk) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+ unsigned int bit;
+
+ if (!cm->online)
+ continue;
+
+ bit = matrix_alloc_area(m, cm, 1, false);
+ if (bit < m->alloc_end) {
+ cm->allocated++;
+ cm->available--;
+ m->total_allocated++;
+ m->global_available--;
+ if (reserved)
+ m->global_reserved--;
+ *mapped_cpu = cpu;
+ trace_irq_matrix_alloc(bit, cpu, m, cm);
+ return bit;
+ }
+ }
+ return -ENOSPC;
+}
+
+/**
+ * irq_matrix_free - Free allocated interrupt in the matrix
+ * @m: Matrix pointer
+ * @cpu: Which CPU map needs be updated
+ * @bit: The bit to remove
+ * @managed: If true, the interrupt is managed and not accounted
+ * as available.
+ */
+void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
+ unsigned int bit, bool managed)
+{
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+ if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
+ return;
+
+ if (cm->online) {
+ clear_bit(bit, cm->alloc_map);
+ cm->allocated--;
+ m->total_allocated--;
+ if (!managed) {
+ cm->available++;
+ m->global_available++;
+ }
+ }
+ trace_irq_matrix_free(bit, cpu, m, cm);
+}
+
+/**
+ * irq_matrix_available - Get the number of globally available irqs
+ * @m: Pointer to the matrix to query
+ * @cpudown: If true, the local CPU is about to go down, adjust
+ * the number of available irqs accordingly
+ */
+unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ return (m->global_available - cpudown) ? cm->available : 0;
+}
+
+/**
+ * irq_matrix_reserved - Get the number of globally reserved irqs
+ * @m: Pointer to the matrix to query
+ */
+unsigned int irq_matrix_reserved(struct irq_matrix *m)
+{
+ return m->global_reserved;
+}
+
+/**
+ * irq_matrix_allocated - Get the number of allocated irqs on the local cpu
+ * @m: Pointer to the matrix to search
+ *
+ * This returns number of allocated irqs
+ */
+unsigned int irq_matrix_allocated(struct irq_matrix *m)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ return cm->allocated;
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+/**
+ * irq_matrix_debug_show - Show detailed allocation information
+ * @sf: Pointer to the seq_file to print to
+ * @m: Pointer to the matrix allocator
+ * @ind: Indentation for the print format
+ *
+ * Note, this is a lockless snapshot.
+ */
+void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind)
+{
+ unsigned int nsys = bitmap_weight(m->system_map, m->matrix_bits);
+ int cpu;
+
+ seq_printf(sf, "Online bitmaps: %6u\n", m->online_maps);
+ seq_printf(sf, "Global available: %6u\n", m->global_available);
+ seq_printf(sf, "Global reserved: %6u\n", m->global_reserved);
+ seq_printf(sf, "Total allocated: %6u\n", m->total_allocated);
+ seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits,
+ m->system_map);
+ seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " ");
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+ seq_printf(sf, "%*s %4d %4u %4u %4u %*pbl\n", ind, " ",
+ cpu, cm->available, cm->managed, cm->allocated,
+ m->matrix_bits, cm->alloc_map);
+ }
+ cpus_read_unlock();
+}
+#endif
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 6ca054a3f91d..86ae0eb80b53 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/irq.h>
#include <linux/interrupt.h>
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3fa4bd59f569..edb987b2c58d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -16,6 +16,8 @@
#include <linux/msi.h>
#include <linux/slab.h>
+#include "internals.h"
+
/**
* alloc_msi_entry - Allocate an initialize msi_entry
* @dev: Pointer to the device for which this is allocated
@@ -100,13 +102,14 @@ int msi_domain_set_affinity(struct irq_data *irq_data,
return ret;
}
-static void msi_domain_activate(struct irq_domain *domain,
- struct irq_data *irq_data)
+static int msi_domain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data, bool early)
{
struct msi_msg msg;
BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
irq_chip_write_msi_msg(irq_data, &msg);
+ return 0;
}
static void msi_domain_deactivate(struct irq_domain *domain,
@@ -373,8 +376,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
return ret;
}
- for (i = 0; i < desc->nvec_used; i++)
+ for (i = 0; i < desc->nvec_used; i++) {
irq_set_msi_desc_off(virq, i, desc);
+ irq_debugfs_copy_devname(virq + i, dev);
+ }
}
if (ops->msi_finish)
@@ -396,11 +401,28 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
struct irq_data *irq_data;
irq_data = irq_domain_get_irq_data(domain, desc->irq);
- irq_domain_activate_irq(irq_data);
+ ret = irq_domain_activate_irq(irq_data, true);
+ if (ret)
+ goto cleanup;
+ if (info->flags & MSI_FLAG_MUST_REACTIVATE)
+ irqd_clr_activated(irq_data);
}
}
-
return 0;
+
+cleanup:
+ for_each_msi_entry(desc, dev) {
+ struct irq_data *irqd;
+
+ if (desc->irq == virq)
+ break;
+
+ irqd = irq_domain_get_irq_data(domain, desc->irq);
+ if (irqd_is_activated(irqd))
+ irq_domain_deactivate_irq(irqd);
+ }
+ msi_domain_free_irqs(domain, dev);
+ return ret;
}
/**
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6376b4a598d3..e8f374971e37 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/irq/proc.c
*
@@ -154,8 +155,9 @@ static ssize_t write_irq_affinity(int type, struct file *file,
*/
err = irq_select_affinity_usr(irq) ? -EINVAL : count;
} else {
- irq_set_affinity(irq, new_value);
- err = count;
+ err = irq_set_affinity(irq, new_value);
+ if (!err)
+ err = count;
}
free_cpumask:
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index b86886beee4f..1d08f45135c2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/irq/resend.c
*
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 320579d89091..e43795cd2ccf 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Internal header to deal with irq_desc->status which will be renamed
* to irq_desc->settings.
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 061ba7eed4ed..ef2a47e0eab6 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/irq/spurious.c
*
@@ -19,8 +20,8 @@
static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
-static void poll_spurious_irqs(unsigned long dummy);
-static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
+static void poll_spurious_irqs(struct timer_list *unused);
+static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
static int irq_poll_cpu;
static atomic_t irq_poll_active;
@@ -142,7 +143,7 @@ out:
return ok;
}
-static void poll_spurious_irqs(unsigned long dummy)
+static void poll_spurious_irqs(struct timer_list *unused)
{
struct irq_desc *desc;
int i;
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index c8c1d073fbf1..e0923fa4927a 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -264,7 +264,7 @@ u64 irq_timings_next_event(u64 now)
* order to prevent the timings circular buffer to be updated
* while we are reading it.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
/*
* Number of elements in the circular buffer: If it happens it
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index bcf107ce0854..40e9d739c169 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -56,7 +56,6 @@ void __weak arch_irq_work_raise(void)
*/
}
-#ifdef CONFIG_SMP
/*
* Enqueue the irq_work @work on @cpu unless it's already pending
* somewhere.
@@ -68,6 +67,8 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(cpu));
+#ifdef CONFIG_SMP
+
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
@@ -78,10 +79,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
arch_send_call_function_single_ipi(cpu);
+#else /* #ifdef CONFIG_SMP */
+ irq_work_queue(work);
+#endif /* #else #ifdef CONFIG_SMP */
+
return true;
}
-EXPORT_SYMBOL_GPL(irq_work_queue_on);
-#endif
/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
@@ -128,9 +131,9 @@ bool irq_work_needs_cpu(void)
static void irq_work_run_list(struct llist_head *list)
{
- unsigned long flags;
- struct irq_work *work;
+ struct irq_work *work, *tmp;
struct llist_node *llnode;
+ unsigned long flags;
BUG_ON(!irqs_disabled());
@@ -138,11 +141,7 @@ static void irq_work_run_list(struct llist_head *list)
return;
llnode = llist_del_all(list);
- while (llnode != NULL) {
- work = llist_entry(llnode, struct irq_work, llnode);
-
- llnode = llist_next(llnode);
-
+ llist_for_each_entry_safe(work, tmp, llnode, llnode) {
/*
* Clear the PENDING bit, after this point the @work
* can be re-used.
@@ -188,7 +187,7 @@ void irq_work_tick(void)
*/
void irq_work_sync(struct irq_work *work)
{
- WARN_ON_ONCE(irqs_disabled());
+ lockdep_assert_irqs_enabled();
while (work->flags & IRQ_WORK_BUSY)
cpu_relax();
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 0bf2e8f5244a..8594d24e4adc 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -83,7 +83,7 @@ static void static_key_slow_inc_cpuslocked(struct static_key *key)
{
int v, v1;
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
/*
* Careful if we get concurrent static_key_slow_inc() calls;
@@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
void static_key_enable_cpuslocked(struct static_key *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
if (atomic_read(&key->enabled) > 0) {
WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
@@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(static_key_enable);
void static_key_disable_cpuslocked(struct static_key *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
if (atomic_read(&key->enabled) != 1) {
WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
@@ -224,21 +224,21 @@ static void jump_label_update_timeout(struct work_struct *work)
void static_key_slow_dec(struct static_key *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
__static_key_slow_dec(key, 0, NULL);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);
void static_key_slow_dec_deferred(struct static_key_deferred *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
__static_key_slow_dec(&key->key, key->timeout, &key->work);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
void static_key_deferred_flush(struct static_key_deferred *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
flush_delayed_work(&key->work);
}
EXPORT_SYMBOL_GPL(static_key_deferred_flush);
@@ -246,7 +246,7 @@ EXPORT_SYMBOL_GPL(static_key_deferred_flush);
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
key->timeout = rl;
INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
}
@@ -769,7 +769,7 @@ static __init int jump_label_test(void)
return 0;
}
-late_initcall(jump_label_test);
+early_initcall(jump_label_test);
#endif /* STATIC_KEYS_SELFTEST */
#endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 127e7cfafa55..531ffa984bc2 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,6 +24,7 @@
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/filter.h>
+#include <linux/ftrace.h>
#include <linux/compiler.h>
#include <asm/sections.h>
@@ -337,6 +338,10 @@ const char *kallsyms_lookup(unsigned long addr,
if (!ret)
ret = bpf_address_lookup(addr, symbolsize,
offset, modname, namebuf);
+
+ if (!ret)
+ ret = ftrace_mod_address_lookup(addr, symbolsize,
+ offset, modname, namebuf);
return ret;
}
@@ -474,12 +479,14 @@ EXPORT_SYMBOL(__print_symbol);
struct kallsym_iter {
loff_t pos;
loff_t pos_mod_end;
+ loff_t pos_ftrace_mod_end;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
char name[KSYM_NAME_LEN];
char module_name[MODULE_NAME_LEN];
int exported;
+ int show_value;
};
static int get_ksymbol_mod(struct kallsym_iter *iter)
@@ -496,11 +503,25 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
return 1;
}
+static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
+{
+ int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
+ &iter->value, &iter->type,
+ iter->name, iter->module_name,
+ &iter->exported);
+ if (ret < 0) {
+ iter->pos_ftrace_mod_end = iter->pos;
+ return 0;
+ }
+
+ return 1;
+}
+
static int get_ksymbol_bpf(struct kallsym_iter *iter)
{
iter->module_name[0] = '\0';
iter->exported = 0;
- return bpf_get_kallsym(iter->pos - iter->pos_mod_end,
+ return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
&iter->value, &iter->type,
iter->name) < 0 ? 0 : 1;
}
@@ -525,20 +546,31 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
iter->name[0] = '\0';
iter->nameoff = get_symbol_offset(new_pos);
iter->pos = new_pos;
- if (new_pos == 0)
+ if (new_pos == 0) {
iter->pos_mod_end = 0;
+ iter->pos_ftrace_mod_end = 0;
+ }
}
static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
{
iter->pos = pos;
- if (iter->pos_mod_end > 0 &&
- iter->pos_mod_end < iter->pos)
+ if (iter->pos_ftrace_mod_end > 0 &&
+ iter->pos_ftrace_mod_end < iter->pos)
return get_ksymbol_bpf(iter);
- if (!get_ksymbol_mod(iter))
- return get_ksymbol_bpf(iter);
+ if (iter->pos_mod_end > 0 &&
+ iter->pos_mod_end < iter->pos) {
+ if (!get_ksymbol_ftrace_mod(iter))
+ return get_ksymbol_bpf(iter);
+ return 1;
+ }
+
+ if (!get_ksymbol_mod(iter)) {
+ if (!get_ksymbol_ftrace_mod(iter))
+ return get_ksymbol_bpf(iter);
+ }
return 1;
}
@@ -582,12 +614,15 @@ static void s_stop(struct seq_file *m, void *p)
static int s_show(struct seq_file *m, void *p)
{
+ unsigned long value;
struct kallsym_iter *iter = m->private;
/* Some debugging symbols have no name. Ignore them. */
if (!iter->name[0])
return 0;
+ value = iter->show_value ? iter->value : 0;
+
if (iter->module_name[0]) {
char type;
@@ -597,10 +632,10 @@ static int s_show(struct seq_file *m, void *p)
*/
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
- seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
+ seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value,
type, iter->name, iter->module_name);
} else
- seq_printf(m, "%pK %c %s\n", (void *)iter->value,
+ seq_printf(m, KALLSYM_FMT " %c %s\n", value,
iter->type, iter->name);
return 0;
}
@@ -612,6 +647,40 @@ static const struct seq_operations kallsyms_op = {
.show = s_show
};
+static inline int kallsyms_for_perf(void)
+{
+#ifdef CONFIG_PERF_EVENTS
+ extern int sysctl_perf_event_paranoid;
+ if (sysctl_perf_event_paranoid <= 1)
+ return 1;
+#endif
+ return 0;
+}
+
+/*
+ * We show kallsyms information even to normal users if we've enabled
+ * kernel profiling and are explicitly not paranoid (so kptr_restrict
+ * is clear, and sysctl_perf_event_paranoid isn't set).
+ *
+ * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
+ * block even that).
+ */
+int kallsyms_show_value(void)
+{
+ switch (kptr_restrict) {
+ case 0:
+ if (kallsyms_for_perf())
+ return 1;
+ /* fallthrough */
+ case 1:
+ if (has_capability_noaudit(current, CAP_SYSLOG))
+ return 1;
+ /* fallthrough */
+ default:
+ return 0;
+ }
+}
+
static int kallsyms_open(struct inode *inode, struct file *file)
{
/*
@@ -625,6 +694,7 @@ static int kallsyms_open(struct inode *inode, struct file *file)
return -ENOMEM;
reset_iter(iter, 0);
+ iter->show_value = kallsyms_show_value();
return 0;
}
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index ea34ed8bb952..a0e3d7a0e8b8 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/fdtable.h>
@@ -131,7 +132,7 @@ static int kcmp_epoll_target(struct task_struct *task1,
if (filp_epoll) {
filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
fput(filp_epoll);
- } else
+ }
if (IS_ERR(filp_tgt))
return PTR_ERR(filp_tgt);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 3f693a0f6f3e..15f33faf4013 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) "kcov: " fmt
#define DISABLE_BRANCH_PROFILING
@@ -21,13 +22,21 @@
#include <linux/kcov.h>
#include <asm/setup.h>
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
+
/*
* kcov descriptor (one per opened debugfs file).
* State transitions of the descriptor:
* - initial state after open()
* - then there must be a single ioctl(KCOV_INIT_TRACE) call
* - then, mmap() call (several calls are allowed but not useful)
- * - then, repeated enable/disable for a task (only one task a time allowed)
+ * - then, ioctl(KCOV_ENABLE, arg), where arg is
+ * KCOV_TRACE_PC - to trace only the PCs
+ * or
+ * KCOV_TRACE_CMP - to trace only the comparison operands
+ * - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
*/
struct kcov {
/*
@@ -47,51 +56,176 @@ struct kcov {
struct task_struct *t;
};
-/*
- * Entry point from instrumented code.
- * This is called once per basic-block/edge.
- */
-void notrace __sanitizer_cov_trace_pc(void)
+static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
{
- struct task_struct *t;
enum kcov_mode mode;
- t = current;
/*
* We are interested in code coverage as a function of a syscall inputs,
* so we ignore code executed in interrupts.
*/
- if (!t || !in_task())
- return;
+ if (!in_task())
+ return false;
mode = READ_ONCE(t->kcov_mode);
- if (mode == KCOV_MODE_TRACE) {
- unsigned long *area;
- unsigned long pos;
- unsigned long ip = _RET_IP_;
+ /*
+ * There is some code that runs in interrupts but for which
+ * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+ * READ_ONCE()/barrier() effectively provides load-acquire wrt
+ * interrupts, there are paired barrier()/WRITE_ONCE() in
+ * kcov_ioctl_locked().
+ */
+ barrier();
+ return mode == needed_mode;
+}
+static unsigned long canonicalize_ip(unsigned long ip)
+{
#ifdef CONFIG_RANDOMIZE_BASE
- ip -= kaslr_offset();
+ ip -= kaslr_offset();
#endif
+ return ip;
+}
- /*
- * There is some code that runs in interrupts but for which
- * in_interrupt() returns false (e.g. preempt_schedule_irq()).
- * READ_ONCE()/barrier() effectively provides load-acquire wrt
- * interrupts, there are paired barrier()/WRITE_ONCE() in
- * kcov_ioctl_locked().
- */
- barrier();
- area = t->kcov_area;
- /* The first word is number of subsequent PCs. */
- pos = READ_ONCE(area[0]) + 1;
- if (likely(pos < t->kcov_size)) {
- area[pos] = ip;
- WRITE_ONCE(area[0], pos);
- }
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+ struct task_struct *t;
+ unsigned long *area;
+ unsigned long ip = canonicalize_ip(_RET_IP_);
+ unsigned long pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
+ return;
+
+ area = t->kcov_area;
+ /* The first 64-bit word is the number of subsequent PCs. */
+ pos = READ_ONCE(area[0]) + 1;
+ if (likely(pos < t->kcov_size)) {
+ area[pos] = ip;
+ WRITE_ONCE(area[0], pos);
}
}
EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+ struct task_struct *t;
+ u64 *area;
+ u64 count, start_index, end_pos, max_pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+ return;
+
+ ip = canonicalize_ip(ip);
+
+ /*
+ * We write all comparison arguments and types as u64.
+ * The buffer was allocated for t->kcov_size unsigned longs.
+ */
+ area = (u64 *)t->kcov_area;
+ max_pos = t->kcov_size * sizeof(unsigned long);
+
+ count = READ_ONCE(area[0]);
+
+ /* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+ start_index = 1 + count * KCOV_WORDS_PER_CMP;
+ end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+ if (likely(end_pos <= max_pos)) {
+ area[start_index] = type;
+ area[start_index + 1] = arg1;
+ area[start_index + 2] = arg2;
+ area[start_index + 3] = ip;
+ WRITE_ONCE(area[0], count + 1);
+ }
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+{
+ u64 i;
+ u64 count = cases[0];
+ u64 size = cases[1];
+ u64 type = KCOV_CMP_CONST;
+
+ switch (size) {
+ case 8:
+ type |= KCOV_CMP_SIZE(0);
+ break;
+ case 16:
+ type |= KCOV_CMP_SIZE(1);
+ break;
+ case 32:
+ type |= KCOV_CMP_SIZE(2);
+ break;
+ case 64:
+ type |= KCOV_CMP_SIZE(3);
+ break;
+ default:
+ return;
+ }
+ for (i = 0; i < count; i++)
+ write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
static void kcov_get(struct kcov *kcov)
{
atomic_inc(&kcov->refcount);
@@ -128,6 +262,7 @@ void kcov_task_exit(struct task_struct *t)
/* Just to not leave dangling references behind. */
kcov_task_init(t);
kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
spin_unlock(&kcov->lock);
kcov_put(kcov);
}
@@ -146,7 +281,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
spin_lock(&kcov->lock);
size = kcov->size * sizeof(unsigned long);
- if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+ if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
vma->vm_end - vma->vm_start != size) {
res = -EINVAL;
goto exit;
@@ -175,6 +310,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
if (!kcov)
return -ENOMEM;
+ kcov->mode = KCOV_MODE_DISABLED;
atomic_set(&kcov->refcount, 1);
spin_lock_init(&kcov->lock);
filep->private_data = kcov;
@@ -210,7 +346,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
if (size < 2 || size > INT_MAX / sizeof(unsigned long))
return -EINVAL;
kcov->size = size;
- kcov->mode = KCOV_MODE_TRACE;
+ kcov->mode = KCOV_MODE_INIT;
return 0;
case KCOV_ENABLE:
/*
@@ -220,17 +356,25 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
* at task exit or voluntary by KCOV_DISABLE. After that it can
* be enabled for another task.
*/
- unused = arg;
- if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
- kcov->area == NULL)
+ if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
return -EINVAL;
if (kcov->t != NULL)
return -EBUSY;
+ if (arg == KCOV_TRACE_PC)
+ kcov->mode = KCOV_MODE_TRACE_PC;
+ else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+ kcov->mode = KCOV_MODE_TRACE_CMP;
+#else
+ return -ENOTSUPP;
+#endif
+ else
+ return -EINVAL;
t = current;
/* Cache in task struct for performance. */
t->kcov_size = kcov->size;
t->kcov_area = kcov->area;
- /* See comment in __sanitizer_cov_trace_pc(). */
+ /* See comment in check_kcov_mode(). */
barrier();
WRITE_ONCE(t->kcov_mode, kcov->mode);
t->kcov = kcov;
@@ -248,6 +392,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
return -EINVAL;
kcov_task_init(t);
kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
kcov_put(kcov);
return 0;
default:
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 9f48f4412297..e5bcd94c1efb 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -406,9 +406,10 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
return 1;
}
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+static int locate_mem_hole_callback(struct resource *res, void *arg)
{
struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+ u64 start = res->start, end = res->end;
unsigned long sz = end - start + 1;
/* Returning 0 will take to next memory range */
@@ -437,7 +438,7 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
* func returning non-zero, then zero will be returned.
*/
int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
- int (*func)(u64, u64, void *))
+ int (*func)(struct resource *, void *))
{
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return walk_iomem_res_desc(crashk_res.desc,
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 50dfcb039a41..48aaf2ac0d0d 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_KEXEC_INTERNAL_H
#define LINUX_KEXEC_INTERNAL_H
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a1606a4224e1..da2ccf142358 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
SLOT_USED = 2,
};
-static void *alloc_insn_page(void)
+void __weak *alloc_insn_page(void)
{
return module_alloc(PAGE_SIZE);
}
@@ -573,13 +573,15 @@ static void kprobe_optimizer(struct work_struct *work)
do_unoptimize_kprobes();
/*
- * Step 2: Wait for quiesence period to ensure all running interrupts
- * are done. Because optprobe may modify multiple instructions
- * there is a chance that Nth instruction is interrupted. In that
- * case, running interrupt can return to 2nd-Nth byte of jump
- * instruction. This wait is for avoiding it.
+ * Step 2: Wait for quiesence period to ensure all potentially
+ * preempted tasks to have normally scheduled. Because optprobe
+ * may modify multiple instructions, there is a chance that Nth
+ * instruction is preempted. In that case, such tasks can return
+ * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
+ * Note that on non-preemptive kernel, this is transparently converted
+ * to synchronoze_sched() to wait for all interrupts to have completed.
*/
- synchronize_sched();
+ synchronize_rcu_tasks();
/* Step 3: Optimize kprobes after quiesence period */
do_optimize_kprobes();
@@ -1769,6 +1771,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
return (unsigned long)entry;
}
+#if 0
int register_jprobes(struct jprobe **jps, int num)
{
int ret = 0, i;
@@ -1837,6 +1840,7 @@ void unregister_jprobes(struct jprobe **jps, int num)
}
}
EXPORT_SYMBOL_GPL(unregister_jprobes);
+#endif
#ifdef CONFIG_KRETPROBES
/*
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1c19edf82427..cd50e99202b0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,7 +20,6 @@
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
-#include <linux/cgroup.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -47,6 +46,9 @@ struct kthread {
void *data;
struct completion parked;
struct completion exited;
+#ifdef CONFIG_BLK_CGROUP
+ struct cgroup_subsys_state *blkcg_css;
+#endif
};
enum KTHREAD_BITS {
@@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct *k)
void free_kthread_struct(struct task_struct *k)
{
+ struct kthread *kthread;
+
/*
* Can be NULL if this kthread was created by kernel_thread()
* or if kmalloc() in kthread() failed.
*/
- kfree(to_kthread(k));
+ kthread = to_kthread(k);
+#ifdef CONFIG_BLK_CGROUP
+ WARN_ON_ONCE(kthread && kthread->blkcg_css);
+#endif
+ kfree(kthread);
}
/**
@@ -196,7 +204,7 @@ static int kthread(void *_create)
struct kthread *self;
int ret;
- self = kmalloc(sizeof(*self), GFP_KERNEL);
+ self = kzalloc(sizeof(*self), GFP_KERNEL);
set_kthread_struct(self);
/* If user was SIGKILLed, I release the structure. */
@@ -212,7 +220,6 @@ static int kthread(void *_create)
do_exit(-ENOMEM);
}
- self->flags = 0;
self->data = data;
init_completion(&self->exited);
init_completion(&self->parked);
@@ -798,15 +805,14 @@ EXPORT_SYMBOL_GPL(kthread_queue_work);
/**
* kthread_delayed_work_timer_fn - callback that queues the associated kthread
* delayed work when the timer expires.
- * @__data: pointer to the data associated with the timer
+ * @t: pointer to the expired timer
*
* The format of the function is defined by struct timer_list.
* It should have been called from irqsafe timer with irq already off.
*/
-void kthread_delayed_work_timer_fn(unsigned long __data)
+void kthread_delayed_work_timer_fn(struct timer_list *t)
{
- struct kthread_delayed_work *dwork =
- (struct kthread_delayed_work *)__data;
+ struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
struct kthread_work *work = &dwork->work;
struct kthread_worker *worker = work->worker;
@@ -837,8 +843,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
struct timer_list *timer = &dwork->timer;
struct kthread_work *work = &dwork->work;
- WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn ||
- timer->data != (unsigned long)dwork);
+ WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
/*
* If @delay is 0, queue @dwork->work immediately. This is for
@@ -1154,3 +1159,54 @@ void kthread_destroy_worker(struct kthread_worker *worker)
kfree(worker);
}
EXPORT_SYMBOL(kthread_destroy_worker);
+
+#ifdef CONFIG_BLK_CGROUP
+/**
+ * kthread_associate_blkcg - associate blkcg to current kthread
+ * @css: the cgroup info
+ *
+ * Current thread must be a kthread. The thread is running jobs on behalf of
+ * other threads. In some cases, we expect the jobs attach cgroup info of
+ * original threads instead of that of current thread. This function stores
+ * original thread's cgroup info in current kthread context for later
+ * retrieval.
+ */
+void kthread_associate_blkcg(struct cgroup_subsys_state *css)
+{
+ struct kthread *kthread;
+
+ if (!(current->flags & PF_KTHREAD))
+ return;
+ kthread = to_kthread(current);
+ if (!kthread)
+ return;
+
+ if (kthread->blkcg_css) {
+ css_put(kthread->blkcg_css);
+ kthread->blkcg_css = NULL;
+ }
+ if (css) {
+ css_get(css);
+ kthread->blkcg_css = css;
+ }
+}
+EXPORT_SYMBOL(kthread_associate_blkcg);
+
+/**
+ * kthread_blkcg - get associated blkcg css of current kthread
+ *
+ * Current thread must be a kthread.
+ */
+struct cgroup_subsys_state *kthread_blkcg(void)
+{
+ struct kthread *kthread;
+
+ if (current->flags & PF_KTHREAD) {
+ kthread = to_kthread(current);
+ if (kthread)
+ return kthread->blkcg_css;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(kthread_blkcg);
+#endif
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index 2b8bdb1925da..b36ceda6488e 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
obj-$(CONFIG_LIVEPATCH) += livepatch.o
-livepatch-objs := core.o patch.o transition.o
+livepatch-objs := core.o patch.o shadow.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index b9628e43c78f..de9e45dca70f 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -54,11 +54,6 @@ static bool klp_is_module(struct klp_object *obj)
return obj->name;
}
-static bool klp_is_object_loaded(struct klp_object *obj)
-{
- return !obj->name || obj->mod;
-}
-
/* sets obj->mod if object is not vmlinux and module is found */
static void klp_find_object_module(struct klp_object *obj)
{
@@ -285,6 +280,11 @@ static int klp_write_object_relocations(struct module *pmod,
static int __klp_disable_patch(struct klp_patch *patch)
{
+ struct klp_object *obj;
+
+ if (WARN_ON(!patch->enabled))
+ return -EINVAL;
+
if (klp_transition_patch)
return -EBUSY;
@@ -295,6 +295,10 @@ static int __klp_disable_patch(struct klp_patch *patch)
klp_init_transition(patch, KLP_UNPATCHED);
+ klp_for_each_object(patch, obj)
+ if (obj->patched)
+ klp_pre_unpatch_callback(obj);
+
/*
* Enforce the order of the func->transition writes in
* klp_init_transition() and the TIF_PATCH_PENDING writes in
@@ -388,13 +392,18 @@ static int __klp_enable_patch(struct klp_patch *patch)
if (!klp_is_object_loaded(obj))
continue;
- ret = klp_patch_object(obj);
+ ret = klp_pre_patch_callback(obj);
if (ret) {
- pr_warn("failed to enable patch '%s'\n",
- patch->mod->name);
+ pr_warn("pre-patch callback failed for object '%s'\n",
+ klp_is_module(obj) ? obj->name : "vmlinux");
+ goto err;
+ }
- klp_cancel_transition();
- return ret;
+ ret = klp_patch_object(obj);
+ if (ret) {
+ pr_warn("failed to patch object '%s'\n",
+ klp_is_module(obj) ? obj->name : "vmlinux");
+ goto err;
}
}
@@ -403,6 +412,11 @@ static int __klp_enable_patch(struct klp_patch *patch)
patch->enabled = true;
return 0;
+err:
+ pr_warn("failed to enable patch '%s'\n", patch->mod->name);
+
+ klp_cancel_transition();
+ return ret;
}
/**
@@ -830,6 +844,47 @@ int klp_register_patch(struct klp_patch *patch)
}
EXPORT_SYMBOL_GPL(klp_register_patch);
+/*
+ * Remove parts of patches that touch a given kernel module. The list of
+ * patches processed might be limited. When limit is NULL, all patches
+ * will be handled.
+ */
+static void klp_cleanup_module_patches_limited(struct module *mod,
+ struct klp_patch *limit)
+{
+ struct klp_patch *patch;
+ struct klp_object *obj;
+
+ list_for_each_entry(patch, &klp_patches, list) {
+ if (patch == limit)
+ break;
+
+ klp_for_each_object(patch, obj) {
+ if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+ continue;
+
+ /*
+ * Only unpatch the module if the patch is enabled or
+ * is in transition.
+ */
+ if (patch->enabled || patch == klp_transition_patch) {
+
+ if (patch != klp_transition_patch)
+ klp_pre_unpatch_callback(obj);
+
+ pr_notice("reverting patch '%s' on unloading module '%s'\n",
+ patch->mod->name, obj->mod->name);
+ klp_unpatch_object(obj);
+
+ klp_post_unpatch_callback(obj);
+ }
+
+ klp_free_object_loaded(obj);
+ break;
+ }
+ }
+}
+
int klp_module_coming(struct module *mod)
{
int ret;
@@ -871,13 +926,25 @@ int klp_module_coming(struct module *mod)
pr_notice("applying patch '%s' to loading module '%s'\n",
patch->mod->name, obj->mod->name);
+ ret = klp_pre_patch_callback(obj);
+ if (ret) {
+ pr_warn("pre-patch callback failed for object '%s'\n",
+ obj->name);
+ goto err;
+ }
+
ret = klp_patch_object(obj);
if (ret) {
pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
patch->mod->name, obj->mod->name, ret);
+
+ klp_post_unpatch_callback(obj);
goto err;
}
+ if (patch != klp_transition_patch)
+ klp_post_patch_callback(obj);
+
break;
}
}
@@ -894,7 +961,7 @@ err:
pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",
patch->mod->name, obj->mod->name, obj->mod->name);
mod->klp_alive = false;
- klp_free_object_loaded(obj);
+ klp_cleanup_module_patches_limited(mod, patch);
mutex_unlock(&klp_mutex);
return ret;
@@ -902,9 +969,6 @@ err:
void klp_module_going(struct module *mod)
{
- struct klp_patch *patch;
- struct klp_object *obj;
-
if (WARN_ON(mod->state != MODULE_STATE_GOING &&
mod->state != MODULE_STATE_COMING))
return;
@@ -917,25 +981,7 @@ void klp_module_going(struct module *mod)
*/
mod->klp_alive = false;
- list_for_each_entry(patch, &klp_patches, list) {
- klp_for_each_object(patch, obj) {
- if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
- continue;
-
- /*
- * Only unpatch the module if the patch is enabled or
- * is in transition.
- */
- if (patch->enabled || patch == klp_transition_patch) {
- pr_notice("reverting patch '%s' on unloading module '%s'\n",
- patch->mod->name, obj->mod->name);
- klp_unpatch_object(obj);
- }
-
- klp_free_object_loaded(obj);
- break;
- }
- }
+ klp_cleanup_module_patches_limited(mod, NULL);
mutex_unlock(&klp_mutex);
}
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
index c74f24c47837..48a83d4364cf 100644
--- a/kernel/livepatch/core.h
+++ b/kernel/livepatch/core.h
@@ -1,6 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LIVEPATCH_CORE_H
#define _LIVEPATCH_CORE_H
+#include <linux/livepatch.h>
+
extern struct mutex klp_mutex;
+static inline bool klp_is_object_loaded(struct klp_object *obj)
+{
+ return !obj->name || obj->mod;
+}
+
+static inline int klp_pre_patch_callback(struct klp_object *obj)
+{
+ int ret = 0;
+
+ if (obj->callbacks.pre_patch)
+ ret = (*obj->callbacks.pre_patch)(obj);
+
+ obj->callbacks.post_unpatch_enabled = !ret;
+
+ return ret;
+}
+
+static inline void klp_post_patch_callback(struct klp_object *obj)
+{
+ if (obj->callbacks.post_patch)
+ (*obj->callbacks.post_patch)(obj);
+}
+
+static inline void klp_pre_unpatch_callback(struct klp_object *obj)
+{
+ if (obj->callbacks.pre_unpatch)
+ (*obj->callbacks.pre_unpatch)(obj);
+}
+
+static inline void klp_post_unpatch_callback(struct klp_object *obj)
+{
+ if (obj->callbacks.post_unpatch_enabled &&
+ obj->callbacks.post_unpatch)
+ (*obj->callbacks.post_unpatch)(obj);
+
+ obj->callbacks.post_unpatch_enabled = false;
+}
+
#endif /* _LIVEPATCH_CORE_H */
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 52c4e907c14b..82d584225dc6 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -28,6 +28,7 @@
#include <linux/slab.h>
#include <linux/bug.h>
#include <linux/printk.h>
+#include "core.h"
#include "patch.h"
#include "transition.h"
diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h
index 0db227170c36..e72d8250d04b 100644
--- a/kernel/livepatch/patch.h
+++ b/kernel/livepatch/patch.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LIVEPATCH_PATCH_H
#define _LIVEPATCH_PATCH_H
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
new file mode 100644
index 000000000000..fdac27588d60
--- /dev/null
+++ b/kernel/livepatch/shadow.c
@@ -0,0 +1,277 @@
+/*
+ * shadow.c - Shadow Variables
+ *
+ * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * DOC: Shadow variable API concurrency notes:
+ *
+ * The shadow variable API provides a simple relationship between an
+ * <obj, id> pair and a pointer value. It is the responsibility of the
+ * caller to provide any mutual exclusion required of the shadow data.
+ *
+ * Once a shadow variable is attached to its parent object via the
+ * klp_shadow_*alloc() API calls, it is considered live: any subsequent
+ * call to klp_shadow_get() may then return the shadow variable's data
+ * pointer. Callers of klp_shadow_*alloc() should prepare shadow data
+ * accordingly.
+ *
+ * The klp_shadow_*alloc() API calls may allocate memory for new shadow
+ * variable structures. Their implementation does not call kmalloc
+ * inside any spinlocks, but API callers should pass GFP flags according
+ * to their specific needs.
+ *
+ * The klp_shadow_hash is an RCU-enabled hashtable and is safe against
+ * concurrent klp_shadow_free() and klp_shadow_get() operations.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/hashtable.h>
+#include <linux/slab.h>
+#include <linux/livepatch.h>
+
+static DEFINE_HASHTABLE(klp_shadow_hash, 12);
+
+/*
+ * klp_shadow_lock provides exclusive access to the klp_shadow_hash and
+ * the shadow variables it references.
+ */
+static DEFINE_SPINLOCK(klp_shadow_lock);
+
+/**
+ * struct klp_shadow - shadow variable structure
+ * @node: klp_shadow_hash hash table node
+ * @rcu_head: RCU is used to safely free this structure
+ * @obj: pointer to parent object
+ * @id: data identifier
+ * @data: data area
+ */
+struct klp_shadow {
+ struct hlist_node node;
+ struct rcu_head rcu_head;
+ void *obj;
+ unsigned long id;
+ char data[];
+};
+
+/**
+ * klp_shadow_match() - verify a shadow variable matches given <obj, id>
+ * @shadow: shadow variable to match
+ * @obj: pointer to parent object
+ * @id: data identifier
+ *
+ * Return: true if the shadow variable matches.
+ */
+static inline bool klp_shadow_match(struct klp_shadow *shadow, void *obj,
+ unsigned long id)
+{
+ return shadow->obj == obj && shadow->id == id;
+}
+
+/**
+ * klp_shadow_get() - retrieve a shadow variable data pointer
+ * @obj: pointer to parent object
+ * @id: data identifier
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get(void *obj, unsigned long id)
+{
+ struct klp_shadow *shadow;
+
+ rcu_read_lock();
+
+ hash_for_each_possible_rcu(klp_shadow_hash, shadow, node,
+ (unsigned long)obj) {
+
+ if (klp_shadow_match(shadow, obj, id)) {
+ rcu_read_unlock();
+ return shadow->data;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get);
+
+static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+ size_t size, gfp_t gfp_flags, bool warn_on_exist)
+{
+ struct klp_shadow *new_shadow;
+ void *shadow_data;
+ unsigned long flags;
+
+ /* Check if the shadow variable already exists */
+ shadow_data = klp_shadow_get(obj, id);
+ if (shadow_data)
+ goto exists;
+
+ /* Allocate a new shadow variable for use inside the lock below */
+ new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags);
+ if (!new_shadow)
+ return NULL;
+
+ new_shadow->obj = obj;
+ new_shadow->id = id;
+
+ /* Initialize the shadow variable if data provided */
+ if (data)
+ memcpy(new_shadow->data, data, size);
+
+ /* Look for <obj, id> again under the lock */
+ spin_lock_irqsave(&klp_shadow_lock, flags);
+ shadow_data = klp_shadow_get(obj, id);
+ if (unlikely(shadow_data)) {
+ /*
+ * Shadow variable was found, throw away speculative
+ * allocation.
+ */
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+ kfree(new_shadow);
+ goto exists;
+ }
+
+ /* No <obj, id> found, so attach the newly allocated one */
+ hash_add_rcu(klp_shadow_hash, &new_shadow->node,
+ (unsigned long)new_shadow->obj);
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+
+ return new_shadow->data;
+
+exists:
+ if (warn_on_exist) {
+ WARN(1, "Duplicate shadow variable <%p, %lx>\n", obj, id);
+ return NULL;
+ }
+
+ return shadow_data;
+}
+
+/**
+ * klp_shadow_alloc() - allocate and add a new shadow variable
+ * @obj: pointer to parent object
+ * @id: data identifier
+ * @data: pointer to data to attach to parent
+ * @size: size of attached data
+ * @gfp_flags: GFP mask for allocation
+ *
+ * Allocates @size bytes for new shadow variable data using @gfp_flags
+ * and copies @size bytes from @data into the new shadow variable's own
+ * data space. If @data is NULL, @size bytes are still allocated, but
+ * no copy is performed. The new shadow variable is then added to the
+ * global hashtable.
+ *
+ * If an existing <obj, id> shadow variable can be found, this routine
+ * will issue a WARN, exit early and return NULL.
+ *
+ * Return: the shadow variable data element, NULL on duplicate or
+ * failure.
+ */
+void *klp_shadow_alloc(void *obj, unsigned long id, void *data,
+ size_t size, gfp_t gfp_flags)
+{
+ return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_alloc);
+
+/**
+ * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable
+ * @obj: pointer to parent object
+ * @id: data identifier
+ * @data: pointer to data to attach to parent
+ * @size: size of attached data
+ * @gfp_flags: GFP mask for allocation
+ *
+ * Returns a pointer to existing shadow data if an <obj, id> shadow
+ * variable is already present. Otherwise, it creates a new shadow
+ * variable like klp_shadow_alloc().
+ *
+ * This function guarantees that only one shadow variable exists with
+ * the given @id for the given @obj. It also guarantees that the shadow
+ * variable will be initialized by the given @data only when it did not
+ * exist before.
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+ size_t size, gfp_t gfp_flags)
+{
+ return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
+
+/**
+ * klp_shadow_free() - detach and free a <obj, id> shadow variable
+ * @obj: pointer to parent object
+ * @id: data identifier
+ *
+ * This function releases the memory for this <obj, id> shadow variable
+ * instance, callers should stop referencing it accordingly.
+ */
+void klp_shadow_free(void *obj, unsigned long id)
+{
+ struct klp_shadow *shadow;
+ unsigned long flags;
+
+ spin_lock_irqsave(&klp_shadow_lock, flags);
+
+ /* Delete <obj, id> from hash */
+ hash_for_each_possible(klp_shadow_hash, shadow, node,
+ (unsigned long)obj) {
+
+ if (klp_shadow_match(shadow, obj, id)) {
+ hash_del_rcu(&shadow->node);
+ kfree_rcu(shadow, rcu_head);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free);
+
+/**
+ * klp_shadow_free_all() - detach and free all <*, id> shadow variables
+ * @id: data identifier
+ *
+ * This function releases the memory for all <*, id> shadow variable
+ * instances, callers should stop referencing them accordingly.
+ */
+void klp_shadow_free_all(unsigned long id)
+{
+ struct klp_shadow *shadow;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&klp_shadow_lock, flags);
+
+ /* Delete all <*, id> from hash */
+ hash_for_each(klp_shadow_hash, i, shadow, node) {
+ if (klp_shadow_match(shadow, shadow->obj, id)) {
+ hash_del_rcu(&shadow->node);
+ kfree_rcu(shadow, rcu_head);
+ }
+ }
+
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free_all);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index b004a1fb6032..56add6327736 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -82,6 +82,10 @@ static void klp_complete_transition(void)
unsigned int cpu;
bool immediate_func = false;
+ pr_debug("'%s': completing %s transition\n",
+ klp_transition_patch->mod->name,
+ klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
if (klp_target_state == KLP_UNPATCHED) {
/*
* All tasks have transitioned to KLP_UNPATCHED so we can now
@@ -109,9 +113,6 @@ static void klp_complete_transition(void)
}
}
- if (klp_target_state == KLP_UNPATCHED && !immediate_func)
- module_put(klp_transition_patch->mod);
-
/* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
if (klp_target_state == KLP_PATCHED)
klp_synchronize_transition();
@@ -130,6 +131,27 @@ static void klp_complete_transition(void)
}
done:
+ klp_for_each_object(klp_transition_patch, obj) {
+ if (!klp_is_object_loaded(obj))
+ continue;
+ if (klp_target_state == KLP_PATCHED)
+ klp_post_patch_callback(obj);
+ else if (klp_target_state == KLP_UNPATCHED)
+ klp_post_unpatch_callback(obj);
+ }
+
+ pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
+ klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
+ /*
+ * See complementary comment in __klp_enable_patch() for why we
+ * keep the module reference for immediate patches.
+ */
+ if (!klp_transition_patch->immediate && !immediate_func &&
+ klp_target_state == KLP_UNPATCHED) {
+ module_put(klp_transition_patch->mod);
+ }
+
klp_target_state = KLP_UNDEFINED;
klp_transition_patch = NULL;
}
@@ -145,6 +167,9 @@ void klp_cancel_transition(void)
if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED))
return;
+ pr_debug("'%s': canceling patching transition, going to unpatch\n",
+ klp_transition_patch->mod->name);
+
klp_target_state = KLP_UNPATCHED;
klp_complete_transition();
}
@@ -408,9 +433,6 @@ void klp_try_complete_transition(void)
}
success:
- pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
- klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
-
/* we're done, now cleanup the data structures */
klp_complete_transition();
}
@@ -426,7 +448,8 @@ void klp_start_transition(void)
WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
- pr_notice("'%s': %s...\n", klp_transition_patch->mod->name,
+ pr_notice("'%s': starting %s transition\n",
+ klp_transition_patch->mod->name,
klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
/*
@@ -482,6 +505,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
*/
klp_target_state = state;
+ pr_debug("'%s': initializing %s transition\n", patch->mod->name,
+ klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
/*
* If the patch can be applied or reverted immediately, skip the
* per-task transitions.
@@ -547,6 +573,11 @@ void klp_reverse_transition(void)
unsigned int cpu;
struct task_struct *g, *task;
+ pr_debug("'%s': reversing transition from %s\n",
+ klp_transition_patch->mod->name,
+ klp_target_state == KLP_PATCHED ? "patching to unpatching" :
+ "unpatching to patching");
+
klp_transition_patch->enabled = !klp_transition_patch->enabled;
klp_target_state = !klp_target_state;
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h
index ce09b326546c..0f6e27c481f9 100644
--- a/kernel/livepatch/transition.h
+++ b/kernel/livepatch/transition.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LIVEPATCH_TRANSITION_H
#define _LIVEPATCH_TRANSITION_H
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 760158d9d98d..392c7f23af76 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
# Any varying coverage in these files is non-deterministic
# and is generally not a function of system call inputs.
KCOV_INSTRUMENT := n
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 44c8d0d17170..9776da8db180 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -47,7 +47,6 @@
#include <linux/stringify.h>
#include <linux/bitops.h>
#include <linux/gfp.h>
-#include <linux/kmemcheck.h>
#include <linux/random.h>
#include <linux/jhash.h>
@@ -76,6 +75,19 @@ module_param(lock_stat, int, 0644);
#define lock_stat 0
#endif
+#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
+static int crossrelease_fullstack = 1;
+#else
+static int crossrelease_fullstack;
+#endif
+static int __init allow_crossrelease_fullstack(char *str)
+{
+ crossrelease_fullstack = 1;
+ return 0;
+}
+
+early_param("crossrelease_fullstack", allow_crossrelease_fullstack);
+
/*
* lockdep_lock: protects the lockdep graph, the hashes and the
* class/list/hash allocators.
@@ -1873,10 +1885,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next, int distance, struct stack_trace *trace,
int (*save)(struct stack_trace *trace))
{
+ struct lock_list *uninitialized_var(target_entry);
struct lock_list *entry;
- int ret;
struct lock_list this;
- struct lock_list *uninitialized_var(target_entry);
+ int ret;
/*
* Prove that the new <prev> -> <next> dependency would not
@@ -1890,8 +1902,17 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
this.class = hlock_class(next);
this.parent = NULL;
ret = check_noncircular(&this, hlock_class(prev), &target_entry);
- if (unlikely(!ret))
+ if (unlikely(!ret)) {
+ if (!trace->entries) {
+ /*
+ * If @save fails here, the printing might trigger
+ * a WARN but because of the !nr_entries it should
+ * not do bad things.
+ */
+ save(trace);
+ }
return print_circular_bug(&this, target_entry, next, prev, trace);
+ }
else if (unlikely(ret < 0))
return print_bfs_bug(ret);
@@ -1938,7 +1959,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return print_bfs_bug(ret);
- if (save && !save(trace))
+ if (!trace->entries && !save(trace))
return 0;
/*
@@ -1958,20 +1979,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
if (!ret)
return 0;
- /*
- * Debugging printouts:
- */
- if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
- graph_unlock();
- printk("\n new dependency: ");
- print_lock_name(hlock_class(prev));
- printk(KERN_CONT " => ");
- print_lock_name(hlock_class(next));
- printk(KERN_CONT "\n");
- dump_stack();
- if (!graph_lock())
- return 0;
- }
return 2;
}
@@ -1986,8 +1993,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
int depth = curr->lockdep_depth;
struct held_lock *hlock;
- struct stack_trace trace;
- int (*save)(struct stack_trace *trace) = save_trace;
+ struct stack_trace trace = {
+ .nr_entries = 0,
+ .max_entries = 0,
+ .entries = NULL,
+ .skip = 0,
+ };
/*
* Debugging checks.
@@ -2018,18 +2029,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
*/
if (hlock->read != 2 && hlock->check) {
int ret = check_prev_add(curr, hlock, next,
- distance, &trace, save);
+ distance, &trace, save_trace);
if (!ret)
return 0;
/*
- * Stop saving stack_trace if save_trace() was
- * called at least once:
- */
- if (save && ret == 2)
- save = NULL;
-
- /*
* Stop after the first non-trylock entry,
* as non-trylock entries have added their
* own direct dependencies already, so this
@@ -3233,8 +3237,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
{
int i;
- kmemcheck_mark_initialized(lock, sizeof(*lock));
-
for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
lock->class_cache[i] = NULL;
@@ -4871,8 +4873,14 @@ static void add_xhlock(struct held_lock *hlock)
xhlock->trace.nr_entries = 0;
xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
xhlock->trace.entries = xhlock->trace_entries;
- xhlock->trace.skip = 3;
- save_stack_trace(&xhlock->trace);
+
+ if (crossrelease_fullstack) {
+ xhlock->trace.skip = 3;
+ save_stack_trace(&xhlock->trace);
+ } else {
+ xhlock->trace.nr_entries = 1;
+ xhlock->trace.entries[0] = hlock->acquire_ip;
+ }
}
static inline int same_context_xhlock(struct hist_lock *xhlock)
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 1da4669d57a7..d459d624ba2a 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* kernel/lockdep_internals.h
*
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 68d9e267ccd4..ad69bbc9bd28 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* kernel/lockdep_proc.c
*
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 6a385aabcce7..f046b7ce9dd6 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* MCS lock defines
*
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index 4174417d5309..1edd3f45a4ec 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Mutexes: blocking mutual exclusion locks
*
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 6ebc1902f779..1c2287d3fa71 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Mutexes: blocking mutual exclusion locks
*
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index a74ee6abd039..6ef600aa0f47 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/osq_lock.h>
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 2655f26ec882..c7471c3fb798 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -23,49 +23,11 @@
#include <linux/spinlock.h>
#include <asm/qrwlock.h>
-/*
- * This internal data structure is used for optimizing access to some of
- * the subfields within the atomic_t cnts.
- */
-struct __qrwlock {
- union {
- atomic_t cnts;
- struct {
-#ifdef __LITTLE_ENDIAN
- u8 wmode; /* Writer mode */
- u8 rcnts[3]; /* Reader counts */
-#else
- u8 rcnts[3]; /* Reader counts */
- u8 wmode; /* Writer mode */
-#endif
- };
- };
- arch_spinlock_t lock;
-};
-
-/**
- * rspin_until_writer_unlock - inc reader count & spin until writer is gone
- * @lock : Pointer to queue rwlock structure
- * @writer: Current queue rwlock writer status byte
- *
- * In interrupt context or at the head of the queue, the reader will just
- * increment the reader count & wait until the writer releases the lock.
- */
-static __always_inline void
-rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
-{
- while ((cnts & _QW_WMASK) == _QW_LOCKED) {
- cpu_relax();
- cnts = atomic_read_acquire(&lock->cnts);
- }
-}
-
/**
* queued_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
- * @cnts: Current qrwlock lock value
*/
-void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
+void queued_read_lock_slowpath(struct qrwlock *lock)
{
/*
* Readers come here when they cannot get the lock without waiting
@@ -73,13 +35,11 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
if (unlikely(in_interrupt())) {
/*
* Readers in interrupt context will get the lock immediately
- * if the writer is just waiting (not holding the lock yet).
- * The rspin_until_writer_unlock() function returns immediately
- * in this case. Otherwise, they will spin (with ACQUIRE
- * semantics) until the lock is available without waiting in
- * the queue.
+ * if the writer is just waiting (not holding the lock yet),
+ * so spin with ACQUIRE semantics until the lock is available
+ * without waiting in the queue.
*/
- rspin_until_writer_unlock(lock, cnts);
+ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
return;
}
atomic_sub(_QR_BIAS, &lock->cnts);
@@ -88,14 +48,14 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
* Put the reader into the wait queue
*/
arch_spin_lock(&lock->wait_lock);
+ atomic_add(_QR_BIAS, &lock->cnts);
/*
* The ACQUIRE semantics of the following spinning code ensure
* that accesses can't leak upwards out of our subsequent critical
* section in the case that the lock is currently held for write.
*/
- cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts);
- rspin_until_writer_unlock(lock, cnts);
+ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
/*
* Signal the next one in queue to become queue head
@@ -110,8 +70,6 @@ EXPORT_SYMBOL(queued_read_lock_slowpath);
*/
void queued_write_lock_slowpath(struct qrwlock *lock)
{
- u32 cnts;
-
/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);
@@ -120,30 +78,14 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
(atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
goto unlock;
- /*
- * Set the waiting flag to notify readers that a writer is pending,
- * or wait for a previous writer to go away.
- */
- for (;;) {
- struct __qrwlock *l = (struct __qrwlock *)lock;
-
- if (!READ_ONCE(l->wmode) &&
- (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
- break;
+ /* Set the waiting flag to notify readers that a writer is pending */
+ atomic_add(_QW_WAITING, &lock->cnts);
- cpu_relax();
- }
-
- /* When no more readers, set the locked flag */
- for (;;) {
- cnts = atomic_read(&lock->cnts);
- if ((cnts == _QW_WAITING) &&
- (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
- _QW_LOCKED) == _QW_WAITING))
- break;
-
- cpu_relax();
- }
+ /* When no more readers or writers, set the locked flag */
+ do {
+ atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING);
+ } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING,
+ _QW_LOCKED) != _QW_WAITING);
unlock:
arch_spin_unlock(&lock->wait_lock);
}
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 43555681c40b..6ee477765e6c 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _GEN_PV_LOCK_SLOWPATH
#error "do not include this file"
#endif
@@ -60,21 +61,50 @@ struct pv_node {
#include "qspinlock_stat.h"
/*
+ * Hybrid PV queued/unfair lock
+ *
* By replacing the regular queued_spin_trylock() with the function below,
* it will be called once when a lock waiter enter the PV slowpath before
- * being queued. By allowing one lock stealing attempt here when the pending
- * bit is off, it helps to reduce the performance impact of lock waiter
- * preemption without the drawback of lock starvation.
+ * being queued.
+ *
+ * The pending bit is set by the queue head vCPU of the MCS wait queue in
+ * pv_wait_head_or_lock() to signal that it is ready to spin on the lock.
+ * When that bit becomes visible to the incoming waiters, no lock stealing
+ * is allowed. The function will return immediately to make the waiters
+ * enter the MCS wait queue. So lock starvation shouldn't happen as long
+ * as the queued mode vCPUs are actively running to set the pending bit
+ * and hence disabling lock stealing.
+ *
+ * When the pending bit isn't set, the lock waiters will stay in the unfair
+ * mode spinning on the lock unless the MCS wait queue is empty. In this
+ * case, the lock waiters will enter the queued mode slowpath trying to
+ * become the queue head and set the pending bit.
+ *
+ * This hybrid PV queued/unfair lock combines the best attributes of a
+ * queued lock (no lock starvation) and an unfair lock (good performance
+ * on not heavily contended locks).
*/
-#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
-static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+#define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l)
+static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
- if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
- (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
- qstat_inc(qstat_pv_lock_stealing, true);
- return true;
+ /*
+ * Stay in unfair lock mode as long as queued mode waiters are
+ * present in the MCS wait queue but the pending bit isn't set.
+ */
+ for (;;) {
+ int val = atomic_read(&lock->val);
+
+ if (!(val & _Q_LOCKED_PENDING_MASK) &&
+ (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ qstat_inc(qstat_pv_lock_stealing, true);
+ return true;
+ }
+ if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
+ break;
+
+ cpu_relax();
}
return false;
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index f4a74e78d467..fd4fe1f5b458 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* RT-Mutexes: blocking mutual exclusion locks with PI support
*
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 5078c6ddf4a5..fc549713bba3 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* RT-Mutexes: blocking mutual exclusion locks with PI support
*
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index 5c253caffe91..732f96abf462 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* RT-Mutexes: blocking mutual exclusion locks with PI support
*
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 7453be0485a5..124e98ca0b17 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* RT Mutexes: blocking mutual exclusion locks with PI support
*
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 0848634c5512..a7ffb2a96ede 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
* generic spinlock implementation
*
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 02f660666ab8..e795908f3607 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* rwsem.c: R/W semaphores: contention handling functions
*
* Written by David Howells (dhowells@redhat.com).
@@ -613,6 +614,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
DEFINE_WAKE_Q(wake_q);
/*
+ * __rwsem_down_write_failed_common(sem)
+ * rwsem_optimistic_spin(sem)
+ * osq_unlock(sem->osq)
+ * ...
+ * atomic_long_add_return(&sem->count)
+ *
+ * - VS -
+ *
+ * __up_write()
+ * if (atomic_long_sub_return_release(&sem->count) < 0)
+ * rwsem_wake(sem)
+ * osq_is_locked(&sem->osq)
+ *
+ * And __up_write() must observe !osq_is_locked() when it observes the
+ * atomic_long_add_return() in order to not miss a wakeup.
+ *
+ * This boils down to:
+ *
+ * [S.rel] X = 1 [RmW] r0 = (Y += 0)
+ * MB RMB
+ * [RmW] Y += 1 [L] r1 = X
+ *
+ * exists (r0=1 /\ r1=0)
+ */
+ smp_rmb();
+
+ /*
* If a spinner is present, it is not necessary to do the wakeup.
* Try to do wakeup only if the trylock succeeds to minimize
* spinlock contention which may introduce too much delay in the
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 4d48b1c4870d..f549c552dbf1 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* kernel/rwsem.c: R/W semaphores, public implementation
*
* Written by David Howells (dhowells@redhat.com).
@@ -28,6 +29,22 @@ void __sched down_read(struct rw_semaphore *sem)
EXPORT_SYMBOL(down_read);
+int __sched down_read_killable(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ return -EINTR;
+ }
+
+ rwsem_set_reader_owned(sem);
+ return 0;
+}
+
+EXPORT_SYMBOL(down_read_killable);
+
/*
* trylock for reading -- returns 1 if successful, 0 if contention
*/
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index a699f4048ba1..a883b8f1fdc6 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* The owner field of the rw_semaphore structure will be set to
* RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..1fd1a7543cdd 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (2004) Linus Torvalds
*
@@ -29,11 +30,10 @@
#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
/*
* The __lock_function inlines are taken from
- * include/linux/spinlock_api_smp.h
+ * spinlock : include/linux/spinlock_api_smp.h
+ * rwlock : include/linux/rwlock_api_smp.h
*/
#else
-#define raw_read_can_lock(l) read_can_lock(l)
-#define raw_write_can_lock(l) write_can_lock(l)
/*
* Some architectures can relax in favour of the CPU owning the lock.
@@ -68,7 +68,7 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
\
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
- while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
+ while ((lock)->break_lock) \
arch_##op##_relax(&lock->raw_lock); \
} \
(lock)->break_lock = 0; \
@@ -88,7 +88,7 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
\
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
- while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
+ while ((lock)->break_lock) \
arch_##op##_relax(&lock->raw_lock); \
} \
(lock)->break_lock = 0; \
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 6bcbfbf1a8fd..403ab9cdb949 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
pgprot_t pgprot = PAGE_KERNEL;
struct dev_pagemap *pgmap;
struct page_map *page_map;
- int error, nid, is_ram;
+ int error, nid, is_ram, i = 0;
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -448,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
list_del(&page->lru);
page->pgmap = pgmap;
percpu_ref_get(ref);
+ if (!(++i % 1024))
+ cond_resched();
}
devres_add(dev, page_map);
return __va(res->start);
diff --git a/kernel/module.c b/kernel/module.c
index de66ec825992..f0411a271765 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -278,6 +278,16 @@ static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
module_param(sig_enforce, bool_enable_only, 0644);
#endif /* !CONFIG_MODULE_SIG_FORCE */
+/*
+ * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
+ * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
+ */
+bool is_module_sig_enforced(void)
+{
+ return sig_enforce;
+}
+EXPORT_SYMBOL(is_module_sig_enforced);
+
/* Block module loading/unloading? */
int modules_disabled = 0;
core_param(nomodule, modules_disabled, bint, 0);
@@ -837,10 +847,8 @@ static int add_module_usage(struct module *a, struct module *b)
pr_debug("Allocating new usage for %s.\n", a->name);
use = kmalloc(sizeof(*use), GFP_ATOMIC);
- if (!use) {
- pr_warn("%s: out of memory loading\n", a->name);
+ if (!use)
return -ENOMEM;
- }
use->source = a;
use->target = b;
@@ -1516,7 +1524,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
sattr->mattr.show = module_sect_show;
sattr->mattr.store = NULL;
sattr->mattr.attr.name = sattr->name;
- sattr->mattr.attr.mode = S_IRUGO;
+ sattr->mattr.attr.mode = S_IRUSR;
*(gattr++) = &(sattr++)->mattr.attr;
}
*gattr = NULL;
@@ -3473,6 +3481,8 @@ static noinline int do_init_module(struct module *mod)
if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
async_synchronize_full();
+ ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
+ mod->init_layout.size);
mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
@@ -4147,6 +4157,7 @@ static int m_show(struct seq_file *m, void *p)
{
struct module *mod = list_entry(p, struct module, list);
char buf[MODULE_FLAGS_BUF_SIZE];
+ unsigned long value;
/* We always ignore unformed modules. */
if (mod->state == MODULE_STATE_UNFORMED)
@@ -4162,7 +4173,8 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
- seq_printf(m, " 0x%pK", mod->core_layout.base);
+ value = m->private ? 0 : (unsigned long)mod->core_layout.base;
+ seq_printf(m, " 0x" KALLSYM_FMT, value);
/* Taints info */
if (mod->taints)
@@ -4184,9 +4196,23 @@ static const struct seq_operations modules_op = {
.show = m_show
};
+/*
+ * This also sets the "private" pointer to non-NULL if the
+ * kernel pointers should be hidden (so you can just test
+ * "m->private" to see if you should keep the values private).
+ *
+ * We use the same logic as for /proc/kallsyms.
+ */
static int modules_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &modules_op);
+ int err = seq_open(file, &modules_op);
+
+ if (!err) {
+ struct seq_file *m = file->private_data;
+ m->private = kallsyms_show_value() ? NULL : (void *)8ul;
+ }
+
+ return 0;
}
static const struct file_operations proc_modules_operations = {
diff --git a/kernel/padata.c b/kernel/padata.c
index 868f947166d7..57c0074d50cc 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -131,6 +131,7 @@ int padata_do_parallel(struct padata_instance *pinst,
padata->cb_cpu = cb_cpu;
target_cpu = padata_cpu_hash(pd);
+ padata->cpu = target_cpu;
queue = per_cpu_ptr(pd->pqueue, target_cpu);
spin_lock(&queue->parallel.lock);
@@ -275,11 +276,51 @@ static void padata_reorder(struct parallel_data *pd)
return;
}
-static void padata_reorder_timer(unsigned long arg)
+static void invoke_padata_reorder(struct work_struct *work)
{
- struct parallel_data *pd = (struct parallel_data *)arg;
+ struct padata_parallel_queue *pqueue;
+ struct parallel_data *pd;
+ local_bh_disable();
+ pqueue = container_of(work, struct padata_parallel_queue, reorder_work);
+ pd = pqueue->pd;
padata_reorder(pd);
+ local_bh_enable();
+}
+
+static void padata_reorder_timer(struct timer_list *t)
+{
+ struct parallel_data *pd = from_timer(pd, t, timer);
+ unsigned int weight;
+ int target_cpu, cpu;
+
+ cpu = get_cpu();
+
+ /* We don't lock pd here to not interfere with parallel processing
+ * padata_reorder() calls on other CPUs. We just need any CPU out of
+ * the cpumask.pcpu set. It would be nice if it's the right one but
+ * it doesn't matter if we're off to the next one by using an outdated
+ * pd->processed value.
+ */
+ weight = cpumask_weight(pd->cpumask.pcpu);
+ target_cpu = padata_index_to_cpu(pd, pd->processed % weight);
+
+ /* ensure to call the reorder callback on the correct CPU */
+ if (cpu != target_cpu) {
+ struct padata_parallel_queue *pqueue;
+ struct padata_instance *pinst;
+
+ /* The timer function is serialized wrt itself -- no locking
+ * needed.
+ */
+ pinst = pd->pinst;
+ pqueue = per_cpu_ptr(pd->pqueue, target_cpu);
+ queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work);
+ } else {
+ padata_reorder(pd);
+ }
+
+ put_cpu();
}
static void padata_serial_worker(struct work_struct *serial_work)
@@ -323,10 +364,21 @@ void padata_do_serial(struct padata_priv *padata)
int cpu;
struct padata_parallel_queue *pqueue;
struct parallel_data *pd;
+ int reorder_via_wq = 0;
pd = padata->pd;
cpu = get_cpu();
+
+ /* We need to run on the same CPU padata_do_parallel(.., padata, ..)
+ * was called on -- or, at least, enqueue the padata object into the
+ * correct per-cpu queue.
+ */
+ if (cpu != padata->cpu) {
+ reorder_via_wq = 1;
+ cpu = padata->cpu;
+ }
+
pqueue = per_cpu_ptr(pd->pqueue, cpu);
spin_lock(&pqueue->reorder.lock);
@@ -336,7 +388,13 @@ void padata_do_serial(struct padata_priv *padata)
put_cpu();
- padata_reorder(pd);
+ /* If we're running on the wrong CPU, call padata_reorder() via a
+ * kernel worker.
+ */
+ if (reorder_via_wq)
+ queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work);
+ else
+ padata_reorder(pd);
}
EXPORT_SYMBOL(padata_do_serial);
@@ -384,8 +442,14 @@ static void padata_init_pqueues(struct parallel_data *pd)
struct padata_parallel_queue *pqueue;
cpu_index = 0;
- for_each_cpu(cpu, pd->cpumask.pcpu) {
+ for_each_possible_cpu(cpu) {
pqueue = per_cpu_ptr(pd->pqueue, cpu);
+
+ if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) {
+ pqueue->cpu_index = -1;
+ continue;
+ }
+
pqueue->pd = pd;
pqueue->cpu_index = cpu_index;
cpu_index++;
@@ -393,6 +457,7 @@ static void padata_init_pqueues(struct parallel_data *pd)
__padata_list_init(&pqueue->reorder);
__padata_list_init(&pqueue->parallel);
INIT_WORK(&pqueue->work, padata_parallel_worker);
+ INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder);
atomic_set(&pqueue->num_obj, 0);
}
}
@@ -420,7 +485,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
padata_init_pqueues(pd);
padata_init_squeues(pd);
- setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
+ timer_setup(&pd->timer, padata_reorder_timer, 0);
atomic_set(&pd->seq_nr, -1);
atomic_set(&pd->reorder_objects, 0);
atomic_set(&pd->refcnt, 0);
diff --git a/kernel/panic.c b/kernel/panic.c
index bdd18afa19a4..2cfef408fec9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,6 +27,8 @@
#include <linux/console.h>
#include <linux/bug.h>
#include <linux/ratelimit.h>
+#include <linux/debugfs.h>
+#include <asm/sections.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -322,6 +324,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
{ 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */
{ 'L', ' ', false }, /* TAINT_SOFTLOCKUP */
{ 'K', ' ', true }, /* TAINT_LIVEPATCH */
+ { 'X', ' ', true }, /* TAINT_AUX */
};
/**
@@ -343,6 +346,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
* 'E' - Unsigned module has been loaded.
* 'L' - A soft lockup has previously occurred.
* 'K' - Kernel has been live patched.
+ * 'X' - Auxiliary taint, for distros' use.
*
* The string is overwritten by the next call to print_tainted().
*/
@@ -518,7 +522,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
{
disable_trace_on_warning();
- pr_warn("------------[ cut here ]------------\n");
+ if (args)
+ pr_warn(CUT_HERE);
if (file)
pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
@@ -582,9 +587,49 @@ EXPORT_SYMBOL(warn_slowpath_fmt_taint);
void warn_slowpath_null(const char *file, int line)
{
+ pr_warn(CUT_HERE);
__warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
}
EXPORT_SYMBOL(warn_slowpath_null);
+#else
+void __warn_printk(const char *fmt, ...)
+{
+ va_list args;
+
+ pr_warn(CUT_HERE);
+
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL(__warn_printk);
+#endif
+
+#ifdef CONFIG_BUG
+
+/* Support resetting WARN*_ONCE state */
+
+static int clear_warn_once_set(void *data, u64 val)
+{
+ generic_bug_clear_once();
+ memset(__start_once, 0, __end_once - __start_once);
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
+ NULL,
+ clear_warn_once_set,
+ "%lld\n");
+
+static __init int register_warn_debugfs(void)
+{
+ /* Don't care about failure */
+ debugfs_create_file("clear_warn_once", 0200, NULL,
+ NULL, &clear_warn_once_fops);
+ return 0;
+}
+
+device_initcall(register_warn_debugfs);
#endif
#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/params.c b/kernel/params.c
index 60b2d8101355..cc9108c2a1fd 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -224,7 +224,7 @@ char *parse_args(const char *doing,
} \
int param_get_##name(char *buffer, const struct kernel_param *kp) \
{ \
- return scnprintf(buffer, PAGE_SIZE, format, \
+ return scnprintf(buffer, PAGE_SIZE, format "\n", \
*((type *)kp->arg)); \
} \
const struct kernel_param_ops param_ops_##name = { \
@@ -236,14 +236,14 @@ char *parse_args(const char *doing,
EXPORT_SYMBOL(param_ops_##name)
-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
-STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
-STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
-STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
-STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
+STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
+STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
+STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
+STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
@@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp);
int param_get_charp(char *buffer, const struct kernel_param *kp)
{
- return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg));
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg));
}
EXPORT_SYMBOL(param_get_charp);
@@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool);
int param_get_bool(char *buffer, const struct kernel_param *kp)
{
/* Y and N chosen as being relatively non-coder friendly */
- return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
+ return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N');
}
EXPORT_SYMBOL(param_get_bool);
@@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool);
int param_get_invbool(char *buffer, const struct kernel_param *kp)
{
- return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
+ return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y');
}
EXPORT_SYMBOL(param_get_invbool);
@@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
struct kernel_param p = *kp;
for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
+ /* Replace \n with comma */
if (i)
- buffer[off++] = ',';
+ buffer[off - 1] = ',';
p.arg = arr->elem + arr->elemsize * i;
check_kparam_locked(p.mod);
ret = arr->ops->get(buffer + off, &p);
@@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring);
int param_get_string(char *buffer, const struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
- return strlcpy(buffer, kps->string, kps->maxlen);
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string);
}
EXPORT_SYMBOL(param_get_string);
@@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
kernel_param_lock(mk->mod);
count = attribute->param->ops->get(buf, attribute->param);
kernel_param_unlock(mk->mod);
- if (count > 0) {
- strcat(buf, "\n");
- ++count;
- }
return count;
}
@@ -600,7 +597,7 @@ EXPORT_SYMBOL(kernel_param_unlock);
/*
* add_sysfs_param - add a parameter to sysfs
* @mk: struct module_kobject
- * @kparam: the actual parameter definition to add to sysfs
+ * @kp: the actual parameter definition to add to sysfs
* @name: name of parameter
*
* Create a kobject if for a (per-module) parameter if mp NULL, and
diff --git a/kernel/pid.c b/kernel/pid.c
index 020dedbdf066..b13b624e2c49 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -39,11 +39,8 @@
#include <linux/proc_ns.h>
#include <linux/proc_fs.h>
#include <linux/sched/task.h>
+#include <linux/idr.h>
-#define pid_hashfn(nr, ns) \
- hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
-static struct hlist_head *pid_hash;
-static unsigned int pidhash_shift = 4;
struct pid init_struct_pid = INIT_STRUCT_PID;
int pid_max = PID_MAX_DEFAULT;
@@ -53,15 +50,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-static inline int mk_pid(struct pid_namespace *pid_ns,
- struct pidmap *map, int off)
-{
- return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
-}
-
-#define find_next_offset(map, off) \
- find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
-
/*
* PID-map pages start out as NULL, they get allocated upon
* first use and are never deallocated. This way a low pid_max
@@ -70,11 +58,8 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
*/
struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
- .pidmap = {
- [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
- },
- .last_pid = 0,
- .nr_hashed = PIDNS_HASH_ADDING,
+ .idr = IDR_INIT,
+ .pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
@@ -101,138 +86,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-static void free_pidmap(struct upid *upid)
-{
- int nr = upid->nr;
- struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
- int offset = nr & BITS_PER_PAGE_MASK;
-
- clear_bit(offset, map->page);
- atomic_inc(&map->nr_free);
-}
-
-/*
- * If we started walking pids at 'base', is 'a' seen before 'b'?
- */
-static int pid_before(int base, int a, int b)
-{
- /*
- * This is the same as saying
- *
- * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
- * and that mapping orders 'a' and 'b' with respect to 'base'.
- */
- return (unsigned)(a - base) < (unsigned)(b - base);
-}
-
-/*
- * We might be racing with someone else trying to set pid_ns->last_pid
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
- * We want the winner to have the "later" value, because if the
- * "earlier" value prevails, then a pid may get reused immediately.
- *
- * Since pids rollover, it is not sufficient to just pick the bigger
- * value. We have to consider where we started counting from.
- *
- * 'base' is the value of pid_ns->last_pid that we observed when
- * we started looking for a pid.
- *
- * 'pid' is the pid that we eventually found.
- */
-static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
-{
- int prev;
- int last_write = base;
- do {
- prev = last_write;
- last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
- } while ((prev != last_write) && (pid_before(base, last_write, pid)));
-}
-
-static int alloc_pidmap(struct pid_namespace *pid_ns)
-{
- int i, offset, max_scan, pid, last = pid_ns->last_pid;
- struct pidmap *map;
-
- pid = last + 1;
- if (pid >= pid_max)
- pid = RESERVED_PIDS;
- offset = pid & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
- /*
- * If last_pid points into the middle of the map->page we
- * want to scan this bitmap block twice, the second time
- * we start with offset == 0 (or RESERVED_PIDS).
- */
- max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
- for (i = 0; i <= max_scan; ++i) {
- if (unlikely(!map->page)) {
- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /*
- * Free the page if someone raced with us
- * installing it:
- */
- spin_lock_irq(&pidmap_lock);
- if (!map->page) {
- map->page = page;
- page = NULL;
- }
- spin_unlock_irq(&pidmap_lock);
- kfree(page);
- if (unlikely(!map->page))
- return -ENOMEM;
- }
- if (likely(atomic_read(&map->nr_free))) {
- for ( ; ; ) {
- if (!test_and_set_bit(offset, map->page)) {
- atomic_dec(&map->nr_free);
- set_last_pid(pid_ns, last, pid);
- return pid;
- }
- offset = find_next_offset(map, offset);
- if (offset >= BITS_PER_PAGE)
- break;
- pid = mk_pid(pid_ns, map, offset);
- if (pid >= pid_max)
- break;
- }
- }
- if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
- ++map;
- offset = 0;
- } else {
- map = &pid_ns->pidmap[0];
- offset = RESERVED_PIDS;
- if (unlikely(last == offset))
- break;
- }
- pid = mk_pid(pid_ns, map, offset);
- }
- return -EAGAIN;
-}
-
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
-{
- int offset;
- struct pidmap *map, *end;
-
- if (last >= PID_MAX_LIMIT)
- return -1;
-
- offset = (last + 1) & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
- end = &pid_ns->pidmap[PIDMAP_ENTRIES];
- for (; map < end; map++, offset = 0) {
- if (unlikely(!map->page))
- continue;
- offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
- if (offset < BITS_PER_PAGE)
- return mk_pid(pid_ns, map, offset);
- }
- return -1;
-}
-
void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
@@ -265,8 +118,7 @@ void free_pid(struct pid *pid)
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
- hlist_del_rcu(&upid->pid_chain);
- switch(--ns->nr_hashed) {
+ switch (--ns->pid_allocated) {
case 2:
case 1:
/* When all that is left in the pid namespace
@@ -275,21 +127,20 @@ void free_pid(struct pid *pid)
*/
wake_up_process(ns->child_reaper);
break;
- case PIDNS_HASH_ADDING:
+ case PIDNS_ADDING:
/* Handle a fork failure of the first process */
WARN_ON(ns->child_reaper);
- ns->nr_hashed = 0;
+ ns->pid_allocated = 0;
/* fall through */
case 0:
schedule_work(&ns->proc_work);
break;
}
+
+ idr_remove(&ns->idr, upid->nr);
}
spin_unlock_irqrestore(&pidmap_lock, flags);
- for (i = 0; i <= pid->level; i++)
- free_pidmap(pid->numbers + i);
-
call_rcu(&pid->rcu, delayed_put_pid);
}
@@ -308,8 +159,29 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = ns;
pid->level = ns->level;
+
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ int pid_min = 1;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_irq(&pidmap_lock);
+
+ /*
+ * init really needs pid 1, but after reaching the maximum
+ * wrap back to RESERVED_PIDS
+ */
+ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+ pid_min = RESERVED_PIDS;
+
+ /*
+ * Store a null pointer so find_pid_ns does not find
+ * a partially initialized PID (see below).
+ */
+ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+ pid_max, GFP_ATOMIC);
+ spin_unlock_irq(&pidmap_lock);
+ idr_preload_end();
+
if (nr < 0) {
retval = nr;
goto out_free;
@@ -334,12 +206,12 @@ struct pid *alloc_pid(struct pid_namespace *ns)
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
- if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+ if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
for ( ; upid >= pid->numbers; --upid) {
- hlist_add_head_rcu(&upid->pid_chain,
- &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
- upid->ns->nr_hashed++;
+ /* Make the PID visible to find_pid_ns. */
+ idr_replace(&upid->ns->idr, pid, upid->nr);
+ upid->ns->pid_allocated++;
}
spin_unlock_irq(&pidmap_lock);
@@ -350,8 +222,11 @@ out_unlock:
put_pid_ns(ns);
out_free:
+ spin_lock_irq(&pidmap_lock);
while (++i <= ns->level)
- free_pidmap(pid->numbers + i);
+ idr_remove(&ns->idr, (pid->numbers + i)->nr);
+
+ spin_unlock_irq(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -360,21 +235,13 @@ out_free:
void disable_pid_allocation(struct pid_namespace *ns)
{
spin_lock_irq(&pidmap_lock);
- ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+ ns->pid_allocated &= ~PIDNS_ADDING;
spin_unlock_irq(&pidmap_lock);
}
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
- struct upid *pnr;
-
- hlist_for_each_entry_rcu(pnr,
- &pid_hash[pid_hashfn(nr, ns)], pid_chain)
- if (pnr->nr == nr && pnr->ns == ns)
- return container_of(pnr, struct pid,
- numbers[ns->level]);
-
- return NULL;
+ return idr_find(&ns->idr, nr);
}
EXPORT_SYMBOL_GPL(find_pid_ns);
@@ -530,6 +397,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (type != PIDTYPE_PID) {
if (type == __PIDTYPE_TGID)
type = PIDTYPE_PID;
+
task = task->group_leader;
}
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
@@ -553,35 +421,13 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
*/
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
- struct pid *pid;
-
- do {
- pid = find_pid_ns(nr, ns);
- if (pid)
- break;
- nr = next_pidmap(ns, nr);
- } while (nr > 0);
-
- return pid;
-}
-
-/*
- * The pid hash table is scaled according to the amount of memory in the
- * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
- * more.
- */
-void __init pidhash_init(void)
-{
- pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
- HASH_EARLY | HASH_SMALL | HASH_ZERO,
- &pidhash_shift, NULL,
- 0, 4096);
+ return idr_get_next(&ns->idr, &nr);
}
-void __init pidmap_init(void)
+void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
- BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
pid_max = min(pid_max_max, max_t(int, pid_max,
@@ -590,10 +436,7 @@ void __init pidmap_init(void)
PIDS_PER_CPU_MIN * num_possible_cpus());
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
- init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /* Reserve PID 0. We never call free_pidmap(0) */
- set_bit(0, init_pid_ns.pidmap[0].page);
- atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+ idr_init(&init_pid_ns.idr);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 4918314893bc..0b53eef7d34b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,7 @@
#include <linux/export.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
+#include <linux/idr.h>
struct pid_cache {
int nr_ids;
@@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
struct ucounts *ucounts;
- int i;
int err;
err = -EINVAL;
@@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns == NULL)
goto out_dec;
- ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- if (!ns->pidmap[0].page)
- goto out_free;
+ idr_init(&ns->idr);
ns->pid_cachep = create_pid_cachep(level + 1);
if (ns->pid_cachep == NULL)
- goto out_free_map;
+ goto out_free_idr;
err = ns_alloc_inum(&ns->ns);
if (err)
- goto out_free_map;
+ goto out_free_idr;
ns->ns.ops = &pidns_operations;
kref_init(&ns->kref);
@@ -135,20 +133,13 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
- ns->nr_hashed = PIDNS_HASH_ADDING;
+ ns->pid_allocated = PIDNS_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);
- set_bit(0, ns->pidmap[0].page);
- atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
- for (i = 1; i < PIDMAP_ENTRIES; i++)
- atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-
return ns;
-out_free_map:
- kfree(ns->pidmap[0].page);
-out_free:
+out_free_idr:
+ idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
out_dec:
dec_pid_namespaces(ucounts);
@@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
- int i;
-
ns_free_inum(&ns->ns);
- for (i = 0; i < PIDMAP_ENTRIES; i++)
- kfree(ns->pidmap[i].page);
+
+ idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
}
@@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
int rc;
struct task_struct *task, *me = current;
int init_pids = thread_group_leader(me) ? 1 : 2;
+ struct pid *pid;
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
@@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* maintain a tasklist for each pid namespace.
*
*/
+ rcu_read_lock();
read_lock(&tasklist_lock);
- nr = next_pidmap(pid_ns, 1);
- while (nr > 0) {
- rcu_read_lock();
-
- task = pid_task(find_vpid(nr), PIDTYPE_PID);
+ nr = 2;
+ idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
+ task = pid_task(pid, PIDTYPE_PID);
if (task && !__fatal_signal_pending(task))
send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
-
- rcu_read_unlock();
-
- nr = next_pidmap(pid_ns, nr);
}
read_unlock(&tasklist_lock);
+ rcu_read_unlock();
/*
* Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
@@ -268,7 +254,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* sys_wait4() above can't reap the EXIT_DEAD children but we do not
* really care, we could reparent them to the global init. We could
* exit and reap ->child_reaper even if it is not the last thread in
- * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+ * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
* pid_ns can not go away until proc_kill_sb() drops the reference.
*
* But this ns can also have other tasks injected by setns()+fork().
@@ -282,7 +268,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*/
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- if (pid_ns->nr_hashed == init_pids)
+ if (pid_ns->pid_allocated == init_pids)
break;
schedule();
}
@@ -301,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
+ int ret, next;
if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
@@ -311,8 +298,14 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
* it should synchronize its usage with external means.
*/
- tmp.data = &pid_ns->last_pid;
- return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ next = idr_get_cursor(&pid_ns->idr) - 1;
+
+ tmp.data = &next;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (!ret && write)
+ idr_set_cursor(&pid_ns->idr, next + 1);
+
+ return ret;
}
extern int pid_max;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e8517b63eb37..e880ca22c5a5 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -259,20 +259,6 @@ config APM_EMULATION
anything, try disabling/enabling this option (or disabling/enabling
APM in your BIOS).
-config PM_OPP
- bool
- select SRCU
- ---help---
- SOCs have a standard set of tuples consisting of frequency and
- voltage pairs that the device will support per voltage domain. This
- is called Operating Performance Point or OPP. The actual definitions
- of OPP varies over silicon within the same family of devices.
-
- OPP layer organizes the data internally using device pointers
- representing individual voltage domains and provides SOC
- implementations a ready to use framework to manage OPPs.
- For more information, read <file:Documentation/power/opp.txt>
-
config PM_CLK
def_bool y
depends on PM && HAVE_CLK
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index eb4f717705ba..a3f79f0eef36 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index 9012ecf7b814..41e83a779e19 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* kernel/power/autosleep.c
*
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 0e781798b0b3..fcdf0e14a47d 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Functions for saving/restoring console.
*
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 1d2d761e3c25..f29cd178df90 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/suspend.h>
#include <linux/suspend_ioctls.h>
#include <linux/utsname.h>
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 50f25cb370c6..7381d49a44db 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* drivers/power/process.c - Functions for starting/stopping processes on
* suspend transitions.
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 97b0df71303e..9d7503910ce2 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -701,8 +701,8 @@ static int __init pm_qos_power_init(void)
for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
ret = register_pm_qos_misc(pm_qos_array[i], d);
if (ret < 0) {
- printk(KERN_ERR "pm_qos_param: %s setup failed\n",
- pm_qos_array[i]->name);
+ pr_err("%s: %s setup failed\n",
+ __func__, pm_qos_array[i]->name);
return ret;
}
}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0972a8e09d08..bce0464524d8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) "PM: " fmt
+
#include <linux/version.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -967,7 +969,7 @@ void __init __register_nosave_region(unsigned long start_pfn,
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
Report:
- printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n",
+ pr_info("Registered nosave memory: [mem %#010llx-%#010llx]\n",
(unsigned long long) start_pfn << PAGE_SHIFT,
((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
}
@@ -1039,7 +1041,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
list_for_each_entry(region, &nosave_regions, list) {
unsigned long pfn;
- pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
+ pr_debug("Marking nosave pages: [mem %#010llx-%#010llx]\n",
(unsigned long long) region->start_pfn << PAGE_SHIFT,
((unsigned long long) region->end_pfn << PAGE_SHIFT)
- 1);
@@ -1095,7 +1097,7 @@ int create_basic_memory_bitmaps(void)
free_pages_map = bm2;
mark_nosave_pages(forbidden_pages_map);
- pr_debug("PM: Basic memory bitmaps created\n");
+ pr_debug("Basic memory bitmaps created\n");
return 0;
@@ -1131,7 +1133,7 @@ void free_basic_memory_bitmaps(void)
memory_bm_free(bm2, PG_UNSAFE_CLEAR);
kfree(bm2);
- pr_debug("PM: Basic memory bitmaps freed\n");
+ pr_debug("Basic memory bitmaps freed\n");
}
void clear_free_pages(void)
@@ -1152,7 +1154,7 @@ void clear_free_pages(void)
pfn = memory_bm_next_pfn(bm);
}
memory_bm_position_reset(bm);
- pr_info("PM: free pages cleared after restore\n");
+ pr_info("free pages cleared after restore\n");
#endif /* PAGE_POISONING_ZERO */
}
@@ -1690,7 +1692,7 @@ int hibernate_preallocate_memory(void)
ktime_t start, stop;
int error;
- printk(KERN_INFO "PM: Preallocating image memory... ");
+ pr_info("Preallocating image memory... ");
start = ktime_get();
error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
@@ -1821,13 +1823,13 @@ int hibernate_preallocate_memory(void)
out:
stop = ktime_get();
- printk(KERN_CONT "done (allocated %lu pages)\n", pages);
+ pr_cont("done (allocated %lu pages)\n", pages);
swsusp_show_speed(start, stop, pages, "Allocated");
return 0;
err_out:
- printk(KERN_CONT "\n");
+ pr_cont("\n");
swsusp_free();
return -ENOMEM;
}
@@ -1867,8 +1869,8 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
free += zone_page_state(zone, NR_FREE_PAGES);
nr_pages += count_pages_for_highmem(nr_highmem);
- pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
- nr_pages, PAGES_FOR_IO, free);
+ pr_debug("Normal pages needed: %u + %u, available pages: %u\n",
+ nr_pages, PAGES_FOR_IO, free);
return free > nr_pages + PAGES_FOR_IO;
}
@@ -1882,7 +1884,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
*/
static inline int get_highmem_buffer(int safe_needed)
{
- buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+ buffer = get_image_page(GFP_ATOMIC, safe_needed);
return buffer ? 0 : -ENOMEM;
}
@@ -1943,7 +1945,7 @@ static int swsusp_alloc(struct memory_bitmap *copy_bm,
while (nr_pages-- > 0) {
struct page *page;
- page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+ page = alloc_image_page(GFP_ATOMIC);
if (!page)
goto err_out;
memory_bm_set_bit(copy_bm, page_to_pfn(page));
@@ -1961,20 +1963,20 @@ asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
- printk(KERN_INFO "PM: Creating hibernation image:\n");
+ pr_info("Creating hibernation image:\n");
drain_local_pages(NULL);
nr_pages = count_data_pages();
nr_highmem = count_highmem_pages();
- printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
+ pr_info("Need to copy %u pages\n", nr_pages + nr_highmem);
if (!enough_free_mem(nr_pages, nr_highmem)) {
- printk(KERN_ERR "PM: Not enough free memory\n");
+ pr_err("Not enough free memory\n");
return -ENOMEM;
}
if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem)) {
- printk(KERN_ERR "PM: Memory allocation failed\n");
+ pr_err("Memory allocation failed\n");
return -ENOMEM;
}
@@ -1995,8 +1997,7 @@ asmlinkage __visible int swsusp_save(void)
nr_copy_pages = nr_pages;
nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
- printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n",
- nr_pages);
+ pr_info("Hibernation image created (%d pages copied)\n", nr_pages);
return 0;
}
@@ -2170,7 +2171,7 @@ static int check_header(struct swsusp_info *info)
if (!reason && info->num_physpages != get_num_physpages())
reason = "memory size";
if (reason) {
- printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
+ pr_err("Image mismatch: %s\n", reason);
return -EPERM;
}
return 0;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 3e2b4f519009..0685c4499431 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -120,22 +120,26 @@ static void s2idle_loop(void)
* frozen processes + suspended devices + idle processors.
* Thus s2idle_enter() should be called right after
* all devices have been suspended.
+ *
+ * Wakeups during the noirq suspend of devices may be spurious,
+ * so prevent them from terminating the loop right away.
*/
error = dpm_noirq_suspend_devices(PMSG_SUSPEND);
if (!error)
s2idle_enter();
+ else if (error == -EBUSY && pm_wakeup_pending())
+ error = 0;
- dpm_noirq_resume_devices(PMSG_RESUME);
- if (error && (error != -EBUSY || !pm_wakeup_pending())) {
- dpm_noirq_end();
- break;
- }
-
- if (s2idle_ops && s2idle_ops->wake)
+ if (!error && s2idle_ops && s2idle_ops->wake)
s2idle_ops->wake();
+ dpm_noirq_resume_devices(PMSG_RESUME);
+
dpm_noirq_end();
+ if (error)
+ break;
+
if (s2idle_ops && s2idle_ops->sync)
s2idle_ops->sync();
@@ -433,7 +437,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = suspend_ops->enter(state);
trace_suspend_resume(TPS("machine_suspend"),
state, false);
- events_check_enabled = false;
} else if (*wakeup) {
error = -EBUSY;
}
@@ -578,6 +581,7 @@ static int enter_state(suspend_state_t state)
pm_restore_gfp_mask();
Finish:
+ events_check_enabled = false;
pm_pr_dbg("Finishing wakeup.\n");
suspend_finish();
Unlock:
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index d7cdc426ee38..293ead59eccc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -12,6 +12,8 @@
*
*/
+#define pr_fmt(fmt) "PM: " fmt
+
#include <linux/module.h>
#include <linux/file.h>
#include <linux/delay.h>
@@ -241,9 +243,9 @@ static void hib_end_io(struct bio *bio)
struct page *page = bio->bi_io_vec[0].bv_page;
if (bio->bi_status) {
- printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
- MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+ (unsigned long long)bio->bi_iter.bi_sector);
}
if (bio_data_dir(bio) == WRITE)
@@ -273,8 +275,8 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_err("Adding page to bio failed at %llu\n",
+ (unsigned long long)bio->bi_iter.bi_sector);
bio_put(bio);
return -EFAULT;
}
@@ -319,7 +321,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
swsusp_resume_block, swsusp_header, NULL);
} else {
- printk(KERN_ERR "PM: Swap header not found!\n");
+ pr_err("Swap header not found!\n");
error = -ENODEV;
}
return error;
@@ -413,8 +415,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
ret = swsusp_swap_check();
if (ret) {
if (ret != -ENOSPC)
- printk(KERN_ERR "PM: Cannot find swap device, try "
- "swapon -a.\n");
+ pr_err("Cannot find swap device, try swapon -a\n");
return ret;
}
handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -491,9 +492,9 @@ static int swap_writer_finish(struct swap_map_handle *handle,
{
if (!error) {
flush_swap_writer(handle);
- printk(KERN_INFO "PM: S");
+ pr_info("S");
error = mark_swapfiles(handle, flags);
- printk("|\n");
+ pr_cont("|\n");
}
if (error)
@@ -542,7 +543,7 @@ static int save_image(struct swap_map_handle *handle,
hib_init_batch(&hb);
- printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
+ pr_info("Saving image data pages (%u pages)...\n",
nr_to_write);
m = nr_to_write / 10;
if (!m)
@@ -557,8 +558,8 @@ static int save_image(struct swap_map_handle *handle,
if (ret)
break;
if (!(nr_pages % m))
- printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image saving progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
err2 = hib_wait_io(&hb);
@@ -566,7 +567,7 @@ static int save_image(struct swap_map_handle *handle,
if (!ret)
ret = err2;
if (!ret)
- printk(KERN_INFO "PM: Image saving done.\n");
+ pr_info("Image saving done\n");
swsusp_show_speed(start, stop, nr_to_write, "Wrote");
return ret;
}
@@ -692,14 +693,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
if (!page) {
- printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+ pr_err("Failed to allocate LZO page\n");
ret = -ENOMEM;
goto out_clean;
}
data = vmalloc(sizeof(*data) * nr_threads);
if (!data) {
- printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+ pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -708,7 +709,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
crc = kmalloc(sizeof(*crc), GFP_KERNEL);
if (!crc) {
- printk(KERN_ERR "PM: Failed to allocate crc\n");
+ pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -726,8 +727,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
"image_compress/%u", thr);
if (IS_ERR(data[thr].thr)) {
data[thr].thr = NULL;
- printk(KERN_ERR
- "PM: Cannot start compression threads\n");
+ pr_err("Cannot start compression threads\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -749,7 +749,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
if (IS_ERR(crc->thr)) {
crc->thr = NULL;
- printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+ pr_err("Cannot start CRC32 thread\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -760,10 +760,9 @@ static int save_image_lzo(struct swap_map_handle *handle,
*/
handle->reqd_free_pages = reqd_free_pages();
- printk(KERN_INFO
- "PM: Using %u thread(s) for compression.\n"
- "PM: Compressing and saving image data (%u pages)...\n",
- nr_threads, nr_to_write);
+ pr_info("Using %u thread(s) for compression\n", nr_threads);
+ pr_info("Compressing and saving image data (%u pages)...\n",
+ nr_to_write);
m = nr_to_write / 10;
if (!m)
m = 1;
@@ -783,10 +782,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
data_of(*snapshot), PAGE_SIZE);
if (!(nr_pages % m))
- printk(KERN_INFO
- "PM: Image saving progress: "
- "%3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image saving progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
if (!off)
@@ -813,15 +810,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
ret = data[thr].ret;
if (ret < 0) {
- printk(KERN_ERR "PM: LZO compression failed\n");
+ pr_err("LZO compression failed\n");
goto out_finish;
}
if (unlikely(!data[thr].cmp_len ||
data[thr].cmp_len >
lzo1x_worst_compress(data[thr].unc_len))) {
- printk(KERN_ERR
- "PM: Invalid LZO compressed length\n");
+ pr_err("Invalid LZO compressed length\n");
ret = -1;
goto out_finish;
}
@@ -857,7 +853,7 @@ out_finish:
if (!ret)
ret = err2;
if (!ret)
- printk(KERN_INFO "PM: Image saving done.\n");
+ pr_info("Image saving done\n");
swsusp_show_speed(start, stop, nr_to_write, "Wrote");
out_clean:
if (crc) {
@@ -888,7 +884,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
unsigned int free_swap = count_swap_pages(root_swap, 1);
unsigned int required;
- pr_debug("PM: Free swap pages: %u\n", free_swap);
+ pr_debug("Free swap pages: %u\n", free_swap);
required = PAGES_FOR_IO + nr_pages;
return free_swap > required;
@@ -915,12 +911,12 @@ int swsusp_write(unsigned int flags)
pages = snapshot_get_image_size();
error = get_swap_writer(&handle);
if (error) {
- printk(KERN_ERR "PM: Cannot get swap writer\n");
+ pr_err("Cannot get swap writer\n");
return error;
}
if (flags & SF_NOCOMPRESS_MODE) {
if (!enough_swap(pages, flags)) {
- printk(KERN_ERR "PM: Not enough free swap\n");
+ pr_err("Not enough free swap\n");
error = -ENOSPC;
goto out_finish;
}
@@ -1068,8 +1064,7 @@ static int load_image(struct swap_map_handle *handle,
hib_init_batch(&hb);
clean_pages_on_read = true;
- printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
- nr_to_read);
+ pr_info("Loading image data pages (%u pages)...\n", nr_to_read);
m = nr_to_read / 10;
if (!m)
m = 1;
@@ -1087,8 +1082,8 @@ static int load_image(struct swap_map_handle *handle,
if (ret)
break;
if (!(nr_pages % m))
- printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image loading progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
err2 = hib_wait_io(&hb);
@@ -1096,7 +1091,7 @@ static int load_image(struct swap_map_handle *handle,
if (!ret)
ret = err2;
if (!ret) {
- printk(KERN_INFO "PM: Image loading done.\n");
+ pr_info("Image loading done\n");
snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
ret = -ENODATA;
@@ -1190,14 +1185,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
if (!page) {
- printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+ pr_err("Failed to allocate LZO page\n");
ret = -ENOMEM;
goto out_clean;
}
data = vmalloc(sizeof(*data) * nr_threads);
if (!data) {
- printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+ pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -1206,7 +1201,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
crc = kmalloc(sizeof(*crc), GFP_KERNEL);
if (!crc) {
- printk(KERN_ERR "PM: Failed to allocate crc\n");
+ pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -1226,8 +1221,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
"image_decompress/%u", thr);
if (IS_ERR(data[thr].thr)) {
data[thr].thr = NULL;
- printk(KERN_ERR
- "PM: Cannot start decompression threads\n");
+ pr_err("Cannot start decompression threads\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -1249,7 +1243,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
if (IS_ERR(crc->thr)) {
crc->thr = NULL;
- printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+ pr_err("Cannot start CRC32 thread\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -1274,8 +1268,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
if (!page[i]) {
if (i < LZO_CMP_PAGES) {
ring_size = i;
- printk(KERN_ERR
- "PM: Failed to allocate LZO pages\n");
+ pr_err("Failed to allocate LZO pages\n");
ret = -ENOMEM;
goto out_clean;
} else {
@@ -1285,10 +1278,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
want = ring_size = i;
- printk(KERN_INFO
- "PM: Using %u thread(s) for decompression.\n"
- "PM: Loading and decompressing image data (%u pages)...\n",
- nr_threads, nr_to_read);
+ pr_info("Using %u thread(s) for decompression\n", nr_threads);
+ pr_info("Loading and decompressing image data (%u pages)...\n",
+ nr_to_read);
m = nr_to_read / 10;
if (!m)
m = 1;
@@ -1348,8 +1340,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
if (unlikely(!data[thr].cmp_len ||
data[thr].cmp_len >
lzo1x_worst_compress(LZO_UNC_SIZE))) {
- printk(KERN_ERR
- "PM: Invalid LZO compressed length\n");
+ pr_err("Invalid LZO compressed length\n");
ret = -1;
goto out_finish;
}
@@ -1400,16 +1391,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
ret = data[thr].ret;
if (ret < 0) {
- printk(KERN_ERR
- "PM: LZO decompression failed\n");
+ pr_err("LZO decompression failed\n");
goto out_finish;
}
if (unlikely(!data[thr].unc_len ||
data[thr].unc_len > LZO_UNC_SIZE ||
data[thr].unc_len & (PAGE_SIZE - 1))) {
- printk(KERN_ERR
- "PM: Invalid LZO uncompressed length\n");
+ pr_err("Invalid LZO uncompressed length\n");
ret = -1;
goto out_finish;
}
@@ -1420,10 +1409,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
data[thr].unc + off, PAGE_SIZE);
if (!(nr_pages % m))
- printk(KERN_INFO
- "PM: Image loading progress: "
- "%3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image loading progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
ret = snapshot_write_next(snapshot);
@@ -1448,15 +1435,14 @@ out_finish:
}
stop = ktime_get();
if (!ret) {
- printk(KERN_INFO "PM: Image loading done.\n");
+ pr_info("Image loading done\n");
snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
ret = -ENODATA;
if (!ret) {
if (swsusp_header->flags & SF_CRC32_MODE) {
if(handle->crc32 != swsusp_header->crc32) {
- printk(KERN_ERR
- "PM: Invalid image CRC32!\n");
+ pr_err("Invalid image CRC32!\n");
ret = -ENODATA;
}
}
@@ -1513,9 +1499,9 @@ int swsusp_read(unsigned int *flags_p)
swap_reader_finish(&handle);
end:
if (!error)
- pr_debug("PM: Image successfully loaded\n");
+ pr_debug("Image successfully loaded\n");
else
- pr_debug("PM: Error %d resuming\n", error);
+ pr_debug("Error %d resuming\n", error);
return error;
}
@@ -1552,13 +1538,13 @@ put:
if (error)
blkdev_put(hib_resume_bdev, FMODE_READ);
else
- pr_debug("PM: Image signature found, resuming\n");
+ pr_debug("Image signature found, resuming\n");
} else {
error = PTR_ERR(hib_resume_bdev);
}
if (error)
- pr_debug("PM: Image not found (code %d)\n", error);
+ pr_debug("Image not found (code %d)\n", error);
return error;
}
@@ -1570,7 +1556,7 @@ put:
void swsusp_close(fmode_t mode)
{
if (IS_ERR(hib_resume_bdev)) {
- pr_debug("PM: Image device not initialised\n");
+ pr_debug("Image device not initialised\n");
return;
}
@@ -1594,7 +1580,7 @@ int swsusp_unmark(void)
swsusp_resume_block,
swsusp_header, NULL);
} else {
- printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
+ pr_err("Cannot find swsusp signature!\n");
error = -ENODEV;
}
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 1896386e16bb..dfba59be190b 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* kernel/power/wakelock.c
*
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 61d41ca41844..1d21ebacfdb8 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h
index 749a6756843a..123154f86304 100644
--- a/kernel/printk/braille.h
+++ b/kernel/printk/braille.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _PRINTK_BRAILLE_H
#define _PRINTK_BRAILLE_H
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
index 2ca4a8b5fe57..11f19c466af5 100644
--- a/kernel/printk/console_cmdline.h
+++ b/kernel/printk/console_cmdline.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _CONSOLE_CMDLINE_H
#define _CONSOLE_CMDLINE_H
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 512f7c2baedd..5d81206a572d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2190,7 +2190,7 @@ again:
}
if (console_seq < log_first_seq) {
- len = sprintf(text, "** %u printk messages dropped ** ",
+ len = sprintf(text, "** %u printk messages dropped **\n",
(unsigned)(log_first_seq - console_seq));
/* messages are gone, move to first one */
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 3cdaeaef9ce1..3e3c2004bb23 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -39,7 +39,7 @@
* There are situations when we want to make sure that all buffers
* were handled or when IRQs are blocked.
*/
-static int printk_safe_irq_ready;
+static int printk_safe_irq_ready __read_mostly;
#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
sizeof(atomic_t) - \
@@ -63,11 +63,8 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
/* Get flushed in a more safe context. */
static void queue_flush_work(struct printk_safe_seq_buf *s)
{
- if (printk_safe_irq_ready) {
- /* Make sure that IRQ work is really initialized. */
- smp_rmb();
+ if (printk_safe_irq_ready)
irq_work_queue(&s->work);
- }
}
/*
@@ -75,7 +72,7 @@ static void queue_flush_work(struct printk_safe_seq_buf *s)
* have dedicated buffers, because otherwise printk-safe preempted by
* NMI-printk would have overwritten the NMI messages.
*
- * The messages are fushed from irq work (or from panic()), possibly,
+ * The messages are flushed from irq work (or from panic()), possibly,
* from other CPU, concurrently with printk_safe_log_store(). Should this
* happen, printk_safe_log_store() will notice the buffer->len mismatch
* and repeat the write.
@@ -398,8 +395,12 @@ void __init printk_safe_init(void)
#endif
}
- /* Make sure that IRQ works are initialized before enabling. */
- smp_wmb();
+ /*
+ * In the highly unlikely event that a NMI were to trigger at
+ * this moment. Make sure IRQ work is set up before this
+ * variable is set.
+ */
+ barrier();
printk_safe_irq_ready = 1;
/* Flush pending messages that did not have scheduled IRQ works. */
diff --git a/kernel/range.c b/kernel/range.c
index 82cfc285b046..d84de6766472 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Range add and subtract
*/
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 13c0fc852767..020e8b6a644b 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
# Any varying coverage in these files is non-deterministic
# and is generally not a function of system call inputs.
KCOV_INSTRUMENT := n
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index e4b43fef89f5..59c471de342a 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -203,6 +203,21 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
extern int rcu_cpu_stall_suppress;
int rcu_jiffies_till_stall_check(void);
+#define rcu_ftrace_dump_stall_suppress() \
+do { \
+ if (!rcu_cpu_stall_suppress) \
+ rcu_cpu_stall_suppress = 3; \
+} while (0)
+
+#define rcu_ftrace_dump_stall_unsuppress() \
+do { \
+ if (rcu_cpu_stall_suppress == 3) \
+ rcu_cpu_stall_suppress = 0; \
+} while (0)
+
+#else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */
+#define rcu_ftrace_dump_stall_suppress()
+#define rcu_ftrace_dump_stall_unsuppress()
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
/*
@@ -220,8 +235,12 @@ do { \
static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
\
if (!atomic_read(&___rfd_beenhere) && \
- !atomic_xchg(&___rfd_beenhere, 1)) \
+ !atomic_xchg(&___rfd_beenhere, 1)) { \
+ tracing_off(); \
+ rcu_ftrace_dump_stall_suppress(); \
ftrace_dump(oops_dump_mode); \
+ rcu_ftrace_dump_stall_unsuppress(); \
+ } \
} while (0)
void rcu_early_boot_tests(void);
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 7649fcd2c4c7..88cba7c2956c 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -23,6 +23,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/rcupdate.h>
#include "rcu_segcblist.h"
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 45f2ffbc1e78..74f6b0146b98 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -51,6 +51,7 @@
#include <asm/byteorder.h>
#include <linux/torture.h>
#include <linux/vmalloc.h>
+#include <linux/sched/debug.h>
#include "rcu.h"
@@ -89,6 +90,7 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
torture_param(int, stall_cpu_holdoff, 10,
"Time to wait before starting stall (s).");
+torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
torture_param(int, stat_interval, 60,
"Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
@@ -1076,7 +1078,7 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp)
* counter in the element should never be greater than 1, otherwise, the
* RCU implementation is broken.
*/
-static void rcu_torture_timer(unsigned long unused)
+static void rcu_torture_timer(struct timer_list *unused)
{
int idx;
unsigned long started;
@@ -1163,7 +1165,7 @@ rcu_torture_reader(void *arg)
VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
set_user_nice(current, MAX_NICE);
if (irqreader && cur_ops->irq_capable)
- setup_timer_on_stack(&t, rcu_torture_timer, 0);
+ timer_setup_on_stack(&t, rcu_torture_timer, 0);
do {
if (irqreader && cur_ops->irq_capable) {
@@ -1239,6 +1241,7 @@ rcu_torture_stats_print(void)
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
static unsigned long rtcv_snap = ULONG_MAX;
+ static bool splatted;
struct task_struct *wtp;
for_each_possible_cpu(cpu) {
@@ -1324,6 +1327,10 @@ rcu_torture_stats_print(void)
gpnum, completed, flags,
wtp == NULL ? ~0UL : wtp->state,
wtp == NULL ? -1 : (int)task_cpu(wtp));
+ if (!splatted && wtp) {
+ sched_show_task(wtp);
+ splatted = true;
+ }
show_rcu_gp_kthreads();
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1357,7 +1364,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
"test_boost=%d/%d test_boost_interval=%d "
"test_boost_duration=%d shutdown_secs=%d "
- "stall_cpu=%d stall_cpu_holdoff=%d "
+ "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
@@ -1365,7 +1372,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
test_boost, cur_ops->can_boost,
test_boost_interval, test_boost_duration, shutdown_secs,
- stall_cpu, stall_cpu_holdoff,
+ stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
n_barrier_cbs,
onoff_interval, onoff_holdoff);
}
@@ -1430,12 +1437,19 @@ static int rcu_torture_stall(void *args)
if (!kthread_should_stop()) {
stop_at = get_seconds() + stall_cpu;
/* RCU CPU stall is expected behavior in following code. */
- pr_alert("rcu_torture_stall start.\n");
rcu_read_lock();
- preempt_disable();
+ if (stall_cpu_irqsoff)
+ local_irq_disable();
+ else
+ preempt_disable();
+ pr_alert("rcu_torture_stall start on CPU %d.\n",
+ smp_processor_id());
while (ULONG_CMP_LT(get_seconds(), stop_at))
continue; /* Induce RCU CPU stall warning. */
- preempt_enable();
+ if (stall_cpu_irqsoff)
+ local_irq_enable();
+ else
+ preempt_enable();
rcu_read_unlock();
pr_alert("rcu_torture_stall end.\n");
}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 729a8706751d..6d5880089ff6 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
/**
* call_srcu() - Queue a callback for invocation after an SRCU grace period
* @sp: srcu_struct in queue the callback
- * @head: structure to be used for queueing the SRCU callback.
+ * @rhp: structure to be used for queueing the SRCU callback.
* @func: function to be invoked after the SRCU grace period
*
* The callback function will be invoked some time after a full SRCU
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 50d1861f7759..3f943efcf61c 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -85,6 +85,9 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
}
/**
+ * rcu_sync_enter_start - Force readers onto slow path for multiple updates
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
* Must be called after rcu_sync_init() and before first use.
*
* Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
@@ -142,7 +145,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
/**
* rcu_sync_func() - Callback function managing reader access to fastpath
- * @rsp: Pointer to rcu_sync structure to use for synchronization
+ * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization
*
* This function is passed to one of the call_rcu() functions by
* rcu_sync_exit(), so that it is invoked after a grace period following the
@@ -158,9 +161,9 @@ void rcu_sync_enter(struct rcu_sync *rsp)
* rcu_sync_exit(). Otherwise, set all state back to idle so that readers
* can again use their fastpaths.
*/
-static void rcu_sync_func(struct rcu_head *rcu)
+static void rcu_sync_func(struct rcu_head *rhp)
{
- struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
+ struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
unsigned long flags;
BUG_ON(rsp->gp_state != GP_PASSED);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1250e4bd4b85..f9c0ca2ccf0c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -534,8 +534,8 @@ module_param(rcu_kick_kthreads, bool, 0644);
* How long the grace period must be before we start recruiting
* quiescent-state help from rcu_note_context_switch().
*/
-static ulong jiffies_till_sched_qs = HZ / 20;
-module_param(jiffies_till_sched_qs, ulong, 0644);
+static ulong jiffies_till_sched_qs = HZ / 10;
+module_param(jiffies_till_sched_qs, ulong, 0444);
static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
@@ -734,7 +734,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
int *fp = &rnp->need_future_gp[idx];
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
return READ_ONCE(*fp);
}
@@ -746,7 +746,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
static bool
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (rcu_gp_in_progress(rsp))
return false; /* No, a grace period is already in progress. */
if (rcu_future_needs_gp(rsp))
@@ -773,7 +773,7 @@ static void rcu_eqs_enter_common(bool user)
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
!user && !is_idle_task(current)) {
@@ -837,10 +837,13 @@ static void rcu_eqs_enter(bool user)
* We crowbar the ->dynticks_nesting field to zero to allow for
* the possibility of usermode upcalls having messed up our count
* of interrupt nesting level during the prior busy period.
+ *
+ * If you add or remove a call to rcu_idle_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_idle_enter(void)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rcu_eqs_enter(false);
}
@@ -852,10 +855,13 @@ void rcu_idle_enter(void)
* is permitted between this call and rcu_user_exit(). This way the
* CPU doesn't need to maintain the tick for RCU maintenance purposes
* when the CPU runs in userspace.
+ *
+ * If you add or remove a call to rcu_user_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_user_enter(void)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rcu_eqs_enter(true);
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -875,13 +881,21 @@ void rcu_user_enter(void)
* Use things like work queues to work around this limitation.
*
* You have been warned.
+ *
+ * If you add or remove a call to rcu_irq_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_exit(void)
{
struct rcu_dynticks *rdtp;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* Page faults can happen in NMI handlers, so check... */
+ if (rdtp->dynticks_nmi_nesting)
+ return;
+
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
rdtp->dynticks_nesting < 1);
if (rdtp->dynticks_nesting <= 1) {
@@ -894,6 +908,9 @@ void rcu_irq_exit(void)
/*
* Wrapper for rcu_irq_exit() where interrupts are enabled.
+ *
+ * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_exit_irqson(void)
{
@@ -942,7 +959,7 @@ static void rcu_eqs_exit(bool user)
struct rcu_dynticks *rdtp;
long long oldval;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
@@ -966,6 +983,9 @@ static void rcu_eqs_exit(bool user)
* allow for the possibility of usermode upcalls messing up our count
* of interrupt nesting level during the busy period that is just
* now starting.
+ *
+ * If you add or remove a call to rcu_idle_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_idle_exit(void)
{
@@ -982,6 +1002,9 @@ void rcu_idle_exit(void)
*
* Exit RCU idle mode while entering the kernel because it can
* run a RCU read side critical section anytime.
+ *
+ * If you add or remove a call to rcu_user_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_user_exit(void)
{
@@ -1007,14 +1030,22 @@ void rcu_user_exit(void)
* Use things like work queues to work around this limitation.
*
* You have been warned.
+ *
+ * If you add or remove a call to rcu_irq_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_enter(void)
{
struct rcu_dynticks *rdtp;
long long oldval;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* Page faults can happen in NMI handlers, so check... */
+ if (rdtp->dynticks_nmi_nesting)
+ return;
+
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting++;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -1027,6 +1058,9 @@ void rcu_irq_enter(void)
/*
* Wrapper for rcu_irq_enter() where interrupts are enabled.
+ *
+ * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_enter_irqson(void)
{
@@ -1045,6 +1079,9 @@ void rcu_irq_enter_irqson(void)
* that the CPU is active. This implementation permits nested NMIs, as
* long as the nesting level does not overflow an int. (You will probably
* run out of stack space first.)
+ *
+ * If you add or remove a call to rcu_nmi_enter(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_nmi_enter(void)
{
@@ -1077,6 +1114,9 @@ void rcu_nmi_enter(void)
* RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
* to let the RCU grace-period handling know that the CPU is back to
* being RCU-idle.
+ *
+ * If you add or remove a call to rcu_nmi_exit(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_nmi_exit(void)
{
@@ -1197,6 +1237,22 @@ static int rcu_is_cpu_rrupt_from_idle(void)
}
/*
+ * We are reporting a quiescent state on behalf of some other CPU, so
+ * it is our responsibility to check for and handle potential overflow
+ * of the rcu_node ->gpnum counter with respect to the rcu_data counters.
+ * After all, the CPU might be in deep idle state, and thus executing no
+ * code whatsoever.
+ */
+static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
+{
+ lockdep_assert_held(&rnp->lock);
+ if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum))
+ WRITE_ONCE(rdp->gpwrap, true);
+ if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum))
+ rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4;
+}
+
+/*
* Snapshot the specified CPU's dynticks counter so that we can later
* credit them with an implicit quiescent state. Return 1 if this CPU
* is in dynticks idle mode, which is an extended quiescent state.
@@ -1206,15 +1262,34 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
- if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
- rdp->mynode->gpnum))
- WRITE_ONCE(rdp->gpwrap, true);
+ rcu_gpnum_ovf(rdp->mynode, rdp);
return 1;
}
return 0;
}
/*
+ * Handler for the irq_work request posted when a grace period has
+ * gone on for too long, but not yet long enough for an RCU CPU
+ * stall warning. Set state appropriately, but just complain if
+ * there is unexpected state on entry.
+ */
+static void rcu_iw_handler(struct irq_work *iwp)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ rdp = container_of(iwp, struct rcu_data, rcu_iw);
+ rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp);
+ if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
+ rdp->rcu_iw_gpnum = rnp->gpnum;
+ rdp->rcu_iw_pending = false;
+ }
+ raw_spin_unlock_rcu_node(rnp);
+}
+
+/*
* Return true if the specified CPU has passed through a quiescent
* state by virtue of being in or having passed through an dynticks
* idle state since the last call to dyntick_save_progress_counter()
@@ -1225,8 +1300,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
unsigned long jtsq;
bool *rnhqp;
bool *ruqp;
- unsigned long rjtsc;
- struct rcu_node *rnp;
+ struct rcu_node *rnp = rdp->mynode;
/*
* If the CPU passed through or entered a dynticks idle phase with
@@ -1239,34 +1313,25 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
rdp->dynticks_fqs++;
+ rcu_gpnum_ovf(rnp, rdp);
return 1;
}
- /* Compute and saturate jiffies_till_sched_qs. */
- jtsq = jiffies_till_sched_qs;
- rjtsc = rcu_jiffies_till_stall_check();
- if (jtsq > rjtsc / 2) {
- WRITE_ONCE(jiffies_till_sched_qs, rjtsc);
- jtsq = rjtsc / 2;
- } else if (jtsq < 1) {
- WRITE_ONCE(jiffies_till_sched_qs, 1);
- jtsq = 1;
- }
-
/*
* Has this CPU encountered a cond_resched_rcu_qs() since the
* beginning of the grace period? For this to be the case,
* the CPU has to have noticed the current grace period. This
* might not be the case for nohz_full CPUs looping in the kernel.
*/
- rnp = rdp->mynode;
+ jtsq = jiffies_till_sched_qs;
ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) &&
READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
+ rcu_gpnum_ovf(rnp, rdp);
return 1;
- } else {
+ } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) {
/* Load rcu_qs_ctr before store to rcu_urgent_qs. */
smp_store_release(ruqp, true);
}
@@ -1275,6 +1340,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
rdp->offline_fqs++;
+ rcu_gpnum_ovf(rnp, rdp);
return 1;
}
@@ -1294,10 +1360,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* updates are only once every few jiffies, the probability of
* lossage (and thus of slight grace-period extension) is
* quite low.
- *
- * Note that if the jiffies_till_sched_qs boot/sysfs parameter
- * is set too high, we override with half of the RCU CPU stall
- * warning delay.
*/
rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu);
if (!READ_ONCE(*rnhqp) &&
@@ -1306,15 +1368,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
WRITE_ONCE(*rnhqp, true);
/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
smp_store_release(ruqp, true);
- rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+ rdp->rsp->jiffies_resched += jtsq; /* Re-enable beating. */
}
/*
- * If more than halfway to RCU CPU stall-warning time, do
- * a resched_cpu() to try to loosen things up a bit.
+ * If more than halfway to RCU CPU stall-warning time, do a
+ * resched_cpu() to try to loosen things up a bit. Also check to
+ * see if the CPU is getting hammered with interrupts, but only
+ * once per grace period, just to keep the IPIs down to a dull roar.
*/
- if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2)
+ if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) {
resched_cpu(rdp->cpu);
+ if (IS_ENABLED(CONFIG_IRQ_WORK) &&
+ !rdp->rcu_iw_pending && rdp->rcu_iw_gpnum != rnp->gpnum &&
+ (rnp->ffmask & rdp->grpmask)) {
+ init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
+ rdp->rcu_iw_pending = true;
+ rdp->rcu_iw_gpnum = rnp->gpnum;
+ irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
+ }
+ }
return 0;
}
@@ -1503,6 +1576,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
{
int cpu;
unsigned long flags;
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
struct rcu_node *rnp = rcu_get_root(rsp);
long totqlen = 0;
@@ -1518,7 +1592,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
*/
pr_err("INFO: %s self-detected stall on CPU", rsp->name);
print_cpu_stall_info_begin();
+ raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
print_cpu_stall_info(rsp, smp_processor_id());
+ raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
@@ -1912,6 +1988,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->core_needs_qs = need_gp;
zero_cpu_stall_ticks(rdp);
WRITE_ONCE(rdp->gpwrap, false);
+ rcu_gpnum_ovf(rnp, rdp);
}
return ret;
}
@@ -3087,9 +3164,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
* read-side critical sections have completed. call_rcu_sched() assumes
* that the read-side critical sections end on enabling of preemption
* or on voluntary preemption.
- * RCU read-side critical sections are delimited by :
- * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
- * - anything that disables preemption.
+ * RCU read-side critical sections are delimited by:
+ *
+ * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
+ * - anything that disables preemption.
*
* These may be nested.
*
@@ -3114,11 +3192,12 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
* handler. This means that read-side critical sections in process
* context must not be interrupted by softirqs. This interface is to be
* used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by :
- * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context.
- * OR
- * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
- * These may be nested.
+ * RCU read-side critical sections are delimited by:
+ *
+ * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context, OR
+ * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
+ *
+ * These may be nested.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -3690,6 +3769,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->cpu_no_qs.b.norm = true;
rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu);
rdp->core_needs_qs = false;
+ rdp->rcu_iw_pending = false;
+ rdp->rcu_iw_gpnum = rnp->gpnum - 1;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
@@ -3727,10 +3808,24 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
*/
int rcutree_online_cpu(unsigned int cpu)
{
- sync_sched_exp_online_cleanup(cpu);
- rcutree_affinity_setting(cpu, -1);
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->ffmask |= rdp->grpmask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
if (IS_ENABLED(CONFIG_TREE_SRCU))
srcu_online_cpu(cpu);
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return 0; /* Too early in boot for scheduler work. */
+ sync_sched_exp_online_cleanup(cpu);
+ rcutree_affinity_setting(cpu, -1);
return 0;
}
@@ -3740,6 +3835,19 @@ int rcutree_online_cpu(unsigned int cpu)
*/
int rcutree_offline_cpu(unsigned int cpu)
{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->ffmask &= ~rdp->grpmask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+
rcutree_affinity_setting(cpu, cpu);
if (IS_ENABLED(CONFIG_TREE_SRCU))
srcu_offline_cpu(cpu);
@@ -4188,8 +4296,7 @@ void __init rcu_init(void)
for_each_online_cpu(cpu) {
rcutree_prepare_cpu(cpu);
rcu_cpu_starting(cpu);
- if (IS_ENABLED(CONFIG_TREE_SRCU))
- srcu_online_cpu(cpu);
+ rcutree_online_cpu(cpu);
}
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e1f285f0a70..46a5d1991450 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -103,6 +103,7 @@ struct rcu_node {
/* Online CPUs for next expedited GP. */
/* Any CPU that has ever been online will */
/* have its bit set. */
+ unsigned long ffmask; /* Fully functional CPUs. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
int grplo; /* lowest-numbered CPU or group here. */
@@ -285,6 +286,10 @@ struct rcu_data {
/* 8) RCU CPU stall data. */
unsigned int softirq_snap; /* Snapshot of softirq activity. */
+ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */
+ struct irq_work rcu_iw; /* Check for non-irq activity. */
+ bool rcu_iw_pending; /* Is ->rcu_iw pending? */
+ unsigned long rcu_iw_gpnum; /* ->gpnum associated with ->rcu_iw. */
int cpu;
struct rcu_state *rsp;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e012b9be777e..db85ca3975f1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -29,6 +29,7 @@
#include <linux/oom.h>
#include <linux/sched/debug.h>
#include <linux/smpboot.h>
+#include <linux/sched/isolation.h>
#include <uapi/linux/sched/types.h>
#include "../time/tick-internal.h"
@@ -54,6 +55,7 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
* This probably needs to be excluded from -rt builds.
*/
#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
+#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1)
#endif /* #else #ifdef CONFIG_RCU_BOOST */
@@ -325,7 +327,7 @@ static void rcu_preempt_note_context_switch(bool preempt)
struct rcu_data *rdp;
struct rcu_node *rnp;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
+ lockdep_assert_irqs_disabled();
WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
if (t->rcu_read_lock_nesting > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
@@ -530,7 +532,7 @@ void rcu_read_unlock_special(struct task_struct *t)
/* Unboost if we were boosted. */
if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
- rt_mutex_unlock(&rnp->boost_mtx);
+ rt_mutex_futex_unlock(&rnp->boost_mtx);
/*
* If this was the last task on the expedited lists,
@@ -911,8 +913,6 @@ void exit_rcu(void)
#ifdef CONFIG_RCU_BOOST
-#include "../locking/rtmutex_common.h"
-
static void rcu_wake_cond(struct task_struct *t, int status)
{
/*
@@ -1421,7 +1421,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
unsigned long dj;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
@@ -1470,7 +1470,7 @@ static void rcu_prepare_for_idle(void)
struct rcu_state *rsp;
int tne;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (rcu_is_nocb_cpu(smp_processor_id()))
return;
@@ -1507,7 +1507,7 @@ static void rcu_prepare_for_idle(void)
rdtp->last_accelerate = jiffies;
for_each_rcu_flavor(rsp) {
rdp = this_cpu_ptr(rsp->rda);
- if (rcu_segcblist_pend_cbs(&rdp->cblist))
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist))
continue;
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
@@ -1525,7 +1525,7 @@ static void rcu_prepare_for_idle(void)
*/
static void rcu_cleanup_after_idle(void)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
@@ -1671,6 +1671,7 @@ static void print_cpu_stall_info_begin(void)
*/
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
{
+ unsigned long delta;
char fast_no_hz[72];
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_dynticks *rdtp = rdp->dynticks;
@@ -1685,11 +1686,15 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
ticks_value = rsp->gpnum - rdp->gpnum;
}
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
- pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
+ delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum;
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
"N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
+ !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
+ rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
+ "!."[!delta],
ticks_value, ticks_title,
rcu_dynticks_snap(rdtp) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
@@ -2012,7 +2017,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (!rcu_is_nocb_cpu(smp_processor_id()))
return false; /* Not NOCBs CPU, caller must migrate CBs. */
__call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
@@ -2261,9 +2266,11 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
}
/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
-static void do_nocb_deferred_wakeup_timer(unsigned long x)
+static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
{
- do_nocb_deferred_wakeup_common((struct rcu_data *)x);
+ struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
+
+ do_nocb_deferred_wakeup_common(rdp);
}
/*
@@ -2327,8 +2334,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
init_swait_queue_head(&rdp->nocb_wq);
rdp->nocb_follower_tail = &rdp->nocb_follower_head;
raw_spin_lock_init(&rdp->nocb_lock);
- setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer,
- (unsigned long)rdp);
+ timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
}
/*
@@ -2583,7 +2589,7 @@ static void rcu_bind_gp_kthread(void)
if (!tick_nohz_full_enabled())
return;
- housekeeping_affine(current);
+ housekeeping_affine(current, HK_FLAG_RCU);
}
/* Record the current task on dyntick-idle entry. */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5033b66d2753..fbd56d6e575b 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -51,6 +51,7 @@
#include <linux/kthread.h>
#include <linux/tick.h>
#include <linux/rcupdate_wait.h>
+#include <linux/sched/isolation.h>
#define CREATE_TRACE_POINTS
@@ -494,6 +495,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
#endif
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_suppress, int, 0644);
@@ -575,7 +577,6 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
-static void rcu_spawn_tasks_kthread(void);
static struct task_struct *rcu_tasks_kthread_ptr;
/**
@@ -600,7 +601,6 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
{
unsigned long flags;
bool needwake;
- bool havetask = READ_ONCE(rcu_tasks_kthread_ptr);
rhp->next = NULL;
rhp->func = func;
@@ -610,11 +610,8 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
rcu_tasks_cbs_tail = &rhp->next;
raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
/* We can't create the thread unless interrupts are enabled. */
- if ((needwake && havetask) ||
- (!havetask && !irqs_disabled_flags(flags))) {
- rcu_spawn_tasks_kthread();
+ if (needwake && READ_ONCE(rcu_tasks_kthread_ptr))
wake_up(&rcu_tasks_cbs_wq);
- }
}
EXPORT_SYMBOL_GPL(call_rcu_tasks);
@@ -718,7 +715,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
LIST_HEAD(rcu_tasks_holdouts);
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
- housekeeping_affine(current);
+ housekeeping_affine(current, HK_FLAG_RCU);
/*
* Each pass through the following loop makes one check for
@@ -853,27 +850,18 @@ static int __noreturn rcu_tasks_kthread(void *arg)
}
}
-/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */
-static void rcu_spawn_tasks_kthread(void)
+/* Spawn rcu_tasks_kthread() at core_initcall() time. */
+static int __init rcu_spawn_tasks_kthread(void)
{
- static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
struct task_struct *t;
- if (READ_ONCE(rcu_tasks_kthread_ptr)) {
- smp_mb(); /* Ensure caller sees full kthread. */
- return;
- }
- mutex_lock(&rcu_tasks_kthread_mutex);
- if (rcu_tasks_kthread_ptr) {
- mutex_unlock(&rcu_tasks_kthread_mutex);
- return;
- }
t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
BUG_ON(IS_ERR(t));
smp_mb(); /* Ensure others see full kthread. */
WRITE_ONCE(rcu_tasks_kthread_ptr, t);
- mutex_unlock(&rcu_tasks_kthread_mutex);
+ return 0;
}
+core_initcall(rcu_spawn_tasks_kthread);
/* Do the srcu_read_lock() for the above synchronize_srcu(). */
void exit_tasks_rcu_start(void)
diff --git a/kernel/reboot.c b/kernel/reboot.c
index bd30a973fe94..e4ced883d8de 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_reboot_notifier);
+static void devm_unregister_reboot_notifier(struct device *dev, void *res)
+{
+ WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res));
+}
+
+int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb)
+{
+ struct notifier_block **rcnb;
+ int ret;
+
+ rcnb = devres_alloc(devm_unregister_reboot_notifier,
+ sizeof(*rcnb), GFP_KERNEL);
+ if (!rcnb)
+ return -ENOMEM;
+
+ ret = register_reboot_notifier(nb);
+ if (!ret) {
+ *rcnb = nb;
+ devres_add(dev, rcnb);
+ } else {
+ devres_free(rcnb);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(devm_register_reboot_notifier);
+
/*
* Notifier list for kernel code which wants to be called
* to restart the system.
diff --git a/kernel/resource.c b/kernel/resource.c
index 9b5f04404152..54ba6de3757c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -397,9 +397,32 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
res->start = p->start;
if (res->end > p->end)
res->end = p->end;
+ res->flags = p->flags;
+ res->desc = p->desc;
return 0;
}
+static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
+ bool first_level_children_only,
+ void *arg,
+ int (*func)(struct resource *, void *))
+{
+ u64 orig_end = res->end;
+ int ret = -1;
+
+ while ((res->start < res->end) &&
+ !find_next_iomem_res(res, desc, first_level_children_only)) {
+ ret = (*func)(res, arg);
+ if (ret)
+ break;
+
+ res->start = res->end + 1;
+ res->end = orig_end;
+ }
+
+ return ret;
+}
+
/*
* Walks through iomem resources and calls func() with matching resource
* ranges. This walks through whole tree and not just first level children.
@@ -415,29 +438,15 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
* <linux/ioport.h> and set it in 'desc' of a target resource entry.
*/
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
- u64 end, void *arg, int (*func)(u64, u64, void *))
+ u64 end, void *arg, int (*func)(struct resource *, void *))
{
struct resource res;
- u64 orig_end;
- int ret = -1;
res.start = start;
res.end = end;
res.flags = flags;
- orig_end = res.end;
-
- while ((res.start < res.end) &&
- (!find_next_iomem_res(&res, desc, false))) {
-
- ret = (*func)(res.start, res.end, arg);
- if (ret)
- break;
-
- res.start = res.end + 1;
- res.end = orig_end;
- }
- return ret;
+ return __walk_iomem_res_desc(&res, desc, false, arg, func);
}
/*
@@ -448,25 +457,33 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
* ranges.
*/
int walk_system_ram_res(u64 start, u64 end, void *arg,
- int (*func)(u64, u64, void *))
+ int (*func)(struct resource *, void *))
{
struct resource res;
- u64 orig_end;
- int ret = -1;
res.start = start;
res.end = end;
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- orig_end = res.end;
- while ((res.start < res.end) &&
- (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) {
- ret = (*func)(res.start, res.end, arg);
- if (ret)
- break;
- res.start = res.end + 1;
- res.end = orig_end;
- }
- return ret;
+
+ return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
+ arg, func);
+}
+
+/*
+ * This function calls the @func callback against all memory ranges, which
+ * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ */
+int walk_mem_res(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *))
+{
+ struct resource res;
+
+ res.start = start;
+ res.end = end;
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
+ return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
+ arg, func);
}
#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
@@ -508,6 +525,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
{
return 1;
}
+
/*
* This generic page_is_ram() returns true if specified address is
* registered as System RAM in iomem_resource list.
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 78f54932ea1d..e2f9d4feff40 10