summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile8
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/audit.c100
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/auditfilter.c16
-rw-r--r--kernel/auditsc.c31
-rw-r--r--kernel/bpf/Makefile8
-rw-r--r--kernel/bpf/arraymap.c10
-rw-r--r--kernel/bpf/bpf_iter.c539
-rw-r--r--kernel/bpf/bpf_lsm.c2
-rw-r--r--kernel/bpf/bpf_struct_ops.c2
-rw-r--r--kernel/bpf/btf.c49
-rw-r--r--kernel/bpf/cgroup.c201
-rw-r--r--kernel/bpf/core.c16
-rw-r--r--kernel/bpf/cpumap.c25
-rw-r--r--kernel/bpf/devmap.c135
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/helpers.c125
-rw-r--r--kernel/bpf/inode.c5
-rw-r--r--kernel/bpf/lpm_trie.c2
-rw-r--r--kernel/bpf/map_in_map.c2
-rw-r--r--kernel/bpf/map_iter.c102
-rw-r--r--kernel/bpf/net_namespace.c373
-rw-r--r--kernel/bpf/queue_stack_maps.c4
-rw-r--r--kernel/bpf/reuseport_array.c2
-rw-r--r--kernel/bpf/ringbuf.c501
-rw-r--r--kernel/bpf/stackmap.c19
-rw-r--r--kernel/bpf/syscall.c664
-rw-r--r--kernel/bpf/task_iter.c353
-rw-r--r--kernel/bpf/verifier.c389
-rw-r--r--kernel/bpf/xskmap.c265
-rw-r--r--kernel/cgroup/cgroup.c37
-rw-r--r--kernel/cgroup/cpuset.c4
-rw-r--r--kernel/cgroup/namespace.c5
-rw-r--r--kernel/cgroup/rstat.c60
-rw-r--r--kernel/compat.c12
-rw-r--r--kernel/context_tracking.c14
-rw-r--r--kernel/cpu.c18
-rw-r--r--kernel/cpu_pm.c4
-rw-r--r--kernel/cred.c3
-rw-r--r--kernel/debug/debug_core.c81
-rw-r--r--kernel/debug/gdbstub.c6
-rw-r--r--kernel/debug/kdb/kdb_bt.c15
-rw-r--r--kernel/debug/kdb/kdb_io.c72
-rw-r--r--kernel/debug/kdb/kdb_main.c14
-rw-r--r--kernel/debug/kdb/kdb_support.c7
-rw-r--r--kernel/dma/Kconfig9
-rw-r--r--kernel/dma/Makefile1
-rw-r--r--kernel/dma/contiguous.c4
-rw-r--r--kernel/dma/debug.c2
-rw-r--r--kernel/dma/direct.c113
-rw-r--r--kernel/dma/pool.c263
-rw-r--r--kernel/dma/remap.c170
-rw-r--r--kernel/events/callchain.c2
-rw-r--r--kernel/events/core.c22
-rw-r--r--kernel/events/hw_breakpoint.c16
-rw-r--r--kernel/events/uprobes.c48
-rw-r--r--kernel/exit.c57
-rw-r--r--kernel/fork.c28
-rw-r--r--kernel/futex.c7
-rw-r--r--kernel/gcov/Kconfig28
-rw-r--r--kernel/gcov/Makefile3
-rw-r--r--kernel/gcov/gcc_3_4.c573
-rwxr-xr-xkernel/gen_kheaders.sh2
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hung_task.c30
-rw-r--r--kernel/irq/Kconfig5
-rw-r--r--kernel/irq/irq_sim.c267
-rw-r--r--kernel/irq/irqdomain.c53
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq_work.c53
-rw-r--r--kernel/kcov.c282
-rw-r--r--kernel/kcsan/Makefile14
-rw-r--r--kernel/kcsan/atomic.h20
-rw-r--r--kernel/kcsan/core.c850
-rw-r--r--kernel/kcsan/debugfs.c349
-rw-r--r--kernel/kcsan/encoding.h95
-rw-r--r--kernel/kcsan/kcsan.h142
-rw-r--r--kernel/kcsan/report.c634
-rw-r--r--kernel/kcsan/test.c131
-rw-r--r--kernel/kexec_file.c39
-rw-r--r--kernel/kprobes.c97
-rw-r--r--kernel/kthread.c80
-rw-r--r--kernel/latencytop.c4
-rw-r--r--kernel/livepatch/core.c178
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lockdep.c94
-rw-r--r--kernel/locking/rtmutex-debug.c2
-rw-r--r--kernel/module.c103
-rw-r--r--kernel/notifier.c1
-rw-r--r--kernel/nsproxy.c305
-rw-r--r--kernel/padata.c281
-rw-r--r--kernel/panic.c49
-rw-r--r--kernel/pid.c22
-rw-r--r--kernel/pid_namespace.c7
-rw-r--r--kernel/power/Kconfig36
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/hibernate.c20
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/snapshot.c1
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/power/user.c22
-rw-r--r--kernel/printk/printk.c26
-rw-r--r--kernel/printk/printk_safe.c7
-rw-r--r--kernel/rcu/tree.c32
-rw-r--r--kernel/rcu/tree_stall.h2
-rw-r--r--kernel/relay.c33
-rw-r--r--kernel/resource.c5
-rw-r--r--kernel/sched/Makefile6
-rw-r--r--kernel/sched/core.c306
-rw-r--r--kernel/sched/cpuacct.c7
-rw-r--r--kernel/sched/deadline.c1
-rw-r--r--kernel/sched/debug.c9
-rw-r--r--kernel/sched/fair.c268
-rw-r--r--kernel/sched/idle.c21
-rw-r--r--kernel/sched/pelt.c24
-rw-r--r--kernel/sched/rt.c22
-rw-r--r--kernel/sched/sched.h36
-rw-r--r--kernel/sched/smp.h9
-rw-r--r--kernel/sched/topology.c33
-rw-r--r--kernel/scs.c104
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c116
-rw-r--r--kernel/smp.c157
-rw-r--r--kernel/softirq.c44
-rw-r--r--kernel/sys.c72
-rw-r--r--kernel/sysctl.c3110
-rw-r--r--kernel/task_work.c16
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/namespace.c5
-rw-r--r--kernel/time/posix-cpu-timers.c111
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/time/timer.c3
-rw-r--r--kernel/trace/Kconfig52
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/blktrace.c70
-rw-r--r--kernel/trace/bpf_trace.c399
-rw-r--r--kernel/trace/ftrace.c28
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c33
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_boot.c10
-rw-r--r--kernel/trace/trace_entries.h14
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_events_hist.c2073
-rw-r--r--kernel/trace/trace_events_synth.c1789
-rw-r--r--kernel/trace/trace_events_trigger.c21
-rw-r--r--kernel/trace/trace_export.c16
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_kprobe.c74
-rw-r--r--kernel/trace/trace_output.c4
-rw-r--r--kernel/trace/trace_preemptirq.c43
-rw-r--r--kernel/trace/trace_probe.c6
-rw-r--r--kernel/trace/trace_probe.h2
-rw-r--r--kernel/trace/trace_stack.c5
-rw-r--r--kernel/trace/trace_synth.h36
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/umh.c2
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/user_namespace.c8
-rw-r--r--kernel/utsname.c5
-rw-r--r--kernel/utsname_sysctl.c2
-rw-r--r--kernel/watch_queue.c655
-rw-r--r--kernel/watchdog.c49
-rw-r--r--kernel/workqueue.c217
166 files changed, 13590 insertions, 6580 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 4cb4130ced32..f3218bc5ec69 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,9 @@ endif
# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
# in coverage traces.
KCOV_INSTRUMENT_softirq.o := n
+# Avoid KCSAN instrumentation in softirq ("No shared variables, all the data
+# are CPU local" => assume no data races), to reduce overhead in interrupts.
+KCSAN_SANITIZE_softirq.o = n
# These are called from save_stack_trace() on slub debug path,
# and produce insane amounts of uninteresting coverage.
KCOV_INSTRUMENT_module.o := n
@@ -31,6 +34,7 @@ KCOV_INSTRUMENT_stacktrace.o := n
# Don't self-instrument.
KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
+KCSAN_SANITIZE_kcov.o := n
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
# cond_syscall is currently not LTO compatible
@@ -103,6 +107,8 @@ obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
obj-$(CONFIG_BPF) += bpf/
+obj-$(CONFIG_KCSAN) += kcsan/
+obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
obj-$(CONFIG_PERF_EVENTS) += events/
@@ -115,11 +121,13 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o
obj-$(CONFIG_HAS_IOMEM) += iomem.o
obj-$(CONFIG_RSEQ) += rseq.o
+obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
KASAN_SANITIZE_stackleak.o := n
+KCSAN_SANITIZE_stackleak.o := n
KCOV_INSTRUMENT_stackleak.o := n
$(obj)/configs.o: $(obj)/config_data.gz
diff --git a/kernel/acct.c b/kernel/acct.c
index 11ff4a596d6b..b0c5b3a9f5af 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -40,7 +40,7 @@
* is one more bug... 10/11/98, AV.
*
* Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
- * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
+ * ->mmap_lock to walk the vma list of current->mm. Nasty, since it leaks
* a struct file opened for write. Fixed. 2/6/2000, AV.
*/
@@ -541,13 +541,13 @@ void acct_collect(long exitcode, int group_dead)
if (group_dead && current->mm) {
struct vm_area_struct *vma;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
vma = current->mm->mmap;
while (vma) {
vsize += vma->vm_end - vma->vm_start;
vma = vma->vm_next;
}
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
}
spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/audit.c b/kernel/audit.c
index 87f31bf1f0a0..8c201f414226 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -880,7 +880,7 @@ main_queue:
return 0;
}
-int audit_send_list(void *_dest)
+int audit_send_list_thread(void *_dest)
{
struct audit_netlink_list *dest = _dest;
struct sk_buff *skb;
@@ -924,19 +924,30 @@ out_kfree_skb:
return NULL;
}
+static void audit_free_reply(struct audit_reply *reply)
+{
+ if (!reply)
+ return;
+
+ if (reply->skb)
+ kfree_skb(reply->skb);
+ if (reply->net)
+ put_net(reply->net);
+ kfree(reply);
+}
+
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
- struct sock *sk = audit_get_sk(reply->net);
audit_ctl_lock();
audit_ctl_unlock();
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
- netlink_unicast(sk, reply->skb, reply->portid, 0);
- put_net(reply->net);
- kfree(reply);
+ netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0);
+ reply->skb = NULL;
+ audit_free_reply(reply);
return 0;
}
@@ -950,35 +961,32 @@ static int audit_send_reply_thread(void *arg)
* @payload: payload data
* @size: payload size
*
- * Allocates an skb, builds the netlink message, and sends it to the port id.
- * No failure notifications.
+ * Allocates a skb, builds the netlink message, and sends it to the port id.
*/
static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
int multi, const void *payload, int size)
{
- struct net *net = sock_net(NETLINK_CB(request_skb).sk);
- struct sk_buff *skb;
struct task_struct *tsk;
- struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
- GFP_KERNEL);
+ struct audit_reply *reply;
+ reply = kzalloc(sizeof(*reply), GFP_KERNEL);
if (!reply)
return;
- skb = audit_make_reply(seq, type, done, multi, payload, size);
- if (!skb)
- goto out;
-
- reply->net = get_net(net);
+ reply->skb = audit_make_reply(seq, type, done, multi, payload, size);
+ if (!reply->skb)
+ goto err;
+ reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
reply->portid = NETLINK_CB(request_skb).portid;
- reply->skb = skb;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
- if (!IS_ERR(tsk))
- return;
- kfree_skb(skb);
-out:
- kfree(reply);
+ if (IS_ERR(tsk))
+ goto err;
+
+ return;
+
+err:
+ audit_free_reply(reply);
}
/*
@@ -1525,20 +1533,60 @@ static void audit_receive(struct sk_buff *skb)
audit_ctl_unlock();
}
+/* Log information about who is connecting to the audit multicast socket */
+static void audit_log_multicast(int group, const char *op, int err)
+{
+ const struct cred *cred;
+ struct tty_struct *tty;
+ char comm[sizeof(current->comm)];
+ struct audit_buffer *ab;
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER);
+ if (!ab)
+ return;
+
+ cred = current_cred();
+ tty = audit_get_tty();
+ audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u",
+ task_pid_nr(current),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ tty ? tty_name(tty) : "(none)",
+ audit_get_sessionid(current));
+ audit_put_tty(tty);
+ audit_log_task_context(ab); /* subj= */
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, get_task_comm(comm, current));
+ audit_log_d_path_exe(ab, current->mm); /* exe= */
+ audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err);
+ audit_log_end(ab);
+}
+
/* Run custom bind function on netlink socket group connect or bind requests. */
-static int audit_bind(struct net *net, int group)
+static int audit_multicast_bind(struct net *net, int group)
{
+ int err = 0;
+
if (!capable(CAP_AUDIT_READ))
- return -EPERM;
+ err = -EPERM;
+ audit_log_multicast(group, "connect", err);
+ return err;
+}
- return 0;
+static void audit_multicast_unbind(struct net *net, int group)
+{
+ audit_log_multicast(group, "disconnect", 0);
}
static int __net_init audit_net_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
.input = audit_receive,
- .bind = audit_bind,
+ .bind = audit_multicast_bind,
+ .unbind = audit_multicast_unbind,
.flags = NL_CFG_F_NONROOT_RECV,
.groups = AUDIT_NLGRP_MAX,
};
diff --git a/kernel/audit.h b/kernel/audit.h
index 2eed4d231624..f0233dc40b17 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -229,7 +229,7 @@ struct audit_netlink_list {
struct sk_buff_head q;
};
-int audit_send_list(void *_dest);
+int audit_send_list_thread(void *_dest);
extern int selinux_audit_rule_update(void);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 026e34da4ace..a10e2997aa6c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1161,11 +1161,8 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz)
*/
int audit_list_rules_send(struct sk_buff *request_skb, int seq)
{
- u32 portid = NETLINK_CB(request_skb).portid;
- struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct task_struct *tsk;
struct audit_netlink_list *dest;
- int err = 0;
/* We can't just spew out the rules here because we might fill
* the available socket buffer space and deadlock waiting for
@@ -1173,25 +1170,26 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq)
* happen if we're actually running in the context of auditctl
* trying to _send_ the stuff */
- dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
+ dest = kmalloc(sizeof(*dest), GFP_KERNEL);
if (!dest)
return -ENOMEM;
- dest->net = get_net(net);
- dest->portid = portid;
+ dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
+ dest->portid = NETLINK_CB(request_skb).portid;
skb_queue_head_init(&dest->q);
mutex_lock(&audit_filter_mutex);
audit_list_rules(seq, &dest->q);
mutex_unlock(&audit_filter_mutex);
- tsk = kthread_run(audit_send_list, dest, "audit_send_list");
+ tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list");
if (IS_ERR(tsk)) {
skb_queue_purge(&dest->q);
+ put_net(dest->net);
kfree(dest);
- err = PTR_ERR(tsk);
+ return PTR_ERR(tsk);
}
- return err;
+ return 0;
}
int audit_comparator(u32 left, u32 op, u32 right)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 814406a35db1..468a23390457 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -130,6 +130,17 @@ struct audit_tree_refs {
struct audit_chunk *c[31];
};
+struct audit_nfcfgop_tab {
+ enum audit_nfcfgop op;
+ const char *s;
+};
+
+static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
+ { AUDIT_XT_OP_REGISTER, "register" },
+ { AUDIT_XT_OP_REPLACE, "replace" },
+ { AUDIT_XT_OP_UNREGISTER, "unregister" },
+};
+
static int audit_match_perm(struct audit_context *ctx, int mask)
{
unsigned n;
@@ -2542,6 +2553,26 @@ void __audit_ntp_log(const struct audit_ntp_data *ad)
audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST);
}
+void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
+ enum audit_nfcfgop op)
+{
+ struct audit_buffer *ab;
+ char comm[sizeof(current->comm)];
+
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG);
+ if (!ab)
+ return;
+ audit_log_format(ab, "table=%s family=%u entries=%u op=%s",
+ name, af, nentries, audit_nfcfgs[op].s);
+
+ audit_log_format(ab, " pid=%u", task_pid_nr(current));
+ audit_log_task_context(ab); /* subj= */
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, get_task_comm(comm, current));
+ audit_log_end(ab);
+}
+EXPORT_SYMBOL_GPL(__audit_log_nfcfg);
+
static void audit_log_task(struct audit_buffer *ab)
{
kuid_t auid, uid;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f2d7be596966..1131a921e1a6 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,9 +2,9 @@
obj-y := core.o
CFLAGS_core.o += $(call cc-disable-warning, override-init)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
@@ -12,10 +12,8 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
-ifeq ($(CONFIG_XDP_SOCKETS),y)
-obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
-endif
obj-$(CONFIG_BPF_SYSCALL) += offload.o
+obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1d6120fd5ba6..11584618e861 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -77,7 +77,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int ret, numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
- bool unpriv = !capable(CAP_SYS_ADMIN);
+ bool bypass_spec_v1 = bpf_bypass_spec_v1();
u64 cost, array_size, mask64;
struct bpf_map_memory mem;
struct bpf_array *array;
@@ -95,7 +95,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
mask64 -= 1;
index_mask = mask64;
- if (unpriv) {
+ if (!bypass_spec_v1) {
/* round up array size to nearest power of 2,
* since cpu will speculate within index_mask limits
*/
@@ -149,7 +149,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
}
array->index_mask = index_mask;
- array->map.unpriv_array = unpriv;
+ array->map.bypass_spec_v1 = bypass_spec_v1;
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
@@ -219,7 +219,7 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- if (map->unpriv_array) {
+ if (!map->bypass_spec_v1) {
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
} else {
@@ -1058,7 +1058,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- if (map->unpriv_array) {
+ if (!map->bypass_spec_v1) {
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
} else {
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
new file mode 100644
index 000000000000..dd612b80b9fe
--- /dev/null
+++ b/kernel/bpf/bpf_iter.c
@@ -0,0 +1,539 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+
+struct bpf_iter_target_info {
+ struct list_head list;
+ const struct bpf_iter_reg *reg_info;
+ u32 btf_id; /* cached value */
+};
+
+struct bpf_iter_link {
+ struct bpf_link link;
+ struct bpf_iter_target_info *tinfo;
+};
+
+struct bpf_iter_priv_data {
+ struct bpf_iter_target_info *tinfo;
+ struct bpf_prog *prog;
+ u64 session_id;
+ u64 seq_num;
+ bool done_stop;
+ u8 target_private[] __aligned(8);
+};
+
+static struct list_head targets = LIST_HEAD_INIT(targets);
+static DEFINE_MUTEX(targets_mutex);
+
+/* protect bpf_iter_link changes */
+static DEFINE_MUTEX(link_mutex);
+
+/* incremented on every opened seq_file */
+static atomic64_t session_id;
+
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
+
+static void bpf_iter_inc_seq_num(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ iter_priv->seq_num++;
+}
+
+static void bpf_iter_dec_seq_num(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ iter_priv->seq_num--;
+}
+
+static void bpf_iter_done_stop(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ iter_priv->done_stop = true;
+}
+
+/* bpf_seq_read, a customized and simpler version for bpf iterator.
+ * no_llseek is assumed for this file.
+ * The following are differences from seq_read():
+ * . fixed buffer size (PAGE_SIZE)
+ * . assuming no_llseek
+ * . stop() may call bpf program, handling potential overflow there
+ */
+static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
+ loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ size_t n, offs, copied = 0;
+ int err = 0;
+ void *p;
+
+ mutex_lock(&seq->lock);
+
+ if (!seq->buf) {
+ seq->size = PAGE_SIZE;
+ seq->buf = kmalloc(seq->size, GFP_KERNEL);
+ if (!seq->buf) {
+ err = -ENOMEM;
+ goto done;
+ }
+ }
+
+ if (seq->count) {
+ n = min(seq->count, size);
+ err = copy_to_user(buf, seq->buf + seq->from, n);
+ if (err) {
+ err = -EFAULT;
+ goto done;
+ }
+ seq->count -= n;
+ seq->from += n;
+ copied = n;
+ goto done;
+ }
+
+ seq->from = 0;
+ p = seq->op->start(seq, &seq->index);
+ if (!p)
+ goto stop;
+ if (IS_ERR(p)) {
+ err = PTR_ERR(p);
+ seq->op->stop(seq, p);
+ seq->count = 0;
+ goto done;
+ }
+
+ err = seq->op->show(seq, p);
+ if (err > 0) {
+ /* object is skipped, decrease seq_num, so next
+ * valid object can reuse the same seq_num.
+ */
+ bpf_iter_dec_seq_num(seq);
+ seq->count = 0;
+ } else if (err < 0 || seq_has_overflowed(seq)) {
+ if (!err)
+ err = -E2BIG;
+ seq->op->stop(seq, p);
+ seq->count = 0;
+ goto done;
+ }
+
+ while (1) {
+ loff_t pos = seq->index;
+
+ offs = seq->count;
+ p = seq->op->next(seq, p, &seq->index);
+ if (pos == seq->index) {
+ pr_info_ratelimited("buggy seq_file .next function %ps "
+ "did not updated position index\n",
+ seq->op->next);
+ seq->index++;
+ }
+
+ if (IS_ERR_OR_NULL(p))
+ break;
+
+ /* got a valid next object, increase seq_num */
+ bpf_iter_inc_seq_num(seq);
+
+ if (seq->count >= size)
+ break;
+
+ err = seq->op->show(seq, p);
+ if (err > 0) {
+ bpf_iter_dec_seq_num(seq);
+ seq->count = offs;
+ } else if (err < 0 || seq_has_overflowed(seq)) {
+ seq->count = offs;
+ if (offs == 0) {
+ if (!err)
+ err = -E2BIG;
+ seq->op->stop(seq, p);
+ goto done;
+ }
+ break;
+ }
+ }
+stop:
+ offs = seq->count;
+ /* bpf program called if !p */
+ seq->op->stop(seq, p);
+ if (!p) {
+ if (!seq_has_overflowed(seq)) {
+ bpf_iter_done_stop(seq);
+ } else {
+ seq->count = offs;
+ if (offs == 0) {
+ err = -E2BIG;
+ goto done;
+ }
+ }
+ }
+
+ n = min(seq->count, size);
+ err = copy_to_user(buf, seq->buf, n);
+ if (err) {
+ err = -EFAULT;
+ goto done;
+ }
+ copied = n;
+ seq->count -= n;
+ seq->from = n;
+done:
+ if (!copied)
+ copied = err;
+ else
+ *ppos += copied;
+ mutex_unlock(&seq->lock);
+ return copied;
+}
+
+static int iter_open(struct inode *inode, struct file *file)
+{
+ struct bpf_iter_link *link = inode->i_private;
+
+ return prepare_seq_file(file, link);
+}
+
+static int iter_release(struct inode *inode, struct file *file)
+{
+ struct bpf_iter_priv_data *iter_priv;
+ struct seq_file *seq;
+
+ seq = file->private_data;
+ if (!seq)
+ return 0;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+
+ if (iter_priv->tinfo->reg_info->fini_seq_private)
+ iter_priv->tinfo->reg_info->fini_seq_private(seq->private);
+
+ bpf_prog_put(iter_priv->prog);
+ seq->private = iter_priv;
+
+ return seq_release_private(inode, file);
+}
+
+const struct file_operations bpf_iter_fops = {
+ .open = iter_open,
+ .llseek = no_llseek,
+ .read = bpf_seq_read,
+ .release = iter_release,
+};
+
+/* The argument reg_info will be cached in bpf_iter_target_info.
+ * The common practice is to declare target reg_info as
+ * a const static variable and passed as an argument to
+ * bpf_iter_reg_target().
+ */
+int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info)
+{
+ struct bpf_iter_target_info *tinfo;
+
+ tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+ if (!tinfo)
+ return -ENOMEM;
+
+ tinfo->reg_info = reg_info;
+ INIT_LIST_HEAD(&tinfo->list);
+
+ mutex_lock(&targets_mutex);
+ list_add(&tinfo->list, &targets);
+ mutex_unlock(&targets_mutex);
+
+ return 0;
+}
+
+void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info)
+{
+ struct bpf_iter_target_info *tinfo;
+ bool found = false;
+
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (reg_info == tinfo->reg_info) {
+ list_del(&tinfo->list);
+ kfree(tinfo);
+ found = true;
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+
+ WARN_ON(found == false);
+}
+
+static void cache_btf_id(struct bpf_iter_target_info *tinfo,
+ struct bpf_prog *prog)
+{
+ tinfo->btf_id = prog->aux->attach_btf_id;
+}
+
+bool bpf_iter_prog_supported(struct bpf_prog *prog)
+{
+ const char *attach_fname = prog->aux->attach_func_name;
+ u32 prog_btf_id = prog->aux->attach_btf_id;
+ const char *prefix = BPF_ITER_FUNC_PREFIX;
+ struct bpf_iter_target_info *tinfo;
+ int prefix_len = strlen(prefix);
+ bool supported = false;
+
+ if (strncmp(attach_fname, prefix, prefix_len))
+ return false;
+
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
+ supported = true;
+ break;
+ }
+ if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) {
+ cache_btf_id(tinfo, prog);
+ supported = true;
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+
+ if (supported) {
+ prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
+ prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
+ }
+
+ return supported;
+}
+
+static void bpf_iter_link_release(struct bpf_link *link)
+{
+}
+
+static void bpf_iter_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+
+ kfree(iter_link);
+}
+
+static int bpf_iter_link_replace(struct bpf_link *link,
+ struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ int ret = 0;
+
+ mutex_lock(&link_mutex);
+ if (old_prog && link->prog != old_prog) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ if (link->prog->type != new_prog->type ||
+ link->prog->expected_attach_type != new_prog->expected_attach_type ||
+ link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ old_prog = xchg(&link->prog, new_prog);
+ bpf_prog_put(old_prog);
+
+out_unlock:
+ mutex_unlock(&link_mutex);
+ return ret;
+}
+
+static const struct bpf_link_ops bpf_iter_link_lops = {
+ .release = bpf_iter_link_release,
+ .dealloc = bpf_iter_link_dealloc,
+ .update_prog = bpf_iter_link_replace,
+};
+
+bool bpf_link_is_iter(struct bpf_link *link)
+{
+ return link->ops == &bpf_iter_link_lops;
+}
+
+int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct bpf_iter_target_info *tinfo;
+ struct bpf_iter_link *link;
+ bool existed = false;
+ u32 prog_btf_id;
+ int err;
+
+ if (attr->link_create.target_fd || attr->link_create.flags)
+ return -EINVAL;
+
+ prog_btf_id = prog->aux->attach_btf_id;
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (tinfo->btf_id == prog_btf_id) {
+ existed = true;
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+ if (!existed)
+ return -ENOENT;
+
+ link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
+ if (!link)
+ return -ENOMEM;
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
+ link->tinfo = tinfo;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ return err;
+ }
+
+ return bpf_link_settle(&link_primer);
+}
+
+static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
+ struct bpf_iter_target_info *tinfo,
+ struct bpf_prog *prog)
+{
+ priv_data->tinfo = tinfo;
+ priv_data->prog = prog;
+ priv_data->session_id = atomic64_inc_return(&session_id);
+ priv_data->seq_num = 0;
+ priv_data->done_stop = false;
+}
+
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
+{
+ struct bpf_iter_priv_data *priv_data;
+ struct bpf_iter_target_info *tinfo;
+ struct bpf_prog *prog;
+ u32 total_priv_dsize;
+ struct seq_file *seq;
+ int err = 0;
+
+ mutex_lock(&link_mutex);
+ prog = link->link.prog;
+ bpf_prog_inc(prog);
+ mutex_unlock(&link_mutex);
+
+ tinfo = link->tinfo;
+ total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
+ tinfo->reg_info->seq_priv_size;
+ priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops,
+ total_priv_dsize);
+ if (!priv_data) {
+ err = -ENOMEM;
+ goto release_prog;
+ }
+
+ if (tinfo->reg_info->init_seq_private) {
+ err = tinfo->reg_info->init_seq_private(priv_data->target_private);
+ if (err)
+ goto release_seq_file;
+ }
+
+ init_seq_meta(priv_data, tinfo, prog);
+ seq = file->private_data;
+ seq->private = priv_data->target_private;
+
+ return 0;
+
+release_seq_file:
+ seq_release_private(file->f_inode, file);
+ file->private_data = NULL;
+release_prog:
+ bpf_prog_put(prog);
+ return err;
+}
+
+int bpf_iter_new_fd(struct bpf_link *link)
+{
+ struct file *file;
+ unsigned int flags;
+ int err, fd;
+
+ if (link->ops != &bpf_iter_link_lops)
+ return -EINVAL;
+
+ flags = O_RDONLY | O_CLOEXEC;
+ fd = get_unused_fd_flags(flags);
+ if (fd < 0)
+ return fd;
+
+ file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto free_fd;
+ }
+
+ err = prepare_seq_file(file,
+ container_of(link, struct bpf_iter_link, link));
+ if (err)
+ goto free_file;
+
+ fd_install(fd, file);
+ return fd;
+
+free_file:
+ fput(file);
+free_fd:
+ put_unused_fd(fd);
+ return err;
+}
+
+struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
+{
+ struct bpf_iter_priv_data *iter_priv;
+ struct seq_file *seq;
+ void *seq_priv;
+
+ seq = meta->seq;
+ if (seq->file->f_op != &bpf_iter_fops)
+ return NULL;
+
+ seq_priv = seq->private;
+ iter_priv = container_of(seq_priv, struct bpf_iter_priv_data,
+ target_private);
+
+ if (in_stop && iter_priv->done_stop)
+ return NULL;
+
+ meta->session_id = iter_priv->session_id;
+ meta->seq_num = iter_priv->seq_num;
+
+ return iter_priv->prog;
+}
+
+int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
+{
+ int ret;
+
+ rcu_read_lock();
+ migrate_disable();
+ ret = BPF_PROG_RUN(prog, ctx);
+ migrate_enable();
+ rcu_read_unlock();
+
+ /* bpf program can only return 0 or 1:
+ * 0 : okay
+ * 1 : retry the same object
+ * The bpf_iter_run_prog() return value
+ * will be seq_ops->show() return value.
+ */
+ return ret == 0 ? 0 : -EAGAIN;
+}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 19636703b24e..fb278144e9fd 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -49,6 +49,6 @@ const struct bpf_prog_ops lsm_prog_ops = {
};
const struct bpf_verifier_ops lsm_verifier_ops = {
- .get_func_proto = bpf_tracing_func_proto,
+ .get_func_proto = tracing_prog_func_proto,
.is_valid_access = btf_ctx_access,
};
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 26cb51f2db72..c6b0decaa46a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -557,7 +557,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
struct bpf_map *map;
int err;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index d65c6912bdaf..58c9af1d4808 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3482,6 +3482,7 @@ extern char __weak __stop_BTF[];
extern struct btf *btf_vmlinux;
#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
static union {
struct bpf_ctx_convert {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
@@ -3508,6 +3509,7 @@ static u8 bpf_ctx_convert_map[] = {
0, /* avoid empty array */
};
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
static const struct btf_member *
btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
@@ -3692,7 +3694,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
struct bpf_verifier_log *log = info->log;
const struct btf_param *args;
u32 nr_args, arg;
- int ret;
+ int i, ret;
if (off % 8) {
bpf_log(log, "func '%s' offset %d is not multiple of 8\n",
@@ -3789,6 +3791,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* this is a pointer to another type */
info->reg_type = PTR_TO_BTF_ID;
+ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
+ const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+
+ if (ctx_arg_info->offset == off) {
+ info->reg_type = ctx_arg_info->reg_type;
+ break;
+ }
+ }
if (tgt_prog) {
ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
@@ -3828,6 +3838,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
const struct btf_type *mtype, *elem_type = NULL;
const struct btf_member *member;
const char *tname, *mname;
+ u32 vlen;
again:
tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
@@ -3836,7 +3847,43 @@ again:
return -EINVAL;
}
+ vlen = btf_type_vlen(t);
if (off + size > t->size) {
+ /* If the last element is a variable size array, we may
+ * need to relax the rule.
+ */
+ struct btf_array *array_elem;
+
+ if (vlen == 0)
+ goto error;
+
+ member = btf_type_member(t) + vlen - 1;
+ mtype = btf_type_skip_modifiers(btf_vmlinux, member->type,
+ NULL);
+ if (!btf_type_is_array(mtype))
+ goto error;
+
+ array_elem = (struct btf_array *)(mtype + 1);
+ if (array_elem->nelems != 0)
+ goto error;
+
+ moff = btf_member_bit_offset(t, member) / 8;
+ if (off < moff)
+ goto error;
+
+ /* Only allow structure for now, can be relaxed for
+ * other types later.
+ */
+ elem_type = btf_type_skip_modifiers(btf_vmlinux,
+ array_elem->type, NULL);
+ if (!btf_type_is_struct(elem_type))
+ goto error;
+
+ off = (off - moff) % elem_type->size;
+ return btf_struct_access(log, elem_type, off, size, atype,
+ next_btf_id);
+
+error:
bpf_log(log, "access beyond struct %s at off %u size %u\n",
tname, off, size);
return -EACCES;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index cb305e71e7de..ac53102e244a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -378,7 +378,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
}
list_for_each_entry(pl, progs, node) {
- if (prog && pl->prog == prog)
+ if (prog && pl->prog == prog && prog != replace_prog)
/* disallow attaching the same prog twice */
return ERR_PTR(-EINVAL);
if (link && pl->link == link)
@@ -557,8 +557,9 @@ found:
*
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
- struct bpf_prog *new_prog)
+static int __cgroup_bpf_replace(struct cgroup *cgrp,
+ struct bpf_cgroup_link *link,
+ struct bpf_prog *new_prog)
{
struct list_head *progs = &cgrp->bpf.progs[link->type];
struct bpf_prog *old_prog;
@@ -583,6 +584,30 @@ int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
return 0;
}
+static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_cgroup_link *cg_link;
+ int ret;
+
+ cg_link = container_of(link, struct bpf_cgroup_link, link);
+
+ mutex_lock(&cgroup_mutex);
+ /* link might have been auto-released by dying cgroup, so fail */
+ if (!cg_link->cgroup) {
+ ret = -ENOLINK;
+ goto out_unlock;
+ }
+ if (old_prog && link->prog != old_prog) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
@@ -808,17 +833,56 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link)
kfree(cg_link);
}
-const struct bpf_link_ops bpf_cgroup_link_lops = {
+static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+ u64 cg_id = 0;
+
+ mutex_lock(&cgroup_mutex);
+ if (cg_link->cgroup)
+ cg_id = cgroup_id(cg_link->cgroup);
+ mutex_unlock(&cgroup_mutex);
+
+ seq_printf(seq,
+ "cgroup_id:\t%llu\n"
+ "attach_type:\t%d\n",
+ cg_id,
+ cg_link->type);
+}
+
+static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+ u64 cg_id = 0;
+
+ mutex_lock(&cgroup_mutex);
+ if (cg_link->cgroup)
+ cg_id = cgroup_id(cg_link->cgroup);
+ mutex_unlock(&cgroup_mutex);
+
+ info->cgroup.cgroup_id = cg_id;
+ info->cgroup.attach_type = cg_link->type;
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_cgroup_link_lops = {
.release = bpf_cgroup_link_release,
.dealloc = bpf_cgroup_link_dealloc,
+ .update_prog = cgroup_bpf_replace,
+ .show_fdinfo = bpf_cgroup_link_show_fdinfo,
+ .fill_link_info = bpf_cgroup_link_fill_link_info,
};
int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
+ struct bpf_link_primer link_primer;
struct bpf_cgroup_link *link;
- struct file *link_file;
struct cgroup *cgrp;
- int err, link_fd;
+ int err;
if (attr->link_create.flags)
return -EINVAL;
@@ -832,26 +896,25 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
err = -ENOMEM;
goto out_put_cgroup;
}
- bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
+ prog);
link->cgroup = cgrp;
link->type = attr->link_create.attach_type;
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_cgroup;
}
err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
BPF_F_ALLOW_MULTI);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_cgroup;
}
- fd_install(link_fd, link_file);
- return link_fd;
+ return bpf_link_settle(&link_primer);
out_put_cgroup:
cgroup_put(cgrp);
@@ -1054,36 +1117,21 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
return !allow;
}
-EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
static const struct bpf_func_proto *
cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- case BPF_FUNC_map_push_elem:
- return &bpf_map_push_elem_proto;
- case BPF_FUNC_map_pop_elem:
- return &bpf_map_pop_elem_proto;
- case BPF_FUNC_map_peek_elem:
- return &bpf_map_peek_elem_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
- case BPF_FUNC_trace_printk:
- if (capable(CAP_SYS_ADMIN))
- return bpf_get_trace_printk_proto();
- /* fall through */
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return NULL;
+ return bpf_base_func_proto(func_id);
}
}
@@ -1137,16 +1185,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
* @head: sysctl table header
* @table: sysctl table
* @write: sysctl is being read (= 0) or written (= 1)
- * @buf: pointer to buffer passed by user space
+ * @buf: pointer to buffer (in and out)
* @pcount: value-result argument: value is size of buffer pointed to by @buf,
* result is size of @new_buf if program set new value, initial value
* otherwise
* @ppos: value-result argument: value is position at which read from or write
* to sysctl is happening, result is new position if program overrode it,
* initial value otherwise
- * @new_buf: pointer to pointer to new buffer that will be allocated if program
- * overrides new value provided by user space on sysctl write
- * NOTE: it's caller responsibility to free *new_buf if it was set
* @type: type of program to be executed
*
* Program is run when sysctl is being accessed, either read or written, and
@@ -1157,8 +1202,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
*/
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
struct ctl_table *table, int write,
- void __user *buf, size_t *pcount,
- loff_t *ppos, void **new_buf,
+ void **buf, size_t *pcount, loff_t *ppos,
enum bpf_attach_type type)
{
struct bpf_sysctl_kern ctx = {
@@ -1173,36 +1217,28 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
.new_updated = 0,
};
struct cgroup *cgrp;
+ loff_t pos = 0;
int ret;
ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
- if (ctx.cur_val) {
- mm_segment_t old_fs;
- loff_t pos = 0;
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- if (table->proc_handler(table, 0, (void __user *)ctx.cur_val,
- &ctx.cur_len, &pos)) {
- /* Let BPF program decide how to proceed. */
- ctx.cur_len = 0;
- }
- set_fs(old_fs);
- } else {
+ if (!ctx.cur_val ||
+ table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
/* Let BPF program decide how to proceed. */
ctx.cur_len = 0;
}
- if (write && buf && *pcount) {
+ if (write && *buf && *pcount) {
/* BPF program should be able to override new value with a
* buffer bigger than provided by user.
*/
ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
- if (!ctx.new_val ||
- copy_from_user(ctx.new_val, buf, ctx.new_len))
+ if (ctx.new_val) {
+ memcpy(ctx.new_val, *buf, ctx.new_len);
+ } else {
/* Let BPF program decide how to proceed. */
ctx.new_len = 0;
+ }
}
rcu_read_lock();
@@ -1213,7 +1249,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
kfree(ctx.cur_val);
if (ret == 1 && ctx.new_updated) {
- *new_buf = ctx.new_val;
+ kfree(*buf);
+ *buf = ctx.new_val;
*pcount = ctx.new_len;
} else {
kfree(ctx.new_val);
@@ -1221,7 +1258,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
return ret == 1 ? 0 : -EPERM;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
#ifdef CONFIG_NET
static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
@@ -1240,16 +1276,23 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
{
- if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
+ if (unlikely(max_optlen < 0))
return -EINVAL;
+ if (unlikely(max_optlen > PAGE_SIZE)) {
+ /* We don't expose optvals that are greater than PAGE_SIZE
+ * to the BPF program.
+ */
+ max_optlen = PAGE_SIZE;
+ }
+
ctx->optval = kzalloc(max_optlen, GFP_USER);
if (!ctx->optval)
return -ENOMEM;
ctx->optval_end = ctx->optval + max_optlen;
- return 0;
+ return max_optlen;
}
static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
@@ -1283,13 +1326,13 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
*/
max_optlen = max_t(int, 16, *optlen);
- ret = sockopt_alloc_buf(&ctx, max_optlen);
- if (ret)
- return ret;
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+ if (max_optlen < 0)
+ return max_optlen;
ctx.optlen = *optlen;
- if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
+ if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
ret = -EFAULT;
goto out;
}
@@ -1317,8 +1360,14 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
/* export any potential modifications */
*level = ctx.level;
*optname = ctx.optname;
- *optlen = ctx.optlen;
- *kernel_optval = ctx.optval;
+
+ /* optlen == 0 from BPF indicates that we should
+ * use original userspace data.
+ */
+ if (ctx.optlen != 0) {
+ *optlen = ctx.optlen;
+ *kernel_optval = ctx.optval;
+ }
}
out:
@@ -1326,7 +1375,6 @@ out:
sockopt_free_buf(&ctx);
return ret;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
int optname, char __user *optval,
@@ -1350,12 +1398,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
return retval;
- ret = sockopt_alloc_buf(&ctx, max_optlen);
- if (ret)
- return ret;
-
ctx.optlen = max_optlen;
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+ if (max_optlen < 0)
+ return max_optlen;
+
if (!retval) {
/* If kernel getsockopt finished successfully,
* copy whatever was returned to the user back
@@ -1369,10 +1417,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
goto out;
}
- if (ctx.optlen > max_optlen)
- ctx.optlen = max_optlen;
-
- if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
+ if (copy_from_user(ctx.optval, optval,
+ min(ctx.optlen, max_optlen)) != 0) {
ret = -EFAULT;
goto out;
}
@@ -1401,10 +1447,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
goto out;
}
- if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
- put_user(ctx.optlen, optlen)) {
- ret = -EFAULT;
- goto out;
+ if (ctx.optlen != 0) {
+ if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
+ put_user(ctx.optlen, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
ret = ctx.retval;
@@ -1413,7 +1461,6 @@ out:
sockopt_free_buf(&ctx);
return ret;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
#endif
static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 14aa1f74dd10..9df4cc9a2907 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -82,7 +82,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
struct bpf_prog *fp;
size = round_up(size, PAGE_SIZE);
- fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ fp = __vmalloc(size, gfp_flags);
if (fp == NULL)
return NULL;
@@ -232,7 +232,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
if (ret)
return NULL;
- fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ fp = __vmalloc(size, gfp_flags);
if (fp == NULL) {
__bpf_prog_uncharge(fp_old->aux->user, delta);
} else {
@@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
- !capable(CAP_SYS_ADMIN))
+ !bpf_capable())
return;
bpf_prog_ksym_set_addr(fp);
@@ -1089,7 +1089,7 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog *fp;
- fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
+ fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags);
if (fp != NULL) {
/* aux->prog still points to the fp_other one, so
* when promoting the clone to the real program,
@@ -1543,7 +1543,7 @@ select_insn:
/* ARG1 at this point is guaranteed to point to CTX from
* the verifier side due to the fact that the tail call is
- * handeled like a helper, that is, bpf_tail_call_proto,
+ * handled like a helper, that is, bpf_tail_call_proto,
* where arg1_type is ARG_PTR_TO_CTX.
*/
insn = prog->insnsi;
@@ -2136,6 +2136,11 @@ BPF_CALL_0(bpf_user_rnd_u32)
return res;
}
+BPF_CALL_0(bpf_get_raw_cpu_id)
+{
+ return raw_smp_processor_id();
+}
+
/* Weak definitions of helper functions in case we don't have bpf syscall. */
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
@@ -2151,6 +2156,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 3fe0b006d2d2..27595fc6da56 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -85,7 +85,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
u64 cost;
int ret;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
/* check sanity of attributes */
@@ -162,25 +162,10 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
/* Part of headroom was reserved to xdpf */
hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom;
- /* build_skb need to place skb_shared_info after SKB end, and
- * also want to know the memory "truesize". Thus, need to
- * know the memory frame size backing xdp_buff.
- *
- * XDP was designed to have PAGE_SIZE frames, but this
- * assumption is not longer true with ixgbe and i40e. It
- * would be preferred to set frame_size to 2048 or 4096
- * depending on the driver.
- * frame_size = 2048;
- * frame_len = frame_size - sizeof(*xdp_frame);
- *
- * Instead, with info avail, skb_shared_info in placed after
- * packet len. This, unfortunately fakes the truesize.
- * Another disadvantage of this approach, the skb_shared_info
- * is not at a fixed memory location, with mixed length
- * packets, which is bad for cache-line hotness.
+ /* Memory size backing xdp_frame data already have reserved
+ * room for build_skb to place skb_shared_info in tailroom.
*/
- frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) +
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ frame_size = xdpf->frame_sz;
pkt_data_start = xdpf->data - hard_start_headroom;
skb = build_skb_around(skb, pkt_data_start, frame_size);
@@ -636,7 +621,7 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
{
struct xdp_frame *xdpf;
- xdpf = convert_to_xdp_frame(xdp);
+ xdpf = xdp_convert_buff_to_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 58bdca5d978a..5fdbc776a760 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -52,7 +52,6 @@
#define DEV_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
-#define DEV_MAP_BULK_SIZE 16
struct xdp_dev_bulk_queue {
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
struct list_head flush_node;
@@ -65,8 +64,10 @@ struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
struct hlist_node index_hlist;
struct bpf_dtab *dtab;
+ struct bpf_prog *xdp_prog;
struct rcu_head rcu;
unsigned int idx;
+ struct bpf_devmap_val val;
};
struct bpf_dtab {
@@ -85,12 +86,13 @@ static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
-static struct hlist_head *dev_map_create_hash(unsigned int entries)
+static struct hlist_head *dev_map_create_hash(unsigned int entries,
+ int numa_node)
{
int i;
struct hlist_head *hash;
- hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL);
+ hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node);
if (hash != NULL)
for (i = 0; i < entries; i++)
INIT_HLIST_HEAD(&hash[i]);
@@ -106,12 +108,18 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
{
+ u32 valsize = attr->value_size;
u64 cost = 0;
int err;
- /* check sanity of attributes */
+ /* check sanity of attributes. 2 value sizes supported:
+ * 4 bytes: ifindex
+ * 8 bytes: ifindex + prog fd
+ */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
+ (valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
+ valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
+ attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return -EINVAL;
/* Lookup returns a pointer straight to dev->ifindex, so make sure the
@@ -138,7 +146,8 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
return -EINVAL;
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
+ dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
+ dtab->map.numa_node);
if (!dtab->dev_index_head)
goto free_charge;
@@ -218,12 +227,14 @@ static void dev_map_free(struct bpf_map *map)
hlist_for_each_entry_safe(dev, next, head, index_hlist) {
hlist_del_rcu(&dev->index_hlist);
+ if (dev->xdp_prog)
+ bpf_prog_put(dev->xdp_prog);
dev_put(dev->dev);
kfree(dev);
}
}
- kfree(dtab->dev_index_head);
+ bpf_map_area_free(dtab->dev_index_head);
} else {
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev;
@@ -232,6 +243,8 @@ static void dev_map_free(struct bpf_map *map)
if (!dev)
continue;
+ if (dev->xdp_prog)
+ bpf_prog_put(dev->xdp_prog);
dev_put(dev->dev);
kfree(dev);
}
@@ -318,6 +331,16 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
return -ENOENT;
}
+bool dev_map_can_have_prog(struct bpf_map *map)
+{
+ if ((map->map_type == BPF_MAP_TYPE_DEVMAP ||
+ map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) &&
+ map->value_size != offsetofend(struct bpf_devmap_val, ifindex))
+ return true;
+
+ return false;
+}
+
static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
struct net_device *dev = bq->dev;
@@ -435,13 +458,41 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
if (unlikely(err))
return err;
- xdpf = convert_to_xdp_frame(xdp);
+ xdpf = xdp_convert_buff_to_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
return bq_enqueue(dev, xdpf, dev_rx);
}
+static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
+ struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
+{
+ struct xdp_txq_info txq = { .dev = dev };
+ u32 act;
+
+ xdp_set_data_meta_invalid(xdp);
+ xdp->txq = &txq;
+
+ act = bpf_prog_run_xdp(xdp_prog, xdp);
+ switch (act) {
+ case XDP_PASS:
+ return xdp;
+ case XDP_DROP:
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(dev, xdp_prog, act);
+ break;
+ }
+
+ xdp_return_buff(xdp);
+ return NULL;
+}
+
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
struct net_device *dev_rx)
{
@@ -453,6 +504,11 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
{
struct net_device *dev = dst->dev;
+ if (dst->xdp_prog) {
+ xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
+ if (!xdp)
+ return 0;
+ }
return __xdp_enqueue(dev, xdp, dev_rx);
}
@@ -473,18 +529,15 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
- struct net_device *dev = obj ? obj->dev : NULL;
- return dev ? &dev->ifindex : NULL;
+ return obj ? &obj->val : NULL;
}
static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
*(u32 *)key);
- struct net_device *dev = obj ? obj->dev : NULL;
-
- return dev ? &dev->ifindex : NULL;
+ return obj ? &obj->val : NULL;
}
static void __dev_map_entry_free(struct rcu_head *rcu)
@@ -492,6 +545,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
struct bpf_dtab_netdev *dev;
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+ if (dev->xdp_prog)
+ bpf_prog_put(dev->xdp_prog);
dev_put(dev->dev);
kfree(dev);
}
@@ -542,9 +597,10 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
struct bpf_dtab *dtab,
- u32 ifindex,
+ struct bpf_devmap_val *val,
unsigned int idx)
{
+ struct bpf_prog *prog = NULL;
struct bpf_dtab_netdev *dev;
dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
@@ -552,16 +608,38 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
if (!dev)
return ERR_PTR(-ENOMEM);
- dev->dev = dev_get_by_index(net, ifindex);
- if (!dev->dev) {
- kfree(dev);
- return ERR_PTR(-EINVAL);
+ dev->dev = dev_get_by_index(net, val->ifindex);
+ if (!dev->dev)
+ goto err_out;
+
+ if (val->bpf_prog.fd > 0) {
+ prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
+ BPF_PROG_TYPE_XDP, false);
+ if (IS_ERR(prog))
+ goto err_put_dev;
+ if (prog->expected_attach_type != BPF_XDP_DEVMAP)
+ goto err_put_prog;
}
dev->idx = idx;
dev->dtab = dtab;
+ if (prog) {
+ dev->xdp_prog = prog;
+ dev->val.bpf_prog.id = prog->aux->id;
+ } else {
+ dev->xdp_prog = NULL;
+ dev->val.bpf_prog.id = 0;
+ }
+ dev->val.ifindex = val->ifindex;
return dev;
+err_put_prog:
+ bpf_prog_put(prog);
+err_put_dev:
+ dev_put(dev->dev);
+err_out:
+ kfree(dev);
+ return ERR_PTR(-EINVAL);
}
static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
@@ -569,7 +647,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
- u32 ifindex = *(u32 *)value;
+ struct bpf_devmap_val val = {};
u32 i = *(u32 *)key;
if (unlikely(map_flags > BPF_EXIST))
@@ -579,10 +657,16 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
if (unlikely(map_flags == BPF_NOEXIST))
return -EEXIST;
- if (!ifindex) {
+ /* already verified value_size <= sizeof val */
+ memcpy(&val, value, map->value_size);
+
+ if (!val.ifindex) {
dev = NULL;
+ /* can not specify fd if ifindex is 0 */
+ if (val.bpf_prog.fd > 0)
+ return -EINVAL;
} else {
- dev = __dev_map_alloc_node(net, dtab, ifindex, i);
+ dev = __dev_map_alloc_node(net, dtab, &val, i);
if (IS_ERR(dev))
return PTR_ERR(dev);
}
@@ -610,12 +694,15 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
- u32 ifindex = *(u32 *)value;
+ struct bpf_devmap_val val = {};
u32 idx = *(u32 *)key;
unsigned long flags;
int err = -EEXIST;
- if (unlikely(map_flags > BPF_EXIST || !ifindex))
+ /* already verified value_size <= sizeof val */
+ memcpy(&val, value, map->value_size);
+
+ if (unlikely(map_flags > BPF_EXIST || !val.ifindex))
return -EINVAL;
spin_lock_irqsave(&dtab->index_lock, flags);
@@ -624,7 +711,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
if (old_dev && (map_flags & BPF_NOEXIST))
goto out_err;
- dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
+ dev = __dev_map_alloc_node(net, dtab, &val, idx);
if (IS_ERR(dev)) {
err = PTR_ERR(dev);
goto out_err;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d541c8486c95..b4b288a3c3c9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -359,9 +359,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
- if (lru && !capable(CAP_SYS_ADMIN))
+ if (lru && !bpf_capable())
/* LRU implementation is much complicated than other
- * maps. Hence, limit to CAP_SYS_ADMIN for now.
+ * maps. Hence, limit to CAP_BPF.
*/
return -EPERM;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bafc53ddd350..be43ab3e619f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -151,7 +151,19 @@ BPF_CALL_0(bpf_ktime_get_ns)
const struct bpf_func_proto bpf_ktime_get_ns_proto = {
.func = bpf_ktime_get_ns,
- .gpl_only = true,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_0(bpf_ktime_get_boot_ns)
+{
+ /* NMI safe access to clock boottime */
+ return ktime_get_boot_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
+ .func = bpf_ktime_get_boot_ns,
+ .gpl_only = false,
.ret_type = RET_INTEGER,
};
@@ -562,3 +574,114 @@ const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = {
.arg3_type = ARG_PTR_TO_UNINIT_MEM,
.arg4_type = ARG_CONST_SIZE,
};
+
+static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
+ .func = bpf_get_raw_cpu_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map,
+ u64, flags, void *, data, u64, size)
+{
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+
+ return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
+}
+
+const struct bpf_func_proto bpf_event_output_data_proto = {
+ .func = bpf_event_output_data,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+const struct bpf_func_proto bpf_get_current_task_proto __weak;
+const struct bpf_func_proto bpf_probe_read_user_proto __weak;
+const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
+const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
+const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
+
+const struct bpf_func_proto *
+bpf_base_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_map_push_elem:
+ return &bpf_map_push_elem_proto;
+ case BPF_FUNC_map_pop_elem:
+ return &bpf_map_pop_elem_proto;
+ case BPF_FUNC_map_peek_elem:
+ return &bpf_map_peek_elem_proto;
+ case BPF_FUNC_get_prandom_u32:
+ return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_raw_smp_processor_id_proto;
+ case BPF_FUNC_get_numa_node_id:
+ return &bpf_get_numa_node_id_proto;
+ case BPF_FUNC_tail_call:
+ return &bpf_tail_call_proto;
+ case BPF_FUNC_ktime_get_ns:
+ return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_ktime_get_boot_ns:
+ return &bpf_ktime_get_boot_ns_proto;
+ case BPF_FUNC_ringbuf_output:
+ return &bpf_ringbuf_output_proto;
+ case BPF_FUNC_ringbuf_reserve:
+ return &bpf_ringbuf_reserve_proto;
+ case BPF_FUNC_ringbuf_submit:
+ return &bpf_ringbuf_submit_proto;
+ case BPF_FUNC_ringbuf_discard:
+ return &bpf_ringbuf_discard_proto;
+ case BPF_FUNC_ringbuf_query:
+ return &bpf_ringbuf_query_proto;
+ default:
+ break;
+ }
+
+ if (!bpf_capable())
+ return NULL;
+
+ switch (func_id) {
+ case BPF_FUNC_spin_lock:
+ return &bpf_spin_lock_proto;
+ case BPF_FUNC_spin_unlock:
+ return &bpf_spin_unlock_proto;
+ case BPF_FUNC_trace_printk:
+ if (!perfmon_capable())
+ return NULL;
+ return bpf_get_trace_printk_proto();
+ case BPF_FUNC_jiffies64:
+ return &bpf_jiffies64_proto;
+ default:
+ break;
+ }
+
+ if (!perfmon_capable())
+ return NULL;
+
+ switch (func_id) {
+ case BPF_FUNC_get_current_task:
+ return &bpf_get_current_task_proto;
+ case BPF_FUNC_probe_read_user:
+ return &bpf_probe_read_user_proto;
+ case BPF_FUNC_probe_read_kernel:
+ return &bpf_probe_read_kernel_proto;
+ case BPF_FUNC_probe_read_user_str:
+ return &bpf_probe_read_user_str_proto;
+ case BPF_FUNC_probe_read_kernel_str:
+ return &bpf_probe_read_kernel_str_proto;
+ default:
+ return NULL;
+ }
+}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 95087d9f4ed3..fb878ba3f22f 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -358,8 +358,11 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg)
{
+ struct bpf_link *link = arg;
+
return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops,
- &bpffs_obj_fops);
+ bpf_link_is_iter(link) ?
+ &bpf_iter_fops : &bpffs_obj_fops);
}
static struct dentry *
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 65c236cf341e..c8cc4e4cf98d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
u64 cost = sizeof(*trie), cost_per_node;
int ret;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
/* check sanity of attributes */
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index b3c48d1533cb..17738c93bec8 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -60,7 +60,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
if (inner_map->ops == &array_map_ops) {
- inner_map_meta->unpriv_array = inner_map->unpriv_array;
+ inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
container_of(inner_map_meta, struct bpf_array, map)->index_mask =
container_of(inner_map, struct bpf_array, map)->index_mask;
}
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
new file mode 100644
index 000000000000..c69071e334bf
--- /dev/null
+++ b/kernel/bpf/map_iter.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+
+struct bpf_iter_seq_map_info {
+ u32 mid;
+};
+
+static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_map_info *info = seq->private;
+ struct bpf_map *map;
+
+ map = bpf_map_get_curr_or_next(&info->mid);
+ if (!map)
+ return NULL;
+
+ ++*pos;
+ return map;
+}
+
+static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_map_info *info = seq->private;
+ struct bpf_map *map;
+
+ ++*pos;
+ ++info->mid;
+ bpf_map_put((struct bpf_map *)v);
+ map = bpf_map_get_curr_or_next(&info->mid);
+ if (!map)
+ return NULL;
+
+ return map;
+}
+
+struct bpf_iter__bpf_map {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_map *, map);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_map, struct bpf_iter_meta *meta, struct bpf_map *map)
+
+static int __bpf_map_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter__bpf_map ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.meta = &meta;
+ ctx.map = v;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static int bpf_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_map_seq_show(seq, v, false);
+}
+
+static void bpf_map_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_map_seq_show(seq, v, true);
+ else
+ bpf_map_put((struct bpf_map *)v);
+}
+
+static const struct seq_operations bpf_map_seq_ops = {
+ .start = bpf_map_seq_start,
+ .next = bpf_map_seq_next,
+ .stop = bpf_map_seq_stop,
+ .show = bpf_map_seq_show,
+};
+
+static const struct bpf_iter_reg bpf_map_reg_info = {
+ .target = "bpf_map",
+ .seq_ops = &bpf_map_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_map_info),
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_map, map),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+};
+
+static int __init bpf_map_iter_init(void)
+{
+ return bpf_iter_reg_target(&bpf_map_reg_info);
+}
+
+late_initcall(bpf_map_iter_init);
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
new file mode 100644
index 000000000000..78cf061f8179
--- /dev/null
+++ b/kernel/bpf/net_namespace.c
@@ -0,0 +1,373 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <net/net_namespace.h>
+
+/*
+ * Functions to manage BPF programs attached to netns
+ */
+
+struct bpf_netns_link {
+ struct bpf_link link;
+ enum bpf_attach_type type;
+ enum netns_bpf_attach_type netns_type;
+
+ /* We don't hold a ref to net in order to auto-detach the link
+ * when netns is going away. Instead we rely on pernet
+ * pre_exit callback to clear this pointer. Must be accessed
+ * with netns_bpf_mutex held.
+ */
+ struct net *net;
+};
+
+/* Protects updates to netns_bpf */
+DEFINE_MUTEX(netns_bpf_mutex);
+
+/* Must be called with netns_bpf_mutex held. */
+static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link)
+{
+ struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+
+ net_link->net = NULL;
+}
+
+static void bpf_netns_link_release(struct bpf_link *link)
+{
+ struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+ enum netns_bpf_attach_type type = net_link->netns_type;
+ struct net *net;
+
+ /* Link auto-detached by dying netns. */
+ if (!net_link->net)
+ return;
+
+ mutex_lock(&netns_bpf_mutex);
+
+ /* Recheck after potential sleep. We can race with cleanup_net
+ * here, but if we see a non-NULL struct net pointer pre_exit
+ * has not happened yet and will block on netns_bpf_mutex.
+ */
+ net = net_link->net;
+ if (!net)
+ goto out_unlock;
+
+ net->bpf.links[type] = NULL;
+ RCU_INIT_POINTER(net->bpf.progs[type], NULL);
+
+out_unlock:
+ mutex_unlock(&netns_bpf_mutex);
+}
+
+static void bpf_netns_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+
+ kfree(net_link);
+}
+
+static int bpf_netns_link_update_prog(struct bpf_link *link,
+ struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+ enum netns_bpf_attach_type type = net_link->netns_type;
+ struct net *net;
+ int ret = 0;
+
+ if (old_prog && old_prog != link->prog)
+ return -EPERM;
+ if (new_prog->type != link->prog->type)
+ return -EINVAL;
+
+ mutex_lock(&netns_bpf_mutex);
+
+ net = net_link->net;
+ if (!net || !check_net(net)) {
+ /* Link auto-detached or netns dying */
+ ret = -ENOLINK;
+ goto out_unlock;
+ }
+
+ old_prog = xchg(&link->prog, new_prog);
+ rcu_assign_pointer(net->bpf.progs[type], new_prog);
+ bpf_prog_put(old_prog);
+
+out_unlock:
+ mutex_unlock(&netns_bpf_mutex);
+ return ret;
+}
+
+static int bpf_netns_link_fill_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ const struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+ unsigned int inum = 0;
+ struct net *net;
+
+ mutex_lock(&netns_bpf_mutex);
+ net = net_link->net;
+ if (net && check_net(net))
+ inum = net->ns.inum;
+ mutex_unlock(&netns_bpf_mutex);
+
+ info->netns.netns_ino = inum;
+ info->netns.attach_type = net_link->type;
+ return 0;
+}
+
+static void bpf_netns_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_link_info info = {};
+
+ bpf_netns_link_fill_info(link, &info);
+ seq_printf(seq,
+ "netns_ino:\t%u\n"
+ "attach_type:\t%u\n",
+ info.netns.netns_ino,
+ info.netns.attach_type);
+}
+
+static const struct bpf_link_ops bpf_netns_link_ops = {
+ .release = bpf_netns_link_release,
+ .dealloc = bpf_netns_link_dealloc,
+ .update_prog = bpf_netns_link_update_prog,
+ .fill_link_info = bpf_netns_link_fill_info,
+ .show_fdinfo = bpf_netns_link_show_fdinfo,
+};
+
+int netns_bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ u32 prog_id, prog_cnt = 0, flags = 0;
+ enum netns_bpf_attach_type type;
+ struct bpf_prog *attached;
+ struct net *net;
+
+ if (attr->query.query_flags)
+ return -EINVAL;
+
+ type = to_netns_bpf_attach_type(attr->query.attach_type);
+ if (type < 0)
+ return -EINVAL;
+
+ net = get_net_ns_by_fd(attr->query.target_fd);
+ if (IS_ERR(net))
+ return PTR_ERR(net);
+
+ rcu_read_lock();
+ attached = rcu_dereference(net->bpf.progs[type]);
+ if (attached) {
+ prog_cnt = 1;
+ prog_id = attached->aux->id;
+ }
+ rcu_read_unlock();
+
+ put_net(net);
+
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
+ return -EFAULT;
+
+ if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
+ return 0;
+
+ if (copy_to_user(prog_ids, &prog_id, sizeof(u32)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ enum netns_bpf_attach_type type;
+ struct net *net;
+ int ret;
+
+ type = to_netns_bpf_attach_type(attr->attach_type);
+ if (type < 0)
+ return -EINVAL;
+
+ net = current->nsproxy->net_ns;
+ mutex_lock(&netns_bpf_mutex);
+
+ /* Attaching prog directly is not compatible with links */
+ if (net->bpf.links[type]) {
+ ret = -EEXIST;
+ goto out_unlock;
+ }
+
+ switch (type) {
+ case NETNS_BPF_FLOW_DISSECTOR:
+ ret = flow_dissector_bpf_prog_attach(net, prog);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+out_unlock:
+ mutex_unlock(&netns_bpf_mutex);
+
+ return ret;
+}
+
+/* Must be called with netns_bpf_mutex held. */
+static int __netns_bpf_prog_detach(struct net *net,
+ enum netns_bpf_attach_type type)
+{
+ struct bpf_prog *attached;
+
+ /* Progs attached via links cannot be detached */
+ if (net->bpf.links[type])
+ return -EINVAL;
+
+ attached = rcu_dereference_protected(net->bpf.progs[type],
+ lockdep_is_held(&netns_bpf_mutex));
+ if (!attached)
+ return -ENOENT;
+ RCU_INIT_POINTER(net->bpf.progs[type], NULL);
+ bpf_prog_put(attached);
+ return 0;
+}
+
+int netns_bpf_prog_detach(const union bpf_attr *attr)
+{
+ enum netns_bpf_attach_type type;
+ int ret;
+
+ type = to_netns_bpf_attach_type(attr->attach_type);
+ if (type < 0)
+ return -EINVAL;
+
+ mutex_lock(&netns_bpf_mutex);
+ ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type);
+ mutex_unlock(&netns_bpf_mutex);
+
+ return ret;
+}
+
+static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
+ enum netns_bpf_attach_type type)
+{
+ struct bpf_prog *prog;
+ int err;
+
+ mutex_lock(&netns_bpf_mutex);
+
+ /* Allow attaching only one prog or link for now */
+ if (net->bpf.links[type]) {
+ err = -E2BIG;
+ goto out_unlock;
+ }
+ /* Links are not compatible with attaching prog directly */
+ prog = rcu_dereference_protected(net->bpf.progs[type],
+ lockdep_is_held(&netns_bpf_mutex));
+ if (prog) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ switch (type) {
+ case NETNS_BPF_FLOW_DISSECTOR:
+ err = flow_dissector_bpf_prog_attach(net, link->prog);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+ if (err)
+ goto out_unlock;
+
+ net->bpf.links[type] = link;
+
+out_unlock:
+ mutex_unlock(&netns_bpf_mutex);
+ return err;
+}
+
+int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ enum netns_bpf_attach_type netns_type;
+ struct bpf_link_primer link_primer;
+ struct bpf_netns_link *net_link;
+ enum bpf_attach_type type;
+ struct net *net;
+ int err;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ type = attr->link_create.attach_type;
+ netns_type = to_netns_bpf_attach_type(type);
+ if (netns_type < 0)
+ return -EINVAL;
+
+ net = get_net_ns_by_fd(attr->link_create.target_fd);
+ if (IS_ERR(net))
+ return PTR_ERR(net);
+
+ net_link = kzalloc(sizeof(*net_link), GFP_USER);
+ if (!net_link) {
+ err = -ENOMEM;
+ goto out_put_net;
+ }
+ bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS,
+ &bpf_netns_link_ops, prog);
+ net_link->net = net;
+ net_link->type = type;
+ net_link->netns_type = netns_type;
+
+ err = bpf_link_prime(&net_link->link, &link_primer);
+ if (err) {
+ kfree(net_link);
+ goto out_put_net;
+ }
+
+ err = netns_bpf_link_attach(net, &net_link->link, netns_type);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ goto out_put_net;
+ }
+
+ put_net(net);
+ return bpf_link_settle(&link_primer);
+
+out_put_net:
+ put_net(net);
+ return err;
+}
+
+static void __net_exit netns_bpf_pernet_pre_exit(struct net *net)
+{
+ enum netns_bpf_attach_type type;
+ struct bpf_link *link;
+
+ mutex_lock(&netns_bpf_mutex);
+ for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) {
+ link = net->bpf.links[type];
+ if (link)
+ bpf_netns_link_auto_detach(link);
+ else
+ __netns_bpf_prog_detach(net, type);
+ }
+ mutex_unlock(&netns_bpf_mutex);
+}
+
+static struct pernet_operations netns_bpf_pernet_ops __net_initdata = {
+ .pre_exit = netns_bpf_pernet_pre_exit,
+};
+
+static int __init netns_bpf_init(void)
+{
+ return register_pernet_subsys(&netns_bpf_pernet_ops);
+}
+
+subsys_initcall(netns_bpf_init);
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index f697647ceb54..05c8e043b9d2 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -19,7 +19,7 @@ struct bpf_queue_stack {
u32 head, tail;
u32 size; /* max_entries + 1 */
- char elements[0] __aligned(8);
+ char elements[] __aligned(8);
};
static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map)
@@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
/* Called from syscall */
static int queue_stack_map_alloc_check(union bpf_attr *attr)
{
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return -EPERM;
/* check sanity of attributes */
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 01badd3eda7a..21cde24386db 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
struct bpf_map_memory mem;
u64 array_size;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
array_size = sizeof(*array);
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
new file mode 100644
index 000000000000..180414bb0d3e
--- /dev/null
+++ b/kernel/bpf/ringbuf.c
@@ -0,0 +1,501 @@
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/irq_work.h>
+#include <linux/slab.h>
+#include <linux/filter.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <uapi/linux/btf.h>
+
+#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
+
+/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
+#define RINGBUF_PGOFF \
+ (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
+/* consumer page and producer page */
+#define RINGBUF_POS_PAGES 2
+
+#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
+
+/* Maximum size of ring buffer area is limited by 32-bit page offset within
+ * record header, counted in pages. Reserve 8 bits for extensibility, and take
+ * into account few extra pages for consumer/producer pages and
+ * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single
+ * ring buffer.
+ */
+#define RINGBUF_MAX_DATA_SZ \
+ (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
+
+struct bpf_ringbuf {
+ wait_queue_head_t waitq;
+ struct irq_work work;
+ u64 mask;
+ struct page **pages;
+ int nr_pages;
+ spinlock_t spinlock ____cacheline_aligned_in_smp;
+ /* Consumer and producer counters are put into separate pages to allow
+ * mapping consumer page as r/w, but restrict producer page to r/o.
+ * This protects producer position from being modified by user-space
+ * application and ruining in-kernel position tracking.
+ */
+ unsigned long consumer_pos __aligned(PAGE_SIZE);
+ unsigned long producer_pos __aligned(PAGE_SIZE);
+ char data[] __aligned(PAGE_SIZE);
+};
+
+struct bpf_ringbuf_map {
+ struct bpf_map map;
+ struct bpf_map_memory memory;
+ struct bpf_ringbuf *rb;
+};
+
+/* 8-byte ring buffer record header structure */
+struct bpf_ringbuf_hdr {
+ u32 len;
+ u32 pg_off;
+};
+
+static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
+{
+ const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN |
+ __GFP_ZERO;
+ int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
+ int nr_data_pages = data_sz >> PAGE_SHIFT;
+ int nr_pages = nr_meta_pages + nr_data_pages;
+ struct page **pages, *page;
+ struct bpf_ringbuf *rb;
+ size_t array_size;
+ int i;
+
+ /* Each data page is mapped twice to allow "virtual"
+ * continuous read of samples wrapping around the end of ring
+ * buffer area:
+ * ------------------------------------------------------
+ * | meta pages | real data pages | same data pages |
+ * ------------------------------------------------------
+ * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 |
+ * ------------------------------------------------------
+ * | | TA DA | TA DA |
+ * ------------------------------------------------------
+ * ^^^^^^^
+ * |
+ * Here, no need to worry about special handling of wrapped-around
+ * data due to double-mapped data pages. This works both in kernel and
+ * when mmap()'ed in user-space, simplifying both kernel and
+ * user-space implementations significantly.
+ */
+ array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages);
+ if (array_size > PAGE_SIZE)
+ pages = vmalloc_node(array_size, numa_node);
+ else
+ pages = kmalloc_node(array_size, flags, numa_node);
+ if (!pages)
+ return NULL;
+
+ for (i = 0; i < nr_pages; i++) {
+ page = alloc_pages_node(numa_node, flags, 0);
+ if (!page) {
+ nr_pages = i;
+ goto err_free_pages;
+ }
+ pages[i] = page;
+ if (i >= nr_meta_pages)
+ pages[nr_data_pages + i] = page;
+ }
+
+ rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
+ VM_ALLOC | VM_USERMAP, PAGE_KERNEL);
+ if (rb) {
+ rb->pages = pages;
+ rb->nr_pages = nr_pages;
+ return rb;
+ }
+
+err_free_pages:
+ for (i = 0; i < nr_pages; i++)
+ __free_page(pages[i]);
+ kvfree(pages);
+ return NULL;
+}
+
+static void bpf_ringbuf_notify(struct irq_work *work)
+{
+ struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work);
+
+ wake_up_all(&rb->waitq);
+}
+
+static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
+{
+ struct bpf_ringbuf *rb;
+
+ if (!data_sz || !PAGE_ALIGNED(data_sz))
+ return ERR_PTR(-EINVAL);
+
+#ifdef CONFIG_64BIT
+ /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */
+ if (data_sz > RINGBUF_MAX_DATA_SZ)
+ return ERR_PTR(-E2BIG);
+#endif
+
+ rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
+ if (!rb)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&rb->spinlock);
+ init_waitqueue_head(&rb->waitq);
+ init_irq_work(&rb->work, bpf_ringbuf_notify);
+
+ rb->mask = data_sz - 1;
+ rb->consumer_pos = 0;
+ rb->producer_pos = 0;
+
+ return rb;
+}
+
+static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_ringbuf_map *rb_map;
+ u64 cost;
+ int err;
+
+ if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ if (attr->key_size || attr->value_size ||
+ attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries))
+ return ERR_PTR(-EINVAL);
+
+ rb_map = kzalloc(sizeof(*rb_map), GFP_USER);
+ if (!rb_map)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&rb_map->map, attr);
+
+ cost = sizeof(struct bpf_ringbuf_map) +
+ sizeof(struct bpf_ringbuf) +
+ attr->max_entries;
+ err = bpf_map_charge_init(&rb_map->map.memory, cost);
+ if (err)
+ goto err_free_map;
+
+ rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
+ if (IS_ERR(rb_map->rb)) {
+ err = PTR_ERR(rb_map->rb);
+ goto err_uncharge;
+ }
+
+ return &rb_map->map;
+
+err_uncharge:
+ bpf_map_charge_finish(&rb_map->map.memory);
+err_free_map:
+ kfree(rb_map);
+ return ERR_PTR(err);
+}
+
+static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
+{
+ /* copy pages pointer and nr_pages to local variable, as we are going
+ * to unmap rb itself with vunmap() below
+ */
+ struct page **pages = rb->pages;
+ int i, nr_pages = rb->nr_pages;
+
+ vunmap(rb);
+ for (i = 0; i < nr_pages; i++)
+ __free_page(pages[i]);
+ kvfree(pages);
+}
+
+static void ringbuf_map_free(struct bpf_map *map)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete
+ */
+ synchronize_rcu();
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ bpf_ringbuf_free(rb_map->rb);
+ kfree(rb_map);
+}
+
+static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+
+static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 flags)
+{
+ return -ENOTSUPP;
+}
+
+static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -ENOTSUPP;
+}
+
+static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb)
+{
+ size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT;
+
+ /* consumer page + producer page + 2 x data pages */
+ return RINGBUF_POS_PAGES + 2 * data_pages;
+}
+
+static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_ringbuf_map *rb_map;
+ size_t mmap_sz;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT;
+
+ if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz)
+ return -EINVAL;
+
+ return remap_vmalloc_range(vma, rb_map->rb,
+ vma->vm_pgoff + RINGBUF_PGOFF);
+}
+
+static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
+{
+ unsigned long cons_pos, prod_pos;
+
+ cons_pos = smp_load_acquire(&rb->consumer_pos);
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ return prod_pos - cons_pos;
+}
+
+static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ poll_wait(filp, &rb_map->rb->waitq, pts);
+
+ if (ringbuf_avail_data_sz(rb_map->rb))
+ return EPOLLIN | EPOLLRDNORM;
+ return 0;
+}
+
+const struct bpf_map_ops ringbuf_map_ops = {
+ .map_alloc = ringbuf_map_alloc,
+ .map_free = ringbuf_map_free,
+ .map_mmap = ringbuf_map_mmap,
+ .map_poll = ringbuf_map_poll,
+ .map_lookup_elem = ringbuf_map_lookup_elem,
+ .map_update_elem = ringbuf_map_update_elem,
+ .map_delete_elem = ringbuf_map_delete_elem,
+ .map_get_next_key = ringbuf_map_get_next_key,
+};
+
+/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
+ * calculate offset from record metadata to ring buffer in pages, rounded
+ * down. This page offset is stored as part of record metadata and allows to
+ * restore struct bpf_ringbuf * from record pointer. This page offset is
+ * stored at offset 4 of record metadata header.
+ */
+static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb,
+ struct bpf_ringbuf_hdr *hdr)
+{
+ return ((void *)hdr - (void *)rb) >> PAGE_SHIFT;
+}
+
+/* Given pointer to ring buffer record header, restore pointer to struct
+ * bpf_ringbuf itself by using page offset stored at offset 4
+ */
+static struct bpf_ringbuf *
+bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
+{
+ unsigned long addr = (unsigned long)(void *)hdr;
+ unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT;
+
+ return (void*)((addr & PAGE_MASK) - off);
+}
+
+static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
+{
+ unsigned long cons_pos, prod_pos, new_prod_pos, flags;
+ u32 len, pg_off;
+ struct bpf_ringbuf_hdr *hdr;
+
+ if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
+ return NULL;
+
+ len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+ cons_pos = smp_load_acquire(&rb->consumer_pos);
+
+ if (in_nmi()) {
+ if (!spin_trylock_irqsave(&rb->spinlock, flags))
+ return NULL;
+ } else {
+ spin_lock_irqsave(&rb->spinlock, flags);
+ }
+
+ prod_pos = rb->producer_pos;
+ new_prod_pos = prod_pos + len;
+
+ /* check for out of ringbuf space by ensuring producer position
+ * doesn't advance more than (ringbuf_size - 1) ahead
+ */
+ if (new_prod_pos - cons_pos > rb->mask) {
+ spin_unlock_irqrestore(&rb->spinlock, flags);
+ return NULL;
+ }
+
+ hdr = (void *)rb->data + (prod_pos & rb->mask);
+ pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
+ hdr->len = size | BPF_RINGBUF_BUSY_BIT;
+ hdr->pg_off = pg_off;
+
+ /* pairs with consumer's smp_load_acquire() */
+ smp_store_release(&rb->producer_pos, new_prod_pos);
+
+ spin_unlock_irqrestore(&rb->spinlock, flags);
+
+ return (void *)hdr + BPF_RINGBUF_HDR_SZ;
+}
+
+BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ if (unlikely(flags))
+ return 0;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size);
+}
+
+const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
+ .func = bpf_ringbuf_reserve,
+ .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
+{
+ unsigned long rec_pos, cons_pos;
+ struct bpf_ringbuf_hdr *hdr;
+ struct bpf_ringbuf *rb;
+ u32 new_len;
+
+ hdr = sample - BPF_RINGBUF_HDR_SZ;
+ rb = bpf_ringbuf_restore_from_rec(hdr);
+ new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT;
+ if (discard)
+ new_len |= BPF_RINGBUF_DISCARD_BIT;
+
+ /* update record header with correct final size prefix */
+ xchg(&hdr->len, new_len);
+
+ /* if consumer caught up and is waiting for our record, notify about
+ * new data availability
+ */
+ rec_pos = (void *)hdr - (void *)rb->data;
+ cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
+
+ if (flags & BPF_RB_FORCE_WAKEUP)
+ irq_work_queue(&rb->work);
+ else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
+ irq_work_queue(&rb->work);
+}
+
+BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
+{
+ bpf_ringbuf_commit(sample, flags, false /* discard */);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_submit_proto = {
+ .func = bpf_ringbuf_submit,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_ALLOC_MEM,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
+{
+ bpf_ringbuf_commit(sample, flags, true /* discard */);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_discard_proto = {
+ .func = bpf_ringbuf_discard,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_ALLOC_MEM,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size,
+ u64, flags)
+{
+ struct bpf_ringbuf_map *rb_map;
+ void *rec;
+
+ if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP)))
+ return -EINVAL;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ rec = __bpf_ringbuf_reserve(rb_map->rb, size);
+ if (!rec)
+ return -EAGAIN;
+
+ memcpy(rec, data, size);
+ bpf_ringbuf_commit(rec, flags, false /* discard */);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_output_proto = {
+ .func = bpf_ringbuf_output,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
+{
+ struct bpf_ringbuf *rb;
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+
+ switch (flags) {
+ case BPF_RB_AVAIL_DATA:
+ return ringbuf_avail_data_sz(rb);
+ case BPF_RB_RING_SIZE:
+ return rb->mask + 1;
+ case BPF_RB_CONS_POS:
+ return smp_load_acquire(&rb->consumer_pos);
+ case BPF_RB_PROD_POS:
+ return smp_load_acquire(&rb->producer_pos);
+ default:
+ return 0;
+ }
+}
+
+const struct bpf_func_proto bpf_ringbuf_query_proto = {
+ .func = bpf_ringbuf_query,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index db76339fe358..599488f25e40 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -33,7 +33,7 @@ struct bpf_stack_map {
/* irq_work to run up_read() for build_id lookup in nmi context */
struct stack_map_irq_work {
struct irq_work irq_work;
- struct rw_semaphore *sem;
+ struct mm_struct *mm;
};
static void do_up_read(struct irq_work *entry)
@@ -44,8 +44,7 @@ static void do_up_read(struct irq_work *entry)
return;
work = container_of(entry, struct stack_map_irq_work, irq_work);
- up_read_non_owner(work->sem);
- work->sem = NULL;
+ mmap_read_unlock_non_owner(work->mm);
}
static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
@@ -93,7 +92,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
u64 cost, n_buckets;
int err;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
@@ -317,7 +316,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
* with build_id.
*/
if (!user || !current || !current->mm || irq_work_busy ||
- down_read_trylock(&current->mm->mmap_sem) == 0) {
+ !mmap_read_trylock_non_owner(current->mm)) {
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
@@ -342,16 +341,10 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
}
if (!work) {
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock_non_owner(current->mm);
} else {
- work->sem = &current->mm->mmap_sem;
+ work->mm = current->mm;
irq_work_queue(&work->irq_work);
- /*
- * The irq_work will release the mmap_sem with
- * up_read_non_owner(). The rwsem_release() is called
- * here to release the lock from lockdep's perspective.
- */
- rwsem_release(&current->mm->mmap_sem.dep_map, _RET_IP_);
}
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4e6dee19a668..8da159936bab 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -25,7 +25,10 @@
#include <linux/nospec.h>
#include <linux/audit.h>
#include <uapi/linux/btf.h>
+#include <linux/pgtable.h>
#include <linux/bpf_lsm.h>
+#include <linux/poll.h>
+#include <linux/bpf-netns.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -42,6 +45,8 @@ static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
static DEFINE_IDR(map_idr);
static DEFINE_SPINLOCK(map_idr_lock);
+static DEFINE_IDR(link_idr);
+static DEFINE_SPINLOCK(link_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly;
@@ -49,9 +54,11 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
#define BPF_MAP_TYPE(_id, _ops) \
[_id] = &_ops,
+#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
};
/*
@@ -67,32 +74,19 @@ int bpf_check_uarg_tail_zero(void __user *uaddr,
size_t expected_size,
size_t actual_size)
{
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
- int err;
+ unsigned char __user *addr = uaddr + expected_size;
+ int res;
if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
return -E2BIG;
- if (unlikely(!access_ok(uaddr, actual_size)))
- return -EFAULT;
-
if (actual_size <= expected_size)
return 0;
- addr = uaddr + expected_size;
- end = uaddr + actual_size;
-
- for (; addr < end; addr++) {
- err = get_user(val, addr);
- if (err)
- return err;
- if (val)
- return -E2BIG;
- }
-
- return 0;
+ res = check_zeroed_user(addr, actual_size - expected_size);
+ if (res < 0)
+ return res;
+ return res ? 0 : -E2BIG;
}
const struct bpf_map_ops bpf_map_offload_ops = {
@@ -281,27 +275,29 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
* __GFP_RETRY_MAYFAIL to avoid such situations.
*/
- const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
+ const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO;
+ unsigned int flags = 0;
+ unsigned long align = 1;
void *area;
if (size >= SIZE_MAX)
return NULL;
/* kmalloc()'ed memory can't be mmap()'ed */
- if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
+ if (mmapable) {
+ BUG_ON(!PAGE_ALIGNED(size));
+ align = SHMLBA;
+ flags = VM_USERMAP;
+ } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+ area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
numa_node);
if (area != NULL)
return area;
}
- if (mmapable) {
- BUG_ON(!PAGE_ALIGNED(size));
- return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL |
- __GFP_RETRY_MAYFAIL | flags);
- }
- return __vmalloc_node_flags_caller(size, numa_node,
- GFP_KERNEL | __GFP_RETRY_MAYFAIL |
- flags, __builtin_return_address(0));
+
+ return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+ gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
+ flags, numa_node, __builtin_return_address(0));
}
void *bpf_map_area_alloc(u64 size, int numa_node)
@@ -658,6 +654,16 @@ out:
return err;
}
+static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
+{
+ struct bpf_map *map = filp->private_data;
+
+ if (map->ops->map_poll)
+ return map->ops->map_poll(map, filp, pts);
+
+ return EPOLLERR;
+}
+
const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
@@ -666,6 +672,7 @@ const struct file_operations bpf_map_fops = {
.read = bpf_dummy_read,
.write = bpf_dummy_write,
.mmap = bpf_map_mmap,
+ .poll = bpf_map_poll,
};
int bpf_map_new_fd(struct bpf_map *map, int flags)
@@ -1383,7 +1390,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
if (!buf) {
- kvfree(buf_prevkey);
+ kfree(buf_prevkey);
return -ENOMEM;
}
@@ -1468,7 +1475,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
+ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
@@ -1543,7 +1551,7 @@ static int map_freeze(const union bpf_attr *attr)
err = -EBUSY;
goto err_put;
}
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!bpf_capable()) {
err = -EPERM;
goto err_put;
}
@@ -1559,9 +1567,11 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
[_id] = & _name ## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
};
static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
@@ -1983,6 +1993,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
@@ -2016,6 +2030,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
}
}
+static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_XDP:
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+ case BPF_PROG_TYPE_SK_SKB:
+ case BPF_PROG_TYPE_SK_MSG:
+ case BPF_PROG_TYPE_LIRC_MODE2:
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_EXT: /* extends any prog */
+ return true;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ /* always unpriv */
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ /* equivalent to SOCKET_FILTER. need CAP_BPF only */
+ default:
+ return false;
+ }
+}
+
+static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
+ case BPF_PROG_TYPE_EXT: /* extends any prog */
+ return true;
+ default:
+ return false;
+ }
+}
+
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
@@ -2038,7 +2101,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
(attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
- !capable(CAP_SYS_ADMIN))
+ !bpf_capable())
return -EPERM;
/* copy eBPF program license from user space */
@@ -2051,11 +2114,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
is_gpl = license_is_gpl_compatible(license);
if (attr->insn_cnt == 0 ||
- attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
+ attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
return -E2BIG;
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
- !capable(CAP_SYS_ADMIN))
+ !bpf_capable())
+ return -EPERM;
+
+ if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (is_perfmon_prog_type(type) && !perfmon_capable())
return -EPERM;
bpf_prog_load_fixup_attach_type(attr);
@@ -2194,25 +2262,39 @@ static int bpf_obj_get(const union bpf_attr *attr)
attr->file_flags);
}
-void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
- struct bpf_prog *prog)
+void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog)
{
atomic64_set(&link->refcnt, 1);
+ link->type = type;
+ link->id = 0;
link->ops = ops;
link->prog = prog;
}
+static void bpf_link_free_id(int id)
+{
+ if (!id)
+ return;
+
+ spin_lock_bh(&link_idr_lock);
+ idr_remove(&link_idr, id);
+ spin_unlock_bh(&link_idr_lock);
+}
+
/* Clean up bpf_link and corresponding anon_inode file and FD. After
* anon_inode is created, bpf_link can't be just kfree()'d due to deferred
- * anon_inode's release() call. This helper manages marking bpf_link as
- * defunct, releases anon_inode file and puts reserved FD.
+ * anon_inode's release() call. This helper marksbpf_link as
+ * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
+ * is not decremented, it's the responsibility of a calling code that failed
+ * to complete bpf_link initialization.
*/
-void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
- int link_fd)
+void bpf_link_cleanup(struct bpf_link_primer *primer)
{
- link->prog = NULL;
- fput(link_file);
- put_unused_fd(link_fd);
+ primer->link->prog = NULL;
+ bpf_link_free_id(primer->id);
+ fput(primer->file);
+ put_unused_fd(primer->fd);
}
void bpf_link_inc(struct bpf_link *link)
@@ -2223,6 +2305,7 @@ void bpf_link_inc(struct bpf_link *link)
/* bpf_link_free is guaranteed to be called from process context */
static void bpf_link_free(struct bpf_link *link)
{
+ bpf_link_free_id(link->id);
if (link->prog) {
/* detach BPF program, clean up used resources */
link->ops->release(link);
@@ -2264,35 +2347,35 @@ static int bpf_link_release(struct inode *inode, struct file *filp)
}
#ifdef CONFIG_PROC_FS
-static const struct bpf_link_ops bpf_raw_tp_lops;
-static const struct bpf_link_ops bpf_tracing_link_lops;
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
+#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
+static const char *bpf_link_type_strs[] = {
+ [BPF_LINK_TYPE_UNSPEC] = "<invalid>",
+#include <linux/bpf_types.h>
+};
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_link *link = filp->private_data;
const struct bpf_prog *prog = link->prog;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- const char *link_type;
-
- if (link->ops == &bpf_raw_tp_lops)
- link_type = "raw_tracepoint";
- else if (link->ops == &bpf_tracing_link_lops)
- link_type = "tracing";
-#ifdef CONFIG_CGROUP_BPF
- else if (link->ops == &bpf_cgroup_link_lops)
- link_type = "cgroup";
-#endif
- else
- link_type = "unknown";
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"link_type:\t%s\n"
+ "link_id:\t%u\n"
"prog_tag:\t%s\n"
"prog_id:\t%u\n",
- link_type,
+ bpf_link_type_strs[link->type],
+ link->id,
prog_tag,
prog->aux->id);
+ if (link->ops->show_fdinfo)
+ link->ops->show_fdinfo(link, m);
}
#endif
@@ -2305,36 +2388,77 @@ static const struct file_operations bpf_link_fops = {
.write = bpf_dummy_write,
};
-int bpf_link_new_fd(struct bpf_link *link)
+static int bpf_link_alloc_id(struct bpf_link *link)
{
- return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
-}
+ int id;
-/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but
- * instead of immediately installing fd in fdtable, just reserve it and
- * return. Caller then need to either install it with fd_install(fd, file) or
- * release with put_unused_fd(fd).
- * This is useful for cases when bpf_link attachment/detachment are
- * complicated and expensive operations and should be delayed until all the fd
- * reservation and anon_inode creation succeeds.
+ idr_preload(GFP_KERNEL);
+ spin_lock_bh(&link_idr_lock);
+ id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
+ spin_unlock_bh(&link_idr_lock);
+ idr_preload_end();
+
+ return id;
+}
+
+/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
+ * reserving unused FD and allocating ID from link_idr. This is to be paired
+ * with bpf_link_settle() to install FD and ID and expose bpf_link to
+ * user-space, if bpf_link is successfully attached. If not, bpf_link and
+ * pre-allocated resources are to be freed with bpf_cleanup() call. All the
+ * transient state is passed around in struct bpf_link_primer.
+ * This is preferred way to create and initialize bpf_link, especially when
+ * there are complicated and expensive operations inbetween creating bpf_link
+ * itself and attaching it to BPF hook. By using bpf_link_prime() and
+ * bpf_link_settle() kernel code using bpf_link doesn't have to perform
+ * expensive (and potentially failing) roll back operations in a rare case
+ * that file, FD, or ID can't be allocated.
*/
-struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd)
+int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
{
struct file *file;
- int fd;
+ int fd, id;
fd = get_unused_fd_flags(O_CLOEXEC);
if (fd < 0)
- return ERR_PTR(fd);
+ return fd;
+
+
+ id = bpf_link_alloc_id(link);
+ if (id < 0) {
+ put_unused_fd(fd);
+ return id;
+ }
file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
if (IS_ERR(file)) {
+ bpf_link_free_id(id);
put_unused_fd(fd);
- return file;
+ return PTR_ERR(file);
}
- *reserved_fd = fd;
- return file;
+ primer->link = link;
+ primer->file = file;
+ primer->fd = fd;
+ primer->id = id;
+ return 0;
+}
+
+int bpf_link_settle(struct bpf_link_primer *primer)
+{
+ /* make bpf_link fetchable by ID */
+ spin_lock_bh(&link_idr_lock);
+ primer->link->id = primer->id;
+ spin_unlock_bh(&link_idr_lock);
+ /* make bpf_link fetchable by FD */
+ fd_install(primer->fd, primer->file);
+ /* pass through installed FD */
+ return primer->fd;
+}
+
+int bpf_link_new_fd(struct bpf_link *link)
+{
+ return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
}
struct bpf_link *bpf_link_get_from_fd(u32 ufd)
@@ -2358,6 +2482,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
struct bpf_tracing_link {
struct bpf_link link;
+ enum bpf_attach_type attach_type;
};
static void bpf_tracing_link_release(struct bpf_link *link)
@@ -2373,16 +2498,40 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link)
kfree(tr_link);
}
+static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_tracing_link *tr_link =
+ container_of(link, struct bpf_tracing_link, link);
+
+ seq_printf(seq,
+ "attach_type:\t%d\n",
+ tr_link->attach_type);
+}
+
+static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_tracing_link *tr_link =
+ container_of(link, struct bpf_tracing_link, link);
+
+ info->tracing.attach_type = tr_link->attach_type;
+
+ return 0;
+}
+
static const struct bpf_link_ops bpf_tracing_link_lops = {
.release = bpf_tracing_link_release,
.dealloc = bpf_tracing_link_dealloc,
+ .show_fdinfo = bpf_tracing_link_show_fdinfo,
+ .fill_link_info = bpf_tracing_link_fill_link_info,
};
static int bpf_tracing_prog_attach(struct bpf_prog *prog)
{
+ struct bpf_link_primer link_primer;
struct bpf_tracing_link *link;
- struct file *link_file;
- int link_fd, err;
+ int err;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
@@ -2415,24 +2564,23 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
err = -ENOMEM;
goto out_put_prog;
}
- bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING,
+ &bpf_tracing_link_lops, prog);
+ link->attach_type = prog->expected_attach_type;
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_prog;
}
err = bpf_trampoline_link_prog(prog);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_prog;
}
- fd_install(link_fd, link_file);
- return link_fd;
-
+ return bpf_link_settle(&link_primer);
out_put_prog:
bpf_prog_put(prog);
return err;
@@ -2460,22 +2608,69 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
kfree(raw_tp);
}
-static const struct bpf_link_ops bpf_raw_tp_lops = {
+static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_raw_tp_link *raw_tp_link =
+ container_of(link, struct bpf_raw_tp_link, link);
+
+ seq_printf(seq,
+ "tp_name:\t%s\n",
+ raw_tp_link->btp->tp->name);
+}
+
+static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_raw_tp_link *raw_tp_link =
+ container_of(link, struct bpf_raw_tp_link, link);
+ char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
+ const char *tp_name = raw_tp_link->btp->tp->name;
+ u32 ulen = info->raw_tracepoint.tp_name_len;
+ size_t tp_len = strlen(tp_name);
+
+ if (ulen && !ubuf)
+ return -EINVAL;
+
+ info->raw_tracepoint.tp_name_len = tp_len + 1;
+
+ if (!ubuf)
+ return 0;
+
+ if (ulen >= tp_len + 1) {
+ if (copy_to_user(ubuf, tp_name, tp_len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(ubuf, tp_name, ulen - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + ulen - 1))
+ return -EFAULT;
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_raw_tp_link_lops = {
.release = bpf_raw_tp_link_release,
.dealloc = bpf_raw_tp_link_dealloc,
+ .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
+ .fill_link_info = bpf_raw_tp_link_fill_link_info,
};
#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
{
+ struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
struct bpf_raw_event_map *btp;
- struct file *link_file;
struct bpf_prog *prog;
const char *tp_name;
char buf[128];
- int link_fd, err;
+ int err;
if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
return -EINVAL;
@@ -2528,24 +2723,23 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
err = -ENOMEM;
goto out_put_btp;
}
- bpf_link_init(&link->link, &bpf_raw_tp_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
+ &bpf_raw_tp_link_lops, prog);
link->btp = btp;
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_btp;
}
err = bpf_probe_register(link->btp, prog);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_btp;
}
- fd_install(link_fd, link_file);
- return link_fd;
+ return bpf_link_settle(&link_primer);
out_put_btp:
bpf_put_raw_tracepoint(btp);
@@ -2563,6 +2757,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
case BPF_PROG_TYPE_CGROUP_SKB:
+ if (!capable(CAP_NET_ADMIN))
+ /* cg-skb progs can be loaded by unpriv user.
+ * check permissions at attach time.
+ */
+ return -EPERM;
return prog->enforce_expected_attach_type &&
prog->expected_attach_type != attach_type ?
-EINVAL : 0;
@@ -2587,6 +2786,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
@@ -2610,6 +2813,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
return BPF_PROG_TYPE_CGROUP_SOCKOPT;
+ case BPF_TRACE_ITER:
+ return BPF_PROG_TYPE_TRACING;
default:
return BPF_PROG_TYPE_UNSPEC;
}
@@ -2626,9 +2831,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
struct bpf_prog *prog;
int ret;
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
@@ -2657,7 +2859,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
ret = lirc_prog_attach(attr, prog);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
- ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
+ ret = netns_bpf_prog_attach(attr, prog);
break;
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
@@ -2683,9 +2885,6 @@ static int bpf_prog_detach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
if (CHECK_ATTR(BPF_PROG_DETACH))
return -EINVAL;
@@ -2698,7 +2897,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_PROG_TYPE_LIRC_MODE2:
return lirc_prog_detach(attr);
case BPF_PROG_TYPE_FLOW_DISSECTOR:
- return skb_flow_dissector_bpf_prog_detach(attr);
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return netns_bpf_prog_detach(attr);
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -2734,6 +2935,10 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET6_POST_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
@@ -2747,7 +2952,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
case BPF_FLOW_DISSECTOR:
- return skb_flow_dissector_prog_query(attr, uattr);
+ return netns_bpf_prog_query(attr, uattr);
default:
return -EINVAL;
}
@@ -2761,8 +2966,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
struct bpf_prog *prog;
int ret = -ENOTSUPP;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
if (CHECK_ATTR(BPF_PROG_TEST_RUN))
return -EINVAL;
@@ -2813,6 +3016,25 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
return err;
}
+struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
+{
+ struct bpf_map *map;
+
+ spin_lock_bh(&map_idr_lock);
+again:
+ map = idr_get_next(&map_idr, id);
+ if (map) {
+ map = __bpf_map_inc_not_zero(map, false);
+ if (IS_ERR(map)) {
+ (*id)++;
+ goto again;
+ }
+ }
+ spin_unlock_bh(&map_idr_lock);
+
+ return map;
+}
+
#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
struct bpf_prog *bpf_prog_by_id(u32 id)
@@ -2923,6 +3145,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
struct bpf_insn *insns;
u32 off, type;
u64 imm;
+ u8 code;
int i;
insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
@@ -2931,21 +3154,27 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
return insns;
for (i = 0; i < prog->len; i++) {
- if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) {
+ code = insns[i].code;
+
+ if (code == (BPF_JMP | BPF_TAIL_CALL)) {
insns[i].code = BPF_JMP | BPF_CALL;
insns[i].imm = BPF_FUNC_tail_call;
/* fall-through */
}
- if (insns[i].code == (BPF_JMP | BPF_CALL) ||
- insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) {
- if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS))
+ if (code == (BPF_JMP | BPF_CALL) ||
+ code == (BPF_JMP | BPF_CALL_ARGS)) {
+ if (code == (BPF_JMP | BPF_CALL_ARGS))
insns[i].code = BPF_JMP | BPF_CALL;
if (!bpf_dump_raw_ok())
insns[i].imm = 0;
continue;
}
+ if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
+ insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
+ continue;
+ }
- if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW))
+ if (code != (BPF_LD | BPF_IMM | BPF_DW))
continue;
imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
@@ -3044,7 +3273,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info.run_time_ns = stats.nsecs;
info.run_cnt = stats.cnt;
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!bpf_capable()) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
info.nr_jited_ksyms = 0;
@@ -3326,6 +3555,42 @@ static int bpf_btf_get_info_by_fd(struct btf *btf,
return btf_get_info_by_fd(btf, attr, uattr);
}
+static int bpf_link_get_info_by_fd(struct bpf_link *link,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_link_info info;
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ memset(&info, 0, sizeof(info));
+ if (copy_from_user(&info, uinfo, info_len))
+ return -EFAULT;
+
+ info.type = link->type;
+ info.id = link->id;
+ info.prog_id = link->prog->aux->id;
+
+ if (link->ops->fill_link_info) {
+ err = link->ops->fill_link_info(link, &info);
+ if (err)
+ return err;
+ }
+
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+
#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
@@ -3350,6 +3615,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
uattr);
else if (f.file->f_op == &btf_fops)
err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr);
+ else if (f.file->f_op == &bpf_link_fops)
+ err = bpf_link_get_info_by_fd(f.file->private_data,
+ attr, uattr);
else
err = -EINVAL;
@@ -3364,7 +3632,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return -EPERM;
return btf_new_fd(attr);
@@ -3477,7 +3745,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (file->f_op == &bpf_link_fops) {
struct bpf_link *link = file->private_data;
- if (link->ops == &bpf_raw_tp_lops) {
+ if (link->ops == &bpf_raw_tp_link_lops) {
struct bpf_raw_tp_link *raw_tp =
container_of(link, struct bpf_raw_tp_link, link);
struct bpf_raw_event_map *btp = raw_tp->btp;
@@ -3571,6 +3839,15 @@ err_put:
return err;
}
+static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ if (attr->link_create.attach_type == BPF_TRACE_ITER &&
+ prog->expected_attach_type == BPF_TRACE_ITER)
+ return bpf_iter_link_attach(attr, prog);
+
+ return -EINVAL;
+}
+
#define BPF_LINK_CREATE_LAST_FIELD link_create.flags
static int link_create(union bpf_attr *attr)
{
@@ -3578,9 +3855,6 @@ static int link_create(union bpf_attr *attr)
struct bpf_prog *prog;
int ret;
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
if (CHECK_ATTR(BPF_LINK_CREATE))
return -EINVAL;
@@ -3607,6 +3881,12 @@ static int link_create(union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
ret = cgroup_bpf_link_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_TRACING:
+ ret = tracing_bpf_link_attach(attr, prog);
+ break;
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ ret = netns_bpf_link_create(attr, prog);
+ break;
default:
ret = -EINVAL;
}
@@ -3626,9 +3906,6 @@ static int link_update(union bpf_attr *attr)
u32 flags;
int ret;
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
if (CHECK_ATTR(BPF_LINK_UPDATE))
return -EINVAL;
@@ -3658,13 +3935,10 @@ static int link_update(union bpf_attr *attr)
goto out_put_progs;
}
-#ifdef CONFIG_CGROUP_BPF
- if (link->ops == &bpf_cgroup_link_lops) {
- ret = cgroup_bpf_replace(link, old_prog, new_prog);
- goto out_put_progs;
- }
-#endif
- ret = -EINVAL;
+ if (link->ops->update_prog)
+ ret = link->ops->update_prog(link, new_prog, old_prog);
+ else
+ ret = -EINVAL;
out_put_progs:
if (old_prog)
@@ -3676,12 +3950,131 @@ out_put_link:
return ret;
}
+static int bpf_link_inc_not_zero(struct bpf_link *link)
+{
+ return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT;
+}
+
+#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
+
+static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_link *link;
+ u32 id = attr->link_id;
+ int fd, err;
+
+ if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ spin_lock_bh(&link_idr_lock);
+ link = idr_find(&link_idr, id);
+ /* before link is "settled", ID is 0, pretend it doesn't exist yet */
+ if (link) {
+ if (link->id)
+ err = bpf_link_inc_not_zero(link);
+ else
+ err = -EAGAIN;
+ } else {
+ err = -ENOENT;
+ }
+ spin_unlock_bh(&link_idr_lock);
+
+ if (err)
+ return err;
+
+ fd = bpf_link_new_fd(link);
+ if (fd < 0)
+ bpf_link_put(link);
+
+ return fd;
+}
+
+DEFINE_MUTEX(bpf_stats_enabled_mutex);
+
+static int bpf_stats_release(struct inode *inode, struct file *file)
+{
+ mutex_lock(&bpf_stats_enabled_mutex);
+ static_key_slow_dec(&bpf_stats_enabled_key.key);
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return 0;
+}
+
+static const struct file_operations bpf_stats_fops = {
+ .release = bpf_stats_release,
+};
+
+static int bpf_enable_runtime_stats(void)
+{
+ int fd;
+
+ mutex_lock(&bpf_stats_enabled_mutex);
+
+ /* Set a very high limit to avoid overflow */
+ if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return -EBUSY;
+ }
+
+ fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
+ if (fd >= 0)
+ static_key_slow_inc(&bpf_stats_enabled_key.key);
+
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return fd;
+}
+
+#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
+
+static int bpf_enable_stats(union bpf_attr *attr)
+{
+
+ if (CHECK_ATTR(BPF_ENABLE_STATS))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (attr->enable_stats.type) {
+ case BPF_STATS_RUN_TIME:
+ return bpf_enable_runtime_stats();
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
+#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
+
+static int bpf_iter_create(union bpf_attr *attr)
+{
+ struct bpf_link *link;
+ int err;
+
+ if (CHECK_ATTR(BPF_ITER_CREATE))
+ return -EINVAL;
+
+ if (attr->iter_create.flags)
+ return -EINVAL;
+
+ link = bpf_link_get_from_fd(attr->iter_create.link_fd);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+
+ err = bpf_iter_new_fd(link);
+ bpf_link_put(link);
+
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr;
int err;
- if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
+ if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
return -EPERM;
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
@@ -3793,6 +4186,19 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_LINK_UPDATE:
err = link_update(&attr);
break;
+ case BPF_LINK_GET_FD_BY_ID:
+ err = bpf_link_get_fd_by_id(&attr);
+ break;
+ case BPF_LINK_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &link_idr, &link_idr_lock);
+ break;
+ case BPF_ENABLE_STATS:
+ err = bpf_enable_stats(&attr);
+ break;
+ case BPF_ITER_CREATE:
+ err = bpf_iter_create(&attr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
new file mode 100644
index 000000000000..4dbf2b6035f8
--- /dev/null
+++ b/kernel/bpf/task_iter.c
@@ -0,0 +1,353 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/pid_namespace.h>
+#include <linux/fs.h>
+#include <linux/fdtable.h>
+#include <linux/filter.h>
+
+struct bpf_iter_seq_task_common {
+ struct pid_namespace *ns;
+};
+
+struct bpf_iter_seq_task_info {
+ /* The first field must be struct bpf_iter_seq_task_common.
+ * this is assumed by {init, fini}_seq_pidns() callback functions.
+ */
+ struct bpf_iter_seq_task_common common;
+ u32 tid;
+};
+
+static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+ u32 *tid)
+{
+ struct task_struct *task = NULL;
+ struct pid *pid;
+
+ rcu_read_lock();
+retry:
+ pid = idr_get_next(&ns->idr, tid);
+ if (pid) {
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ ++*tid;
+ goto retry;
+ }
+ }
+ rcu_read_unlock();
+
+ return task;
+}
+
+static void *task_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_task_info *info = seq->private;
+ struct task_struct *task;
+
+ task = task_seq_get_next(info->common.ns, &info->tid);
+ if (!task)
+ return NULL;
+
+ ++*pos;
+ return task;
+}
+
+static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_task_info *info = seq->private;
+ struct task_struct *task;
+
+ ++*pos;
+ ++info->tid;
+ put_task_struct((struct task_struct *)v);
+ task = task_seq_get_next(info->common.ns, &info->tid);
+ if (!task)
+ return NULL;
+
+ return task;
+}
+
+struct bpf_iter__task {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct task_struct *, task);
+};
+
+DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
+
+static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
+ bool in_stop)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_iter__task ctx;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ meta.seq = seq;
+ ctx.meta = &meta;
+ ctx.task = task;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int task_seq_show(struct seq_file *seq, void *v)
+{
+ return __task_seq_show(seq, v, false);
+}
+
+static void task_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__task_seq_show(seq, v, true);
+ else
+ put_task_struct((struct task_struct *)v);
+}
+
+static const struct seq_operations task_seq_ops = {
+ .start = task_seq_start,
+ .next = task_seq_next,
+ .stop = task_seq_stop,
+ .show = task_seq_show,
+};
+
+struct bpf_iter_seq_task_file_info {
+ /* The first field must be struct bpf_iter_seq_task_common.
+ * this is assumed by {init, fini}_seq_pidns() callback functions.
+ */
+ struct bpf_iter_seq_task_common common;
+ struct task_struct *task;
+ struct files_struct *files;
+ u32 tid;
+ u32 fd;
+};
+
+static struct file *
+task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info,
+ struct task_struct **task, struct files_struct **fstruct)
+{
+ struct pid_namespace *ns = info->common.ns;
+ u32 curr_tid = info->tid, max_fds;
+ struct files_struct *curr_files;
+ struct task_struct *curr_task;
+ int curr_fd = info->fd;
+
+ /* If this function returns a non-NULL file object,
+ * it held a reference to the task/files_struct/file.
+ * Otherwise, it does not hold any reference.
+ */
+again:
+ if (*task) {
+ curr_task = *task;
+ curr_files = *fstruct;
+ curr_fd = info->fd;
+ } else {
+ curr_task = task_seq_get_next(ns, &curr_tid);
+ if (!curr_task)
+ return NULL;
+
+ curr_files = get_files_struct(curr_task);
+ if (!curr_files) {
+ put_task_struct(curr_task);
+ curr_tid = ++(info->tid);
+ info->fd = 0;
+ goto again;
+ }
+
+ /* set *fstruct, *task and info->tid */
+ *fstruct = curr_files;
+ *task = curr_task;
+ if (curr_tid == info->tid) {
+ curr_fd = info->fd;
+ } else {
+ info->tid = curr_tid;
+ curr_fd = 0;
+ }
+ }
+
+ rcu_read_lock();
+ max_fds = files_fdtable(curr_files)->max_fds;
+ for (; curr_fd < max_fds; curr_fd++) {
+ struct file *f;
+
+ f = fcheck_files(curr_files, curr_fd);
+ if (!f)
+ continue;
+
+ /* set info->fd */
+ info->fd = curr_fd;
+ get_file(f);
+ rcu_read_unlock();
+ return f;
+ }
+
+ /* the current task is done, go to the next task */
+ rcu_read_unlock();
+ put_files_struct(curr_files);
+ put_task_struct(curr_task);
+ *task = NULL;
+ *fstruct = NULL;
+ info->fd = 0;
+ curr_tid = ++(info->tid);
+ goto again;
+}
+
+static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_task_file_info *info = seq->private;
+ struct files_struct *files = NULL;
+ struct task_struct *task = NULL;
+ struct file *file;
+
+ file = task_file_seq_get_next(info, &task, &files);
+ if (!file) {
+ info->files = NULL;
+ info->task = NULL;
+ return NULL;
+ }
+
+ ++*pos;
+ info->task = task;
+ info->files = files;
+
+ return file;
+}
+
+static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_task_file_info *info = seq->private;
+ struct files_struct *files = info->files;
+ struct task_struct *task = info->task;
+ struct file *file;
+
+ ++*pos;
+ ++info->fd;
+ fput((struct file *)v);
+ file = task_file_seq_get_next(info, &task, &files);
+ if (!file) {
+ info->files = NULL;
+ info->task = NULL;
+ return NULL;
+ }
+
+ info->task = task;
+ info->files = files;
+
+ return file;
+}
+
+struct bpf_iter__task_file {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct task_struct *, task);
+ u32 fd __aligned(8);
+ __bpf_md_ptr(struct file *, file);
+};
+
+DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
+ struct task_struct *task, u32 fd,
+ struct file *file)
+
+static int __task_file_seq_show(struct seq_file *seq, struct file *file,
+ bool in_stop)
+{
+ struct bpf_iter_seq_task_file_info *info = seq->private;
+ struct bpf_iter__task_file ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.task = info->task;
+ ctx.fd = info->fd;
+ ctx.file = file;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int task_file_seq_show(struct seq_file *seq, void *v)
+{
+ return __task_file_seq_show(seq, v, false);
+}
+
+static void task_file_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_seq_task_file_info *info = seq->private;
+
+ if (!v) {
+ (void)__task_file_seq_show(seq, v, true);
+ } else {
+ fput((struct file *)v);
+ put_files_struct(info->files);
+ put_task_struct(info->task);
+ info->files = NULL;
+ info->task = NULL;
+ }
+}
+
+static int init_seq_pidns(void *priv_data)
+{
+ struct bpf_iter_seq_task_common *common = priv_data;
+
+ common->ns = get_pid_ns(task_active_pid_ns(current));
+ return 0;
+}
+
+static void fini_seq_pidns(void *priv_data)
+{
+ struct bpf_iter_seq_task_common *common = priv_data;
+
+ put_pid_ns(common->ns);
+}
+
+static const struct seq_operations task_file_seq_ops = {
+ .start = task_file_seq_start,
+ .next = task_file_seq_next,
+ .stop = task_file_seq_stop,
+ .show = task_file_seq_show,
+};
+
+static const struct bpf_iter_reg task_reg_info = {
+ .target = "task",
+ .seq_ops = &task_seq_ops,
+ .init_seq_private = init_seq_pidns,
+ .fini_seq_private = fini_seq_pidns,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_task_info),
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__task, task),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+};
+
+static const struct bpf_iter_reg task_file_reg_info = {
+ .target = "task_file",
+ .seq_ops = &task_file_seq_ops,
+ .init_seq_private = init_seq_pidns,
+ .fini_seq_private = fini_seq_pidns,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info),
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__task_file, task),
+ PTR_TO_BTF_ID_OR_NULL },
+ { offsetof(struct bpf_iter__task_file, file),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+};
+
+static int __init task_iter_init(void)
+{
+ int ret;
+
+ ret = bpf_iter_reg_target(&task_reg_info);
+ if (ret)
+ return ret;
+
+ return bpf_iter_reg_target(&task_file_reg_info);
+}
+late_initcall(task_iter_init);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index efe14cf24bc6..34cde841ab68 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -28,9 +28,11 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
[_id] = & _name ## _verifier_ops,
#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
};
/* bpf_check() is a static code analyzer that walks eBPF program
@@ -168,6 +170,8 @@ struct bpf_verifier_stack_elem {
int insn_idx;
int prev_insn_idx;
struct bpf_verifier_stack_elem *next;
+ /* length of verifier log at the time this state was pushed on stack */
+ u32 log_pos;
};
#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
@@ -229,6 +233,7 @@ struct bpf_call_arg_meta {
bool pkt_access;
int regno;
int access_size;
+ int mem_size;
u64 msize_max_value;
int ref_obj_id;
int func_id;
@@ -283,6 +288,18 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
log->ubuf = NULL;
}
+static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
+{
+ char zero = 0;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ log->len_used = new_pos;
+ if (put_user(zero, log->ubuf + new_pos))
+ log->ubuf = NULL;
+}
+
/* log_level controls verbosity level of eBPF verifier.
* bpf_verifier_log_write() is used to dump the verification trace to the log,
* so the user can figure out what's wrong with the program
@@ -377,12 +394,23 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
type == PTR_TO_XDP_SOCK;
}
+static bool reg_type_not_null(enum bpf_reg_type type)
+{
+ return type == PTR_TO_SOCKET ||
+ type == PTR_TO_TCP_SOCK ||
+ type == PTR_TO_MAP_VALUE ||
+ type == PTR_TO_SOCK_COMMON ||
+ type == PTR_TO_BTF_ID;
+}
+
static bool reg_type_may_be_null(enum bpf_reg_type type)
{
return type == PTR_TO_MAP_VALUE_OR_NULL ||
type == PTR_TO_SOCKET_OR_NULL ||
type == PTR_TO_SOCK_COMMON_OR_NULL ||
- type == PTR_TO_TCP_SOCK_OR_NULL;
+ type == PTR_TO_TCP_SOCK_OR_NULL ||
+ type == PTR_TO_BTF_ID_OR_NULL ||
+ type == PTR_TO_MEM_OR_NULL;
}
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
@@ -396,7 +424,9 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
return type == PTR_TO_SOCKET ||
type == PTR_TO_SOCKET_OR_NULL ||
type == PTR_TO_TCP_SOCK ||
- type == PTR_TO_TCP_SOCK_OR_NULL;
+ type == PTR_TO_TCP_SOCK_OR_NULL ||
+ type == PTR_TO_MEM ||
+ type == PTR_TO_MEM_OR_NULL;
}
static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
@@ -410,14 +440,37 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
*/
static bool is_release_function(enum bpf_func_id func_id)
{
- return func_id == BPF_FUNC_sk_release;
+ return func_id == BPF_FUNC_sk_release ||
+ func_id == BPF_FUNC_ringbuf_submit ||
+ func_id == BPF_FUNC_ringbuf_discard;
}
-static bool is_acquire_function(enum bpf_func_id func_id)
+static bool may_be_acquire_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_sk_lookup_tcp ||
func_id == BPF_FUNC_sk_lookup_udp ||
- func_id == BPF_FUNC_skc_lookup_tcp;
+ func_id == BPF_FUNC_skc_lookup_tcp ||
+ func_id == BPF_FUNC_map_lookup_elem ||
+ func_id == BPF_FUNC_ringbuf_reserve;
+}
+
+static bool is_acquire_function(enum bpf_func_id func_id,
+ const struct bpf_map *map)
+{
+ enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;
+
+ if (func_id == BPF_FUNC_sk_lookup_tcp ||
+ func_id == BPF_FUNC_sk_lookup_udp ||
+ func_id == BPF_FUNC_skc_lookup_tcp ||
+ func_id == BPF_FUNC_ringbuf_reserve)
+ return true;
+
+ if (func_id == BPF_FUNC_map_lookup_elem &&
+ (map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map_type == BPF_MAP_TYPE_SOCKHASH))
+ return true;
+
+ return false;
}
static bool is_ptr_cast_function(enum bpf_func_id func_id)
@@ -448,6 +501,9 @@ static const char * const reg_type_str[] = {
[PTR_TO_TP_BUFFER] = "tp_buffer",
[PTR_TO_XDP_SOCK] = "xdp_sock",
[PTR_TO_BTF_ID] = "ptr_",
+ [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
+ [PTR_TO_MEM] = "mem",
+ [PTR_TO_MEM_OR_NULL] = "mem_or_null",
};
static char slot_type_char[] = {
@@ -508,7 +564,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
/* reg->off should be 0 for SCALAR_VALUE */
verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
- if (t == PTR_TO_BTF_ID)
+ if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL)
verbose(env, "%s", kernel_type_name(reg->btf_id));
verbose(env, "(id=%d", reg->id);
if (reg_type_may_be_refcounted_or_null(t))
@@ -846,7 +902,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi
}
static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
- int *insn_idx)
+ int *insn_idx, bool pop_log)
{
struct bpf_verifier_state *cur = env->cur_state;
struct bpf_verifier_stack_elem *elem, *head = env->head;
@@ -860,6 +916,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
if (err)
return err;
}
+ if (pop_log)
+ bpf_vlog_reset(&env->log, head->log_pos);
if (insn_idx)
*insn_idx = head->insn_idx;
if (prev_insn_idx)
@@ -887,6 +945,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
+ elem->log_pos = env->log.len_used;
env->head = elem;
env->stack_size++;
err = copy_verifier_state(&elem->st, cur);
@@ -915,7 +974,7 @@ err:
free_verifier_state(env->cur_state, true);
env->cur_state = NULL;
/* pop all elements and return */
- while (!pop_stack(env, NULL, NULL));
+ while (!pop_stack(env, NULL, NULL, false));
return NULL;
}
@@ -1255,7 +1314,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
reg->type = SCALAR_VALUE;
reg->var_off = tnum_unknown;
reg->frameno = 0;
- reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks;
+ reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
__mark_reg_unbounded(reg);
}
@@ -1387,8 +1446,9 @@ static int check_subprogs(struct bpf_verifier_env *env)
continue;
if (insn[i].src_reg != BPF_PSEUDO_CALL)
continue;
- if (!env->allow_ptr_leaks) {
- verbose(env, "function calls to other bpf functions are allowed for root only\n");
+ if (!env->bpf_capable) {
+ verbose(env,
+ "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
return -EPERM;
}
ret = add_subprog(env, i + insn[i].imm + 1);
@@ -1922,8 +1982,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
bool new_marks = false;
int i, err;
- if (!env->allow_ptr_leaks)
- /* backtracking is root only for now */
+ if (!env->bpf_capable)
return 0;
func = st->frame[st->curframe];
@@ -2101,6 +2160,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID_OR_NULL:
return true;
default:
return false;
@@ -2170,7 +2230,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
reg = &cur->regs[value_regno];
if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
- !register_is_null(reg) && env->allow_ptr_leaks) {
+ !register_is_null(reg) && env->bpf_capable) {
if (dst_reg != BPF_REG_FP) {
/* The backtracking logic can only recognize explicit
* stack slot address like [fp - 8]. Other spill of
@@ -2196,7 +2256,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
return -EINVAL;
}
- if (!env->allow_ptr_leaks) {
+ if (!env->bypass_spec_v4) {
bool sanitize = false;
if (state->stack[spi].slot_type[0] == STACK_SPILL &&
@@ -2418,32 +2478,49 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
return 0;
}
-/* check read/write into map element returned by bpf_map_lookup_elem() */
-static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size, bool zero_size_allowed)
+/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */
+static int __check_mem_access(struct bpf_verifier_env *env, int regno,
+ int off, int size, u32 mem_size,
+ bool zero_size_allowed)
{
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_map *map = regs[regno].map_ptr;
+ bool size_ok = size > 0 || (size == 0 && zero_size_allowed);
+ struct bpf_reg_state *reg;
+
+ if (off >= 0 && size_ok && (u64)off + size <= mem_size)
+ return 0;
- if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
- off + size > map->value_size) {
+ reg = &cur_regs(env)[regno];
+ switch (reg->type) {
+ case PTR_TO_MAP_VALUE:
verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
- map->value_size, off, size);
- return -EACCES;
+ mem_size, off, size);
+ break;
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_PACKET_END:
+ verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+ off, size, regno, reg->id, off, mem_size);
+ break;
+ case PTR_TO_MEM:
+ default:
+ verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n",
+ mem_size, off, size);
}
- return 0;
+
+ return -EACCES;
}
-/* check read/write into a map element with possible variable offset */
-static int check_map_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size, bool zero_size_allowed)
+/* check read/write into a memory region with possible variable offset */
+static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
+ int off, int size, u32 mem_size,
+ bool zero_size_allowed)
{
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *reg = &state->regs[regno];
int err;
- /* We may have adjusted the register to this map value, so we
+ /* We may have adjusted the register pointing to memory region, so we
* need to try adding each of min_value and max_value to off
* to make sure our theoretical access will be safe.
*/
@@ -2464,10 +2541,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
regno);
return -EACCES;
}
- err = __check_map_access(env, regno, reg->smin_value + off, size,
- zero_size_allowed);
+ err = __check_mem_access(env, regno, reg->smin_value + off, size,
+ mem_size, zero_size_allowed);
if (err) {
- verbose(env, "R%d min value is outside of the array range\n",
+ verbose(env, "R%d min value is outside of the allowed memory range\n",
regno);
return err;
}
@@ -2477,18 +2554,38 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* If reg->umax_value + off could overflow, treat that as unbounded too.
*/
if (reg->umax_value >= BPF_MAX_VAR_OFF) {
- verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+ verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n",
regno);
return -EACCES;
}
- err = __check_map_access(env, regno, reg->umax_value + off, size,
- zero_size_allowed);
- if (err)
- verbose(env, "R%d max value is outside of the array range\n",
+ err = __check_mem_access(env, regno, reg->umax_value + off, size,
+ mem_size, zero_size_allowed);
+ if (err) {
+ verbose(env, "R%d max value is outside of the allowed memory range\n",
regno);
+ return err;
+ }
+
+ return 0;
+}
+
+/* check read/write into a map element with possible variable offset */
+static int check_map_access(struct bpf_verifier_env *env, u32 regno,
+ int off, int size, bool zero_size_allowed)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ struct bpf_reg_state *reg = &state->regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+
+ err = check_mem_region_access(env, regno, off, size, map->value_size,
+ zero_size_allowed);
+ if (err)
+ return err;
- if (map_value_has_spin_lock(reg->map_ptr)) {
- u32 lock = reg->map_ptr->spin_lock_off;
+ if (map_value_has_spin_lock(map)) {
+ u32 lock = map->spin_lock_off;
/* if any part of struct bpf_spin_lock can be touched by
* load/store reject this program.
@@ -2546,21 +2643,6 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
}
}
-static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size, bool zero_size_allowed)
-{
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_reg_state *reg = &regs[regno];
-
- if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
- (u64)off + size > reg->range) {
- verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
- off, size, regno, reg->id, reg->off, reg->range);
- return -EACCES;
- }
- return 0;
-}
-
static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
int size, bool zero_size_allowed)
{
@@ -2581,16 +2663,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
regno);
return -EACCES;
}
- err = __check_packet_access(env, regno, off, size, zero_size_allowed);
+ err = __check_mem_access(env, regno, off, size, reg->range,
+ zero_size_allowed);
if (err) {
verbose(env, "R%d offset is outside of the packet\n", regno);
return err;
}
- /* __check_packet_access has made sure "off + size - 1" is within u16.
+ /* __check_mem_access has made sure "off + size - 1" is within u16.
* reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
* otherwise find_good_pkt_pointers would have refused to set range info
- * that __check_packet_access would have rejected this pkt access.
+ * that __check_mem_access would have rejected this pkt access.
* Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
*/
env->prog->aux->max_pkt_offset =
@@ -2621,7 +2704,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
*/
*reg_type = info.reg_type;
- if (*reg_type == PTR_TO_BTF_ID)
+ if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL)
*btf_id = info.btf_id;
else
env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
@@ -3170,6 +3253,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
mark_reg_unknown(env, regs, value_regno);
}
}
+ } else if (reg->type == PTR_TO_MEM) {
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose(env, "R%d leaks addr into mem\n", value_regno);
+ return -EACCES;
+ }
+ err = check_mem_region_access(env, regno, off, size,
+ reg->mem_size, false);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
u32 btf_id = 0;
@@ -3205,7 +3298,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* a sub-register.
*/
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
- if (reg_type == PTR_TO_BTF_ID)
+ if (reg_type == PTR_TO_BTF_ID ||
+ reg_type == PTR_TO_BTF_ID_OR_NULL)
regs[value_regno].btf_id = btf_id;
}
regs[value_regno].type = reg_type;
@@ -3390,7 +3484,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
* Spectre masking for stack ALU.
* See also retrieve_ptr_limit().
*/
- if (!env->allow_ptr_leaks) {
+ if (!env->bypass_spec_v1) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
@@ -3452,6 +3546,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
*stype = STACK_MISC;
goto mark;
}
+
+ if (state->stack[spi].slot_type[0] == STACK_SPILL &&
+ state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
+ goto mark;
+
if (state->stack[spi].slot_type[0] == STACK_SPILL &&
state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
__mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
@@ -3501,6 +3600,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return -EACCES;
return check_map_access(env, regno, reg->off, access_size,
zero_size_allowed);
+ case PTR_TO_MEM:
+ return check_mem_region_access(env, regno, reg->off,
+ access_size, reg->mem_size,
+ zero_size_allowed);
default: /* scalar_value|ptr_to_stack or invalid ptr */
return check_stack_boundary(env, regno, access_size,
zero_size_allowed, meta);
@@ -3605,6 +3708,17 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)
type == ARG_CONST_SIZE_OR_ZERO;
}
+static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type)
+{
+ return type == ARG_PTR_TO_ALLOC_MEM ||
+ type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
+}
+
+static bool arg_type_is_alloc_size(enum bpf_arg_type type)
+{
+ return type == ARG_CONST_ALLOC_SIZE_OR_ZERO;
+}
+
static bool arg_type_is_int_ptr(enum bpf_arg_type type)
{
return type == ARG_PTR_TO_INT ||
@@ -3664,7 +3778,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
- arg_type == ARG_CONST_SIZE_OR_ZERO) {
+ arg_type == ARG_CONST_SIZE_OR_ZERO ||
+ arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) {
expected_type = SCALAR_VALUE;
if (type != expected_type)
goto err_type;
@@ -3735,13 +3850,29 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
* happens during stack boundary checking.
*/
if (register_is_null(reg) &&
- arg_type == ARG_PTR_TO_MEM_OR_NULL)
+ (arg_type == ARG_PTR_TO_MEM_OR_NULL ||
+ arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL))
/* final test in check_stack_boundary() */;
else if (!type_is_pkt_pointer(type) &&
type != PTR_TO_MAP_VALUE &&
+ type != PTR_TO_MEM &&
type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
+ } else if (arg_type_is_alloc_mem_ptr(arg_type)) {
+ expected_type = PTR_TO_MEM;
+ if (register_is_null(reg) &&
+ arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)
+ /* final test in check_stack_boundary() */;
+ else if (type != expected_type)
+ goto err_type;
+ if (meta->ref_obj_id) {
+ verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ regno, reg->ref_obj_id,
+ meta->ref_obj_id);
+ return -EFAULT;
+ }
+ meta->ref_obj_id = reg->ref_obj_id;
} else if (arg_type_is_int_ptr(arg_type)) {
expected_type = PTR_TO_STACK;
if (!type_is_pkt_pointer(type) &&
@@ -3837,6 +3968,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
zero_size_allowed, meta);
if (!err)
err = mark_chain_precision(env, regno);
+ } else if (arg_type_is_alloc_size(arg_type)) {
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ meta->mem_size = reg->var_off.value;
} else if (arg_type_is_int_ptr(arg_type)) {
int size = int_ptr_type_to_size(arg_type);
@@ -3873,6 +4011,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_xdp_output)
goto error;
break;
+ case BPF_MAP_TYPE_RINGBUF:
+ if (func_id != BPF_FUNC_ringbuf_output &&
+ func_id != BPF_FUNC_ringbuf_reserve &&
+ func_id != BPF_FUNC_ringbuf_submit &&
+ func_id != BPF_FUNC_ringbuf_discard &&
+ func_id != BPF_FUNC_ringbuf_query)
+ goto error;
+ break;
case BPF_MAP_TYPE_STACK_TRACE:
if (func_id != BPF_FUNC_get_stackid)
goto error;
@@ -3915,7 +4061,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_sock_map_update &&
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_map &&
- func_id != BPF_FUNC_sk_select_reuseport)
+ func_id != BPF_FUNC_sk_select_reuseport &&
+ func_id != BPF_FUNC_map_lookup_elem)
goto error;
break;
case BPF_MAP_TYPE_SOCKHASH:
@@ -3923,7 +4070,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_sock_hash_update &&
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_hash &&
- func_id != BPF_FUNC_sk_select_reuseport)
+ func_id != BPF_FUNC_sk_select_reuseport &&
+ func_id != BPF_FUNC_map_lookup_elem)
goto error;
break;
case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
@@ -4093,7 +4241,7 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
/* A reference acquiring function cannot acquire
* another refcounted ptr.
*/
- if (is_acquire_function(func_id) && count)
+ if (may_be_acquire_function(func_id) && count)
return false;
/* We only support one arg being unreferenced at the moment,
@@ -4388,10 +4536,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
if (!BPF_MAP_PTR(aux->map_ptr_state))
bpf_map_ptr_store(aux, meta->map_ptr,
- meta->map_ptr->unpriv_array);
+ !meta->map_ptr->bypass_spec_v1);
else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr)
bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
- meta->map_ptr->unpriv_array);
+ !meta->map_ptr->bypass_spec_v1);
return 0;
}
@@ -4597,6 +4745,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
regs[BPF_REG_0].id = ++env->id_gen;
+ } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
+ regs[BPF_REG_0].id = ++env->id_gen;
+ regs[BPF_REG_0].mem_size = meta.mem_size;
} else {
verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
@@ -4606,7 +4759,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
if (is_ptr_cast_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
- } else if (is_acquire_function(func_id)) {
+ } else if (is_acquire_function(func_id, meta.map_ptr)) {
int id = acquire_reference_state(env, insn_idx);
if (id < 0)
@@ -4760,7 +4913,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
const struct bpf_insn *insn)
{
- return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K;
+ return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K;
}
static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
@@ -5070,7 +5223,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* For unprivileged we require that resulting offset must be in bounds
* in order to be able to sanitize access later on.
*/
- if (!env->allow_ptr_leaks) {
+ if (!env->bypass_spec_v1) {
if (dst_reg->type == PTR_TO_MAP_VALUE &&
check_map_access(env, dst, dst_reg->off, 1, false)) {
verbose(env, "R%d pointer arithmetic of map value goes out of range, "
@@ -5611,7 +5764,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
{
struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
- bool src_known, dst_known;
+ bool src_known;
s64 smin_val, smax_val;
u64 umin_val, umax_val;
s32 s32_min_val, s32_max_val;
@@ -5633,7 +5786,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
if (alu32) {
src_known = tnum_subreg_is_const(src_reg.var_off);
- dst_known = tnum_subreg_is_const(dst_reg->var_off);
if ((src_known &&
(s32_min_val != s32_max_val || u32_min_val != u32_max_val)) ||
s32_min_val > s32_max_val || u32_min_val > u32_max_val) {
@@ -5645,7 +5797,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
}
} else {
src_known = tnum_is_const(src_reg.var_off);
- dst_known = tnum_is_const(dst_reg->var_off);
if ((src_known &&
(smin_val != smax_val || umin_val != umax_val)) ||
smin_val > smax_val || umin_val > umax_val) {
@@ -6261,8 +6412,25 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
bool is_jmp32)
{
- if (__is_pointer_value(false, reg))
- return -1;
+ if (__is_pointer_value(false, reg)) {
+ if (!reg_type_not_null(reg->type))
+ return -1;
+
+ /* If pointer is valid tests against zero will fail so we can
+ * use this to direct branch taken.
+ */
+ if (val != 0)
+ return -1;
+
+ switch (opcode) {
+ case BPF_JEQ:
+ return 0;
+ case BPF_JNE:
+ return 1;
+ default:
+ return -1;
+ }
+ }
if (is_jmp32)
return is_branch32_taken(reg, val, opcode);
@@ -6517,12 +6685,16 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
if (is_null) {
reg->type = SCALAR_VALUE;
} else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
- if (reg->map_ptr->inner_map_meta) {
+ const struct bpf_map *map = reg->map_ptr;
+
+ if (map->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
- reg->map_ptr = reg->map_ptr->inner_map_meta;
- } else if (reg->map_ptr->map_type ==
- BPF_MAP_TYPE_XSKMAP) {
+ reg->map_ptr = map->inner_map_meta;
+ } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+ reg->type = PTR_TO_SOCKET;
} else {
reg->type = PTR_TO_MAP_VALUE;
}
@@ -6532,6 +6704,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
reg->type = PTR_TO_SOCK_COMMON;
} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
reg->type = PTR_TO_TCP_SOCK;
+ } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
+ reg->type = PTR_TO_BTF_ID;
+ } else if (reg->type == PTR_TO_MEM_OR_NULL) {
+ reg->type = PTR_TO_MEM;
}
if (is_null) {
/* We don't need id and ref_obj_id from this point
@@ -6755,7 +6931,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
if (pred >= 0) {
- err = mark_chain_precision(env, insn->dst_reg);
+ /* If we get here with a dst_reg pointer type it is because
+ * above is_branch_taken() special cased the 0 comparison.
+ */
+ if (!__is_pointer_value(false, dst_reg))
+ err = mark_chain_precision(env, insn->dst_reg);
if (BPF_SRC(insn->code) == BPF_X && !err)
err = mark_chain_precision(env, insn->src_reg);
if (err)
@@ -7041,7 +7221,11 @@ static int check_return_code(struct bpf_verifier_env *env)
switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
- env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+ env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
range = tnum_range(1, 1);
break;
case BPF_PROG_TYPE_CGROUP_SKB:
@@ -7070,6 +7254,8 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_TRACE_RAW_TP:
case BPF_MODIFY_RETURN:
return 0;
+ case BPF_TRACE_ITER:
+ break;
default:
return -ENOTSUPP;
}
@@ -7206,7 +7392,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
insn_stack[env->cfg.cur_stack++] = w;
return 1;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- if (loop_ok && env->allow_ptr_leaks)
+ if (loop_ok && env->bpf_capable)
return 0;
verbose_linfo(env, t, "%d: ", t);
verbose_linfo(env, w, "%d: ", w);
@@ -7366,7 +7552,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
const struct btf *btf;
void __user *urecord;
u32 prev_offset = 0;
- int ret = 0;
+ int ret = -ENOMEM;
nfuncs = attr->func_info_cnt;
if (!nfuncs)
@@ -8315,7 +8501,7 @@ next:
if (env->max_states_per_insn < states_cnt)
env->max_states_per_insn = states_cnt;
- if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
+ if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
return push_jmp_history(env, cur);
if (!add_new_state)
@@ -8402,6 +8588,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID_OR_NULL:
return false;
default:
return true;
@@ -8428,6 +8615,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
static int do_check(struct bpf_verifier_env *env)
{
+ bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_verifier_state *state = env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
struct bpf_reg_state *regs;
@@ -8704,7 +8892,7 @@ static int do_check(struct bpf_verifier_env *env)
process_bpf_exit:
update_branch_counts(env, env->cur_state);
err = pop_stack(env, &prev_insn_idx,
- &env->insn_idx);
+ &env->insn_idx, pop_log);
if (err < 0) {
if (err != -ENOENT)
return err;
@@ -9974,7 +10162,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->code = BPF_JMP | BPF_TAIL_CALL;
aux = &env->insn_aux_data[i + delta];
- if (env->allow_ptr_leaks && !expect_blinding &&
+ if (env->bpf_capable && !expect_blinding &&
prog->jit_requested &&
!bpf_map_key_poisoned(aux) &&
!bpf_map_ptr_poisoned(aux) &&
@@ -10227,6 +10415,7 @@ static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
+ bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_verifier_state *state;
struct bpf_reg_state *regs;
int ret, i;
@@ -10289,7 +10478,9 @@ out:
free_verifier_state(env->cur_state, true);
env->cur_state = NULL;
}
- while (!pop_stack(env, NULL, NULL));
+ while (!pop_stack(env, NULL, NULL, false));
+ if (!ret && pop_log)
+ bpf_vlog_reset(&env->log, 0);
free_states(env);
if (ret)
/* clean aux data in case subprog was rejected */
@@ -10445,6 +10636,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
struct bpf_prog *tgt_prog = prog->aux->linked_prog;
u32 btf_id = prog->aux->attach_btf_id;
const char prefix[] = "btf_trace_";
+ struct btf_func_model fmodel;
int ret = 0, subprog = -1, i;
struct bpf_trampoline *tr;
const struct btf_type *t;
@@ -10586,6 +10778,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
prog->aux->attach_func_proto = t;
prog->aux->attach_btf_trace = true;
return 0;
+ case BPF_TRACE_ITER:
+ if (!btf_type_is_func(t)) {
+ verbose(env, "attach_btf_id %u is not a function\n",
+ btf_id);
+ return -EINVAL;
+ }
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_func_proto(t))
+ return -EINVAL;
+ prog->aux->attach_func_name = tname;
+ prog->aux->attach_func_proto = t;
+ if (!bpf_iter_prog_supported(prog))
+ return -EINVAL;
+ ret = btf_distill_func_proto(&env->log, btf, t,
+ tname, &fmodel);
+ return ret;
default:
if (!prog_extension)
return -EINVAL;
@@ -10696,7 +10904,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
- is_priv = capable(CAP_SYS_ADMIN);
+ is_priv = bpf_capable();
if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
mutex_lock(&bpf_verifier_lock);
@@ -10737,7 +10945,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
env->strict_alignment = false;
- env->allow_ptr_leaks = is_priv;
+ env->allow_ptr_leaks = bpf_allow_ptr_leaks();
+ env->bypass_spec_v1 = bpf_bypass_spec_v1();
+ env->bypass_spec_v4 = bpf_bypass_spec_v4();
+ env->bpf_capable = bpf_capable();
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
deleted file mode 100644
index 2cc5c8f4c800..000000000000
--- a/kernel/bpf/xskmap.c
+++ /dev/null
@@ -1,265 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* XSKMAP used for AF_XDP sockets
- * Copyright(c) 2018 Intel Corporation.
- */
-
-#include <linux/bpf.h>
-#include <linux/capability.h>
-#include <net/xdp_sock.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-
-int xsk_map_inc(struct xsk_map *map)
-{
- bpf_map_inc(&map->map);
- return 0;
-}
-
-void xsk_map_put(struct xsk_map *map)
-{
- bpf_map_put(&map->map);
-}
-
-static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
- struct xdp_sock **map_entry)
-{
- struct xsk_map_node *node;
- int err;
-
- node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
- if (!node)
- return ERR_PTR(-ENOMEM);
-
- err = xsk_map_inc(map);
- if (err) {
- kfree(node);
- return ERR_PTR(err);
- }
-
- node->map = map;
- node->map_entry = map_entry;
- return node;
-}
-
-static void xsk_map_node_free(struct xsk_map_node *node)
-{
- xsk_map_put(node->map);
- kfree(node);
-}
-
-static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
-{
- spin_lock_bh(&xs->map_list_lock);
- list_add_tail(&node->node, &xs->map_list);
- spin_unlock_bh(&xs->map_list_lock);
-}
-
-static void xsk_map_sock_delete(struct xdp_sock *xs,
- struct xdp_sock **map_entry)
-{
- struct xsk_map_node *n, *tmp;
-
- spin_lock_bh(&xs->map_list_lock);
- list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
- if (map_entry == n->map_entry) {
- list_del(&n->node);
- xsk_map_node_free(n);
- }
- }
- spin_unlock_bh(&xs->map_list_lock);
-}
-
-static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
-{
- struct bpf_map_memory mem;
- int err, numa_node;
- struct xsk_map *m;
- u64 size;
-
- if (!capable(CAP_NET_ADMIN))
- return ERR_PTR(-EPERM);
-
- if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 ||
- attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
- return ERR_PTR(-EINVAL);
-
- numa_node = bpf_map_attr_numa_node(attr);
- size = struct_size(m, xsk_map, attr->max_entries);
-
- err = bpf_map_charge_init(&mem, size);
- if (err < 0)
- return ERR_PTR(err);
-
- m = bpf_map_area_alloc(size, numa_node);
- if (!m) {
- bpf_map_charge_finish(&mem);
- return ERR_PTR(-ENOMEM);
- }
-
- bpf_map_init_from_attr(&m->map, attr);
- bpf_map_charge_move(&m->map.memory, &mem);
- spin_lock_init(&m->lock);
-
- return &m->map;
-}
-
-static void xsk_map_free(struct bpf_map *map)
-{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
-
- bpf_clear_redirect_map(map);
- synchronize_net();
- bpf_map_area_free(m);
-}
-
-static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
-{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
- u32 index = key ? *(u32 *)key : U32_MAX;
- u32 *next = next_key;
-
- if (index >= m->map.max_entries) {
- *next = 0;
- return 0;
- }
-
- if (index == m->map.max_entries - 1)
- return -ENOENT;
- *next = index + 1;
- return 0;
-}
-
-static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
-{
- const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
- struct bpf_insn *insn = insn_buf;
-
- *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
- *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
- *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
- *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
- *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
- *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *insn++ = BPF_MOV64_IMM(ret, 0);
- return insn - insn_buf;
-}
-
-static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
-{
- WARN_ON_ONCE(!rcu_read_lock_held());
- return __xsk_map_lookup_elem(map, *(u32 *)key);
-}
-
-static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
-{
- return ERR_PTR(-EOPNOTSUPP);
-}
-
-static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
-{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct xdp_sock *xs, *old_xs, **map_entry;
- u32 i = *(u32 *)key, fd = *(u32 *)value;
- struct xsk_map_node *node;
- struct socket *sock;
- int err;
-
- if (unlikely(map_flags > BPF_EXIST))
- return -EINVAL;
- if (unlikely(i >= m->map.max_entries))
- return -E2BIG;
-
- sock = sockfd_lookup(fd, &err);
- if (!sock)
- return err;
-
- if (sock->sk->sk_family != PF_XDP) {
- sockfd_put(sock);
- return -EOPNOTSUPP;
- }
-
- xs = (struct xdp_sock *)sock->sk;
-
- if (!xsk_is_setup_for_bpf_map(xs)) {
- sockfd_put(sock);
- return -EOPNOTSUPP;
- }
-
- map_entry = &m->xsk_map[i];
- node = xsk_map_node_alloc(m, map_entry);
- if (IS_ERR(node)) {
- sockfd_put(sock);
- return PTR_ERR(node);
- }
-
- spin_lock_bh(&m->lock);
- old_xs = READ_ONCE(*map_entry);
- if (old_xs == xs) {
- err = 0;
- goto out;
- } else if (old_xs && map_flags == BPF_NOEXIST) {
- err = -EEXIST;
- goto out;
- } else if (!old_xs && map_flags == BPF_EXIST) {
- err = -ENOENT;
- goto out;
- }
- xsk_map_sock_add(xs, node);
- WRITE_ONCE(*map_entry, xs);
- if (old_xs)
- xsk_map_sock_delete(old_xs, map_entry);
- spin_unlock_bh(&m->lock);
- sockfd_put(sock);
- return 0;
-
-out:
- spin_unlock_bh(&m->lock);
- sockfd_put(sock);
- xsk_map_node_free(node);
- return err;
-}
-
-static int xsk_map_delete_elem(struct bpf_map *map, void *key)
-{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct xdp_sock *old_xs, **map_entry;
- int k = *(u32 *)key;
-
- if (k >= map->max_entries)
- return -EINVAL;
-
- spin_lock_bh(&m->lock);
- map_entry = &m->xsk_map[k];
- old_xs = xchg(map_entry, NULL);
- if (old_xs)
- xsk_map_sock_delete(old_xs, map_entry);
- spin_unlock_bh(&m->lock);
-
- return 0;
-}
-
-void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
- struct xdp_sock **map_entry)
-{
- spin_lock_bh(&map->lock);
- if (READ_ONCE(*map_entry) == xs) {
- WRITE_ONCE(*map_entry, NULL);
- xsk_map_sock_delete(xs, map_entry);
- }
- spin_unlock_bh(&map->lock);
-}
-
-const struct bpf_map_ops xsk_map_ops = {
- .map_alloc = xsk_map_alloc,
- .map_free = xsk_map_free,
- .map_get_next_key = xsk_map_get_next_key,
- .map_lookup_elem = xsk_map_lookup_elem,
- .map_gen_lookup = xsk_map_gen_lookup,
- .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
- .map_update_elem = xsk_map_update_elem,
- .map_delete_elem = xsk_map_delete_elem,
- .map_check_btf = map_check_no_btf,
-};
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 06b5ea9d899d..1ea181a58465 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -153,11 +153,7 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
-/*
- * The default hierarchy, reserved for the subsystems that are otherwise
- * unattached - it never has more than a single cgroup, and all tasks are
- * part of that cgroup.
- */
+/* the default hierarchy */
struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
@@ -251,9 +247,6 @@ bool cgroup_ssid_enabled(int ssid)
* cases where a subsystem should behave differnetly depending on the
* interface version.
*
- * The set of behaviors which change on the default hierarchy are still
- * being determined and the mount option is prefixed with __DEVEL__.
- *
* List of changed behaviors:
*
* - Mount options "noprefix", "xattr", "clone_children", "release_agent"
@@ -4881,7 +4874,6 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cpu.stat",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_stat_show,
},
#ifdef CONFIG_PSI
@@ -6508,33 +6500,6 @@ int cgroup_bpf_attach(struct cgroup *cgrp,
return ret;
}
-int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog,
- struct bpf_prog *new_prog)
-{
- struct bpf_cgroup_link *cg_link;
- int ret;
-
- if (link->ops != &bpf_cgroup_link_lops)
- return -EINVAL;
-
- cg_link = container_of(link, struct bpf_cgroup_link, link);
-
- mutex_lock(&cgroup_mutex);
- /* link might have been auto-released by dying cgroup, so fail */
- if (!cg_link->cgroup) {
- ret = -EINVAL;
- goto out_unlock;
- }
- if (old_prog && link->prog != old_prog) {
- ret = -EPERM;
- goto out_unlock;
- }
- ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
-out_unlock:
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type)
{
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 729d3a5c772e..642415b8c3c9 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1655,7 +1655,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
guarantee_online_mems(cs, &newmems);
/*
- * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
+ * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
@@ -1760,7 +1760,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
*
* Call with cpuset_mutex held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
- * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * lock each such tasks mm->mmap_lock, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
*/
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index b05f1dd58a62..812a61afd538 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -95,11 +95,12 @@ static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
return container_of(ns, struct cgroup_namespace, ns);
}
-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
- if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+ if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 41ca996568df..b6397a186ce9 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -389,18 +389,62 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
cgroup_base_stat_cputime_account_end(cgrp, rstatc);
}
+/*
+ * compute the cputime for the root cgroup by getting the per cpu data
+ * at a global level, then categorizing the fields in a manner consistent
+ * with how it is done by __cgroup_account_cputime_field for each bit of
+ * cpu time attributed to a cgroup.
+ */
+static void root_cgroup_cputime(struct task_cputime *cputime)
+{
+ int i;
+
+ cputime->stime = 0;
+ cputime->utime = 0;
+ cputime->sum_exec_runtime = 0;
+ for_each_possible_cpu(i) {
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+ u64 user = 0;
+ u64 sys = 0;
+
+ kcpustat_cpu_fetch(&kcpustat, i);
+
+ user += cpustat[CPUTIME_USER];
+ user += cpustat[CPUTIME_NICE];
+ cputime->utime += user;
+
+ sys += cpustat[CPUTIME_SYSTEM];
+ sys += cpustat[CPUTIME_IRQ];
+ sys += cpustat[CPUTIME_SOFTIRQ];
+ cputime->stime += sys;
+
+ cputime->sum_exec_runtime += user;
+ cputime->sum_exec_runtime += sys;
+ cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
+ cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST];
+ cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE];
+ }
+}
+
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
-
- if (!cgroup_parent(cgrp))
- return;
-
- cgroup_rstat_flush_hold(cgrp);
- usage = cgrp->bstat.cputime.sum_exec_runtime;
- cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime);
- cgroup_rstat_flush_release();
+ struct task_cputime cputime;
+
+ if (cgroup_parent(cgrp)) {
+ cgroup_rstat_flush_hold(cgrp);
+ usage = cgrp->bstat.cputime.sum_exec_runtime;
+ cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
+ &utime, &stime);
+ cgroup_rstat_flush_release();
+ } else {
+ root_cgroup_cputime(&cputime);
+ usage = cputime.sum_exec_runtime;
+ utime = cputime.utime;
+ stime = cputime.stime;
+ }
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
diff --git a/kernel/compat.c b/kernel/compat.c
index 843dd17e6078..b8d2800bb4b7 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -199,7 +199,7 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
- if (!user_access_begin(umask, bitmap_size / 8))
+ if (!user_read_access_begin(umask, bitmap_size / 8))
return -EFAULT;
while (nr_compat_longs > 1) {
@@ -211,11 +211,11 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
}
if (nr_compat_longs)
unsafe_get_user(*mask, umask++, Efault);
- user_access_end();
+ user_read_access_end();
return 0;
Efault:
- user_access_end();
+ user_read_access_end();
return -EFAULT;
}
@@ -228,7 +228,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
- if (!user_access_begin(umask, bitmap_size / 8))
+ if (!user_write_access_begin(umask, bitmap_size / 8))
return -EFAULT;
while (nr_compat_longs > 1) {
@@ -239,10 +239,10 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
}
if (nr_compat_longs)
unsafe_put_user((compat_ulong_t)*mask, umask++, Efault);
- user_access_end();
+ user_write_access_end();
return 0;
Efault:
- user_access_end();
+ user_write_access_end();
return -EFAULT;
}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index ce430885c26c..36a98c48aedc 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(context_tracking_key);
DEFINE_PER_CPU(struct context_tracking, context_tracking);
EXPORT_SYMBOL_GPL(context_tracking);
-static bool context_tracking_recursion_enter(void)
+static noinstr bool context_tracking_recursion_enter(void)
{
int recursion;
@@ -45,7 +45,7 @@ static bool context_tracking_recursion_enter(void)
return false;
}
-static void context_tracking_recursion_exit(void)
+static __always_inline void context_tracking_recursion_exit(void)
{
__this_cpu_dec(context_tracking.recursion);
}
@@ -59,7 +59,7 @@ static void context_tracking_recursion_exit(void)
* instructions to execute won't use any RCU read side critical section
* because this function sets RCU in extended quiescent state.
*/
-void __context_tracking_enter(enum ctx_state state)
+void noinstr __context_tracking_enter(enum ctx_state state)
{
/* Kernel threads aren't supposed to go to userspace */
WARN_ON_ONCE(!current->mm);
@@ -77,8 +77,10 @@ void __context_tracking_enter(enum ctx_state state)
* on the tick.
*/
if (state == CONTEXT_USER) {
+ instrumentation_begin();
trace_user_enter(0);
vtime_user_enter(current);
+ instrumentation_end();
}
rcu_user_enter();
}
@@ -99,7 +101,6 @@ void __context_tracking_enter(enum ctx_state state)
}
context_tracking_recursion_exit();
}
-NOKPROBE_SYMBOL(__context_tracking_enter);
EXPORT_SYMBOL_GPL(__context_tracking_enter);
void context_tracking_enter(enum ctx_state state)
@@ -142,7 +143,7 @@ NOKPROBE_SYMBOL(context_tracking_user_enter);
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
-void __context_tracking_exit(enum ctx_state state)
+void noinstr __context_tracking_exit(enum ctx_state state)
{
if (!context_tracking_recursion_enter())
return;
@@ -155,15 +156,16 @@ void __context_tracking_exit(enum ctx_state state)
*/
rcu_user_exit();
if (state == CONTEXT_USER) {
+ instrumentation_begin();
vtime_user_exit(current);
trace_user_exit(0);
+ instrumentation_end();
}
}
__this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
context_tracking_recursion_exit();
}
-NOKPROBE_SYMBOL(__context_tracking_exit);
EXPORT_SYMBOL_GPL(__context_tracking_exit);
void context_tracking_exit(enum ctx_state state)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9f892144db6b..6ff2578ecf17 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3,6 +3,7 @@
*
* This code is licenced under the GPL.
*/
+#include <linux/sched/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
@@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
return bringup_wait_for_ap(cpu);
}
+static int finish_cpu(unsigned int cpu)
+{
+ struct task_struct *idle = idle_thread_get(cpu);
+ struct mm_struct *mm = idle->active_mm;
+
+ /*
+ * idle_task_exit() will have switched to &init_mm, now
+ * clean up any remaining active_mm state.
+ */
+ if (mm != &init_mm)
+ idle->active_mm = &init_mm;
+ mmdrop(mm);
+ return 0;
+}
+
/*
* Hotplug state machine related functions
*/
@@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
.startup.single = bringup_cpu,
- .teardown.single = NULL,
+ .teardown.single = finish_cpu,
.cant_stop = true,
},
/* Final state before CPU kills itself */
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index cbca6879ab7d..44a259338e33 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
*/
int cpu_pm_enter(void)
{
- int nr_calls;
+ int nr_calls = 0;
int ret = 0;
ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
*/
int cpu_cluster_pm_enter(void)
{
- int nr_calls;
+ int nr_calls = 0;
int ret = 0;
ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
diff --git a/kernel/cred.c b/kernel/cred.c
index 71a792616917..421b1149c651 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -315,6 +315,9 @@ struct cred *prepare_exec_creds(void)
new->process_keyring = NULL;
#endif
+ new->suid = new->fsuid = new->euid;
+ new->sgid = new->fsgid = new->egid;
+
return new;
}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 2b7c9b67931d..9e5934780f41 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -67,9 +67,7 @@ static int kgdb_break_asap;
struct debuggerinfo_struct kgdb_info[NR_CPUS];
-/**
- * kgdb_connected - Is a host GDB connected to us?
- */
+/* kgdb_connected - Is a host GDB connected to us? */
int kgdb_connected;
EXPORT_SYMBOL_GPL(kgdb_connected);
@@ -171,18 +169,18 @@ int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
{
int err;
- err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ err = copy_from_kernel_nofault(bpt->saved_instr, (char *)bpt->bpt_addr,
BREAK_INSTR_SIZE);
if (err)
return err;
- err = probe_kernel_write((char *)bpt->bpt_addr,
+ err = copy_to_kernel_nofault((char *)bpt->bpt_addr,
arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
return err;
}
int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
{
- return probe_kernel_write((char *)bpt->bpt_addr,
+ return copy_to_kernel_nofault((char *)bpt->bpt_addr,
(char *)bpt->saved_instr, BREAK_INSTR_SIZE);
}
@@ -417,6 +415,18 @@ int kgdb_isremovedbreak(unsigned long addr)
return 0;
}
+int kgdb_has_hit_break(unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state == BP_ACTIVE &&
+ kgdb_break[i].bpt_addr == addr)
+ return 1;
+ }
+ return 0;
+}
+
int dbg_remove_all_break(void)
{
int error;
@@ -532,6 +542,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
if (exception_level > 1) {
dump_stack();
+ kgdb_io_module_registered = false;
panic("Recursive entry to debugger");
}
@@ -576,6 +587,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
arch_kgdb_ops.disable_hw_break(regs);
acquirelock:
+ rcu_read_lock();
/*
* Interrupts will be restored by the 'trap return' code, except when
* single stepping.
@@ -635,6 +647,7 @@ return_normal:
atomic_dec(&slaves_in_kgdb);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
return 0;
}
cpu_relax();
@@ -653,6 +666,7 @@ return_normal:
raw_spin_unlock(&dbg_master_lock);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
goto acquirelock;
}
@@ -668,6 +682,8 @@ return_normal:
if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
goto kgdb_restore;
+ atomic_inc(&ignore_console_lock_warning);
+
/* Call the I/O driver's pre_exception routine */
if (dbg_io_ops->pre_exception)
dbg_io_ops->pre_exception();
@@ -740,6 +756,8 @@ cpu_master_loop:
if (dbg_io_ops->post_exception)
dbg_io_ops->post_exception();
+ atomic_dec(&ignore_console_lock_warning);
+
if (!kgdb_single_step) {
raw_spin_unlock(&dbg_slave_lock);
/* Wait till all the CPUs have quit from the debugger. */
@@ -772,6 +790,7 @@ kgdb_restore:
raw_spin_unlock(&dbg_master_lock);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
return kgdb_info[cpu].ret_state;
}
@@ -920,7 +939,7 @@ static void sysrq_handle_dbg(int key)
kgdb_breakpoint();
}
-static struct sysrq_key_op sysrq_dbg_op = {
+static const struct sysrq_key_op sysrq_dbg_op = {
.handler = sysrq_handle_dbg,
.help_msg = "debug(g)",
.action_msg = "DEBUG",
@@ -946,6 +965,14 @@ void kgdb_panic(const char *msg)
kgdb_breakpoint();
}
+static void kgdb_initial_breakpoint(void)
+{
+ kgdb_break_asap = 0;
+
+ pr_crit("Waiting for connection from remote gdb...\n");
+ kgdb_breakpoint();
+}
+
void __weak kgdb_arch_late(void)
{
}
@@ -956,6 +983,9 @@ void __init dbg_late_init(void)
if (kgdb_io_module_registered)
kgdb_arch_late();
kdb_init(KDB_INIT_FULL);
+
+ if (kgdb_io_module_registered && kgdb_break_asap)
+ kgdb_initial_breakpoint();
}
static int
@@ -1051,14 +1081,6 @@ void kgdb_schedule_breakpoint(void)
}
EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
-static void kgdb_initial_breakpoint(void)
-{
- kgdb_break_asap = 0;
-
- pr_crit("Waiting for connection from remote gdb...\n");
- kgdb_breakpoint();
-}
-
/**
* kgdb_register_io_module - register KGDB IO module
* @new_dbg_io_ops: the io ops vector
@@ -1067,15 +1089,22 @@ static void kgdb_initial_breakpoint(void)
*/
int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
{
+ struct kgdb_io *old_dbg_io_ops;
int err;
spin_lock(&kgdb_registration_lock);
- if (dbg_io_ops) {
- spin_unlock(&kgdb_registration_lock);
+ old_dbg_io_ops = dbg_io_ops;
+ if (old_dbg_io_ops) {
+ if (!old_dbg_io_ops->deinit) {
+ spin_unlock(&kgdb_registration_lock);
- pr_err("Another I/O driver is already registered with KGDB\n");
- return -EBUSY;
+ pr_err("KGDB I/O driver %s can't replace %s.\n",
+ new_dbg_io_ops->name, old_dbg_io_ops->name);
+ return -EBUSY;
+ }
+ pr_info("Replacing I/O driver %s with %s\n",
+ old_dbg_io_ops->name, new_dbg_io_ops->name);
}
if (new_dbg_io_ops->init) {
@@ -1090,12 +1119,18 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
spin_unlock(&kgdb_registration_lock);
+ if (old_dbg_io_ops) {
+ old_dbg_io_ops->deinit();
+ return 0;
+ }
+
pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name);
/* Arm KGDB now. */
kgdb_register_callbacks();
- if (kgdb_break_asap)
+ if (kgdb_break_asap &&
+ (!dbg_is_early || IS_ENABLED(CONFIG_ARCH_HAS_EARLY_DEBUG)))
kgdb_initial_breakpoint();
return 0;
@@ -1125,6 +1160,9 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
spin_unlock(&kgdb_registration_lock);
+ if (old_dbg_io_ops->deinit)
+ old_dbg_io_ops->deinit();
+
pr_info("Unregistered I/O driver %s, debugger disabled\n",
old_dbg_io_ops->name);
}
@@ -1165,7 +1203,8 @@ static int __init opt_kgdb_wait(char *str)
kgdb_break_asap = 1;
kdb_init(KDB_INIT_EARLY);
- if (kgdb_io_module_registered)
+ if (kgdb_io_module_registered &&
+ IS_ENABLED(CONFIG_ARCH_HAS_EARLY_DEBUG))
kgdb_initial_breakpoint();
return 0;
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 4b280fc7dd67..61774aec46b4 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -247,7 +247,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count)
*/
tmp = buf + count;
- err = probe_kernel_read(tmp, mem, count);
+ err = copy_from_kernel_nofault(tmp, mem, count);
if (err)
return NULL;
while (count > 0) {
@@ -283,7 +283,7 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
*tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
}
- return probe_kernel_write(mem, tmp_raw, count);
+ return copy_to_kernel_nofault(mem, tmp_raw, count);
}
/*
@@ -335,7 +335,7 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count)
size++;
}
- return probe_kernel_write(mem, c, size);
+ return copy_to_kernel_nofault(mem, c, size);
}
#if DBG_MAX_REG_NUM > 0
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 3de0cc780c16..18e03aba2cfc 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -21,17 +21,18 @@
static void kdb_show_stack(struct task_struct *p, void *addr)
{
- int old_lvl = console_loglevel;
-
- console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
kdb_trap_printk++;
- if (!addr && kdb_task_has_cpu(p))
+ if (!addr && kdb_task_has_cpu(p)) {
+ int old_lvl = console_loglevel;
+
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
kdb_dump_stack_on_cpu(kdb_process_cpu(p));
- else
- show_stack(p, addr);
+ console_loglevel = old_lvl;
+ } else {
+ show_stack(p, addr, KERN_EMERG);
+ }
- console_loglevel = old_lvl;
kdb_trap_printk--;
}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 924bc9298a42..683a799618ad 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -542,6 +542,44 @@ static int kdb_search_string(char *searched, char *searchfor)
return 0;
}
+static void kdb_msg_write(const char *msg, int msg_len)
+{
+ struct console *c;
+
+ if (msg_len == 0)
+ return;
+
+ if (dbg_io_ops) {
+ const char *cp = msg;
+ int len = msg_len;
+
+ while (len--) {
+ dbg_io_ops->write_char(*cp);
+ cp++;
+ }
+ }
+
+ for_each_console(c) {
+ if (!(c->flags & CON_ENABLED))
+ continue;
+ if (c == dbg_io_ops->cons)
+ continue;
+ /*
+ * Set oops_in_progress to encourage the console drivers to
+ * disregard their internal spin locks: in the current calling
+ * context the risk of deadlock is a bigger problem than risks
+ * due to re-entering the console driver. We operate directly on
+ * oops_in_progress rather than using bust_spinlocks() because
+ * the calls bust_spinlocks() makes on exit are not appropriate
+ * for this calling context.
+ */
+ ++oops_in_progress;
+ c->write(c, msg, msg_len);
+ --oops_in_progress;
+ touch_nmi_watchdog();
+ }
+}
+
int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
{
int diag;
@@ -553,7 +591,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
int this_cpu, old_cpu;
char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
char *moreprompt = "more> ";
- struct console *c;
unsigned long uninitialized_var(flags);
/* Serialize kdb_printf if multiple cpus try to write at once.
@@ -687,22 +724,11 @@ kdb_printit:
*/
retlen = strlen(kdb_buffer);
cp = (char *) printk_skip_headers(kdb_buffer);
- if (!dbg_kdb_mode && kgdb_connected) {
+ if (!dbg_kdb_mode && kgdb_connected)
gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
- } else {
- if (dbg_io_ops && !dbg_io_ops->is_console) {
- len = retlen - (cp - kdb_buffer);
- cp2 = cp;
- while (len--) {
- dbg_io_ops->write_char(*cp2);
- cp2++;
- }
- }
- for_each_console(c) {
- c->write(c, cp, retlen - (cp - kdb_buffer));
- touch_nmi_watchdog();
- }
- }
+ else
+ kdb_msg_write(cp, retlen - (cp - kdb_buffer));
+
if (logging) {
saved_loglevel = console_loglevel;
console_loglevel = CONSOLE_LOGLEVEL_SILENT;
@@ -751,19 +777,7 @@ kdb_printit:
moreprompt = "more> ";
kdb_input_flush();
-
- if (dbg_io_ops && !dbg_io_ops->is_console) {
- len = strlen(moreprompt);
- cp = moreprompt;
- while (len--) {
- dbg_io_ops->write_char(*cp);
- cp++;
- }
- }
- for_each_console(c) {
- c->write(c, moreprompt, strlen(moreprompt));
- touch_nmi_watchdog();
- }
+ kdb_msg_write(moreprompt, strlen(moreprompt));
if (logging)
printk("%s", moreprompt);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 515379cbf209..5c7949061671 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -62,7 +62,7 @@ int kdb_grep_trailing;
/*
* Kernel debugger state flags
*/
-int kdb_flags;
+unsigned int kdb_flags;
/*
* kdb_lock protects updates to kdb_initial_cpu. Used to
@@ -418,8 +418,7 @@ int kdb_set(int argc, const char **argv)
argv[2]);
return 0;
}
- kdb_flags = (kdb_flags &
- ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
+ kdb_flags = (kdb_flags & ~KDB_DEBUG(MASK))
| (debugflags << KDB_DEBUG_FLAG_SHIFT);
return 0;
@@ -1108,7 +1107,8 @@ static int handle_ctrl_cmd(char *cmd)
switch (*cmd) {
case CTRL_P:
if (cmdptr != cmd_tail)
- cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
+ cmdptr = (cmdptr + KDB_CMD_HISTORY_COUNT - 1) %
+ KDB_CMD_HISTORY_COUNT;
strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
return 1;
case CTRL_N:
@@ -2081,7 +2081,8 @@ static int kdb_env(int argc, const char **argv)
}
if (KDB_DEBUG(MASK))
- kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
+ kdb_printf("KDBDEBUG=0x%x\n",
+ (kdb_flags & KDB_DEBUG(MASK)) >> KDB_DEBUG_FLAG_SHIFT);
return 0;
}
@@ -2325,7 +2326,8 @@ void kdb_ps1(const struct task_struct *p)
int cpu;
unsigned long tmp;
- if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+ if (!p ||
+ copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long)))
return;
cpu = kdb_process_cpu(p);
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index b8e6306e7e13..004c5b6c87f8 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -325,7 +325,7 @@ char *kdb_strdup(const char *str, gfp_t type)
*/
int kdb_getarea_size(void *res, unsigned long addr, size_t size)
{
- int ret = probe_kernel_read((char *)res, (char *)addr, size);
+ int ret = copy_from_kernel_nofault((char *)res, (char *)addr, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
@@ -350,7 +350,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size)
*/
int kdb_putarea_size(unsigned long addr, void *res, size_t size)
{
- int ret = probe_kernel_read((char *)addr, (char *)res, size);
+ int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
@@ -624,7 +624,8 @@ char kdb_task_state_char (const struct task_struct *p)
char state;
unsigned long tmp;
- if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+ if (!p ||
+ copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long)))
return 'E';
cpu = kdb_process_cpu(p);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 4c103a24e380..1da3f44f2565 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -71,17 +71,22 @@ config SWIOTLB
# in the pagetables
#
config DMA_NONCOHERENT_MMAP
+ default y if !MMU
+ bool
+
+config DMA_COHERENT_POOL
+ select GENERIC_ALLOCATOR
bool
config DMA_REMAP
+ bool
depends on MMU
- select GENERIC_ALLOCATOR
select DMA_NONCOHERENT_MMAP
- bool
config DMA_DIRECT_REMAP
bool
select DMA_REMAP
+ select DMA_COHERENT_POOL
config DMA_CMA
bool "DMA Contiguous Memory Allocator"
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index d237cf3dc181..370f63344e9c 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -6,4 +6,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
obj-$(CONFIG_DMA_VIRT_OPS) += virt.o
obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
+obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o
obj-$(CONFIG_DMA_REMAP) += remap.o
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 8bc6f2d670f9..15bc5026c485 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -222,8 +222,8 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
* @gfp: Allocation flags.
*
* This function allocates contiguous memory buffer for specified device. It
- * first tries to use device specific contiguous memory area if available or
- * the default global one, then tries a fallback allocation of normal pages.
+ * tries to use device specific contiguous memory area if available, or the
+ * default global one.
*
* Note that it byapss one-page size of allocations from the global area as
* the addresses within one page are always contiguous, so there is no need
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 9e1777c81f55..36c962a86bf2 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -656,7 +656,7 @@ static struct dma_debug_entry *__dma_entry_alloc(void)
return entry;
}
-void __dma_entry_alloc_check_leak(void)
+static void __dma_entry_alloc_check_leak(void)
{
u32 tmp = nr_total_entries % nr_prealloc_entries;
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 8f4bbdaf965e..93f578a8e613 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -45,8 +45,8 @@ u64 dma_direct_get_required_mask(struct device *dev)
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
}
-static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
- u64 *phys_limit)
+gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
+ u64 *phys_limit)
{
u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
@@ -76,29 +76,63 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
}
-struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
+/*
+ * Decrypting memory is allowed to block, so if this device requires
+ * unencrypted memory it must come from atomic pools.
+ */
+static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp,
+ unsigned long attrs)
+{
+ if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
+ return false;
+ if (gfpflags_allow_blocking(gfp))
+ return false;
+ if (force_dma_unencrypted(dev))
+ return true;
+ if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
+ return false;
+ if (dma_alloc_need_uncached(dev, attrs))
+ return true;
+ return false;
+}
+
+static inline bool dma_should_free_from_pool(struct device *dev,
+ unsigned long attrs)
+{
+ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
+ return true;
+ if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
+ !force_dma_unencrypted(dev))
+ return false;
+ if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
+ return true;
+ return false;
+}
+
+static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp_t gfp, unsigned long attrs)
{
- size_t alloc_size = PAGE_ALIGN(size);
int node = dev_to_node(dev);
struct page *page = NULL;
u64 phys_limit;
+ WARN_ON_ONCE(!PAGE_ALIGNED(size));
+
if (attrs & DMA_ATTR_NO_WARN)
gfp |= __GFP_NOWARN;
/* we always manually zero the memory once we are done: */
gfp &= ~__GFP_ZERO;
- gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
- &phys_limit);
- page = dma_alloc_contiguous(dev, alloc_size, gfp);
+ gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+ &phys_limit);
+ page = dma_alloc_contiguous(dev, size, gfp);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
- dma_free_contiguous(dev, page, alloc_size);
+ dma_free_contiguous(dev, page, size);
page = NULL;
}
again:
if (!page)
- page = alloc_pages_node(node, gfp, get_order(alloc_size));
+ page = alloc_pages_node(node, gfp, get_order(size));
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_free_contiguous(dev, page, size);
page = NULL;
@@ -124,11 +158,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
{
struct page *page;
void *ret;
+ int err;
+
+ size = PAGE_ALIGN(size);
- if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- dma_alloc_need_uncached(dev, attrs) &&
- !gfpflags_allow_blocking(gfp)) {
- ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp);
+ if (dma_should_alloc_from_pool(dev, gfp, attrs)) {
+ ret = dma_alloc_from_pool(dev, size, &page, gfp);
if (!ret)
return NULL;
goto done;
@@ -152,14 +187,20 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
dma_alloc_need_uncached(dev, attrs)) ||
(IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
/* remove any dirty cache lines on the kernel alias */
- arch_dma_prep_coherent(page, PAGE_ALIGN(size));
+ arch_dma_prep_coherent(page, size);
/* create a coherent mapping */
- ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size),
+ ret = dma_common_contiguous_remap(page, size,
dma_pgprot(dev, PAGE_KERNEL, attrs),
__builtin_return_address(0));
if (!ret)
goto out_free_pages;
+ if (force_dma_unencrypted(dev)) {
+ err = set_memory_decrypted((unsigned long)ret,
+ 1 << get_order(size));
+ if (err)
+ goto out_free_pages;
+ }
memset(ret, 0, size);
goto done;
}
@@ -176,8 +217,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
}
ret = page_address(page);
- if (force_dma_unencrypted(dev))
- set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
+ if (force_dma_unencrypted(dev)) {
+ err = set_memory_decrypted((unsigned long)ret,
+ 1 << get_order(size));
+ if (err)
+ goto out_free_pages;
+ }
memset(ret, 0, size);
@@ -186,7 +231,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
arch_dma_prep_coherent(page, size);
ret = arch_dma_set_uncached(ret, size);
if (IS_ERR(ret))
- goto out_free_pages;
+ goto out_encrypt_pages;
}
done:
if (force_dma_unencrypted(dev))
@@ -194,6 +239,15 @@ done:
else
*dma_handle = phys_to_dma(dev, page_to_phys(page));
return ret;
+
+out_encrypt_pages:
+ if (force_dma_unencrypted(dev)) {
+ err = set_memory_encrypted((unsigned long)page_address(page),
+ 1 << get_order(size));
+ /* If memory cannot be re-encrypted, it must be leaked */
+ if (err)
+ return NULL;
+ }
out_free_pages:
dma_free_contiguous(dev, page, size);
return NULL;
@@ -204,6 +258,11 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
{
unsigned int page_order = get_order(size);
+ /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
+ if (dma_should_free_from_pool(dev, attrs) &&
+ dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
+ return;
+
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
!force_dma_unencrypted(dev)) {
/* cpu_addr is a struct page cookie, not a kernel address */
@@ -211,10 +270,6 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
return;
}
- if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- dma_free_from_pool(cpu_addr, PAGE_ALIGN(size)))
- return;
-
if (force_dma_unencrypted(dev))
set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
@@ -427,7 +482,6 @@ int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
return ret;
}
-#ifdef CONFIG_MMU
bool dma_direct_can_mmap(struct device *dev)
{
return dev_is_dma_coherent(dev) ||
@@ -453,19 +507,6 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
user_count << PAGE_SHIFT, vma->vm_page_prot);
}
-#else /* CONFIG_MMU */
-bool dma_direct_can_mmap(struct device *dev)
-{
- return false;
-}
-
-int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
- void *cpu_addr, dma_addr_t dma_addr, size_t size,
- unsigned long attrs)
-{
- return -ENXIO;
-}
-#endif /* CONFIG_MMU */
int dma_direct_supported(struct device *dev, u64 mask)
{
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
new file mode 100644
index 000000000000..8cfa01243ed2
--- /dev/null
+++ b/kernel/dma/pool.c
@@ -0,0 +1,263 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ * Copyright (C) 2020 Google LLC
+ */
+#include <linux/debugfs.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/dma-contiguous.h>
+#include <linux/init.h>
+#include <linux/genalloc.h>
+#include <linux/set_memory.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+static struct gen_pool *atomic_pool_dma __ro_after_init;
+static unsigned long pool_size_dma;
+static struct gen_pool *atomic_pool_dma32 __ro_after_init;
+static unsigned long pool_size_dma32;
+static struct gen_pool *atomic_pool_kernel __ro_after_init;
+static unsigned long pool_size_kernel;
+
+/* Size can be defined by the coherent_pool command line */
+static size_t atomic_pool_size;
+
+/* Dynamic background expansion when the atomic pool is near capacity */
+static struct work_struct atomic_pool_work;
+
+static int __init early_coherent_pool(char *p)
+{
+ atomic_pool_size = memparse(p, &p);
+ return 0;
+}
+early_param("coherent_pool", early_coherent_pool);
+
+static void __init dma_atomic_pool_debugfs_init(void)
+{
+ struct dentry *root;
+
+ root = debugfs_create_dir("dma_pools", NULL);
+ if (IS_ERR_OR_NULL(root))
+ return;
+
+ debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma);
+ debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32);
+ debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel);
+}
+
+static void dma_atomic_pool_size_add(gfp_t gfp, size_t size)
+{
+ if (gfp & __GFP_DMA)
+ pool_size_dma += size;
+ else if (gfp & __GFP_DMA32)
+ pool_size_dma32 += size;
+ else
+ pool_size_kernel += size;
+}
+
+static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
+ gfp_t gfp)
+{
+ unsigned int order;
+ struct page *page;
+ void *addr;
+ int ret = -ENOMEM;
+
+ /* Cannot allocate larger than MAX_ORDER-1 */
+ order = min(get_order(pool_size), MAX_ORDER-1);
+
+ do {
+ pool_size = 1 << (PAGE_SHIFT + order);
+
+ if (dev_get_cma_area(NULL))
+ page = dma_alloc_from_contiguous(NULL, 1 << order,
+ order, false);
+ else
+ page = alloc_pages(gfp, order);
+ } while (!page && order-- > 0);
+ if (!page)
+ goto out;
+
+ arch_dma_prep_coherent(page, pool_size);
+
+#ifdef CONFIG_DMA_DIRECT_REMAP
+ addr = dma_common_contiguous_remap(page, pool_size,
+ pgprot_dmacoherent(PAGE_KERNEL),
+ __builtin_return_address(0));
+ if (!addr)
+ goto free_page;
+#else
+ addr = page_to_virt(page);
+#endif
+ /*
+ * Memory in the atomic DMA pools must be unencrypted, the pools do not
+ * shrink so no re-encryption occurs in dma_direct_free_pages().
+ */
+ ret = set_memory_decrypted((unsigned long)page_to_virt(page),
+ 1 << order);
+ if (ret)
+ goto remove_mapping;
+ ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page),
+ pool_size, NUMA_NO_NODE);
+ if (ret)
+ goto encrypt_mapping;
+
+ dma_atomic_pool_size_add(gfp, pool_size);
+ return 0;
+
+encrypt_mapping:
+ ret = set_memory_encrypted((unsigned long)page_to_virt(page),
+ 1 << order);
+ if (WARN_ON_ONCE(ret)) {
+ /* Decrypt succeeded but encrypt failed, purposely leak */
+ goto out;
+ }
+remove_mapping:
+#ifdef CONFIG_DMA_DIRECT_REMAP
+ dma_common_free_remap(addr, pool_size);
+#endif
+free_page: __maybe_unused
+ if (!dma_release_from_contiguous(NULL, page, 1 << order))
+ __free_pages(page, order);
+out:
+ return ret;
+}
+
+static void atomic_pool_resize(struct gen_pool *pool, gfp_t gfp)
+{
+ if (pool && gen_pool_avail(pool) < atomic_pool_size)
+ atomic_pool_expand(pool, gen_pool_size(pool), gfp);
+}
+
+static void atomic_pool_work_fn(struct work_struct *work)
+{
+ if (IS_ENABLED(CONFIG_ZONE_DMA))
+ atomic_pool_resize(atomic_pool_dma,
+ GFP_KERNEL | GFP_DMA);
+ if (IS_ENABLED(CONFIG_ZONE_DMA32))
+ atomic_pool_resize(atomic_pool_dma32,
+ GFP_KERNEL | GFP_DMA32);
+ atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
+}
+
+static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size,
+ gfp_t gfp)
+{
+ struct gen_pool *pool;
+ int ret;
+
+ pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
+ if (!pool)
+ return NULL;
+
+ gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL);
+
+ ret = atomic_pool_expand(pool, pool_size, gfp);
+ if (ret) {
+ gen_pool_destroy(pool);
+ pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n",
+ pool_size >> 10, &gfp);
+ return NULL;
+ }
+
+ pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n",
+ gen_pool_size(pool) >> 10, &gfp);
+ return pool;
+}
+
+static int __init dma_atomic_pool_init(void)
+{
+ int ret = 0;
+
+ /*
+ * If coherent_pool was not used on the command line, default the pool
+ * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1.
+ */
+ if (!atomic_pool_size) {
+ unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
+ pages = min_t(unsigned long, pages, MAX_ORDER_NR_PAGES);
+ atomic_pool_size = max_t(size_t, pages << PAGE_SHIFT, SZ_128K);
+ }
+ INIT_WORK(&atomic_pool_work, atomic_pool_work_fn);
+
+ atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size,
+ GFP_KERNEL);
+ if (!atomic_pool_kernel)
+ ret = -ENOMEM;
+ if (IS_ENABLED(CONFIG_ZONE_DMA)) {
+ atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
+ GFP_KERNEL | GFP_DMA);
+ if (!atomic_pool_dma)
+ ret = -ENOMEM;
+ }
+ if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
+ atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
+ GFP_KERNEL | GFP_DMA32);
+ if (!atomic_pool_dma32)
+ ret = -ENOMEM;
+ }
+
+ dma_atomic_pool_debugfs_init();
+ return ret;
+}
+postcore_initcall(dma_atomic_pool_init);
+
+static inline struct gen_pool *dev_to_pool(struct device *dev)
+{
+ u64 phys_mask;
+ gfp_t gfp;
+
+ gfp = dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+ &phys_mask);
+ if (IS_ENABLED(CONFIG_ZONE_DMA) && gfp == GFP_DMA)
+ return atomic_pool_dma;
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) && gfp == GFP_DMA32)
+ return atomic_pool_dma32;
+ return atomic_pool_kernel;
+}
+
+static bool dma_in_atomic_pool(struct device *dev, void *start, size_t size)
+{
+ struct gen_pool *pool = dev_to_pool(dev);
+
+ if (unlikely(!pool))
+ return false;
+ return gen_pool_has_addr(pool, (unsigned long)start, size);
+}
+
+void *dma_alloc_from_pool(struct device *dev, size_t size,
+ struct page **ret_page, gfp_t flags)
+{
+ struct gen_pool *pool = dev_to_pool(dev);
+ unsigned long val;
+ void *ptr = NULL;
+
+ if (!pool) {
+ WARN(1, "%pGg atomic pool not initialised!\n", &flags);
+ return NULL;
+ }
+
+ val = gen_pool_alloc(pool, size);
+ if (val) {
+ phys_addr_t phys = gen_pool_virt_to_phys(pool, val);
+
+ *ret_page = pfn_to_page(__phys_to_pfn(phys));
+ ptr = (void *)val;
+ memset(ptr, 0, size);
+ }
+ if (gen_pool_avail(pool) < atomic_pool_size)
+ schedule_work(&atomic_pool_work);
+
+ return ptr;
+}
+
+bool dma_free_from_pool(struct device *dev, void *start, size_t size)
+{
+ struct gen_pool *pool = dev_to_pool(dev);
+
+ if (!dma_in_atomic_pool(dev, start, size))
+ return false;
+ gen_pool_free(pool, (unsigned long)start, size);
+ return true;
+}
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index d14cbc83986a..78b23f089cf1 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -1,13 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright (C) 2012 ARM Ltd.
* Copyright (c) 2014 The Linux Foundation
*/
-#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
-#include <linux/dma-contiguous.h>
-#include <linux/init.h>
-#include <linux/genalloc.h>
+#include <linux/dma-mapping.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -20,23 +15,6 @@ struct page **dma_common_find_pages(void *cpu_addr)
return area->pages;
}
-static struct vm_struct *__dma_common_pages_remap(struct page **pages,
- size_t size, pgprot_t prot, const void *caller)
-{
- struct vm_struct *area;
-
- area = get_vm_area_caller(size, VM_DMA_COHERENT, caller);
- if (!area)
- return NULL;
-
- if (map_vm_area(area, prot, pages)) {
- vunmap(area->addr);
- return NULL;
- }
-
- return area;
-}
-
/*
* Remaps an array of PAGE_SIZE pages into another vm_area.
* Cannot be used in non-sleeping contexts
@@ -44,15 +22,13 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages,
void *dma_common_pages_remap(struct page **pages, size_t size,
pgprot_t prot, const void *caller)
{
- struct vm_struct *area;
-
- area = __dma_common_pages_remap(pages, size, prot, caller);
- if (!area)
- return NULL;
+ void *vaddr;
- area->pages = pages;
-
- return area->addr;
+ vaddr = vmap(pages, PAGE_ALIGN(size) >> PAGE_SHIFT,
+ VM_DMA_COHERENT, prot);
+ if (vaddr)
+ find_vm_area(vaddr)->pages = pages;
+ return vaddr;
}
/*
@@ -62,24 +38,20 @@ void *dma_common_pages_remap(struct page **pages, size_t size,
void *dma_common_contiguous_remap(struct page *page, size_t size,
pgprot_t prot, const void *caller)
{
- int i;
+ int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
struct page **pages;
- struct vm_struct *area;
+ void *vaddr;
+ int i;
- pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL);
+ pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
if (!pages)
return NULL;
-
- for (i = 0; i < (size >> PAGE_SHIFT); i++)
+ for (i = 0; i < count; i++)
pages[i] = nth_page(page, i);
-
- area = __dma_common_pages_remap(pages, size, prot, caller);
-
+ vaddr = vmap(pages, count, VM_DMA_COHERENT, prot);
kfree(pages);
- if (!area)
- return NULL;
- return area->addr;
+ return vaddr;
}
/*
@@ -97,117 +69,3 @@ void dma_common_free_remap(void *cpu_addr, size_t size)
unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size));
vunmap(cpu_addr);
}
-
-#ifdef CONFIG_DMA_DIRECT_REMAP
-static struct gen_pool *atomic_pool __ro_after_init;
-
-#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K
-static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
-
-static int __init early_coherent_pool(char *p)
-{
- atomic_pool_size = memparse(p, &p);
- return 0;
-}
-early_param("coherent_pool", early_coherent_pool);
-
-static gfp_t dma_atomic_pool_gfp(void)
-{
- if (IS_ENABLED(CONFIG_ZONE_DMA))
- return GFP_DMA;
- if (IS_ENABLED(CONFIG_ZONE_DMA32))
- return GFP_DMA32;
- return GFP_KERNEL;
-}
-
-static int __init dma_atomic_pool_init(void)
-{
- unsigned int pool_size_order = get_order(atomic_pool_size);
- unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
- struct page *page;
- void *addr;
- int ret;
-
- if (dev_get_cma_area(NULL))
- page = dma_alloc_from_contiguous(NULL, nr_pages,
- pool_size_order, false);
- else
- page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order);
- if (!page)
- goto out;
-
- arch_dma_prep_coherent(page, atomic_pool_size);
-
- atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
- if (!atomic_pool)
- goto free_page;
-
- addr = dma_common_contiguous_remap(page, atomic_pool_size,
- pgprot_dmacoherent(PAGE_KERNEL),
- __builtin_return_address(0));
- if (!addr)
- goto destroy_genpool;
-
- ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr,
- page_to_phys(page), atomic_pool_size, -1);
- if (ret)
- goto remove_mapping;
- gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL);
-
- pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n",
- atomic_pool_size / 1024);
- return 0;
-
-remove_mapping:
- dma_common_free_remap(addr, atomic_pool_size);
-destroy_genpool:
- gen_pool_destroy(atomic_pool);
- atomic_pool = NULL;
-free_page:
- if (!dma_release_from_contiguous(NULL, page, nr_pages))
- __free_pages(page, pool_size_order);
-out:
- pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n",
- atomic_pool_size / 1024);
- return -ENOMEM;
-}
-postcore_initcall(dma_atomic_pool_init);
-
-bool dma_in_atomic_pool(void *start, size_t size)
-{
- if (unlikely(!atomic_pool))
- return false;
-
- return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
-}
-
-void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
-{
- unsigned long val;
- void *ptr = NULL;
-
- if (!atomic_pool) {
- WARN(1, "coherent pool not initialised!\n");
- return NULL;
- }
-
- val = gen_pool_alloc(atomic_pool, size);
- if (val) {
- phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
-
- *ret_page = pfn_to_page(__phys_to_pfn(phys));
- ptr = (void *)val;
- memset(ptr, 0, size);
- }
-
- return ptr;
-}
-
-bool dma_free_from_pool(void *start, size_t size)
-{
- if (!dma_in_atomic_pool(start, size))
- return false;
- gen_pool_free(atomic_pool, (unsigned long)start, size);
- return true;
-}
-#endif /* CONFIG_DMA_DIRECT_REMAP */
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index b1991043b7d8..334d48b16c36 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -236,7 +236,7 @@ exit_put:
* sysctl_perf_event_max_contexts_per_stack.
*/
int perf_event_max_stack_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *value = table->data;
int new_value = *value, ret;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e296c5c59c6f..856d98c36f56 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -442,8 +442,7 @@ static void update_perf_cpu_limits(void)
static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
int perf_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
int perf_cpu = sysctl_perf_cpu_time_max_percent;
@@ -467,8 +466,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -1318,7 +1316,7 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event::child_mutex;
* perf_event_context::lock
* perf_event::mmap_mutex
- * mmap_sem
+ * mmap_lock
* perf_addr_filters_head::lock
*
* cpu_hotplug_lock
@@ -3082,7 +3080,7 @@ static int perf_event_stop(struct perf_event *event, int restart)
* pre-existing mappings, called once when new filters arrive via SET_FILTER
* ioctl;
* (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
- * registered mapping, called for every new mmap(), with mm::mmap_sem down
+ * registered mapping, called for every new mmap(), with mm::mmap_lock down
* for reading;
* (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
* of exec.
@@ -6936,12 +6934,12 @@ static u64 perf_virt_to_phys(u64 virt)
* Walking the pages tables for user address.
* Interrupts are disabled, so it prevents any tear down
* of the page tables.
- * Try IRQ-safe __get_user_pages_fast first.
+ * Try IRQ-safe get_user_page_fast_only first.
* If failed, leave phys_addr as 0.
*/
if (current->mm != NULL) {
pagefault_disable();
- if (__get_user_pages_fast(virt, 1, 0, &p) == 1)
+ if (get_user_page_fast_only(virt, 0, &p))
phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
pagefault_enable();
}
@@ -9744,7 +9742,7 @@ static void perf_addr_filters_splice(struct perf_event *event,
/*
* Scan through mm's vmas and see if one of them matches the
* @filter; if so, adjust filter's address range.
- * Called with mm::mmap_sem down for reading.
+ * Called with mm::mmap_lock down for reading.
*/
static void perf_addr_filter_apply(struct perf_addr_filter *filter,
struct mm_struct *mm,
@@ -9786,7 +9784,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
if (!mm)
goto restart;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
}
raw_spin_lock_irqsave(&ifh->lock, flags);
@@ -9812,7 +9810,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
raw_spin_unlock_irqrestore(&ifh->lock, flags);
if (ifh->nr_file_filters) {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
mmput(mm);
}
@@ -12222,7 +12220,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* When a child task exits, feed back event values to parent events.
*
* Can be called with exec_update_mutex held when called from
- * install_exec_creds().
+ * setup_new_exec().
*/
void perf_event_exit_task(struct task_struct *child)
{
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 3cc8416ec844..b48d7039a015 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -213,6 +213,15 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
list_del(&bp->hw.bp_list);
}
+__weak int arch_reserve_bp_slot(struct perf_event *bp)
+{
+ return 0;
+}
+
+__weak void arch_release_bp_slot(struct perf_event *bp)
+{
+}
+
/*
* Function to perform processor-specific cleanup during unregistration
*/
@@ -270,6 +279,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
struct bp_busy_slots slots = {0};
enum bp_type_idx type;
int weight;
+ int ret;
/* We couldn't initialize breakpoint constraints on boot */
if (!constraints_initialized)
@@ -294,6 +304,10 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
if (slots.pinned + (!!slots.flexible) > nr_slots[type])
return -ENOSPC;
+ ret = arch_reserve_bp_slot(bp);
+ if (ret)
+ return ret;
+
toggle_bp_slot(bp, true, type, weight);
return 0;
@@ -317,6 +331,8 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
enum bp_type_idx type;
int weight;
+ arch_release_bp_slot(bp);
+
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
toggle_bp_slot(bp, false, type, weight);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ece7e13f6e4a..bb0862873dba 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -162,14 +162,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
};
int err;
struct mmu_notifier_range range;
- struct mem_cgroup *memcg;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
if (new_page) {
- err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
- &memcg, false);
+ err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
if (err)
return err;
}
@@ -179,17 +177,13 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
- if (!page_vma_mapped_walk(&pvmw)) {
- if (new_page)
- mem_cgroup_cancel_charge(new_page, memcg, false);
+ if (!page_vma_mapped_walk(&pvmw))
goto unlock;
- }
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
if (new_page) {
get_page(new_page);
page_add_new_anon_rmap(new_page, vma, addr, false);
- mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
} else
/* no new page, just dec_mm_counter for old_page */
@@ -463,7 +457,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
*
- * Called with mm->mmap_sem held for write.
+ * Called with mm->mmap_lock held for write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
@@ -867,10 +861,6 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
if (ret)
goto out;
- /* uprobe_write_opcode() assumes we don't cross page boundary */
- BUG_ON((uprobe->offset & ~PAGE_MASK) +
- UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
-
smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
@@ -1064,7 +1054,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (err && is_register)
goto free;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
file_inode(vma->vm_file) != uprobe->inode)
@@ -1086,7 +1076,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
}
unlock:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
free:
mmput(mm);
info = free_map_info(info);
@@ -1166,6 +1156,15 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
if (offset > i_size_read(inode))
return -EINVAL;
+ /*
+ * This ensures that copy_from_page(), copy_to_page() and
+ * __update_ref_ctr() can't cross page boundary.
+ */
+ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
+ return -EINVAL;
+ if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
+ return -EINVAL;
+
retry:
uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
if (!uprobe)
@@ -1241,7 +1240,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
struct vm_area_struct *vma;
int err = 0;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
unsigned long vaddr;
loff_t offset;
@@ -1258,7 +1257,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
vaddr = offset_to_vaddr(vma, uprobe->offset);
err |= remove_breakpoint(uprobe, mm, vaddr);
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return err;
}
@@ -1355,7 +1354,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
}
/*
- * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
+ * Called from mmap_region/vma_adjust with mm->mmap_lock acquired.
*
* Currently we ignore all errors and always return 0, the callers
* can't handle the failure anyway.
@@ -1445,7 +1444,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
struct vm_area_struct *vma;
int ret;
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
if (mm->uprobes_state.xol_area) {
@@ -1475,7 +1474,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
/* pairs with get_xol_area() */
smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
fail:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return ret;
}
@@ -1674,7 +1673,7 @@ void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
copy_to_page(page, vaddr, src, len);
/*
- * We probably need flush_icache_user_range() but it needs vma.
+ * We probably need flush_icache_user_page() but it needs vma.
* This should work on most of architectures by default. If
* architecture needs to do something different it can define
* its own version of the function.
@@ -2014,6 +2013,9 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
uprobe_opcode_t opcode;
int result;
+ if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
+ return -EINVAL;
+
pagefault_disable();
result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
pagefault_enable();
@@ -2045,7 +2047,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
struct uprobe *uprobe = NULL;
struct vm_area_struct *vma;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma(mm, bp_vaddr);
if (vma && vma->vm_start <= bp_vaddr) {
if (valid_vma(vma, false)) {
@@ -2063,7 +2065,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
mmf_recalc_uprobes(mm);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return uprobe;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index ce2a75bc0ade..727150f28103 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -66,7 +66,6 @@
#include <linux/uaccess.h>
#include <asm/unistd.h>
-#include <asm/pgtable.h>
#include <asm/mmu_context.h>
static void __unhash_process(struct task_struct *p, bool group_dead)
@@ -228,8 +227,9 @@ repeat:
goto repeat;
}
-void rcuwait_wake_up(struct rcuwait *w)
+int rcuwait_wake_up(struct rcuwait *w)
{
+ int ret = 0;
struct task_struct *task;
rcu_read_lock();
@@ -237,7 +237,7 @@ void rcuwait_wake_up(struct rcuwait *w)
/*
* Order condition vs @task, such that everything prior to the load
* of @task is visible. This is the condition as to why the user called
- * rcuwait_trywake() in the first place. Pairs with set_current_state()
+ * rcuwait_wake() in the first place. Pairs with set_current_state()
* barrier (A) in rcuwait_wait_event().
*
* WAIT WAKE
@@ -249,8 +249,10 @@ void rcuwait_wake_up(struct rcuwait *w)
task = rcu_dereference(w->task);
if (task)
- wake_up_process(task);
+ ret = wake_up_process(task);
rcu_read_unlock();
+
+ return ret;
}
EXPORT_SYMBOL_GPL(rcuwait_wake_up);
@@ -438,17 +440,17 @@ static void exit_mm(void)
sync_mm_rss(mm);
/*
* Serialize with any possible pending coredump.
- * We must hold mmap_sem around checking core_state
+ * We must hold mmap_lock around checking core_state
* and clearing tsk->mm. The core-inducing thread
* will increment ->nr_threads for each thread in the
* group with ->mm != NULL.
*/
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
core_state = mm->core_state;
if (core_state) {
struct core_thread self;
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
self.task = current;
self.next = xchg(&core_state->dumper.next, &self);
@@ -466,14 +468,14 @@ static void exit_mm(void)
freezable_schedule();
}
__set_current_state(TASK_RUNNING);
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
}
mmgrab(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
current->mm = NULL;
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
enter_lazy_tlb(mm, current);
task_unlock(current);
mm_update_next_owner(mm);
@@ -708,8 +710,12 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
- profile_task_exit(tsk);
- kcov_task_exit(tsk);
+ /*
+ * We can get here from a kernel oops, sometimes with preemption off.
+ * Start by checking for critical errors.
+ * Then fix up important state like USER_DS and preemption.
+ * Then do everything else.
+ */
WARN_ON(blk_needs_flush_plug(tsk));
@@ -727,6 +733,16 @@ void __noreturn do_exit(long code)
*/
set_fs(USER_DS);
+ if (unlikely(in_atomic())) {
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
+ preempt_count_set(PREEMPT_ENABLED);
+ }
+
+ profile_task_exit(tsk);
+ kcov_task_exit(tsk);
+
ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
@@ -744,13 +760,6 @@ void __noreturn do_exit(long code)
exit_signals(tsk); /* sets PF_EXITING */
- if (unlikely(in_atomic())) {
- pr_info("note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
- preempt_count_set(PREEMPT_ENABLED);
- }
-
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk->mm);
@@ -1558,7 +1567,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
if (!infop)
return err;
- if (!user_access_begin(infop, sizeof(*infop)))
+ if (!user_write_access_begin(infop, sizeof(*infop)))
return -EFAULT;
unsafe_put_user(signo, &infop->si_signo, Efault);
@@ -1567,10 +1576,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
unsafe_put_user(info.pid, &infop->si_pid, Efault);
unsafe_put_user(info.uid, &infop->si_uid, Efault);
unsafe_put_user(info.status, &infop->si_status, Efault);
- user_access_end();
+ user_write_access_end();
return err;
Efault:
- user_access_end();
+ user_write_access_end();
return -EFAULT;
}
@@ -1685,7 +1694,7 @@ COMPAT_SYSCALL_DEFINE5(waitid,
if (!infop)
return err;
- if (!user_access_begin(infop, sizeof(*infop)))
+ if (!user_write_access_begin(infop, sizeof(*infop)))
return -EFAULT;
unsafe_put_user(signo, &infop->si_signo, Efault);
@@ -1694,10 +1703,10 @@ COMPAT_SYSCALL_DEFINE5(waitid,
unsafe_put_user(info.pid, &infop->si_pid, Efault);
unsafe_put_user(info.uid, &infop->si_uid, Efault);
unsafe_put_user(info.status, &infop->si_status, Efault);
- user_access_end();
+ user_write_access_end();
return err;
Efault:
- user_access_end();
+ user_write_access_end();
return -EFAULT;
}
#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index c40478e749a7..efc5493203ae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -94,8 +94,8 @@
#include <linux/thread_info.h>
#include <linux/stackleak.h>
#include <linux/kasan.h>
+#include <linux/scs.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -456,6 +456,8 @@ void put_task_stack(struct task_struct *tsk)
void free_task(struct task_struct *tsk)
{
+ scs_release(tsk);
+
#ifndef CONFIG_THREAD_INFO_IN_TASK
/*
* The task is finally done with both the stack and thread_info,
@@ -490,7 +492,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
LIST_HEAD(uf);
uprobe_start_dup_mmap();
- if (down_write_killable(&oldmm->mmap_sem)) {
+ if (mmap_write_lock_killable(oldmm)) {
retval = -EINTR;
goto fail_uprobe_end;
}
@@ -499,7 +501,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
/*
* Not linked in yet - no deadlock potential:
*/
- down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
+ mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
/* No ordering required: file already has been exposed. */
RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
@@ -615,9 +617,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
out:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
- up_write(&oldmm->mmap_sem);
+ mmap_write_unlock(oldmm);
dup_userfaultfd_complete(&uf);
fail_uprobe_end:
uprobe_end_dup_mmap();
@@ -647,9 +649,9 @@ static inline void mm_free_pgd(struct mm_struct *mm)
#else
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
- down_write(&oldmm->mmap_sem);
+ mmap_write_lock(oldmm);
RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
- up_write(&oldmm->mmap_sem);
+ mmap_write_unlock(oldmm);
return 0;
}
#define mm_alloc_pgd(mm) (0)
@@ -840,6 +842,8 @@ void __init fork_init(void)
NULL, free_vm_stack_cache);
#endif
+ scs_init();
+
lockdep_init_task(&init_task);
uprobes_init();
}
@@ -899,6 +903,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (err)
goto free_stack;
+ err = scs_prepare(tsk, node);
+ if (err)
+ goto free_stack;
+
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
@@ -1014,7 +1022,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->vmacache_seqnum = 0;
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
- init_rwsem(&mm->mmap_sem);
+ mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
mm_pgtables_bytes_init(mm);
@@ -1750,7 +1758,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
pid_t nr = -1;
if (likely(pid_has_task(pid, PIDTYPE_PID))) {
- ns = proc_pid_ns(file_inode(m->file));
+ ns = proc_pid_ns(file_inode(m->file)->i_sb);
nr = pid_nr_ns(pid, ns);
}
@@ -1969,7 +1977,7 @@ static __latent_entropy struct task_struct *copy_process(
* to stop root fork bombs.
*/
retval = -EAGAIN;
- if (nr_threads >= max_threads)
+ if (data_race(nr_threads >= max_threads))
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
diff --git a/kernel/futex.c b/kernel/futex.c
index b59532862bc0..e646661f6282 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -486,10 +486,13 @@ static u64 get_inode_sequence_number(struct inode *inode)
* The key words are stored in @key on success.
*
* For shared mappings (when @fshared), the key is:
+ *
* ( inode->i_sequence, page->index, offset_within_page )
+ *
* [ also see get_inode_sequence_number() ]
*
* For private mappings (or when !@fshared), the key is:
+ *
* ( current->mm, address, 0 )
*
* This allows (cross process, where applicable) identification of the futex
@@ -695,10 +698,10 @@ static int fault_in_user_writeable(u32 __user *uaddr)
struct mm_struct *mm = current->mm;
int ret;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
FAULT_FLAG_WRITE, NULL);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return ret < 0 ? ret : 0;
}
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 3941a9c48f83..3110c77230c7 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -6,7 +6,7 @@ config GCOV_KERNEL
depends on DEBUG_FS
select CONSTRUCTORS if !UML
default n
- ---help---
+ help
This option enables gcov-based code profiling (e.g. for code coverage
measurements).
@@ -42,7 +42,7 @@ config GCOV_PROFILE_ALL
depends on GCOV_KERNEL
depends on ARCH_HAS_GCOV_PROFILE_ALL
default n
- ---help---
+ help
This options activates profiling for the entire kernel.
If unsure, say N.
@@ -51,28 +51,4 @@ config GCOV_PROFILE_ALL
larger and run slower. Also be sure to exclude files from profiling
which are not linked to the kernel image to prevent linker errors.
-choice
- prompt "Specify GCOV format"
- depends on GCOV_KERNEL
- depends on CC_IS_GCC
- ---help---
- The gcov format is usually determined by the GCC version, and the
- default is chosen according to your GCC version. However, there are
- exceptions where format changes are integrated in lower-version GCCs.
- In such a case, change this option to adjust the format used in the
- kernel accordingly.
-
-config GCOV_FORMAT_3_4
- bool "GCC 3.4 format"
- depends on GCC_VERSION < 40700
- ---help---
- Select this option to use the format defined by GCC 3.4.
-
-config GCOV_FORMAT_4_7
- bool "GCC 4.7 format"
- ---help---
- Select this option to use the format defined by GCC 4.7.
-
-endchoice
-
endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index d66a74b0f100..16f8ecc7d882 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -2,6 +2,5 @@
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-y := base.o fs.o
-obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o
-obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o
+obj-$(CONFIG_CC_IS_GCC) += gcc_base.o gcc_4_7.o
obj-$(CONFIG_CC_IS_CLANG) += clang.o
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
deleted file mode 100644
index acb83558e5df..000000000000
--- a/kernel/gcov/gcc_3_4.c
+++ /dev/null
@@ -1,573 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * This code provides functions to handle gcc's profiling data format
- * introduced with gcc 3.4. Future versions of gcc may change the gcov
- * format (as happened before), so all format-specific information needs
- * to be kept modular and easily exchangeable.
- *
- * This file is based on gcc-internal definitions. Functions and data
- * structures are defined to be compatible with gcc counterparts.
- * For a better understanding, refer to gcc source: gcc/gcov-io.h.
- *
- * Copyright IBM Corp. 2009
- * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
- *
- * Uses gcc-internal data definitions.
- */
-
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/seq_file.h>
-#include <linux/vmalloc.h>
-#include "gcov.h"
-
-#define GCOV_COUNTERS 5
-
-static struct gcov_info *gcov_info_head;
-
-/**
- * struct gcov_fn_info - profiling meta data per function
- * @ident: object file-unique function identifier
- * @checksum: function checksum
- * @n_ctrs: number of values per counter type belonging to this function
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time.
- */
-struct gcov_fn_info {
- unsigned int ident;
- unsigned int checksum;
- unsigned int n_ctrs[];
-};
-
-/**
- * struct gcov_ctr_info - profiling data per counter type
- * @num: number of counter values for this type
- * @values: array of counter values for this type
- * @merge: merge function for counter values of this type (unused)
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the values array.
- */
-struct gcov_ctr_info {
- unsigned int num;
- gcov_type *values;
- void (*merge)(gcov_type *, unsigned int);
-};
-
-/**
- * struct gcov_info - profiling data per object file
- * @version: gcov version magic indicating the gcc version used for compilation
- * @next: list head for a singly-linked list
- * @stamp: time stamp
- * @filename: name of the associated gcov data file
- * @n_functions: number of instrumented functions
- * @functions: function data
- * @ctr_mask: mask specifying which counter types are active
- * @counts: counter data per counter type
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the next pointer.
- */
-struct gcov_info {
- unsigned int version;
- struct gcov_info *next;
- unsigned int stamp;
- const char *filename;
- unsigned int n_functions;
- const struct gcov_fn_info *functions;
- unsigned int ctr_mask;
- struct gcov_ctr_info counts[];
-};
-
-/**
- * gcov_info_filename - return info filename
- * @info: profiling data set
- */
-const char *gcov_info_filename(struct gcov_info *info)
-{
- return info->filename;
-}
-
-/**
- * gcov_info_version - return info version
- * @info: profiling data set
- */
-unsigned int gcov_info_version(struct gcov_info *info)
-{
- return info->version;
-}
-
-/**
- * gcov_info_next - return next profiling data set
- * @info: profiling data set
- *
- * Returns next gcov_info following @info or first gcov_info in the chain if
- * @info is %NULL.
- */
-struct gcov_info *gcov_info_next(struct gcov_info *info)
-{
- if (!info)
- return gcov_info_head;
-
- return info->next;
-}
-
-/**
- * gcov_info_link - link/add profiling data set to the list
- * @info: profiling data set
- */
-void gcov_info_link(struct gcov_info *info)
-{
- info->next = gcov_info_head;
- gcov_info_head = info;
-}
-
-/**
- * gcov_info_unlink - unlink/remove profiling data set from the list
- * @prev: previous profiling data set
- * @info: profiling data set
- */
-void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
-{
- if (prev)
- prev->next = info->next;
- else
- gcov_info_head = info->next;
-}
-
-/**
- * gcov_info_within_module - check if a profiling data set belongs to a module
- * @info: profiling data set
- * @mod: module
- *
- * Returns true if profiling data belongs module, false otherwise.
- */
-bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
-{
- return within_module((unsigned long)info, mod);
-}
-
-/* Symbolic links to be created for each profiling data file. */
-const struct gcov_link gcov_link[] = {
- { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
- { 0, NULL},
-};
-
-/*
- * Determine whether a counter is active. Based on gcc magic. Doesn't change
- * at run-time.
- */
-static int counter_active(struct gcov_info *info, unsigned int type)
-{
- return (1 << type) & info->ctr_mask;
-}
-
-/* Determine number of active counters. Based on gcc magic. */
-static unsigned int num_counter_active(struct gcov_info *info)
-{
- unsigned int i;
- unsigned int result = 0;
-
- for (i = 0; i < GCOV_COUNTERS; i++) {
- if (counter_active(info, i))
- result++;
- }
- return result;
-}
-
-/**
- * gcov_info_reset - reset profiling data to zero
- * @info: profiling data set
- */
-void gcov_info_reset(struct gcov_info *info)
-{
- unsigned int active = num_counter_active(info);
- unsigned int i;
-
- for (i = 0; i < active; i++) {
- memset(info->counts[i].values, 0,
- info->counts[i].num * sizeof(gcov_type));
- }
-}
-
-/**
- * gcov_info_is_compatible - check if profiling data can be added
- * @info1: first profiling data set
- * @info2: second profiling data set
- *
- * Returns non-zero if profiling data can be added, zero otherwise.
- */
-int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
-{
- return (info1->stamp == info2->stamp);
-}
-
-/**
- * gcov_info_add - add up profiling data
- * @dest: profiling data set to which data is added
- * @source: profiling data set which is added
- *
- * Adds profiling counts of @source to @dest.
- */
-void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
-{
- unsigned int i;
- unsigned int j;
-
- for (i = 0; i < num_counter_active(dest); i++) {
- for (j = 0; j < dest->counts[i].num; j++) {
- dest->counts[i].values[j] +=
- source->counts[i].values[j];
- }
- }
-}
-
-/* Get size of function info entry. Based on gcc magic. */
-static size_t get_fn_size(struct gcov_info *info)
-{
- size_t size;
-
- size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
- sizeof(unsigned int);
- if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
- size = ALIGN(size, __alignof__(struct gcov_fn_info));
- return size;
-}
-
-/* Get address of function info entry. Based on gcc magic. */
-static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
-{
- return (struct gcov_fn_info *)
- ((char *) info->functions + fn * get_fn_size(info));
-}
-
-/**
- * gcov_info_dup - duplicate profiling data set
- * @info: profiling data set to duplicate
- *
- * Return newly allocated duplicate on success, %NULL on error.
- */
-struct gcov_info *gcov_info_dup(struct gcov_info *info)
-{
- struct gcov_info *dup;
- unsigned int i;
- unsigned int active;
-
- /* Duplicate gcov_info. */
- active = num_counter_active(info);
- dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL);
- if (!dup)
- return NULL;
- dup->version = info->version;
- dup->stamp = info->stamp;
- dup->n_functions = info->n_functions;
- dup->ctr_mask = info->ctr_mask;
- /* Duplicate filename. */
- dup->filename = kstrdup(info->filename, GFP_KERNEL);
- if (!dup->filename)
- goto err_free;
- /* Duplicate table of functions. */
- dup->functions = kmemdup(info->functions, info->n_functions *
- get_fn_size(info), GFP_KERNEL);
- if (!dup->functions)
- goto err_free;
- /* Duplicate counter arrays. */
- for (i = 0; i < active ; i++) {
- struct gcov_ctr_info *ctr = &info->counts[i];
- size_t size = ctr->num * sizeof(gcov_type);
-
- dup->counts[i].num = ctr->num;
- dup->counts[i].merge = ctr->merge;
- dup->counts[i].values = vmalloc(size);
- if (!dup->counts[i].values)
- goto err_free;
- memcpy(dup->counts[i].values, ctr->values, size);
- }
- return dup;
-
-err_free:
- gcov_info_free(dup);
- return NULL;
-}
-
-/**
- * gcov_info_free - release memory for profiling data set duplicate
- * @info: profiling data set duplicate to free
- */
-void gcov_info_free(struct gcov_info *info)
-{
- unsigned int active = num_counter_active(info);
- unsigned int i;
-
- for (i = 0; i < active ; i++)
- vfree(info->counts[i].values);
- kfree(info->functions);
- kfree(info->filename);
- kfree(info);
-}
-
-/**
- * struct type_info - iterator helper array
- * @ctr_type: counter type
- * @offset: index of the first value of the current function for this type
- *
- * This array is needed to convert the in-memory data format into the in-file
- * data format:
- *
- * In-memory:
- * for each counter type
- * for each function
- * values
- *
- * In-file:
- * for each function
- * for each counter type
- * values
- *
- * See gcc source gcc/gcov-io.h for more information on data organization.
- */
-struct type_info {
- int ctr_type;
- unsigned int offset;
-};
-
-/**
- * struct gcov_iterator - specifies current file position in logical records
- * @info: associated profiling data
- * @record: record type
- * @function: function number
- * @type: counter type
- * @count: index into values array
- * @num_types: number of counter types
- * @type_info: helper array to get values-array offset for current function
- */
-struct gcov_iterator {
- struct gcov_info *info;
-
- int record;
- unsigned int function;
- unsigned int type;
- unsigned int count;
-
- int num_types;
- struct type_info type_info[];
-};
-
-static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
-{
- return get_fn_info(iter->info, iter->function);
-}
-
-static struct type_info *get_type(struct gcov_iterator *iter)
-{
- return &iter->type_info[iter->type];
-}
-
-/**
- * gcov_iter_new - allocate and initialize profiling data iterator
- * @info: profiling data set to be iterated
- *
- * Return file iterator on success, %NULL otherwise.
- */
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
-{
- struct gcov_iterator *iter;
-
- iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)),
- GFP_KERNEL);
- if (iter)
- iter->info = info;
-
- return iter;
-}
-
-/**
- * gcov_iter_free - release memory for iterator
- * @iter: file iterator to free
- */
-void gcov_iter_free(struct gcov_iterator *iter)
-{
- kfree(iter);
-}
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
-{
- return iter->info;
-}
-
-/**
- * gcov_iter_start - reset file iterator to starting position
- * @iter: file iterator
- */
-void gcov_iter_start(struct gcov_iterator *iter)
-{
- int i;
-
- iter->record = 0;
- iter->function = 0;
- iter->type = 0;
- iter->count = 0;
- iter->num_types = 0;
- for (i = 0; i < GCOV_COUNTERS; i++) {
- if (counter_active(iter->info, i)) {
- iter->type_info[iter->num_types].ctr_type = i;
- iter->type_info[iter->num_types++].offset = 0;
- }
- }
-}
-
-/* Mapping of logical record number to actual file content. */
-#define RECORD_FILE_MAGIC 0
-#define RECORD_GCOV_VERSION 1
-#define RECORD_TIME_STAMP 2
-#define RECORD_FUNCTION_TAG 3
-#define RECORD_FUNCTON_TAG_LEN 4
-#define RECORD_FUNCTION_IDENT 5
-#define RECORD_FUNCTION_CHECK 6
-#define RECORD_COUNT_TAG 7
-#define RECORD_COUNT_LEN 8
-#define RECORD_COUNT 9
-
-/**
- * gcov_iter_next - advance file iterator to next logical record
- * @iter: file iterator
- *
- * Return zero if new position is valid, non-zero if iterator has reached end.
- */
-int gcov_iter_next(struct gcov_iterator *iter)
-{
- switch (iter->record) {
- case RECORD_FILE_MAGIC:
- case RECORD_GCOV_VERSION:
- case RECORD_FUNCTION_TAG:
- case RECORD_FUNCTON_TAG_LEN:
- case RECORD_FUNCTION_IDENT:
- case RECORD_COUNT_TAG:
- /* Advance to next record */
- iter->record++;
- break;
- case RECORD_COUNT:
- /* Advance to next count */
- iter->count++;
- /* fall through */
- case RECORD_COUNT_LEN:
- if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
- iter->record = 9;
- break;
- }
- /* Advance to next counter type */
- get_type(iter)->offset += iter->count;
- iter->count = 0;
- iter->type++;
- /* fall through */
- case RECORD_FUNCTION_CHECK:
- if (iter->type < iter->num_types) {
- iter->record = 7;
- break;
- }
- /* Advance to next function */
- iter->type = 0;
- iter->function++;
- /* fall through */
- case RECORD_TIME_STAMP:
- if (iter->function < iter->info->n_functions)
- iter->record = 3;
- else
- iter->record = -1;
- break;
- }
- /* Check for EOF. */
- if (iter->record == -1)
- return -EINVAL;
- else
- return 0;
-}
-
-/**
- * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
- * @seq: seq_file handle
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file.
- */
-static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
-{
- return seq_write(seq, &v, sizeof(v));
-}
-
-/**
- * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
- * @seq: seq_file handle
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. 64 bit numbers are stored as two 32 bit numbers, the low part
- * first.
- */
-static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
-{
- u32 data[2];
-
- data[0] = (v & 0xffffffffUL);
- data[1] = (v >> 32);
- return seq_write(seq, data, sizeof(data));
-}
-
-/**
- * gcov_iter_write - write data for current pos to seq_file
- * @iter: file iterator
- * @seq: seq_file handle
- *
- * Return zero on success, non-zero otherwise.
- */
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
-{
- int rc = -EINVAL;
-
- switch (iter->record) {
- case RECORD_FILE_MAGIC:
- rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
- break;
- case RECORD_GCOV_VERSION:
- rc = seq_write_gcov_u32(seq, iter->info->version);
- break;
- case RECORD_TIME_STAMP:
- rc = seq_write_gcov_u32(seq, iter->info->stamp);
- break;
- case RECORD_FUNCTION_TAG:
- rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
- break;
- case RECORD_FUNCTON_TAG_LEN:
- rc = seq_write_gcov_u32(seq, 2);
- break;
- case RECORD_FUNCTION_IDENT:
- rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
- break;
- case RECORD_FUNCTION_CHECK:
- rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
- break;
- case RECORD_COUNT_TAG:
- rc = seq_write_gcov_u32(seq,
- GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
- break;
- case RECORD_COUNT_LEN:
- rc = seq_write_gcov_u32(seq,
- get_func(iter)->n_ctrs[iter->type] * 2);
- break;
- case RECORD_COUNT:
- rc = seq_write_gcov_u64(seq,
- iter->info->counts[iter->type].
- values[iter->count + get_type(iter)->offset]);
- break;
- }
- return rc;
-}
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index e13ca842eb7e..c1510f0ab3ea 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -88,7 +88,7 @@ find $cpio_dir -type f -print0 |
find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \
tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \
--owner=0 --group=0 --numeric-owner --no-recursion \
- -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null
+ -I $XZ -cf $tarfile -C $cpio_dir/ -T - > /dev/null
echo $headers_md5 > kernel/kheaders.md5
echo "$this_file_md5" >> kernel/kheaders.md5
diff --git a/kernel/groups.c b/kernel/groups.c
index daae2f2dc6d4..6ee6691f6839 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -20,7 +20,7 @@ struct group_info *groups_alloc(int gidsetsize)
len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize;
gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
if (!gi)
- gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL);
+ gi = __vmalloc(len, GFP_KERNEL_ACCOUNT);
if (!gi)
return NULL;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 14a625c16cb3..ce76f490126c 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -53,9 +53,18 @@ int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
static bool hung_task_show_lock;
static bool hung_task_call_panic;
+static bool hung_task_show_all_bt;
static struct task_struct *watchdog_task;
+#ifdef CONFIG_SMP
+/*
+ * Should we dump all CPUs backtraces in a hung task event?
+ * Defaults to 0, can be changed via sysctl.
+ */
+unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
+#endif /* CONFIG_SMP */
+
/*
* Should we panic (and reboot, if panic_timeout= is set) when a
* hung task is detected:
@@ -63,16 +72,6 @@ static struct task_struct *watchdog_task;
unsigned int __read_mostly sysctl_hung_task_panic =
CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
-static int __init hung_task_panic_setup(char *str)
-{
- int rc = kstrtouint(str, 0, &sysctl_hung_task_panic);
-
- if (rc)
- return rc;
- return 1;
-}
-__setup("hung_task_panic=", hung_task_panic_setup);
-
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
{
@@ -137,6 +136,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
" disables this message.\n");
sched_show_task(t);
hung_task_show_lock = true;
+
+ if (sysctl_hung_task_all_cpu_backtrace)
+ hung_task_show_all_bt = true;
}
touch_nmi_watchdog();
@@ -201,10 +203,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
rcu_read_unlock();
if (hung_task_show_lock)
debug_show_all_locks();
- if (hung_task_call_panic) {
+
+ if (hung_task_show_all_bt) {
+ hung_task_show_all_bt = false;
trigger_all_cpu_backtrace();
- panic("hung_task: blocked tasks");
}
+
+ if (hung_task_call_panic)
+ panic("hung_task: blocked tasks");
}
static long hung_timeout_jiffies(unsigned long last_checked,
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 20d501af4f2e..20512252ecc9 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -72,6 +72,7 @@ config IRQ_DOMAIN
config IRQ_SIM
bool
select IRQ_WORK
+ select IRQ_DOMAIN
# Support for hierarchical irq domains
config IRQ_DOMAIN_HIERARCHY
@@ -117,7 +118,7 @@ config IRQ_FORCED_THREADING
config SPARSE_IRQ
bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
- ---help---
+ help
Sparse irq numbering is useful for distro kernels that want
to define a high CONFIG_NR_CPUS value but still want to have
@@ -133,7 +134,7 @@ config GENERIC_IRQ_DEBUGFS
depends on DEBUG_FS
select GENERIC_IRQ_INJECTION
default n
- ---help---
+ help
Exposes internal state information through debugfs. Mostly for
developers and debugging of hard to diagnose interrupt problems.
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index b992f88c5613..48006608baf0 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -1,14 +1,31 @@
// SPDX-License-Identifier: GPL-2.0+
/*
* Copyright (C) 2017-2018 Bartosz Golaszewski <brgl@bgdev.pl>
+ * Copyright (C) 2020 Bartosz Golaszewski <bgolaszewski@baylibre.com>
*/
-#include <linux/slab.h>
-#include <linux/irq_sim.h>
#include <linux/irq.h>
+#include <linux/irq_sim.h>
+#include <linux/irq_work.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+struct irq_sim_work_ctx {
+ struct irq_work work;
+ int irq_base;
+ unsigned int irq_count;
+ unsigned long *pending;
+ struct irq_domain *domain;
+};
+
+struct irq_sim_irq_ctx {
+ int irqnum;
+ bool enabled;
+ struct irq_sim_work_ctx *work_ctx;
+};
struct irq_sim_devres {
- struct irq_sim *sim;
+ struct irq_domain *domain;
};
static void irq_sim_irqmask(struct irq_data *data)
@@ -36,159 +53,205 @@ static int irq_sim_set_type(struct irq_data *data, unsigned int type)
return 0;
}
+static int irq_sim_get_irqchip_state(struct irq_data *data,
+ enum irqchip_irq_state which, bool *state)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+ irq_hw_number_t hwirq = irqd_to_hwirq(data);
+
+ switch (which) {
+ case IRQCHIP_STATE_PENDING:
+ if (irq_ctx->enabled)
+ *state = test_bit(hwirq, irq_ctx->work_ctx->pending);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int irq_sim_set_irqchip_state(struct irq_data *data,
+ enum irqchip_irq_state which, bool state)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+ irq_hw_number_t hwirq = irqd_to_hwirq(data);
+
+ switch (which) {
+ case IRQCHIP_STATE_PENDING:
+ if (irq_ctx->enabled) {
+ assign_bit(hwirq, irq_ctx->work_ctx->pending, state);
+ if (state)
+ irq_work_queue(&irq_ctx->work_ctx->work);
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static struct irq_chip irq_sim_irqchip = {
- .name = "irq_sim",
- .irq_mask = irq_sim_irqmask,
- .irq_unmask = irq_sim_irqunmask,
- .irq_set_type = irq_sim_set_type,
+ .name = "irq_sim",
+ .irq_mask = irq_sim_irqmask,
+ .irq_unmask = irq_sim_irqunmask,
+ .irq_set_type = irq_sim_set_type,
+ .irq_get_irqchip_state = irq_sim_get_irqchip_state,
+ .irq_set_irqchip_state = irq_sim_set_irqchip_state,
};
static void irq_sim_handle_irq(struct irq_work *work)
{
struct irq_sim_work_ctx *work_ctx;
unsigned int offset = 0;
- struct irq_sim *sim;
int irqnum;
work_ctx = container_of(work, struct irq_sim_work_ctx, work);
- sim = container_of(work_ctx, struct irq_sim, work_ctx);
- while (!bitmap_empty(work_ctx->pending, sim->irq_count)) {
+ while (!bitmap_empty(work_ctx->pending, work_ctx->irq_count)) {
offset = find_next_bit(work_ctx->pending,
- sim->irq_count, offset);
+ work_ctx->irq_count, offset);
clear_bit(offset, work_ctx->pending);
- irqnum = irq_sim_irqnum(sim, offset);
+ irqnum = irq_find_mapping(work_ctx->domain, offset);
handle_simple_irq(irq_to_desc(irqnum));
}
}
+static int irq_sim_domain_map(struct irq_domain *domain,
+ unsigned int virq, irq_hw_number_t hw)
+{
+ struct irq_sim_work_ctx *work_ctx = domain->host_data;
+ struct irq_sim_irq_ctx *irq_ctx;
+
+ irq_ctx = kzalloc(sizeof(*irq_ctx), GFP_KERNEL);
+ if (!irq_ctx)
+ return -ENOMEM;
+
+ irq_set_chip(virq, &irq_sim_irqchip);
+ irq_set_chip_data(virq, irq_ctx);
+ irq_set_handler(virq, handle_simple_irq);
+ irq_modify_status(virq, IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE);
+ irq_ctx->work_ctx = work_ctx;
+
+ return 0;
+}
+
+static void irq_sim_domain_unmap(struct irq_domain *domain, unsigned int virq)
+{
+ struct irq_sim_irq_ctx *irq_ctx;
+ struct irq_data *irqd;
+
+ irqd = irq_domain_get_irq_data(domain, virq);
+ irq_ctx = irq_data_get_irq_chip_data(irqd);
+
+ irq_set_handler(virq, NULL);
+ irq_domain_reset_irq_data(irqd);
+ kfree(irq_ctx);
+}
+
+static const struct irq_domain_ops irq_sim_domain_ops = {
+ .map = irq_sim_domain_map,
+ .unmap = irq_sim_domain_unmap,
+};
+
/**
- * irq_sim_init - Initialize the interrupt simulator: allocate a range of
- * dummy interrupts.
+ * irq_domain_create_sim - Create a new interrupt simulator irq_domain and
+ * allocate a range of dummy interrupts.
*
- * @sim: The interrupt simulator object to initialize.
- * @num_irqs: Number of interrupts to allocate
+ * @fnode: struct fwnode_handle to be associated with this domain.
+ * @num_irqs: Number of interrupts to allocate.
*
- * On success: return the base of the allocated interrupt range.
- * On failure: a negative errno.
+ * On success: return a new irq_domain object.
+ * On failure: a negative errno wrapped with ERR_PTR().
*/
-int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs)
+struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode,
+ unsigned int num_irqs)
{
- int i;
+ struct irq_sim_work_ctx *work_ctx;
- sim->irqs = kmalloc_array(num_irqs, sizeof(*sim->irqs), GFP_KERNEL);
- if (!sim->irqs)
- return -ENOMEM;
+ work_ctx = kmalloc(sizeof(*work_ctx), GFP_KERNEL);
+ if (!work_ctx)
+ goto err_out;
- sim->irq_base = irq_alloc_descs(-1, 0, num_irqs, 0);
- if (sim->irq_base < 0) {
- kfree(sim->irqs);
- return sim->irq_base;
- }
+ work_ctx->pending = bitmap_zalloc(num_irqs, GFP_KERNEL);
+ if (!work_ctx->pending)
+ goto err_free_work_ctx;
- sim->work_ctx.pending = bitmap_zalloc(num_irqs, GFP_KERNEL);
- if (!sim->work_ctx.pending) {
- kfree(sim->irqs);
- irq_free_descs(sim->irq_base, num_irqs);
- return -ENOMEM;
- }
+ work_ctx->domain = irq_domain_create_linear(fwnode, num_irqs,
+ &irq_sim_domain_ops,
+ work_ctx);
+ if (!work_ctx->domain)
+ goto err_free_bitmap;
- for (i = 0; i < num_irqs; i++) {
- sim->irqs[i].irqnum = sim->irq_base + i;
- sim->irqs[i].enabled = false;
- irq_set_chip(sim->irq_base + i, &irq_sim_irqchip);
- irq_set_chip_data(sim->irq_base + i, &sim->irqs[i]);
- irq_set_handler(sim->irq_base + i, &handle_simple_irq);
- irq_modify_status(sim->irq_base + i,
- IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE);
- }
+ work_ctx->irq_count = num_irqs;
+ init_irq_work(&work_ctx->work, irq_sim_handle_irq);
- init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq);
- sim->irq_count = num_irqs;
+ return work_ctx->domain;
- return sim->irq_base;
+err_free_bitmap:
+ bitmap_free(work_ctx->pending);
+err_free_work_ctx:
+ kfree(work_ctx);
+err_out:
+ return ERR_PTR(-ENOMEM);
}
-EXPORT_SYMBOL_GPL(irq_sim_init);
+EXPORT_SYMBOL_GPL(irq_domain_create_sim);
/**
- * irq_sim_fini - Deinitialize the interrupt simulator: free the interrupt
- * descriptors and allocated memory.
+ * irq_domain_remove_sim - Deinitialize the interrupt simulator domain: free
+ * the interrupt descriptors and allocated memory.
*
- * @sim: The interrupt simulator to tear down.
+ * @domain: The interrupt simulator domain to tear down.
*/
-void irq_sim_fini(struct irq_sim *sim)
+void irq_domain_remove_sim(struct irq_domain *domain)
{
- irq_work_sync(&sim->work_ctx.work);
- bitmap_free(sim->work_ctx.pending);
- irq_free_descs(sim->irq_base, sim->irq_count);
- kfree(sim->irqs);
+ struct irq_sim_work_ctx *work_ctx = domain->host_data;
+
+ irq_work_sync(&work_ctx->work);
+ bitmap_free(work_ctx->pending);
+ kfree(work_ctx);
+
+ irq_domain_remove(domain);
}
-EXPORT_SYMBOL_GPL(irq_sim_fini);
+EXPORT_SYMBOL_GPL(irq_domain_remove_sim);
-static void devm_irq_sim_release(struct device *dev, void *res)
+static void devm_irq_domain_release_sim(struct device *dev, void *res)
{
struct irq_sim_devres *this = res;
- irq_sim_fini(this->sim);
+ irq_domain_remove_sim(this->domain);
}
/**
- * irq_sim_init - Initialize the interrupt simulator for a managed device.
+ * devm_irq_domain_create_sim - Create a new interrupt simulator for
+ * a managed device.
*
* @dev: Device to initialize the simulator object for.
- * @sim: The interrupt simulator object to initialize.
+ * @fnode: struct fwnode_handle to be associated with this domain.
* @num_irqs: Number of interrupts to allocate
*
- * On success: return the base of the allocated interrupt range.
- * On failure: a negative errno.
+ * On success: return a new irq_domain object.
+ * On failure: a negative errno wrapped with ERR_PTR().
*/
-int devm_irq_sim_init(struct device *dev, struct irq_sim *sim,
- unsigned int num_irqs)
+struct irq_domain *devm_irq_domain_create_sim(struct device *dev,
+ struct fwnode_handle *fwnode,
+ unsigned int num_irqs)
{
struct irq_sim_devres *dr;
- int rv;
- dr = devres_alloc(devm_irq_sim_release, sizeof(*dr), GFP_KERNEL);
+ dr = devres_alloc(devm_irq_domain_release_sim,
+ sizeof(*dr), GFP_KERNEL);
if (!dr)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- rv = irq_sim_init(sim, num_irqs);
- if (rv < 0) {
+ dr->domain = irq_domain_create_sim(fwnode, num_irqs);
+ if (IS_ERR(dr->domain)) {
devres_free(dr);
- return rv;
+ return dr->domain;
}
- dr->sim = sim;
devres_add(dev, dr);
-
- return rv;
-}
-EXPORT_SYMBOL_GPL(devm_irq_sim_init);
-
-/**
- * irq_sim_fire - Enqueue an interrupt.
- *
- * @sim: The interrupt simulator object.
- * @offset: Offset of the simulated interrupt which should be fired.
- */
-void irq_sim_fire(struct irq_sim *sim, unsigned int offset)
-{
- if (sim->irqs[offset].enabled) {
- set_bit(offset, sim->work_ctx.pending);
- irq_work_queue(&sim->work_ctx.work);
- }
-}
-EXPORT_SYMBOL_GPL(irq_sim_fire);
-
-/**
- * irq_sim_irqnum - Get the allocated number of a dummy interrupt.
- *
- * @sim: The interrupt simulator object.
- * @offset: Offset of the simulated interrupt for which to retrieve
- * the number.
- */
-int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset)
-{
- return sim->irqs[offset].irqnum;
+ return dr->domain;
}
-EXPORT_SYMBOL_GPL(irq_sim_irqnum);
+EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 35b8d97c3a1d..a4c2c915511d 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -132,14 +132,13 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
const struct irq_domain_ops *ops,
void *host_data)
{
- struct device_node *of_node = to_of_node(fwnode);
struct irqchip_fwid *fwid;
struct irq_domain *domain;
static atomic_t unknown_domains;
domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
- GFP_KERNEL, of_node_to_nid(of_node));
+ GFP_KERNEL, of_node_to_nid(to_of_node(fwnode)));
if (!domain)
return NULL;
@@ -162,30 +161,16 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
domain->name = fwid->name;
break;
}
-#ifdef CONFIG_ACPI
- } else if (is_acpi_device_node(fwnode)) {
- struct acpi_buffer buf = {
- .length = ACPI_ALLOCATE_BUFFER,
- };
- acpi_handle handle;
-
- handle = acpi_device_handle(to_acpi_device_node(fwnode));
- if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) {
- domain->name = buf.pointer;
- domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
- }
-
- domain->fwnode = fwnode;
-#endif
- } else if (of_node) {
+ } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) ||
+ is_software_node(fwnode)) {
char *name;
/*
- * DT paths contain '/', which debugfs is legitimately
+ * fwnode paths contain '/', which debugfs is legitimately
* unhappy about. Replace them with ':', which does
* the trick and is not as offensive as '\'...
*/
- name = kasprintf(GFP_KERNEL, "%pOF", of_node);
+ name = kasprintf(GFP_KERNEL, "%pfw", fwnode);
if (!name) {
kfree(domain);
return NULL;
@@ -210,7 +195,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
}
- of_node_get(of_node);
+ fwnode_handle_get(fwnode);
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
@@ -259,7 +244,7 @@ void irq_domain_remove(struct irq_domain *domain)
pr_debug("Removed domain %s\n", domain->name);
- of_node_put(irq_domain_get_of_node(domain));
+ fwnode_handle_put(domain->fwnode);
if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
kfree(domain->name);
kfree(domain);
@@ -1047,6 +1032,18 @@ int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
return virq;
}
+/**
+ * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
+ * @irq_data: The pointer to irq_data
+ */
+void irq_domain_reset_irq_data(struct irq_data *irq_data)
+{
+ irq_data->hwirq = 0;
+ irq_data->chip = &no_irq_chip;
+ irq_data->chip_data = NULL;
+}
+EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data);
+
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
/**
* irq_domain_create_hierarchy - Add a irqdomain into the hierarchy
@@ -1248,18 +1245,6 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
EXPORT_SYMBOL(irq_domain_set_info);
/**
- * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
- * @irq_data: The pointer to irq_data
- */
-void irq_domain_reset_irq_data(struct irq_data *irq_data)
-{
- irq_data->hwirq = 0;
- irq_data->chip = &no_irq_chip;
- irq_data->chip_data = NULL;
-}
-EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data);
-
-/**
* irq_domain_free_irqs_common - Clear irq_data and free the parent
* @domain: Interrupt domain to match
* @virq: IRQ number to start with
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 453a8a0f4804..761911168438 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2619,6 +2619,8 @@ int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which,
do {
chip = irq_data_get_irq_chip(data);
+ if (WARN_ON_ONCE(!chip))
+ return -ENODEV;
if (chip->irq_get_irqchip_state)
break;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
@@ -2696,6 +2698,8 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
do {
chip = irq_data_get_irq_chip(data);
+ if (WARN_ON_ONCE(!chip))
+ return -ENODEV;
if (chip->irq_set_irqchip_state)
break;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 48b5d1b6af4d..eca83965b631 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work)
{
int oflags;
- oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
+ oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
/*
* If the work is already pending, no need to raise the IPI.
* The pairing atomic_fetch_andnot() in irq_work_run() makes sure
@@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (cpu != smp_processor_id()) {
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
- if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
- arch_send_call_function_single_ipi(cpu);
+ __smp_call_single_queue(cpu, &work->llnode);
} else {
__irq_work_queue_local(work);
}
@@ -131,6 +130,31 @@ bool irq_work_needs_cpu(void)
return true;
}
+void irq_work_single(void *arg)
+{
+ struct irq_work *work = arg;
+ int flags;
+
+ /*
+ * Clear the PENDING bit, after this point the @work
+ * can be re-used.
+ * Make it immediately visible so that other CPUs trying
+ * to claim that work don't rely on us to handle their data
+ * while we are in the middle of the func.
+ */
+ flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
+
+ lockdep_irq_work_enter(work);
+ work->func(work);
+ lockdep_irq_work_exit(work);
+ /*
+ * Clear the BUSY bit and return to the free state if
+ * no-one else claimed it meanwhile.
+ */
+ flags &= ~IRQ_WORK_PENDING;
+ (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+}
+
static void irq_work_run_list(struct llist_head *list)
{
struct irq_work *work, *tmp;
@@ -142,27 +166,8 @@ static void irq_work_run_list(struct llist_head *list)
return;
llnode = llist_del_all(list);
- llist_for_each_entry_safe(work, tmp, llnode, llnode) {
- int flags;
- /*
- * Clear the PENDING bit, after this point the @work
- * can be re-used.
- * Make it immediately visible so that other CPUs trying
- * to claim that work don't rely on us to handle their data
- * while we are in the middle of the func.
- */
- flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
-
- lockdep_irq_work_enter(work);
- work->func(work);
- lockdep_irq_work_exit(work);
- /*
- * Clear the BUSY bit and return to the free state if
- * no-one else claimed it meanwhile.
- */
- flags &= ~IRQ_WORK_PENDING;
- (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
- }
+ llist_for_each_entry_safe(work, tmp, llnode, llnode)
+ irq_work_single(work);
}
/*
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 8accc9722a81..6afae0bcbac4 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -86,6 +86,18 @@ static DEFINE_SPINLOCK(kcov_remote_lock);
static DEFINE_HASHTABLE(kcov_remote_map, 4);
static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas);
+struct kcov_percpu_data {
+ void *irq_area;
+
+ unsigned int saved_mode;
+ unsigned int saved_size;
+ void *saved_area;
+ struct kcov *saved_kcov;
+ int saved_sequence;
+};
+
+DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
+
/* Must be called with kcov_remote_lock locked. */
static struct kcov_remote *kcov_remote_find(u64 handle)
{
@@ -98,6 +110,7 @@ static struct kcov_remote *kcov_remote_find(u64 handle)
return NULL;
}
+/* Must be called with kcov_remote_lock locked. */
static struct kcov_remote *kcov_remote_add(struct kcov *kcov, u64 handle)
{
struct kcov_remote *remote;
@@ -119,16 +132,13 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size)
struct kcov_remote_area *area;
struct list_head *pos;
- kcov_debug("size = %u\n", size);
list_for_each(pos, &kcov_remote_areas) {
area = list_entry(pos, struct kcov_remote_area, list);
if (area->size == size) {
list_del(&area->list);
- kcov_debug("rv = %px\n", area);
return area;
}
}
- kcov_debug("rv = NULL\n");
return NULL;
}
@@ -136,7 +146,6 @@ static struct kcov_remote_area *kcov_remote_area_get(unsigned int size)
static void kcov_remote_area_put(struct kcov_remote_area *area,
unsigned int size)
{
- kcov_debug("area = %px, size = %u\n", area, size);
INIT_LIST_HEAD(&area->list);
area->size = size;
list_add(&area->list, &kcov_remote_areas);
@@ -148,9 +157,10 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru
/*
* We are interested in code coverage as a function of a syscall inputs,
- * so we ignore code executed in interrupts.
+ * so we ignore code executed in interrupts, unless we are in a remote
+ * coverage collection section in a softirq.
*/
- if (!in_task())
+ if (!in_task() && !(in_serving_softirq() && t->kcov_softirq))
return false;
mode = READ_ONCE(t->kcov_mode);
/*
@@ -312,23 +322,26 @@ void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
-static void kcov_start(struct task_struct *t, unsigned int size,
- void *area, enum kcov_mode mode, int sequence)
+static void kcov_start(struct task_struct *t, struct kcov *kcov,
+ unsigned int size, void *area, enum kcov_mode mode,
+ int sequence)
{
kcov_debug("t = %px, size = %u, area = %px\n", t, size, area);
+ t->kcov = kcov;
/* Cache in task struct for performance. */
t->kcov_size = size;
t->kcov_area = area;
+ t->kcov_sequence = sequence;
/* See comment in check_kcov_mode(). */
barrier();
WRITE_ONCE(t->kcov_mode, mode);
- t->kcov_sequence = sequence;
}
static void kcov_stop(struct task_struct *t)
{
WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED);
barrier();
+ t->kcov = NULL;
t->kcov_size = 0;
t->kcov_area = NULL;
}
@@ -336,7 +349,6 @@ static void kcov_stop(struct task_struct *t)
static void kcov_task_reset(struct task_struct *t)
{
kcov_stop(t);
- t->kcov = NULL;
t->kcov_sequence = 0;
t->kcov_handle = 0;
}
@@ -361,18 +373,18 @@ static void kcov_remote_reset(struct kcov *kcov)
int bkt;
struct kcov_remote *remote;
struct hlist_node *tmp;
+ unsigned long flags;
- spin_lock(&kcov_remote_lock);
+ spin_lock_irqsave(&kcov_remote_lock, flags);
hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) {
if (remote->kcov != kcov)
continue;
- kcov_debug("removing handle %llx\n", remote->handle);
hash_del(&remote->hnode);
kfree(remote);
}
/* Do reset before unlock to prevent races with kcov_remote_start(). */
kcov_reset(kcov);
- spin_unlock(&kcov_remote_lock);
+ spin_unlock_irqrestore(&kcov_remote_lock, flags);
}
static void kcov_disable(struct task_struct *t, struct kcov *kcov)
@@ -401,12 +413,13 @@ static void kcov_put(struct kcov *kcov)
void kcov_task_exit(struct task_struct *t)
{
struct kcov *kcov;
+ unsigned long flags;
kcov = t->kcov;
if (kcov == NULL)
return;
- spin_lock(&kcov->lock);
+ spin_lock_irqsave(&kcov->lock, flags);
kcov_debug("t = %px, kcov->t = %px\n", t, kcov->t);
/*
* For KCOV_ENABLE devices we want to make sure that t->kcov->t == t,
@@ -414,7 +427,8 @@ void kcov_task_exit(struct task_struct *t)
* WARN_ON(!kcov->remote && kcov->t != t);
*
* For KCOV_REMOTE_ENABLE devices, the exiting task is either:
- * 2. A remote task between kcov_remote_start() and kcov_remote_stop().
+ *
+ * 1. A remote task between kcov_remote_start() and kcov_remote_stop().
* In this case we should print a warning right away, since a task
* shouldn't be exiting when it's in a kcov coverage collection
* section. Here t points to the task that is collecting remote
@@ -424,18 +438,18 @@ void kcov_task_exit(struct task_struct *t)
* WARN_ON(kcov->remote && kcov->t != t);
*
* 2. The task that created kcov exiting without calling KCOV_DISABLE,
- * and then again we can make sure that t->kcov->t == t:
+ * and then again we make sure that t->kcov->t == t:
* WARN_ON(kcov->remote && kcov->t != t);
*
* By combining all three checks into one we get:
*/
if (WARN_ON(kcov->t != t)) {
- spin_unlock(&kcov->lock);
+ spin_unlock_irqrestore(&kcov->lock, flags);
return;
}
/* Just to not leave dangling references behind. */
kcov_disable(t, kcov);
- spin_unlock(&kcov->lock);
+ spin_unlock_irqrestore(&kcov->lock, flags);
kcov_put(kcov);
}
@@ -446,12 +460,13 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
struct kcov *kcov = vma->vm_file->private_data;
unsigned long size, off;
struct page *page;
+ unsigned long flags;
area = vmalloc_user(vma->vm_end - vma->vm_start);
if (!area)
return -ENOMEM;
- spin_lock(&kcov->lock);
+ spin_lock_irqsave(&kcov->lock, flags);
size = kcov->size * sizeof(unsigned long);
if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
vma->vm_end - vma->vm_start != size) {
@@ -461,7 +476,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
if (!kcov->area) {
kcov->area = area;
vma->vm_flags |= VM_DONTEXPAND;
- spin_unlock(&kcov->lock);
+ spin_unlock_irqrestore(&kcov->lock, flags);
for (off = 0; off < size; off += PAGE_SIZE) {
page = vmalloc_to_page(kcov->area + off);
if (vm_insert_page(vma, vma->vm_start + off, page))
@@ -470,7 +485,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
return 0;
}
exit:
- spin_unlock(&kcov->lock);
+ spin_unlock_irqrestore(&kcov->lock, flags);
vfree(area);
return res;
}
@@ -550,10 +565,10 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
int mode, i;
struct kcov_remote_arg *remote_arg;
struct kcov_remote *remote;
+ unsigned long flags;
switch (cmd) {
case KCOV_INIT_TRACE:
- kcov_debug("KCOV_INIT_TRACE\n");
/*
* Enable kcov in trace mode and setup buffer size.
* Must happen before anything else.
@@ -572,7 +587,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
kcov->mode = KCOV_MODE_INIT;
return 0;
case KCOV_ENABLE:
- kcov_debug("KCOV_ENABLE\n");
/*
* Enable coverage for the current task.
* At this point user must have been enabled trace mode,
@@ -590,15 +604,13 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
return mode;
kcov_fault_in_area(kcov);
kcov->mode = mode;
- kcov_start(t, kcov->size, kcov->area, kcov->mode,
+ kcov_start(t, kcov, kcov->size, kcov->area, kcov->mode,
kcov->sequence);
- t->kcov = kcov;
kcov->t = t;
/* Put either in kcov_task_exit() or in KCOV_DISABLE. */
kcov_get(kcov);
return 0;
case KCOV_DISABLE:
- kcov_debug("KCOV_DISABLE\n");
/* Disable coverage for the current task. */
unused = arg;
if (unused != 0 || current->kcov != kcov)
@@ -610,7 +622,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
kcov_put(kcov);
return 0;
case KCOV_REMOTE_ENABLE:
- kcov_debug("KCOV_REMOTE_ENABLE\n");
if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
return -EINVAL;
t = current;
@@ -627,41 +638,42 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
kcov->t = t;
kcov->remote = true;
kcov->remote_size = remote_arg->area_size;
- spin_lock(&kcov_remote_lock);
+ spin_lock_irqsave(&kcov_remote_lock, flags);
for (i = 0; i < remote_arg->num_handles; i++) {
- kcov_debug("handle %llx\n", remote_arg->handles[i]);
if (!kcov_check_handle(remote_arg->handles[i],
false, true, false)) {
- spin_unlock(&kcov_remote_lock);
+ spin_unlock_irqrestore(&kcov_remote_lock,
+ flags);
kcov_disable(t, kcov);
return -EINVAL;
}
remote = kcov_remote_add(kcov, remote_arg->handles[i]);
if (IS_ERR(remote)) {
- spin_unlock(&kcov_remote_lock);
+ spin_unlock_irqrestore(&kcov_remote_lock,
+ flags);
kcov_disable(t, kcov);
return PTR_ERR(remote);
}
}
if (remote_arg->common_handle) {
- kcov_debug("common handle %llx\n",
- remote_arg->common_handle);
if (!kcov_check_handle(remote_arg->common_handle,
true, false, false)) {
- spin_unlock(&kcov_remote_lock);
+ spin_unlock_irqrestore(&kcov_remote_lock,
+ flags);
kcov_disable(t, kcov);
return -EINVAL;
}
remote = kcov_remote_add(kcov,
remote_arg->common_handle);
if (IS_ERR(remote)) {
- spin_unlock(&kcov_remote_lock);
+ spin_unlock_irqrestore(&kcov_remote_lock,
+ flags);
kcov_disable(t, kcov);
return PTR_ERR(remote);
}
t->kcov_handle = remote_arg->common_handle;
}
- spin_unlock(&kcov_remote_lock);
+ spin_unlock_irqrestore(&kcov_remote_lock, flags);
/* Put either in kcov_task_exit() or in KCOV_DISABLE. */
kcov_get(kcov);
return 0;
@@ -677,6 +689,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
struct kcov_remote_arg *remote_arg = NULL;
unsigned int remote_num_handles;
unsigned long remote_arg_size;
+ unsigned long flags;
if (cmd == KCOV_REMOTE_ENABLE) {
if (get_user(remote_num_handles, (unsigned __user *)(arg +
@@ -697,9 +710,9 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
}
kcov = filep->private_data;
- spin_lock(&kcov->lock);
+ spin_lock_irqsave(&kcov->lock, flags);
res = kcov_ioctl_locked(kcov, cmd, arg);
- spin_unlock(&kcov->lock);
+ spin_unlock_irqrestore(&kcov->lock, flags);
kfree(remote_arg);
@@ -716,8 +729,8 @@ static const struct file_operations kcov_fops = {
/*
* kcov_remote_start() and kcov_remote_stop() can be used to annotate a section
- * of code in a kernel background thread to allow kcov to be used to collect
- * coverage from that part of code.
+ * of code in a kernel background thread or in a softirq to allow kcov to be
+ * used to collect coverage from that part of code.
*
* The handle argument of kcov_remote_start() identifies a code section that is
* used for coverage collection. A userspace process passes this handle to
@@ -728,9 +741,9 @@ static const struct file_operations kcov_fops = {
* the type of the kernel thread whose code is being annotated.
*
* For global kernel threads that are spawned in a limited number of instances
- * (e.g. one USB hub_event() worker thread is spawned per USB HCD), each
- * instance must be assigned a unique 4-byte instance id. The instance id is
- * then combined with a 1-byte subsystem id to get a handle via
+ * (e.g. one USB hub_event() worker thread is spawned per USB HCD) and for
+ * softirqs, each instance must be assigned a unique 4-byte instance id. The
+ * instance id is then combined with a 1-byte subsystem id to get a handle via
* kcov_remote_handle(subsystem_id, instance_id).
*
* For local kernel threads that are spawned from system calls handler when a
@@ -749,70 +762,136 @@ static const struct file_operations kcov_fops = {
*
* See Documentation/dev-tools/kcov.rst for more details.
*
- * Internally, this function looks up the kcov device associated with the
+ * Internally, kcov_remote_start() looks up the kcov device associated with the
* provided handle, allocates an area for coverage collection, and saves the
* pointers to kcov and area into the current task_struct to allow coverage to
- * be collected via __sanitizer_cov_trace_pc()
+ * be collected via __sanitizer_cov_trace_pc().
* In turns kcov_remote_stop() clears those pointers from task_struct to stop
* collecting coverage and copies all collected coverage into the kcov area.
*/
+
+static inline bool kcov_mode_enabled(unsigned int mode)
+{
+ return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED;
+}
+
+void kcov_remote_softirq_start(struct task_struct *t)
+{
+ struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
+ unsigned int mode;
+
+ mode = READ_ONCE(t->kcov_mode);
+ barrier();
+ if (kcov_mode_enabled(mode)) {
+ data->saved_mode = mode;
+ data->saved_size = t->kcov_size;
+ data->saved_area = t->kcov_area;
+ data->saved_sequence = t->kcov_sequence;
+ data->saved_kcov = t->kcov;
+ kcov_stop(t);
+ }
+}
+
+void kcov_remote_softirq_stop(struct task_struct *t)
+{
+ struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
+
+ if (data->saved_kcov) {
+ kcov_start(t, data->saved_kcov, data->saved_size,
+ data->saved_area, data->saved_mode,
+ data->saved_sequence);
+ data->saved_mode = 0;
+ data->saved_size = 0;
+ data->saved_area = NULL;
+ data->saved_sequence = 0;
+ data->saved_kcov = NULL;
+ }
+}
+
void kcov_remote_start(u64 handle)
{
+ struct task_struct *t = current;
struct kcov_remote *remote;
+ struct kcov *kcov;
+ unsigned int mode;
void *area;
- struct task_struct *t;
unsigned int size;
- enum kcov_mode mode;
int sequence;
+ unsigned long flags;
if (WARN_ON(!kcov_check_handle(handle, true, true, true)))
return;
- if (WARN_ON(!in_task()))
+ if (!in_task() && !in_serving_softirq())
return;
- t = current;
+
+ local_irq_save(flags);
+
/*
- * Check that kcov_remote_start is not called twice
- * nor called by user tasks (with enabled kcov).
+ * Check that kcov_remote_start() is not called twice in background
+ * threads nor called by user tasks (with enabled kcov).
*/
- if (WARN_ON(t->kcov))
+ mode = READ_ONCE(t->kcov_mode);
+ if (WARN_ON(in_task() && kcov_mode_enabled(mode))) {
+ local_irq_restore(flags);
return;
-
- kcov_debug("handle = %llx\n", handle);
+ }
+ /*
+ * Check that kcov_remote_start() is not called twice in softirqs.
+ * Note, that kcov_remote_start() can be called from a softirq that
+ * happened while collecting coverage from a background thread.
+ */
+ if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) {
+ local_irq_restore(flags);
+ return;
+ }
spin_lock(&kcov_remote_lock);
remote = kcov_remote_find(handle);
if (!remote) {
- kcov_debug("no remote found");
- spin_unlock(&kcov_remote_lock);
+ spin_unlock_irqrestore(&kcov_remote_lock, flags);
return;
}
+ kcov_debug("handle = %llx, context: %s\n", handle,
+ in_task() ? "task" : "softirq");
+ kcov = remote->kcov;
/* Put in kcov_remote_stop(). */
- kcov_get(remote->kcov);
- t->kcov = remote->kcov;
+ kcov_get(kcov);
/*
* Read kcov fields before unlock to prevent races with
* KCOV_DISABLE / kcov_remote_reset().
*/
- size = remote->kcov->remote_size;
- mode = remote->kcov->mode;
- sequence = remote->kcov->sequence;
- area = kcov_remote_area_get(size);
- spin_unlock(&kcov_remote_lock);
+ mode = kcov->mode;
+ sequence = kcov->sequence;
+ if (in_task()) {
+ size = kcov->remote_size;
+ area = kcov_remote_area_get(size);
+ } else {
+ size = CONFIG_KCOV_IRQ_AREA_SIZE;
+ area = this_cpu_ptr(&kcov_percpu_data)->irq_area;
+ }
+ spin_unlock_irqrestore(&kcov_remote_lock, flags);
+ /* Can only happen when in_task(). */
if (!area) {
area = vmalloc(size * sizeof(unsigned long));
if (!area) {
- t->kcov = NULL;
- kcov_put(remote->kcov);
+ kcov_put(kcov);
return;
}
}
+
+ local_irq_save(flags);
+
/* Reset coverage size. */
*(u64 *)area = 0;
- kcov_debug("area = %px, size = %u", area, size);
+ if (in_serving_softirq()) {
+ kcov_remote_softirq_start(t);
+ t->kcov_softirq = 1;
+ }
+ kcov_start(t, kcov, size, area, mode, sequence);
- kcov_start(t, size, area, mode, sequence);
+ local_irq_restore(flags);
}
EXPORT_SYMBOL(kcov_remote_start);
@@ -876,34 +955,67 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area,
void kcov_remote_stop(void)
{
struct task_struct *t = current;
- struct kcov *kcov = t->kcov;
- void *area = t->kcov_area;
- unsigned int size = t->kcov_size;
- int sequence = t->kcov_sequence;
+ struct kcov *kcov;
+ unsigned int mode;
+ void *area;
+ unsigned int size;
+ int sequence;
+ unsigned long flags;
- if (!kcov) {
- kcov_debug("no kcov found\n");
+ if (!in_task() && !in_serving_softirq())
+ return;
+
+ local_irq_save(flags);
+
+ mode = READ_ONCE(t->kcov_mode);
+ barrier();
+ if (!kcov_mode_enabled(mode)) {
+ local_irq_restore(flags);
+ return;
+ }
+ /*
+ * When in softirq, check if the corresponding kcov_remote_start()
+ * actually found the remote handle and started collecting coverage.
+ */
+ if (in_serving_softirq() && !t->kcov_softirq) {
+ local_irq_restore(flags);
+ return;
+ }
+ /* Make sure that kcov_softirq is only set when in softirq. */
+ if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) {
+ local_irq_restore(flags);
return;
}
+ kcov = t->kcov;
+ area = t->kcov_area;
+ size = t->kcov_size;
+ sequence = t->kcov_sequence;
+
kcov_stop(t);
- t->kcov = NULL;
+ if (in_serving_softirq()) {
+ t->kcov_softirq = 0;
+ kcov_remote_softirq_stop(t);
+ }
spin_lock(&kcov->lock);
/*
* KCOV_DISABLE could have been called between kcov_remote_start()
- * and kcov_remote_stop(), hence the check.
+ * and kcov_remote_stop(), hence the sequence check.
*/
- kcov_debug("move if: %d == %d && %d\n",
- sequence, kcov->sequence, (int)kcov->remote);
if (sequence == kcov->sequence && kcov->remote)
kcov_move_area(kcov->mode, kcov->area, kcov->size, area);
spin_unlock(&kcov->lock);
- spin_lock(&kcov_remote_lock);
- kcov_remote_area_put(area, size);
- spin_unlock(&kcov_remote_lock);
+ if (in_task()) {
+ spin_lock(&kcov_remote_lock);
+ kcov_remote_area_put(area, size);
+ spin_unlock(&kcov_remote_lock);
+ }
+
+ local_irq_restore(flags);
+ /* Get in kcov_remote_start(). */
kcov_put(kcov);
}
EXPORT_SYMBOL(kcov_remote_stop);
@@ -917,6 +1029,16 @@ EXPORT_SYMBOL(kcov_common_handle);
static int __init kcov_init(void)
{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE *
+ sizeof(unsigned long));
+ if (!area)
+ return -ENOMEM;
+ per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area;
+ }
+
/*
* The kcov debugfs file won't ever get removed and thus,
* there is no need to protect it against removal races. The
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
new file mode 100644
index 000000000000..d4999b38d1be
--- /dev/null
+++ b/kernel/kcsan/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+KCSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+UBSAN_SANITIZE := n
+
+CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
+
+CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \
+ $(call cc-option,-fno-stack-protector,)
+
+obj-y := core.o debugfs.o report.o
+obj-$(CONFIG_KCSAN_SELFTEST) += test.o
diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h
new file mode 100644
index 000000000000..be9e625227f3
--- /dev/null
+++ b/kernel/kcsan/atomic.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_KCSAN_ATOMIC_H
+#define _KERNEL_KCSAN_ATOMIC_H
+
+#include <linux/jiffies.h>
+#include <linux/sched.h>
+
+/*
+ * Special rules for certain memory where concurrent conflicting accesses are
+ * common, however, the current convention is to not mark them; returns true if
+ * access to @ptr should be considered atomic. Called from slow-path.
+ */
+static bool kcsan_is_atomic_special(const volatile void *ptr)
+{
+ /* volatile globals that have been observed in data races. */
+ return ptr == &jiffies || ptr == &current->state;
+}
+
+#endif /* _KERNEL_KCSAN_ATOMIC_H */
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
new file mode 100644
index 000000000000..15f67949d11e
--- /dev/null
+++ b/kernel/kcsan/core.c
@@ -0,0 +1,850 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+
+#include "atomic.h"
+#include "encoding.h"
+#include "kcsan.h"
+
+static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE);
+unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK;
+unsigned int kcsan_udelay_interrupt = CONFIG_KCSAN_UDELAY_INTERRUPT;
+static long kcsan_skip_watch = CONFIG_KCSAN_SKIP_WATCH;
+static bool kcsan_interrupt_watcher = IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER);
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "kcsan."
+module_param_named(early_enable, kcsan_early_enable, bool, 0);
+module_param_named(udelay_task, kcsan_udelay_task, uint, 0644);
+module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644);
+module_param_named(skip_watch, kcsan_skip_watch, long, 0644);
+module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444);
+
+bool kcsan_enabled;
+
+/* Per-CPU kcsan_ctx for interrupts */
+static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = {
+ .disable_count = 0,
+ .atomic_next = 0,
+ .atomic_nest_count = 0,
+ .in_flat_atomic = false,
+ .access_mask = 0,
+ .scoped_accesses = {LIST_POISON1, NULL},
+};
+
+/*
+ * Helper macros to index into adjacent slots, starting from address slot
+ * itself, followed by the right and left slots.
+ *
+ * The purpose is 2-fold:
+ *
+ * 1. if during insertion the address slot is already occupied, check if
+ * any adjacent slots are free;
+ * 2. accesses that straddle a slot boundary due to size that exceeds a
+ * slot's range may check adjacent slots if any watchpoint matches.
+ *
+ * Note that accesses with very large size may still miss a watchpoint; however,
+ * given this should be rare, this is a reasonable trade-off to make, since this
+ * will avoid:
+ *
+ * 1. excessive contention between watchpoint checks and setup;
+ * 2. larger number of simultaneous watchpoints without sacrificing
+ * performance.
+ *
+ * Example: SLOT_IDX values for KCSAN_CHECK_ADJACENT=1, where i is [0, 1, 2]:
+ *
+ * slot=0: [ 1, 2, 0]
+ * slot=9: [10, 11, 9]
+ * slot=63: [64, 65, 63]
+ */
+#define SLOT_IDX(slot, i) (slot + ((i + KCSAN_CHECK_ADJACENT) % NUM_SLOTS))
+
+/*
+ * SLOT_IDX_FAST is used in the fast-path. Not first checking the address's primary
+ * slot (middle) is fine if we assume that races occur rarely. The set of
+ * indices {SLOT_IDX(slot, i) | i in [0, NUM_SLOTS)} is equivalent to
+ * {SLOT_IDX_FAST(slot, i) | i in [0, NUM_SLOTS)}.
+ */
+#define SLOT_IDX_FAST(slot, i) (slot + i)
+
+/*
+ * Watchpoints, with each entry encoded as defined in encoding.h: in order to be
+ * able to safely update and access a watchpoint without introducing locking
+ * overhead, we encode each watchpoint as a single atomic long. The initial
+ * zero-initialized state matches INVALID_WATCHPOINT.
+ *
+ * Add NUM_SLOTS-1 entries to account for overflow; this helps avoid having to
+ * use more complicated SLOT_IDX_FAST calculation with modulo in the fast-path.
+ */
+static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1];
+
+/*
+ * Instructions to skip watching counter, used in should_watch(). We use a
+ * per-CPU counter to avoid excessive contention.
+ */
+static DEFINE_PER_CPU(long, kcsan_skip);
+
+static __always_inline atomic_long_t *find_watchpoint(unsigned long addr,
+ size_t size,
+ bool expect_write,
+ long *encoded_watchpoint)
+{
+ const int slot = watchpoint_slot(addr);
+ const unsigned long addr_masked = addr & WATCHPOINT_ADDR_MASK;
+ atomic_long_t *watchpoint;
+ unsigned long wp_addr_masked;
+ size_t wp_size;
+ bool is_write;
+ int i;
+
+ BUILD_BUG_ON(CONFIG_KCSAN_NUM_WATCHPOINTS < NUM_SLOTS);
+
+ for (i = 0; i < NUM_SLOTS; ++i) {
+ watchpoint = &watchpoints[SLOT_IDX_FAST(slot, i)];
+ *encoded_watchpoint = atomic_long_read(watchpoint);
+ if (!decode_watchpoint(*encoded_watchpoint, &wp_addr_masked,
+ &wp_size, &is_write))
+ continue;
+
+ if (expect_write && !is_write)
+ continue;
+
+ /* Check if the watchpoint matches the access. */
+ if (matching_access(wp_addr_masked, wp_size, addr_masked, size))
+ return watchpoint;
+ }
+
+ return NULL;
+}
+
+static inline atomic_long_t *
+insert_watchpoint(unsigned long addr, size_t size, bool is_write)
+{
+ const int slot = watchpoint_slot(addr);
+ const long encoded_watchpoint = encode_watchpoint(addr, size, is_write);
+ atomic_long_t *watchpoint;
+ int i;
+
+ /* Check slot index logic, ensuring we stay within array bounds. */
+ BUILD_BUG_ON(SLOT_IDX(0, 0) != KCSAN_CHECK_ADJACENT);
+ BUILD_BUG_ON(SLOT_IDX(0, KCSAN_CHECK_ADJACENT+1) != 0);
+ BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT) != ARRAY_SIZE(watchpoints)-1);
+ BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT+1) != ARRAY_SIZE(watchpoints) - NUM_SLOTS);
+
+ for (i = 0; i < NUM_SLOTS; ++i) {
+ long expect_val = INVALID_WATCHPOINT;
+
+ /* Try to acquire this slot. */
+ watchpoint = &watchpoints[SLOT_IDX(slot, i)];
+ if (atomic_long_try_cmpxchg_relaxed(watchpoint, &expect_val, encoded_watchpoint))
+ return watchpoint;
+ }
+
+ return NULL;
+}
+
+/*
+ * Return true if watchpoint was successfully consumed, false otherwise.
+ *
+ * This may return false if:
+ *
+ * 1. another thread already consumed the watchpoint;
+ * 2. the thread that set up the watchpoint already removed it;
+ * 3. the watchpoint was removed and then re-used.
+ */
+static __always_inline bool
+try_consume_watchpoint(atomic_long_t *watchpoint, long encoded_watchpoint)
+{
+ return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, CONSUMED_WATCHPOINT);
+}
+
+/* Return true if watchpoint was not touched, false if already consumed. */
+static inline bool consume_watchpoint(atomic_long_t *watchpoint)
+{
+ return atomic_long_xchg_relaxed(watchpoint, CONSUMED_WATCHPOINT) != CONSUMED_WATCHPOINT;
+}
+
+/* Remove the watchpoint -- its slot may be reused after. */
+static inline void remove_watchpoint(atomic_long_t *watchpoint)
+{
+ atomic_long_set(watchpoint, INVALID_WATCHPOINT);
+}
+
+static __always_inline struct kcsan_ctx *get_ctx(void)
+{
+ /*
+ * In interrupts, use raw_cpu_ptr to avoid unnecessary checks, that would
+ * also result in calls that generate warnings in uaccess regions.
+ */
+ return in_task() ? &current->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx);
+}
+
+/* Check scoped accesses; never inline because this is a slow-path! */
+static noinline void kcsan_check_scoped_accesses(void)
+{
+ struct kcsan_ctx *ctx = get_ctx();
+ struct list_head *prev_save = ctx->scoped_accesses.prev;
+ struct kcsan_scoped_access *scoped_access;
+
+ ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */
+ list_for_each_entry(scoped_access, &ctx->scoped_accesses, list)
+ __kcsan_check_access(scoped_access->ptr, scoped_access->size, scoped_access->type);
+ ctx->scoped_accesses.prev = prev_save;
+}
+
+/* Rules for generic atomic accesses. Called from fast-path. */
+static __always_inline bool
+is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
+{
+ if (type & KCSAN_ACCESS_ATOMIC)
+ return true;
+
+ /*
+ * Unless explicitly declared atomic, never consider an assertion access
+ * as atomic. This allows using them also in atomic regions, such as
+ * seqlocks, without implicitly changing their semantics.
+ */
+ if (type & KCSAN_ACCESS_ASSERT)
+ return false;
+
+ if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) &&
+ (type & KCSAN_ACCESS_WRITE) && size <= sizeof(long) &&
+ IS_ALIGNED((unsigned long)ptr, size))
+ return true; /* Assume aligned writes up to word size are atomic. */
+
+ if (ctx->atomic_next > 0) {
+ /*
+ * Because we do not have separate contexts for nested
+ * interrupts, in case atomic_next is set, we simply assume that
+ * the outer interrupt set atomic_next. In the worst case, we
+ * will conservatively consider operations as atomic. This is a
+ * reasonable trade-off to make, since this case should be
+ * extremely rare; however, even if extremely rare, it could
+ * lead to false positives otherwise.
+ */
+ if ((hardirq_count() >> HARDIRQ_SHIFT) < 2)
+ --ctx->atomic_next; /* in task, or outer interrupt */
+ return true;
+ }
+
+ return ctx->atomic_nest_count > 0 || ctx->in_flat_atomic;
+}
+
+static __always_inline bool
+should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
+{
+ /*
+ * Never set up watchpoints when memory operations are atomic.
+ *
+ * Need to check this first, before kcsan_skip check below: (1) atomics
+ * should not count towards skipped instructions, and (2) to actually
+ * decrement kcsan_atomic_next for consecutive instruction stream.
+ */
+ if (is_atomic(ptr, size, type, ctx))
+ return false;
+
+ if (this_cpu_dec_return(kcsan_skip) >= 0)
+ return false;
+
+ /*
+ * NOTE: If we get here, kcsan_skip must always be reset in slow path
+ * via reset_kcsan_skip() to avoid underflow.
+ */
+
+ /* this operation should be watched */
+ return true;
+}
+
+static inline void reset_kcsan_skip(void)
+{
+ long skip_count = kcsan_skip_watch -
+ (IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ?
+ prandom_u32_max(kcsan_skip_watch) :
+ 0);
+ this_cpu_write(kcsan_skip, skip_count);
+}
+
+static __always_inline bool kcsan_is_enabled(void)
+{
+ return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0;
+}
+
+static inline unsigned int get_delay(void)
+{
+ unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt;
+ return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ?
+ prandom_u32_max(delay) :
+ 0);
+}
+
+/*
+ * Pull everything together: check_access() below contains the performance
+ * critical operations; the fast-path (including check_access) functions should
+ * all be inlinable by the instrumentation functions.
+ *
+ * The slow-path (kcsan_found_watchpoint, kcsan_setup_watchpoint) are
+ * non-inlinable -- note that, we prefix these with "kcsan_" to ensure they can
+ * be filtered from the stacktrace, as well as give them unique names for the
+ * UACCESS whitelist of objtool. Each function uses user_access_save/restore(),
+ * since they do not access any user memory, but instrumentation is still
+ * emitted in UACCESS regions.
+ */
+
+static noinline void kcsan_found_watchpoint(const volatile void *ptr,
+ size_t size,
+ int type,
+ atomic_long_t *watchpoint,
+ long encoded_watchpoint)
+{
+ unsigned long flags;
+ bool consumed;
+
+ if (!kcsan_is_enabled())
+ return;
+
+ /*
+ * The access_mask check relies on value-change comparison. To avoid
+ * reporting a race where e.g. the writer set up the watchpoint, but the
+ * reader has access_mask!=0, we have to ignore the found watchpoint.
+ */
+ if (get_ctx()->access_mask != 0)
+ return;
+
+ /*
+ * Consume the watchpoint as soon as possible, to minimize the chances
+ * of !consumed. Consuming the watchpoint must always be guarded by
+ * kcsan_is_enabled() check, as otherwise we might erroneously
+ * triggering reports when disabled.
+ */
+ consumed = try_consume_watchpoint(watchpoint, encoded_watchpoint);
+
+ /* keep this after try_consume_watchpoint */
+ flags = user_access_save();
+
+ if (consumed) {
+ kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE,
+ KCSAN_REPORT_CONSUMED_WATCHPOINT,
+ watchpoint - watchpoints);
+ } else {
+ /*
+ * The other thread may not print any diagnostics, as it has
+ * already removed the watchpoint, or another thread consumed
+ * the watchpoint before this thread.
+ */
+ kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES);
+ }
+
+ if ((type & KCSAN_ACCESS_ASSERT) != 0)
+ kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+ else
+ kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES);
+
+ user_access_restore(flags);
+}
+
+static noinline void
+kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
+{
+ const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
+ const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
+ atomic_long_t *watchpoint;
+ union {
+ u8 _1;
+ u16 _2;
+ u32 _4;
+ u64 _8;
+ } expect_value;
+ unsigned long access_mask;
+ enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE;
+ unsigned long ua_flags = user_access_save();
+ unsigned long irq_flags = 0;
+
+ /*
+ * Always reset kcsan_skip counter in slow-path to avoid underflow; see
+ * should_watch().
+ */
+ reset_kcsan_skip();
+
+ if (!kcsan_is_enabled())
+ goto out;
+
+ /*
+ * Special atomic rules: unlikely to be true, so we check them here in
+ * the slow-path, and not in the fast-path in is_atomic(). Call after
+ * kcsan_is_enabled(), as we may access memory that is not yet
+ * initialized during early boot.
+ */
+ if (!is_assert && kcsan_is_atomic_special(ptr))
+ goto out;
+
+ if (!check_encodable((unsigned long)ptr, size)) {
+ kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES);
+ goto out;
+ }
+
+ if (!kcsan_interrupt_watcher)
+ /* Use raw to avoid lockdep recursion via IRQ flags tracing. */
+ raw_local_irq_save(irq_flags);
+
+ watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write);
+ if (watchpoint == NULL) {
+ /*
+ * Out of capacity: the size of 'watchpoints', and the frequency
+ * with which should_watch() returns true should be tweaked so
+ * that this case happens very rarely.
+ */
+ kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY);
+ goto out_unlock;
+ }
+
+ kcsan_counter_inc(KCSAN_COUNTER_SETUP_WATCHPOINTS);
+ kcsan_counter_inc(KCSAN_COUNTER_USED_WATCHPOINTS);
+
+ /*
+ * Read the current value, to later check and infer a race if the data
+ * was modified via a non-instrumented access, e.g. from a device.
+ */
+ expect_value._8 = 0;
+ switch (size) {
+ case 1:
+ expect_value._1 = READ_ONCE(*(const u8 *)ptr);
+ break;
+ case 2:
+ expect_value._2 = READ_ONCE(*(const u16 *)ptr);
+ break;
+ case 4:
+ expect_value._4 = READ_ONCE(*(const u32 *)ptr);
+ break;
+ case 8:
+ expect_value._8 = READ_ONCE(*(const u64 *)ptr);
+ break;
+ default:
+ break; /* ignore; we do not diff the values */
+ }
+
+ if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) {
+ kcsan_disable_current();
+ pr_err("KCSAN: watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n",
+ is_write ? "write" : "read", size, ptr,
+ watchpoint_slot((unsigned long)ptr),
+ encode_watchpoint((unsigned long)ptr, size, is_write));
+ kcsan_enable_current();
+ }
+
+ /*
+ * Delay this thread, to increase probability of observing a racy
+ * conflicting access.
+ */
+ udelay(get_delay());
+
+ /*
+ * Re-read value, and check if it is as expected; if not, we infer a
+ * racy access.
+ */
+ access_mask = get_ctx()->access_mask;
+ switch (size) {
+ case 1:
+ expect_value._1 ^= READ_ONCE(*(const u8 *)ptr);
+ if (access_mask)
+ expect_value._1 &= (u8)access_mask;
+ break;
+ case 2:
+ expect_value._2 ^= READ_ONCE(*(const u16 *)ptr);
+ if (access_mask)
+ expect_value._2 &= (u16)access_mask;
+ break;
+ case 4:
+ expect_value._4 ^= READ_ONCE(*(const u32 *)ptr);
+ if (access_mask)
+ expect_value._4 &= (u32)access_mask;
+ break;
+ case 8:
+ expect_value._8 ^= READ_ONCE(*(const u64 *)ptr);
+ if (access_mask)
+ expect_value._8 &= (u64)access_mask;
+ break;
+ default:
+ break; /* ignore; we do not diff the values */
+ }
+
+ /* Were we able to observe a value-change? */
+ if (expect_value._8 != 0)
+ value_change = KCSAN_VALUE_CHANGE_TRUE;
+
+ /* Check if this access raced with another. */
+ if (!consume_watchpoint(watchpoint)) {
+ /*
+ * Depending on the access type, map a value_change of MAYBE to
+ * TRUE (always report) or FALSE (never report).
+ */
+ if (value_change == KCSAN_VALUE_CHANGE_MAYBE) {
+ if (access_mask != 0) {
+ /*
+ * For access with access_mask, we require a
+ * value-change, as it is likely that races on
+ * ~access_mask bits are expected.
+ */
+ value_change = KCSAN_VALUE_CHANGE_FALSE;
+ } else if (size > 8 || is_assert) {
+ /* Always assume a value-change. */
+ value_change = KCSAN_VALUE_CHANGE_TRUE;
+ }
+ }
+
+ /*
+ * No need to increment 'data_races' counter, as the racing
+ * thread already did.
+ *
+ * Count 'assert_failures' for each failed ASSERT access,
+ * therefore both this thread and the racing thread may
+ * increment this counter.
+ */
+ if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE)
+ kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+
+ kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL,
+ watchpoint - watchpoints);
+ } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
+ /* Inferring a race, since the value should not have changed. */
+
+ kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN);
+ if (is_assert)
+ kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert)
+ kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE,
+ KCSAN_REPORT_RACE_UNKNOWN_ORIGIN,
+ watchpoint - watchpoints);
+ }
+
+ /*
+ * Remove watchpoint; must be after reporting, since the slot may be
+ * reused after this point.
+ */
+ remove_watchpoint(watchpoint);
+ kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS);
+out_unlock:
+ if (!kcsan_interrupt_watcher)
+ raw_local_irq_restore(irq_flags);
+out:
+ user_access_restore(ua_flags);
+}
+
+static __always_inline void check_access(const volatile void *ptr, size_t size,
+ int type)
+{
+ const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
+ atomic_long_t *watchpoint;
+ long encoded_watchpoint;
+
+ /*
+ * Do nothing for 0 sized check; this comparison will be optimized out
+ * for constant sized instrumentation (__tsan_{read,write}N).
+ */
+ if (unlikely(size == 0))
+ return;
+
+ /*
+ * Avoid user_access_save in fast-path: find_watchpoint is safe without
+ * user_access_save, as the address that ptr points to is only used to
+ * check if a watchpoint exists; ptr is never dereferenced.
+ */
+ watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write,
+ &encoded_watchpoint);
+ /*
+ * It is safe to check kcsan_is_enabled() after find_watchpoint in the
+ * slow-path, as long as no state changes that cause a race to be
+ * detected and reported have occurred until kcsan_is_enabled() is
+ * checked.
+ */
+
+ if (unlikely(watchpoint != NULL))
+ kcsan_found_watchpoint(ptr, size, type, watchpoint,
+ encoded_watchpoint);
+ else {
+ struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */
+
+ if (unlikely(should_watch(ptr, size, type, ctx)))
+ kcsan_setup_watchpoint(ptr, size, type);
+ else if (unlikely(ctx->scoped_accesses.prev))
+ kcsan_check_scoped_accesses();
+ }
+}
+
+/* === Public interface ===================================================== */
+
+void __init kcsan_init(void)
+{
+ BUG_ON(!in_task());
+
+ kcsan_debugfs_init();
+
+ /*
+ * We are in the init task, and no other tasks should be running;
+ * WRITE_ONCE without memory barrier is sufficient.
+ */
+ if (kcsan_early_enable)
+ WRITE_ONCE(kcsan_enabled, true);
+}
+
+/* === Exported interface =================================================== */
+
+void kcsan_disable_current(void)
+{
+ ++get_ctx()->disable_count;
+}
+EXPORT_SYMBOL(kcsan_disable_current);
+
+void kcsan_enable_current(void)
+{
+ if (get_ctx()->disable_count-- == 0) {
+ /*
+ * Warn if kcsan_enable_current() calls are unbalanced with
+ * kcsan_disable_current() calls, which causes disable_count to
+ * become negative and should not happen.
+ */
+ kcsan_disable_current(); /* restore to 0, KCSAN still enabled */
+ kcsan_disable_current(); /* disable to generate warning */
+ WARN(1, "Unbalanced %s()", __func__);
+ kcsan_enable_current();
+ }
+}
+EXPORT_SYMBOL(kcsan_enable_current);
+
+void kcsan_enable_current_nowarn(void)
+{
+ if (get_ctx()->disable_count-- == 0)
+ kcsan_disable_current();
+}
+EXPORT_SYMBOL(kcsan_enable_current_nowarn);
+
+void kcsan_nestable_atomic_begin(void)
+{
+ /*
+ * Do *not* check and warn if we are in a flat atomic region: nestable
+ * and flat atomic regions are independent from each other.
+ * See include/linux/kcsan.h: struct kcsan_ctx comments for more
+ * comments.
+ */
+
+ ++get_ctx()->atomic_nest_count;
+}
+EXPORT_SYMBOL(kcsan_nestable_atomic_begin);
+
+void kcsan_nestable_atomic_end(void)
+{
+ if (get_ctx()->atomic_nest_count-- == 0) {
+ /*
+ * Warn if kcsan_nestable_atomic_end() calls are unbalanced with
+ * kcsan_nestable_atomic_begin() calls, which causes
+ * atomic_nest_count to become negative and should not happen.
+ */
+ kcsan_nestable_atomic_begin(); /* restore to 0 */
+ kcsan_disable_current(); /* disable to generate warning */
+ WARN(1, "Unbalanced %s()", __func__);
+ kcsan_enable_current();
+ }
+}
+EXPORT_SYMBOL(kcsan_nestable_atomic_end);
+
+void kcsan_flat_atomic_begin(void)
+{
+ get_ctx()->in_flat_atomic = true;
+}
+EXPORT_SYMBOL(kcsan_flat_atomic_begin);
+
+void kcsan_flat_atomic_end(void)
+{
+ get_ctx()->in_flat_atomic = false;
+}
+EXPORT_SYMBOL(kcsan_flat_atomic_end);
+
+void kcsan_atomic_next(int n)
+{
+ get_ctx()->atomic_next = n;
+}
+EXPORT_SYMBOL(kcsan_atomic_next);
+
+void kcsan_set_access_mask(unsigned long mask)
+{
+ get_ctx()->access_mask = mask;
+}
+EXPORT_SYMBOL(kcsan_set_access_mask);
+
+struct kcsan_scoped_access *
+kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
+ struct kcsan_scoped_access *sa)
+{
+ struct kcsan_ctx *ctx = get_ctx();
+
+ __kcsan_check_access(ptr, size, type);
+
+ ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */
+
+ INIT_LIST_HEAD(&sa->list);
+ sa->ptr = ptr;
+ sa->size = size;
+ sa->type = type;
+
+ if (!ctx->scoped_accesses.prev) /* Lazy initialize list head. */
+ INIT_LIST_HEAD(&ctx->scoped_accesses);
+ list_add(&sa->list, &ctx->scoped_accesses);
+
+ ctx->disable_count--;
+ return sa;
+}
+EXPORT_SYMBOL(kcsan_begin_scoped_access);
+
+void kcsan_end_scoped_access(struct kcsan_scoped_access *sa)
+{
+ struct kcsan_ctx *ctx = get_ctx();
+
+ if (WARN(!ctx->scoped_accesses.prev, "Unbalanced %s()?", __func__))
+ return;
+
+ ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */
+
+ list_del(&sa->list);
+ if (list_empty(&ctx->scoped_accesses))
+ /*
+ * Ensure we do not enter kcsan_check_scoped_accesses()
+ * slow-path if unnecessary, and avoids requiring list_empty()
+ * in the fast-path (to avoid a READ_ONCE() and potential
+ * uaccess warning).
+ */
+ ctx->scoped_accesses.prev = NULL;
+
+ ctx->disable_count--;
+
+ __kcsan_check_access(sa->ptr, sa->size, sa->type);
+}
+EXPORT_SYMBOL(kcsan_end_scoped_access);
+
+void __kcsan_check_access(const volatile void *ptr, size_t size, int type)
+{
+ check_access(ptr, size, type);
+}
+EXPORT_SYMBOL(__kcsan_check_access);
+
+/*
+ * KCSAN uses the same instrumentation that is emitted by supported compilers
+ * for ThreadSanitizer (TSAN).
+ *
+ * When enabled, the compiler emits instrumentation calls (the functions
+ * prefixed with "__tsan" below) for all loads and stores that it generated;
+ * inline asm is not instrumented.
+ *
+ * Note that, not all supported compiler versions distinguish aligned/unaligned
+ * accesses, but e.g. recent versions of Clang do. We simply alias the unaligned
+ * version to the generic version, which can handle both.
+ */
+
+#define DEFINE_TSAN_READ_WRITE(size) \
+ void __tsan_read##size(void *ptr) \
+ { \
+ check_access(ptr, size, 0); \
+ } \
+ EXPORT_SYMBOL(__tsan_read##size); \
+ void __tsan_unaligned_read##size(void *ptr) \
+ __alias(__tsan_read##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_read##size); \
+ void __tsan_write##size(void *ptr) \
+ { \
+ check_access(ptr, size, KCSAN_ACCESS_WRITE); \
+ } \
+ EXPORT_SYMBOL(__tsan_write##size); \
+ void __tsan_unaligned_write##size(void *ptr) \
+ __alias(__tsan_write##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_write##size)
+
+DEFINE_TSAN_READ_WRITE(1);
+DEFINE_TSAN_READ_WRITE(2);
+DEFINE_TSAN_READ_WRITE(4);
+DEFINE_TSAN_READ_WRITE(8);
+DEFINE_TSAN_READ_WRITE(16);
+
+void __tsan_read_range(void *ptr, size_t size)
+{
+ check_access(ptr, size, 0);
+}
+EXPORT_SYMBOL(__tsan_read_range);
+
+void __tsan_write_range(void *ptr, size_t size)
+{
+ check_access(ptr, size, KCSAN_ACCESS_WRITE);
+}
+EXPORT_SYMBOL(__tsan_write_range);
+
+/*
+ * Use of explicit volatile is generally disallowed [1], however, volatile is
+ * still used in various concurrent context, whether in low-level
+ * synchronization primitives or for legacy reasons.
+ * [1] https://lwn.net/Articles/233479/
+ *
+ * We only consider volatile accesses atomic if they are aligned and would pass
+ * the size-check of compiletime_assert_rwonce_type().
+ */
+#define DEFINE_TSAN_VOLATILE_READ_WRITE(size) \
+ void __tsan_volatile_read##size(void *ptr) \
+ { \
+ const bool is_atomic = size <= sizeof(long long) && \
+ IS_ALIGNED((unsigned long)ptr, size); \
+ if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \
+ return; \
+ check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0); \
+ } \
+ EXPORT_SYMBOL(__tsan_volatile_read##size); \
+ void __tsan_unaligned_volatile_read##size(void *ptr) \
+ __alias(__tsan_volatile_read##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size); \
+ void __tsan_volatile_write##size(void *ptr) \
+ { \
+ const bool is_atomic = size <= sizeof(long long) && \
+ IS_ALIGNED((unsigned long)ptr, size); \
+ if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \
+ return; \
+ check_access(ptr, size, \
+ KCSAN_ACCESS_WRITE | \
+ (is_atomic ? KCSAN_ACCESS_ATOMIC : 0)); \
+ } \
+ EXPORT_SYMBOL(__tsan_volatile_write##size); \
+ void __tsan_unaligned_volatile_write##size(void *ptr) \
+ __alias(__tsan_volatile_write##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_volatile_write##size)
+
+DEFINE_TSAN_VOLATILE_READ_WRITE(1);
+DEFINE_TSAN_VOLATILE_READ_WRITE(2);
+DEFINE_TSAN_VOLATILE_READ_WRITE(4);
+DEFINE_TSAN_VOLATILE_READ_WRITE(8);
+DEFINE_TSAN_VOLATILE_READ_WRITE(16);
+
+/*
+ * The below are not required by KCSAN, but can still be emitted by the
+ * compiler.
+ */
+void __tsan_func_entry(void *call_pc)
+{
+}
+EXPORT_SYMBOL(__tsan_func_entry);
+void __tsan_func_exit(void)
+{
+}
+EXPORT_SYMBOL(__tsan_func_exit);
+void __tsan_init(void)
+{
+}
+EXPORT_SYMBOL(__tsan_init);
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
new file mode 100644
index 000000000000..023e49c58d55
--- /dev/null
+++ b/kernel/kcsan/debugfs.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/atomic.h>
+#include <linux/bsearch.h>
+#include <linux/bug.h>
+#include <linux/debugfs.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+
+#include "kcsan.h"
+
+/*
+ * Statistics counters.
+ */
+static atomic_long_t counters[KCSAN_COUNTER_COUNT];
+
+/*
+ * Addresses for filtering functions from reporting. This list can be used as a
+ * whitelist or blacklist.
+ */
+static struct {
+ unsigned long *addrs; /* array of addresses */
+ size_t size; /* current size */
+ int used; /* number of elements used */
+ bool sorted; /* if elements are sorted */
+ bool whitelist; /* if list is a blacklist or whitelist */
+} report_filterlist = {
+ .addrs = NULL,
+ .size = 8, /* small initial size */
+ .used = 0,
+ .sorted = false,
+ .whitelist = false, /* default is blacklist */
+};
+static DEFINE_SPINLOCK(report_filterlist_lock);
+
+static const char *counter_to_name(enum kcsan_counter_id id)
+{
+ switch (id) {
+ case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints";
+ case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints";
+ case KCSAN_COUNTER_DATA_RACES: return "data_races";
+ case KCSAN_COUNTER_ASSERT_FAILURES: return "assert_failures";
+ case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity";
+ case KCSAN_COUNTER_REPORT_RACES: return "report_races";
+ case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin";
+ case KCSAN_COUNTER_UNENCODABLE_ACCESSES: return "unencodable_accesses";
+ case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: return "encoding_false_positives";
+ case KCSAN_COUNTER_COUNT:
+ BUG();
+ }
+ return NULL;
+}
+
+void kcsan_counter_inc(enum kcsan_counter_id id)
+{
+ atomic_long_inc(&counters[id]);
+}
+
+void kcsan_counter_dec(enum kcsan_counter_id id)
+{
+ atomic_long_dec(&counters[id]);
+}
+
+/*
+ * The microbenchmark allows benchmarking KCSAN core runtime only. To run
+ * multiple threads, pipe 'microbench=<iters>' from multiple tasks into the
+ * debugfs file. This will not generate any conflicts, and tests fast-path only.
+ */
+static noinline void microbenchmark(unsigned long iters)
+{
+ const struct kcsan_ctx ctx_save = current->kcsan_ctx;
+ const bool was_enabled = READ_ONCE(kcsan_enabled);
+ cycles_t cycles;
+
+ /* We may have been called from an atomic region; reset context. */
+ memset(&current->kcsan_ctx, 0, sizeof(current->kcsan_ctx));
+ /*
+ * Disable to benchmark fast-path for all accesses, and (expected
+ * negligible) call into slow-path, but never set up watchpoints.
+ */
+ WRITE_ONCE(kcsan_enabled, false);
+
+ pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters);
+
+ cycles = get_cycles();
+ while (iters--) {
+ unsigned long addr = iters & ((PAGE_SIZE << 8) - 1);
+ int type = !(iters & 0x7f) ? KCSAN_ACCESS_ATOMIC :
+ (!(iters & 0xf) ? KCSAN_ACCESS_WRITE : 0);
+ __kcsan_check_access((void *)addr, sizeof(long), type);
+ }
+ cycles = get_cycles() - cycles;
+
+ pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles);
+
+ WRITE_ONCE(kcsan_enabled, was_enabled);
+ /* restore context */
+ current->kcsan_ctx = ctx_save;
+}
+
+/*
+ * Simple test to create conflicting accesses. Write 'test=<iters>' to KCSAN's
+ * debugfs file from multiple tasks to generate real conflicts and show reports.
+ */
+static long test_dummy;
+static long test_flags;
+static long test_scoped;
+static noinline void test_thread(unsigned long iters)
+{
+ const long CHANGE_BITS = 0xff00ff00ff00ff00L;
+ const struct kcsan_ctx ctx_save = current->kcsan_ctx;
+ cycles_t cycles;
+
+ /* We may have been called from an atomic region; reset context. */
+ memset(&current->kcsan_ctx, 0, sizeof(current->kcsan_ctx));
+
+ pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters);
+ pr_info("test_dummy@%px, test_flags@%px, test_scoped@%px,\n",
+ &test_dummy, &test_flags, &test_scoped);
+
+ cycles = get_cycles();
+ while (iters--) {
+ /* These all should generate reports. */
+ __kcsan_check_read(&test_dummy, sizeof(test_dummy));
+ ASSERT_EXCLUSIVE_WRITER(test_dummy);
+ ASSERT_EXCLUSIVE_ACCESS(test_dummy);
+
+ ASSERT_EXCLUSIVE_BITS(test_flags, ~CHANGE_BITS); /* no report */
+ __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */
+
+ ASSERT_EXCLUSIVE_BITS(test_flags, CHANGE_BITS); /* report */
+ __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */
+
+ /* not actually instrumented */
+ WRITE_ONCE(test_dummy, iters); /* to observe value-change */
+ __kcsan_check_write(&test_dummy, sizeof(test_dummy));
+
+ test_flags ^= CHANGE_BITS; /* generate value-change */
+ __kcsan_check_write(&test_flags, sizeof(test_flags));
+
+ BUG_ON(current->kcsan_ctx.scoped_accesses.prev);
+ {
+ /* Should generate reports anywhere in this block. */
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(test_scoped);
+ ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_scoped);
+ BUG_ON(!current->kcsan_ctx.scoped_accesses.prev);
+ /* Unrelated accesses. */
+ __kcsan_check_access(&cycles, sizeof(cycles), 0);
+ __kcsan_check_access(&cycles, sizeof(cycles), KCSAN_ACCESS_ATOMIC);
+ }
+ BUG_ON(current->kcsan_ctx.scoped_accesses.prev);
+ }
+ cycles = get_cycles() - cycles;
+
+ pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles);
+
+ /* restore context */
+ current->kcsan_ctx = ctx_save;
+}
+
+static int cmp_filterlist_addrs(const void *rhs, const void *lhs)
+{
+ const unsigned long a = *(const unsigned long *)rhs;
+ const unsigned long b = *(const unsigned long *)lhs;
+
+ return a < b ? -1 : a == b ? 0 : 1;
+}
+
+bool kcsan_skip_report_debugfs(unsigned long func_addr)
+{
+ unsigned long symbolsize, offset;
+ unsigned long flags;
+ bool ret = false;
+
+ if (!kallsyms_lookup_size_offset(func_addr, &symbolsize, &offset))
+ return false;
+ func_addr -= offset; /* Get function start */
+
+ spin_lock_irqsave(&report_filterlist_lock, flags);
+ if (report_filterlist.used == 0)
+ goto out;
+
+ /* Sort array if it is unsorted, and then do a binary search. */
+ if (!report_filterlist.sorted) {
+ sort(report_filterlist.addrs, report_filterlist.used,
+ sizeof(unsigned long), cmp_filterlist_addrs, NULL);
+ report_filterlist.sorted = true;
+ }
+ ret = !!bsearch(&func_addr, report_filterlist.addrs,
+ report_filterlist.used, sizeof(unsigned long),
+ cmp_filterlist_addrs);
+ if (report_filterlist.whitelist)
+ ret = !ret;
+
+out:
+ spin_unlock_irqrestore(&report_filterlist_lock, flags);
+ return ret;
+}
+
+static void set_report_filterlist_whitelist(bool whitelist)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&report_filterlist_lock, flags);
+ report_filterlist.whitelist = whitelist;
+ spin_unlock_irqrestore(&report_filterlist_lock, flags);
+}
+
+/* Returns 0 on success, error-code otherwise. */
+static ssize_t insert_report_filterlist(const char *func)
+{
+ unsigned long flags;
+ unsigned long addr = kallsyms_lookup_name(func);
+ ssize_t ret = 0;
+
+ if (!addr) {
+ pr_err("KCSAN: could not find function: '%s'\n", func);
+ return -ENOENT;
+ }
+
+ spin_lock_irqsave(&report_filterlist_lock, flags);
+
+ if (report_filterlist.addrs == NULL) {
+ /* initial allocation */
+ report_filterlist.addrs =
+ kmalloc_array(report_filterlist.size,
+ sizeof(unsigned long), GFP_ATOMIC);
+ if (report_filterlist.addrs == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if (report_filterlist.used == report_filterlist.size) {
+ /* resize filterlist */
+ size_t new_size = report_filterlist.size * 2;
+ unsigned long *new_addrs =
+ krealloc(report_filterlist.addrs,
+ new_size * sizeof(unsigned long), GFP_ATOMIC);
+
+ if (new_addrs == NULL) {
+ /* leave filterlist itself untouched */
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ report_filterlist.size = new_size;
+ report_filterlist.addrs = new_addrs;
+ }
+
+ /* Note: deduplicating should be done in userspace. */
+ report_filterlist.addrs[report_filterlist.used++] =
+ kallsyms_lookup_name(func);
+ report_filterlist.sorted = false;
+
+out:
+ spin_unlock_irqrestore(&report_filterlist_lock, flags);
+
+ return ret;
+}
+
+static int show_info(struct seq_file *file, void *v)
+{
+ int i;
+ unsigned long flags;
+
+ /* show stats */
+ seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled));
+ for (i = 0; i < KCSAN_COUNTER_COUNT; ++i)
+ seq_printf(file, "%s: %ld\n", counter_to_name(i),
+ atomic_long_read(&counters[i]));
+
+ /* show filter functions, and filter type */
+ spin_lock_irqsave(&report_filterlist_lock, flags);
+ seq_printf(file, "\n%s functions: %s\n",
+ report_filterlist.whitelist ? "whitelisted" : "blacklisted",
+ report_filterlist.used == 0 ? "none" : "");
+ for (i = 0; i < report_filterlist.used; ++i)
+ seq_printf(file, " %ps\n", (void *)report_filterlist.addrs[i]);
+ spin_unlock_irqrestore(&report_filterlist_lock, flags);
+
+ return 0;
+}
+
+static int debugfs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_info, NULL);
+}
+
+static ssize_t
+debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *off)
+{
+ char kbuf[KSYM_NAME_LEN];
+ char *arg;
+ int read_len = count < (sizeof(kbuf) - 1) ? count : (sizeof(kbuf) - 1);
+
+ if (copy_from_user(kbuf, buf, read_len))
+ return -EFAULT;
+ kbuf[read_len] = '\0';
+ arg = strstrip(kbuf);
+
+ if (!strcmp(arg, "on")) {
+ WRITE_ONCE(kcsan_enabled, true);
+ } else if (!strcmp(arg, "off")) {
+ WRITE_ONCE(kcsan_enabled, false);
+ } else if (!strncmp(arg, "microbench=", sizeof("microbench=") - 1)) {
+ unsigned long iters;
+
+ if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters))
+ return -EINVAL;
+ microbenchmark(iters);
+ } else if (!strncmp(arg, "test=", sizeof("test=") - 1)) {
+ unsigned long iters;
+
+ if (kstrtoul(&arg[sizeof("test=") - 1], 0, &iters))
+ return -EINVAL;
+ test_thread(iters);
+ } else if (!strcmp(arg, "whitelist")) {
+ set_report_filterlist_whitelist(true);
+ } else if (!strcmp(arg, "blacklist")) {
+ set_report_filterlist_whitelist(false);
+ } else if (arg[0] == '!') {
+ ssize_t ret = insert_report_filterlist(&arg[1]);
+
+ if (ret < 0)
+ return ret;
+ } else {
+ return -EINVAL;
+ }
+
+ return count;
+}
+
+static const struct file_operations debugfs_ops =
+{
+ .read = seq_read,
+ .open = debugfs_open,
+ .write = debugfs_write,
+ .release = single_release
+};
+
+void __init kcsan_debugfs_init(void)
+{
+ debugfs_create_file("kcsan", 0644, NULL, NULL, &debugfs_ops);
+}
diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h
new file mode 100644
index 000000000000..f03562aaf2eb
--- /dev/null
+++ b/kernel/kcsan/encoding.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_KCSAN_ENCODING_H
+#define _KERNEL_KCSAN_ENCODING_H
+
+#include <linux/bits.h>
+#include <linux/log2.h>
+#include <linux/mm.h>
+
+#include "kcsan.h"
+
+#define SLOT_RANGE PAGE_SIZE
+
+#define INVALID_WATCHPOINT 0
+#define CONSUMED_WATCHPOINT 1
+
+/*
+ * The maximum useful size of accesses for which we set up watchpoints is the
+ * max range of slots we check on an access.
+ */
+#define MAX_ENCODABLE_SIZE (SLOT_RANGE * (1 + KCSAN_CHECK_ADJACENT))
+
+/*
+ * Number of bits we use to store size info.
+ */
+#define WATCHPOINT_SIZE_BITS bits_per(MAX_ENCODABLE_SIZE)
+/*
+ * This encoding for addresses discards the upper (1 for is-write + SIZE_BITS);
+ * however, most 64-bit architectures do not use the full 64-bit address space.
+ * Also, in order for a false positive to be observable 2 things need to happen:
+ *
+ * 1. different addresses but with the same encoded address race;
+ * 2. and both map onto the same watchpoint slots;
+ *
+ * Both these are assumed to be very unlikely. However, in case it still happens
+ * happens, the report logic will filter out the false positive (see report.c).
+ */
+#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS)
+
+/*
+ * Masks to set/retrieve the encoded data.
+ */
+#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1)
+#define WATCHPOINT_SIZE_MASK \
+ GENMASK(BITS_PER_LONG-2, BITS_PER_LONG-2 - WATCHPOINT_SIZE_BITS)
+#define WATCHPOINT_ADDR_MASK \
+ GENMASK(BITS_PER_LONG-3 - WATCHPOINT_SIZE_BITS, 0)
+
+static inline bool check_encodable(unsigned long addr, size_t size)
+{
+ return size <= MAX_ENCODABLE_SIZE;
+}
+
+static inline long
+encode_watchpoint(unsigned long addr, size_t size, bool is_write)
+{
+ return (long)((is_write ? WATCHPOINT_WRITE_MASK : 0) |
+ (size << WATCHPOINT_ADDR_BITS) |
+ (addr & WATCHPOINT_ADDR_MASK));
+}
+
+static __always_inline bool decode_watchpoint(long watchpoint,
+ unsigned long *addr_masked,
+ size_t *size,
+ bool *is_write)
+{
+ if (watchpoint == INVALID_WATCHPOINT ||
+ watchpoint == CONSUMED_WATCHPOINT)
+ return false;
+
+ *addr_masked = (unsigned long)watchpoint & WATCHPOINT_ADDR_MASK;
+ *size = ((unsigned long)watchpoint & WATCHPOINT_SIZE_MASK) >> WATCHPOINT_ADDR_BITS;
+ *is_write = !!((unsigned long)watchpoint & WATCHPOINT_WRITE_MASK);
+
+ return true;
+}
+
+/*
+ * Return watchpoint slot for an address.
+ */
+static __always_inline int watchpoint_slot(unsigned long addr)
+{
+ return (addr / PAGE_SIZE) % CONFIG_KCSAN_NUM_WATCHPOINTS;
+}
+
+static __always_inline bool matching_access(unsigned long addr1, size_t size1,
+ unsigned long addr2, size_t size2)
+{
+ unsigned long end_range1 = addr1 + size1 - 1;
+ unsigned long end_range2 = addr2 + size2 - 1;
+
+ return addr1 <= end_range2 && addr2 <= end_range1;
+}
+
+#endif /* _KERNEL_KCSAN_ENCODING_H */
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
new file mode 100644
index 000000000000..763d6d08d94b
--- /dev/null
+++ b/kernel/kcsan/kcsan.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * The Kernel Concurrency Sanitizer (KCSAN) infrastructure. For more info please
+ * see Documentation/dev-tools/kcsan.rst.
+ */
+
+#ifndef _KERNEL_KCSAN_KCSAN_H
+#define _KERNEL_KCSAN_KCSAN_H
+
+#include <linux/kcsan.h>
+
+/* The number of adjacent watchpoints to check. */
+#define KCSAN_CHECK_ADJACENT 1
+#define NUM_SLOTS (1 + 2*KCSAN_CHECK_ADJACENT)
+
+extern unsigned int kcsan_udelay_task;
+extern unsigned int kcsan_udelay_interrupt;
+
+/*
+ * Globally enable and disable KCSAN.
+ */
+extern bool kcsan_enabled;
+
+/*
+ * Initialize debugfs file.
+ */
+void kcsan_debugfs_init(void);
+
+enum kcsan_counter_id {
+ /*
+ * Number of watchpoints currently in use.
+ */
+ KCSAN_COUNTER_USED_WATCHPOINTS,
+
+ /*
+ * Total number of watchpoints set up.
+ */
+ KCSAN_COUNTER_SETUP_WATCHPOINTS,
+
+ /*
+ * Total number of data races.
+ */
+ KCSAN_COUNTER_DATA_RACES,
+
+ /*
+ * Total number of ASSERT failures due to races. If the observed race is
+ * due to two conflicting ASSERT type accesses, then both will be
+ * counted.
+ */
+ KCSAN_COUNTER_ASSERT_FAILURES,
+
+ /*
+ * Number of times no watchpoints were available.
+ */
+ KCSAN_COUNTER_NO_CAPACITY,
+
+ /*
+ * A thread checking a watchpoint raced with another checking thread;
+ * only one will be reported.
+ */
+ KCSAN_COUNTER_REPORT_RACES,
+
+ /*
+ * Observed data value change, but writer thread unknown.
+ */
+ KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN,
+
+ /*
+ * The access cannot be encoded to a valid watchpoint.
+ */
+ KCSAN_COUNTER_UNENCODABLE_ACCESSES,
+
+ /*
+ * Watchpoint encoding caused a watchpoint to fire on mismatching
+ * accesses.
+ */
+ KCSAN_COUNTER_ENCODING_FALSE_POSITIVES,
+
+ KCSAN_COUNTER_COUNT, /* number of counters */
+};
+
+/*
+ * Increment/decrement counter with given id; avoid calling these in fast-path.
+ */
+extern void kcsan_counter_inc(enum kcsan_counter_id id);
+extern void kcsan_counter_dec(enum kcsan_counter_id id);
+
+/*
+ * Returns true if data races in the function symbol that maps to func_addr
+ * (offsets are ignored) should *not* be reported.
+ */
+extern bool kcsan_skip_report_debugfs(unsigned long func_addr);
+
+/*
+ * Value-change states.
+ */
+enum kcsan_value_change {
+ /*
+ * Did not observe a value-change, however, it is valid to report the
+ * race, depending on preferences.
+ */
+ KCSAN_VALUE_CHANGE_MAYBE,
+
+ /*
+ * Did not observe a value-change, and it is invalid to report the race.
+ */
+ KCSAN_VALUE_CHANGE_FALSE,
+
+ /*
+ * The value was observed to change, and the race should be reported.
+ */
+ KCSAN_VALUE_CHANGE_TRUE,
+};
+
+enum kcsan_report_type {
+ /*
+ * The thread that set up the watchpoint and briefly stalled was
+ * signalled that another thread triggered the watchpoint.
+ */
+ KCSAN_REPORT_RACE_SIGNAL,
+
+ /*
+ * A thread found and consumed a matching watchpoint.
+ */
+ KCSAN_REPORT_CONSUMED_WATCHPOINT,
+
+ /*
+ * No other thread was observed to race with the access, but the data
+ * value before and after the stall differs.
+ */
+ KCSAN_REPORT_RACE_UNKNOWN_ORIGIN,
+};
+
+/*
+ * Print a race report from thread that encountered the race.
+ */
+extern void kcsan_report(const volatile void *ptr, size_t size, int access_type,
+ enum kcsan_value_change value_change,
+ enum kcsan_report_type type, int watchpoint_idx);
+
+#endif /* _KERNEL_KCSAN_KCSAN_H */
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
new file mode 100644
index 000000000000..ac5f8345bae9
--- /dev/null
+++ b/kernel/kcsan/report.c
@@ -0,0 +1,634 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/debug_locks.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/lockdep.h>
+#include <linux/preempt.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/stacktrace.h>
+
+#include "kcsan.h"
+#include "encoding.h"
+
+/*
+ * Max. number of stack entries to show in the report.
+ */
+#define NUM_STACK_ENTRIES 64
+
+/* Common access info. */
+struct access_info {
+ const volatile void *ptr;
+ size_t size;
+ int access_type;
+ int task_pid;
+ int cpu_id;
+};
+
+/*
+ * Other thread info: communicated from other racing thread to thread that set
+ * up the watchpoint, which then prints the complete report atomically.
+ */
+struct other_info {
+ struct access_info ai;
+ unsigned long stack_entries[NUM_STACK_ENTRIES];
+ int num_stack_entries;
+
+ /*
+ * Optionally pass @current. Typically we do not need to pass @current
+ * via @other_info since just @task_pid is sufficient. Passing @current
+ * has additional overhead.
+ *
+ * To safely pass @current, we must either use get_task_struct/
+ * put_task_struct, or stall the thread that populated @other_info.
+ *
+ * We cannot rely on get_task_struct/put_task_struct in case
+ * release_report() races with a task being released, and would have to
+ * free it in release_report(). This may result in deadlock if we want
+ * to use KCSAN on the allocators.
+ *
+ * Since we also want to reliably print held locks for
+ * CONFIG_KCSAN_VERBOSE, the current implementation stalls the thread
+ * that populated @other_info until it has been consumed.
+ */
+ struct task_struct *task;
+};
+
+/*
+ * To never block any producers of struct other_info, we need as many elements
+ * as we have watchpoints (upper bound on concurrent races to report).
+ */
+static struct other_info other_infos[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1];
+
+/*
+ * Information about reported races; used to rate limit reporting.
+ */
+struct report_time {
+ /*
+ * The last time the race was reported.
+ */
+ unsigned long time;
+
+ /*
+ * The frames of the 2 threads; if only 1 thread is known, one frame
+ * will be 0.
+ */
+ unsigned long frame1;
+ unsigned long frame2;
+};
+
+/*
+ * Since we also want to be able to debug allocators with KCSAN, to avoid
+ * deadlock, report_times cannot be dynamically resized with krealloc in
+ * rate_limit_report.
+ *
+ * Therefore, we use a fixed-size array, which at most will occupy a page. This
+ * still adequately rate limits reports, assuming that a) number of unique data
+ * races is not excessive, and b) occurrence of unique races within the
+ * same time window is limited.
+ */
+#define REPORT_TIMES_MAX (PAGE_SIZE / sizeof(struct report_time))
+#define REPORT_TIMES_SIZE \
+ (CONFIG_KCSAN_REPORT_ONCE_IN_MS > REPORT_TIMES_MAX ? \
+ REPORT_TIMES_MAX : \
+ CONFIG_KCSAN_REPORT_ONCE_IN_MS)
+static struct report_time report_times[REPORT_TIMES_SIZE];
+
+/*
+ * Spinlock serializing report generation, and access to @other_infos. Although
+ * it could make sense to have a finer-grained locking story for @other_infos,
+ * report generation needs to be serialized either way, so not much is gained.
+ */
+static DEFINE_RAW_SPINLOCK(report_lock);
+
+/*
+ * Checks if the race identified by thread frames frame1 and frame2 has
+ * been reported since (now - KCSAN_REPORT_ONCE_IN_MS).
+ */
+static bool rate_limit_report(unsigned long frame1, unsigned long frame2)
+{
+ struct report_time *use_entry = &report_times[0];
+ unsigned long invalid_before;
+ int i;
+
+ BUILD_BUG_ON(CONFIG_KCSAN_REPORT_ONCE_IN_MS != 0 && REPORT_TIMES_SIZE == 0);
+
+ if (CONFIG_KCSAN_REPORT_ONCE_IN_MS == 0)
+ return false;
+
+ invalid_before = jiffies - msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS);
+
+ /* Check if a matching race report exists. */
+ for (i = 0; i < REPORT_TIMES_SIZE; ++i) {
+ struct report_time *rt = &report_times[i];
+
+ /*
+ * Must always select an entry for use to store info as we
+ * cannot resize report_times; at the end of the scan, use_entry
+ * will be the oldest entry, which ideally also happened before
+ * KCSAN_REPORT_ONCE_IN_MS ago.
+ */
+ if (time_before(rt->time, use_entry->time))
+ use_entry = rt;
+
+ /*
+ * Initially, no need to check any further as this entry as well
+ * as following entries have never been used.
+ */
+ if (rt->time == 0)
+ break;
+
+ /* Check if entry expired. */
+ if (time_before(rt->time, invalid_before))
+ continue; /* before KCSAN_REPORT_ONCE_IN_MS ago */
+
+ /* Reported recently, check if race matches. */
+ if ((rt->frame1 == frame1 && rt->frame2 == frame2) ||
+ (rt->frame1 == frame2 && rt->frame2 == frame1))
+ return true;
+ }
+
+ use_entry->time = jiffies;
+ use_entry->frame1 = frame1;
+ use_entry->frame2 = frame2;
+ return false;
+}
+
+/*
+ * Special rules to skip reporting.
+ */
+static bool
+skip_report(enum kcsan_value_change value_change, unsigned long top_frame)
+{
+ /* Should never get here if value_change==FALSE. */
+ WARN_ON_ONCE(value_change == KCSAN_VALUE_CHANGE_FALSE);
+
+ /*
+ * The first call to skip_report always has value_change==TRUE, since we
+ * cannot know the value written of an instrumented access. For the 2nd
+ * call there are 6 cases with CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY:
+ *
+ * 1. read watchpoint, conflicting write (value_change==TRUE): report;
+ * 2. read watchpoint, conflicting write (value_change==MAYBE): skip;
+ * 3. write watchpoint, conflicting write (value_change==TRUE): report;
+ * 4. write watchpoint, conflicting write (value_change==MAYBE): skip;
+ * 5. write watchpoint, conflicting read (value_change==MAYBE): skip;
+ * 6. write watchpoint, conflicting read (value_change==TRUE): report;
+ *
+ * Cases 1-4 are intuitive and expected; case 5 ensures we do not report
+ * data races where the write may have rewritten the same value; case 6
+ * is possible either if the size is larger than what we check value
+ * changes for or the access type is KCSAN_ACCESS_ASSERT.
+ */
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) &&
+ value_change == KCSAN_VALUE_CHANGE_MAYBE) {
+ /*
+ * The access is a write, but the data value did not change.
+ *
+ * We opt-out of this filter for certain functions at request of
+ * maintainers.
+ */
+ char buf[64];
+ int len = scnprintf(buf, sizeof(buf), "%ps", (void *)top_frame);
+
+ if (!strnstr(buf, "rcu_", len) &&
+ !strnstr(buf, "_rcu", len) &&
+ !strnstr(buf, "_srcu", len))
+ return true;
+ }
+
+ return kcsan_skip_report_debugfs(top_frame);
+}
+
+static const char *get_access_type(int type)
+{
+ if (type & KCSAN_ACCESS_ASSERT) {
+ if (type & KCSAN_ACCESS_SCOPED) {
+ if (type & KCSAN_ACCESS_WRITE)
+ return "assert no accesses (scoped)";
+ else
+ return "assert no writes (scoped)";
+ } else {
+ if (type & KCSAN_ACCESS_WRITE)
+ return "assert no accesses";
+ else
+ return "assert no writes";
+ }
+ }
+
+ switch (type) {
+ case 0:
+ return "read";
+ case KCSAN_ACCESS_ATOMIC:
+ return "read (marked)";
+ case KCSAN_ACCESS_WRITE:
+ return "write";
+ case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "write (marked)";
+ case KCSAN_ACCESS_SCOPED:
+ return "read (scoped)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC:
+ return "read (marked, scoped)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE:
+ return "write (scoped)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "write (marked, scoped)";
+ default:
+ BUG();
+ }
+}
+
+static const char *get_bug_type(int type)
+{
+ return (type & KCSAN_ACCESS_ASSERT) != 0 ? "assert: race" : "data-race";
+}
+
+/* Return thread description: in task or interrupt. */
+static const char *get_thread_desc(int task_id)
+{
+ if (task_id != -1) {
+ static char buf[32]; /* safe: protected by report_lock */
+
+ snprintf(buf, sizeof(buf), "task %i", task_id);
+ return buf;
+ }
+ return "interrupt";
+}
+
+/* Helper to skip KCSAN-related functions in stack-trace. */
+static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries)
+{
+ char buf[64];
+ char *cur;
+ int len, skip;
+
+ for (skip = 0; skip < num_entries; ++skip) {
+ len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skip]);
+
+ /* Never show tsan_* or {read,write}_once_size. */
+ if (strnstr(buf, "tsan_", len) ||
+ strnstr(buf, "_once_size", len))
+ continue;
+
+ cur = strnstr(buf, "kcsan_", len);
+ if (cur) {
+ cur += sizeof("kcsan_") - 1;
+ if (strncmp(cur, "test", sizeof("test") - 1))
+ continue; /* KCSAN runtime function. */
+ /* KCSAN related test. */
+ }
+
+ /*
+ * No match for runtime functions -- @skip entries to skip to
+ * get to first frame of interest.
+ */
+ break;
+ }
+
+ return skip;
+}
+
+/* Compares symbolized strings of addr1 and addr2. */
+static int sym_strcmp(void *addr1, void *addr2)
+{
+ char buf1[64];
+ char buf2[64];
+
+ snprintf(buf1, sizeof(buf1), "%pS", addr1);
+ snprintf(buf2, sizeof(buf2), "%pS", addr2);
+
+ return strncmp(buf1, buf2, sizeof(buf1));
+}
+
+static void print_verbose_info(struct task_struct *task)
+{
+ if (!task)
+ return;
+
+ pr_err("\n");
+ debug_show_held_locks(task);
+ print_irqtrace_events(task);
+}
+
+/*
+ * Returns true if a report was generated, false otherwise.
+ */
+static bool print_report(enum kcsan_value_change value_change,
+ enum kcsan_report_type type,
+ const struct access_info *ai,
+ const struct other_info *other_info)
+{
+ unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 };
+ int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1);
+ int skipnr = get_stack_skipnr(stack_entries, num_stack_entries);
+ unsigned long this_frame = stack_entries[skipnr];
+ unsigned long other_frame = 0;
+ int other_skipnr = 0; /* silence uninit warnings */
+
+ /*
+ * Must check report filter rules before starting to print.
+ */
+ if (skip_report(KCSAN_VALUE_CHANGE_TRUE, stack_entries[skipnr]))
+ return false;
+
+ if (type == KCSAN_REPORT_RACE_SIGNAL) {
+ other_skipnr = get_stack_skipnr(other_info->stack_entries,
+ other_info->num_stack_entries);
+ other_frame = other_info->stack_entries[other_skipnr];
+
+ /* @value_change is only known for the other thread */
+ if (skip_report(value_change, other_frame))
+ return false;
+ }
+
+ if (rate_limit_report(this_frame, other_frame))
+ return false;
+
+ /* Print report header. */
+ pr_err("==================================================================\n");
+ switch (type) {
+ case KCSAN_REPORT_RACE_SIGNAL: {
+ int cmp;
+
+ /*
+ * Order functions lexographically for consistent bug titles.
+ * Do not print offset of functions to keep title short.
+ */
+ cmp = sym_strcmp((void *)other_frame, (void *)this_frame);
+ pr_err("BUG: KCSAN: %s in %ps / %ps\n",
+ get_bug_type(ai->access_type | other_info->ai.access_type),
+ (void *)(cmp < 0 ? other_frame : this_frame),
+ (void *)(cmp < 0 ? this_frame : other_frame));
+ } break;
+
+ case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN:
+ pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(ai->access_type),
+ (void *)this_frame);
+ break;
+
+ default:
+ BUG();
+ }
+
+ pr_err("\n");
+
+ /* Print information about the racing accesses. */
+ switch (type) {
+ case KCSAN_REPORT_RACE_SIGNAL:
+ pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n",
+ get_access_type(other_info->ai.access_type), other_info->ai.ptr,
+ other_info->ai.size, get_thread_desc(other_info->ai.task_pid),
+ other_info->ai.cpu_id);
+
+ /* Print the other thread's stack trace. */
+ stack_trace_print(other_info->stack_entries + other_skipnr,
+ other_info->num_stack_entries - other_skipnr,
+ 0);
+
+ if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
+ print_verbose_info(other_info->task);
+
+ pr_err("\n");
+ pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n",
+ get_access_type(ai->access_type), ai->ptr, ai->size,
+ get_thread_desc(ai->task_pid), ai->cpu_id);
+ break;
+
+ case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN:
+ pr_err("race at unknown origin, with %s to 0x%px of %zu bytes by %s on cpu %i:\n",
+ get_access_type(ai->access_type), ai->ptr, ai->size,
+ get_thread_desc(ai->task_pid), ai->cpu_id);
+ break;
+
+ default:
+ BUG();
+ }
+ /* Print stack trace of this thread. */
+ stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr,
+ 0);
+
+ if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
+ print_verbose_info(current);
+
+ /* Print report footer. */
+ pr_err("\n");
+ pr_err("Reported by Kernel Concurrency Sanitizer on:\n");
+ dump_stack_print_info(KERN_DEFAULT);
+ pr_err("==================================================================\n");
+
+ return true;
+}
+
+static void release_report(unsigned long *flags, struct other_info *other_info)
+{
+ if (other_info)
+ /*
+ * Use size to denote valid/invalid, since KCSAN entirely
+ * ignores 0-sized accesses.
+ */
+ other_info->ai.size = 0;
+
+ raw_spin_unlock_irqrestore(&report_lock, *flags);
+}
+
+/*
+ * Sets @other_info->task and awaits consumption of @other_info.
+ *
+ * Precondition: report_lock is held.
+ * Postcondition: report_lock is held.
+ */
+static void set_other_info_task_blocking(unsigned long *flags,
+ const struct access_info *ai,
+ struct other_info *other_info)
+{
+ /*
+ * We may be instrumenting a code-path where current->state is already
+ * something other than TASK_RUNNING.
+ */
+ const bool is_running = current->state == TASK_RUNNING;
+ /*
+ * To avoid deadlock in case we are in an interrupt here and this is a
+ * race with a task on the same CPU (KCSAN_INTERRUPT_WATCHER), provide a
+ * timeout to ensure this works in all contexts.
+ *
+ * Await approximately the worst case delay of the reporting thread (if
+ * we are not interrupted).
+ */
+ int timeout = max(kcsan_udelay_task, kcsan_udelay_interrupt);
+
+ other_info->task = current;
+ do {
+ if (is_running) {
+ /*
+ * Let lockdep know the real task is sleeping, to print
+ * the held locks (recall we turned lockdep off, so
+ * locking/unlocking @report_lock won't be recorded).
+ */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ }
+ raw_spin_unlock_irqrestore(&report_lock, *flags);
+ /*
+ * We cannot call schedule() since we also cannot reliably
+ * determine if sleeping here is permitted -- see in_atomic().
+ */
+
+ udelay(1);
+ raw_spin_lock_irqsave(&report_lock, *flags);
+ if (timeout-- < 0) {
+ /*
+ * Abort. Reset @other_info->task to NULL, since it
+ * appears the other thread is still going to consume
+ * it. It will result in no verbose info printed for
+ * this task.
+ */
+ other_info->task = NULL;
+ break;
+ }
+ /*
+ * If invalid, or @ptr nor @current matches, then @other_info
+ * has been consumed and we may continue. If not, retry.
+ */
+ } while (other_info->ai.size && other_info->ai.ptr == ai->ptr &&
+ other_info->task == current);
+ if (is_running)
+ set_current_state(TASK_RUNNING);
+}
+
+/* Populate @other_info; requires that the provided @other_info not in use. */
+static void prepare_report_producer(unsigned long *flags,
+ const struct access_info *ai,
+ struct other_info *other_info)
+{
+ raw_spin_lock_irqsave(&report_lock, *flags);
+
+ /*
+ * The same @other_infos entry cannot be used concurrently, because
+ * there is a one-to-one mapping to watchpoint slots (@watchpoints in
+ * core.c), and a watchpoint is only released for reuse after reporting
+ * is done by the consumer of @other_info. Therefore, it is impossible
+ * for another concurrent prepare_report_producer() to set the same
+ * @other_info, and are guaranteed exclusivity for the @other_infos
+ * entry pointed to by @other_info.
+ *
+ * To check this property holds, size should never be non-zero here,
+ * because every consumer of struct other_info resets size to 0 in
+ * release_report().
+ */
+ WARN_ON(other_info->ai.size);
+
+ other_info->ai = *ai;
+ other_info->num_stack_entries = stack_trace_save(other_info->stack_entries, NUM_STACK_ENTRIES, 2);
+
+ if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
+ set_other_info_task_blocking(flags, ai, other_info);
+
+ raw_spin_unlock_irqrestore(&report_lock, *flags);
+}
+
+/* Awaits producer to fill @other_info and then returns. */
+static bool prepare_report_consumer(unsigned long *flags,
+ const struct access_info *ai,
+ struct other_info *other_info)
+{
+
+ raw_spin_lock_irqsave(&report_lock, *flags);
+ while (!other_info->ai.size) { /* Await valid @other_info. */
+ raw_spin_unlock_irqrestore(&report_lock, *flags);
+ cpu_relax();
+ raw_spin_lock_irqsave(&report_lock, *flags);
+ }
+
+ /* Should always have a matching access based on watchpoint encoding. */
+ if (WARN_ON(!matching_access((unsigned long)other_info->ai.ptr & WATCHPOINT_ADDR_MASK, other_info->ai.size,
+ (unsigned long)ai->ptr & WATCHPOINT_ADDR_MASK, ai->size)))
+ goto discard;
+
+ if (!matching_access((unsigned long)other_info->ai.ptr, other_info->ai.size,
+ (unsigned long)ai->ptr, ai->size)) {
+ /*
+ * If the actual accesses to not match, this was a false
+ * positive due to watchpoint encoding.
+ */
+ kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES);
+ goto discard;
+ }
+
+ return true;
+
+discard:
+ release_report(flags, other_info);
+ return false;
+}
+
+/*
+ * Depending on the report type either sets @other_info and returns false, or
+ * awaits @other_info and returns true. If @other_info is not required for the
+ * report type, simply acquires @report_lock and returns true.
+ */
+static noinline bool prepare_report(unsigned long *flags,
+ enum kcsan_report_type type,
+ const struct access_info *ai,
+ struct other_info *other_info)
+{
+ switch (type) {
+ case KCSAN_REPORT_CONSUMED_WATCHPOINT:
+ prepare_report_producer(flags, ai, other_info);
+ return false;
+ case KCSAN_REPORT_RACE_SIGNAL:
+ return prepare_report_consumer(flags, ai, other_info);
+ default:
+ /* @other_info not required; just acquire @report_lock. */
+ raw_spin_lock_irqsave(&report_lock, *flags);
+ return true;
+ }
+}
+
+void kcsan_report(const volatile void *ptr, size_t size, int access_type,
+ enum kcsan_value_change value_change,
+ enum kcsan_report_type type, int watchpoint_idx)
+{
+ unsigned long flags = 0;
+ const struct access_info ai = {
+ .ptr = ptr,
+ .size = size,
+ .access_type = access_type,
+ .task_pid = in_task() ? task_pid_nr(current) : -1,
+ .cpu_id = raw_smp_processor_id()
+ };
+ struct other_info *other_info = type == KCSAN_REPORT_RACE_UNKNOWN_ORIGIN
+ ? NULL : &other_infos[watchpoint_idx];
+
+ kcsan_disable_current();
+ if (WARN_ON(watchpoint_idx < 0 || watchpoint_idx >= ARRAY_SIZE(other_infos)))
+ goto out;
+
+ /*
+ * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if
+ * we do not turn off lockdep here; this could happen due to recursion
+ * into lockdep via KCSAN if we detect a race in utilities used by
+ * lockdep.
+ */
+ lockdep_off();
+
+ if (prepare_report(&flags, type, &ai, other_info)) {
+ /*
+ * Never report if value_change is FALSE, only if we it is
+ * either TRUE or MAYBE. In case of MAYBE, further filtering may
+ * be done once we know the full stack trace in print_report().
+ */
+ bool reported = value_change != KCSAN_VALUE_CHANGE_FALSE &&
+ print_report(value_change, type, &ai, other_info);
+
+ if (reported && panic_on_warn)
+ panic("panic_on_warn set ...\n");
+
+ release_report(&flags, other_info);
+ }
+
+ lockdep_on();
+out:
+ kcsan_enable_current();
+}
diff --git a/kernel/kcsan/test.c b/kernel/kcsan/test.c
new file mode 100644
index 000000000000..d26a052d3383
--- /dev/null
+++ b/kernel/kcsan/test.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/types.h>
+
+#include "encoding.h"
+
+#define ITERS_PER_TEST 2000
+
+/* Test requirements. */
+static bool test_requires(void)
+{
+ /* random should be initialized for the below tests */
+ return prandom_u32() + prandom_u32() != 0;
+}
+
+/*
+ * Test watchpoint encode and decode: check that encoding some access's info,
+ * and then subsequent decode preserves the access's info.
+ */
+static bool test_encode_decode(void)
+{
+ int i;
+
+ for (i = 0; i < ITERS_PER_TEST; ++i) {
+ size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1;
+ bool is_write = !!prandom_u32_max(2);
+ unsigned long addr;
+
+ prandom_bytes(&addr, sizeof(addr));
+ if (WARN_ON(!check_encodable(addr, size)))
+ return false;
+
+ /* Encode and decode */
+ {
+ const long encoded_watchpoint =
+ encode_watchpoint(addr, size, is_write);
+ unsigned long verif_masked_addr;
+ size_t verif_size;
+ bool verif_is_write;
+
+ /* Check special watchpoints */
+ if (WARN_ON(decode_watchpoint(
+ INVALID_WATCHPOINT, &verif_masked_addr,
+ &verif_size, &verif_is_write)))
+ return false;
+ if (WARN_ON(decode_watchpoint(
+ CONSUMED_WATCHPOINT, &verif_masked_addr,
+ &verif_size, &verif_is_write)))
+ return false;
+
+ /* Check decoding watchpoint returns same data */
+ if (WARN_ON(!decode_watchpoint(
+ encoded_watchpoint, &verif_masked_addr,
+ &verif_size, &verif_is_write)))
+ return false;
+ if (WARN_ON(verif_masked_addr !=
+ (addr & WATCHPOINT_ADDR_MASK)))
+ goto fail;
+ if (WARN_ON(verif_size != size))
+ goto fail;
+ if (WARN_ON(is_write != verif_is_write))
+ goto fail;
+
+ continue;
+fail:
+ pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
+ __func__, is_write ? "write" : "read", size,
+ addr, encoded_watchpoint,
+ verif_is_write ? "write" : "read", verif_size,
+ verif_masked_addr);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* Test access matching function. */
+static bool test_matching_access(void)
+{
+ if (WARN_ON(!matching_access(10, 1, 10, 1)))
+ return false;
+ if (WARN_ON(!matching_access(10, 2, 11, 1)))
+ return false;
+ if (WARN_ON(!matching_access(10, 1, 9, 2)))
+ return false;
+ if (WARN_ON(matching_access(10, 1, 11, 1)))
+ return false;
+ if (WARN_ON(matching_access(9, 1, 10, 1)))
+ return false;
+
+ /*
+ * An access of size 0 could match another access, as demonstrated here.
+ * Rather than add more comparisons to 'matching_access()', which would
+ * end up in the fast-path for *all* checks, check_access() simply
+ * returns for all accesses of size 0.
+ */
+ if (WARN_ON(!matching_access(8, 8, 12, 0)))
+ return false;
+
+ return true;
+}
+
+static int __init kcsan_selftest(void)
+{
+ int passed = 0;
+ int total = 0;
+
+#define RUN_TEST(do_test) \
+ do { \
+ ++total; \
+ if (do_test()) \
+ ++passed; \
+ else \
+ pr_err("KCSAN selftest: " #do_test " failed"); \
+ } while (0)
+
+ RUN_TEST(test_requires);
+ RUN_TEST(test_encode_decode);
+ RUN_TEST(test_matching_access);
+
+ pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total);
+ if (passed != total)
+ panic("KCSAN selftests failed");
+ return 0;
+}
+postcore_initcall(kcsan_selftest);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index faa74d5f6941..09cc78df53c6 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -181,34 +181,19 @@ void kimage_file_post_load_cleanup(struct kimage *image)
static int
kimage_validate_signature(struct kimage *image)
{
- const char *reason;
int ret;
ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
image->kernel_buf_len);
- switch (ret) {
- case 0:
- break;
+ if (ret) {
- /* Certain verification errors are non-fatal if we're not
- * checking errors, provided we aren't mandating that there
- * must be a valid signature.
- */
- case -ENODATA:
- reason = "kexec of unsigned image";
- goto decide;
- case -ENOPKG:
- reason = "kexec of image with unsupported crypto";
- goto decide;
- case -ENOKEY:
- reason = "kexec of image with unavailable key";
- decide:
if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) {
- pr_notice("%s rejected\n", reason);
+ pr_notice("Enforced kernel signature verification failed (%d).\n", ret);
return ret;
}
- /* If IMA is guaranteed to appraise a signature on the kexec
+ /*
+ * If IMA is guaranteed to appraise a signature on the kexec
* image, permit it even if the kernel is otherwise locked
* down.
*/
@@ -216,17 +201,10 @@ kimage_validate_signature(struct kimage *image)
security_locked_down(LOCKDOWN_KEXEC))
return -EPERM;
- return 0;
-
- /* All other errors are fatal, including nomem, unparseable
- * signatures and signature check failures - even if signatures
- * aren't required.
- */
- default:
- pr_notice("kernel signature verification failed (%d).\n", ret);
+ pr_debug("kernel signature verification failed (%d).\n", ret);
}
- return ret;
+ return 0;
}
#endif
@@ -540,6 +518,11 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
unsigned long sz = end - start + 1;
/* Returning 0 will take to next memory range */
+
+ /* Don't use memory that will be detected and handled by a driver. */
+ if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED)
+ return 0;
+
if (sz < kbuf->memsz)
return 0;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3f310df4a693..4a904cc56d68 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -46,6 +46,11 @@
static int kprobes_initialized;
+/* kprobe_table can be accessed by
+ * - Normal hlist traversal and RCU add/del under kprobe_mutex is held.
+ * Or
+ * - RCU hlist traversal under disabling preempt (breakpoint handlers)
+ */
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
@@ -326,7 +331,8 @@ struct kprobe *get_kprobe(void *addr)
struct kprobe *p;
head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
- hlist_for_each_entry_rcu(p, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist,
+ lockdep_is_held(&kprobe_mutex)) {
if (p->addr == addr)
return p;
}
@@ -586,11 +592,12 @@ static void kprobe_optimizer(struct work_struct *work)
mutex_unlock(&module_mutex);
mutex_unlock(&text_mutex);
cpus_read_unlock();
- mutex_unlock(&kprobe_mutex);
/* Step 5: Kick optimizer again if needed */
if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
kick_kprobe_optimizer();
+
+ mutex_unlock(&kprobe_mutex);
}
/* Wait for completing optimization and unoptimization */
@@ -668,8 +675,6 @@ static void force_unoptimize_kprobe(struct optimized_kprobe *op)
lockdep_assert_cpus_held();
arch_unoptimize_kprobe(op);
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
- if (kprobe_disabled(&op->kp))
- arch_disarm_kprobe(&op->kp);
}
/* Unoptimize a kprobe if p is optimized */
@@ -849,7 +854,7 @@ static void optimize_all_kprobes(void)
kprobes_allow_optimization = true;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, head, hlist)
+ hlist_for_each_entry(p, head, hlist)
if (!kprobe_disabled(p))
optimize_kprobe(p);
}
@@ -876,7 +881,7 @@ static void unoptimize_all_kprobes(void)
kprobes_allow_optimization = false;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, head, hlist) {
+ hlist_for_each_entry(p, head, hlist) {
if (!kprobe_disabled(p))
unoptimize_kprobe(p, false);
}
@@ -892,7 +897,7 @@ static void unoptimize_all_kprobes(void)
static DEFINE_MUTEX(kprobe_sysctl_mutex);
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length,
+ void *buffer, size_t *length,
loff_t *ppos)
{
int ret;
@@ -1236,6 +1241,26 @@ __releases(hlist_lock)
}
NOKPROBE_SYMBOL(kretprobe_table_unlock);
+struct kprobe kprobe_busy = {
+ .addr = (void *) get_kprobe,
+};
+
+void kprobe_busy_begin(void)
+{
+ struct kprobe_ctlblk *kcb;
+
+ preempt_disable();
+ __this_cpu_write(current_kprobe, &kprobe_busy);
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+}
+
+void kprobe_busy_end(void)
+{
+ __this_cpu_write(current_kprobe, NULL);
+ preempt_enable();
+}
+
/*
* This function is called from finish_task_switch when task tk becomes dead,
* so that we can recycle any function-return probe instances associated
@@ -1253,6 +1278,8 @@ void kprobe_flush_task(struct task_struct *tk)
/* Early boot. kretprobe_table_locks not yet initialized. */
return;
+ kprobe_busy_begin();
+
INIT_HLIST_HEAD(&empty_rp);
hash = hash_ptr(tk, KPROBE_HASH_BITS);
head = &kretprobe_inst_table[hash];
@@ -1266,6 +1293,8 @@ void kprobe_flush_task(struct task_struct *tk)
hlist_del(&ri->hlist);
kfree(ri);
}
+
+ kprobe_busy_end();
}
NOKPROBE_SYMBOL(kprobe_flush_task);
@@ -1499,12 +1528,14 @@ static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
struct kprobe *ap, *list_p;
+ lockdep_assert_held(&kprobe_mutex);
+
ap = get_kprobe(p->addr);
if (unlikely(!ap))
return NULL;
if (p != ap) {
- list_for_each_entry_rcu(list_p, &ap->list, list)
+ list_for_each_entry(list_p, &ap->list, list)
if (list_p == p)
/* kprobe p is a valid probe */
goto valid;
@@ -1669,7 +1700,9 @@ static int aggr_kprobe_disabled(struct kprobe *ap)
{
struct kprobe *kp;
- list_for_each_entry_rcu(kp, &ap->list, list)
+ lockdep_assert_held(&kprobe_mutex);
+
+ list_for_each_entry(kp, &ap->list, list)
if (!kprobe_disabled(kp))
/*
* There is an active probe on the list.
@@ -1748,7 +1781,7 @@ static int __unregister_kprobe_top(struct kprobe *p)
else {
/* If disabling probe has special handlers, update aggrprobe */
if (p->post_handler && !kprobe_gone(p)) {
- list_for_each_entry_rcu(list_p, &ap->list, list) {
+ list_for_each_entry(list_p, &ap->list, list) {
if ((list_p != p) && (list_p->post_handler))
goto noclean;
}
@@ -2062,13 +2095,15 @@ static void kill_kprobe(struct kprobe *p)
{
struct kprobe *kp;
+ lockdep_assert_held(&kprobe_mutex);
+
p->flags |= KPROBE_FLAG_GONE;
if (kprobe_aggrprobe(p)) {
/*
* If this is an aggr_kprobe, we have to list all the
* chained probes and mark them GONE.
*/
- list_for_each_entry_rcu(kp, &p->list, list)
+ list_for_each_entry(kp, &p->list, list)
kp->flags |= KPROBE_FLAG_GONE;
p->post_handler = NULL;
kill_optimized_kprobe(p);
@@ -2312,7 +2347,7 @@ static int kprobes_module_callback(struct notifier_block *nb,
mutex_lock(&kprobe_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, head, hlist)
+ hlist_for_each_entry(p, head, hlist)
if (within_module_init((unsigned long)p->addr, mod) ||
(checkcore &&
within_module_core((unsigned long)p->addr, mod))) {
@@ -2475,24 +2510,14 @@ static int show_kprobe_addr(struct seq_file *pi, void *v)
return 0;
}
-static const struct seq_operations kprobes_seq_ops = {
+static const struct seq_operations kprobes_sops = {
.start = kprobe_seq_start,
.next = kprobe_seq_next,
.stop = kprobe_seq_stop,
.show = show_kprobe_addr
};
-static int kprobes_open(struct inode *inode, struct file *filp)
-{
- return seq_open(filp, &kprobes_seq_ops);
-}
-
-static const struct file_operations debugfs_kprobes_operations = {
- .open = kprobes_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(kprobes);
/* kprobes/blacklist -- shows which functions can not be probed */
static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
@@ -2529,24 +2554,13 @@ static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v)
mutex_unlock(&kprobe_mutex);
}
-static const struct seq_operations kprobe_blacklist_seq_ops = {
+static const struct seq_operations kprobe_blacklist_sops = {
.start = kprobe_blacklist_seq_start,
.next = kprobe_blacklist_seq_next,
.stop = kprobe_blacklist_seq_stop,
.show = kprobe_blacklist_seq_show,
};
-
-static int kprobe_blacklist_open(struct inode *inode, struct file *filp)
-{
- return seq_open(filp, &kprobe_blacklist_seq_ops);
-}
-
-static const struct file_operations debugfs_kprobe_blacklist_ops = {
- .open = kprobe_blacklist_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist);
static int arm_all_kprobes(void)
{
@@ -2571,7 +2585,7 @@ static int arm_all_kprobes(void)
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
/* Arm all kprobes on a best-effort basis */
- hlist_for_each_entry_rcu(p, head, hlist) {
+ hlist_for_each_entry(p, head, hlist) {
if (!kprobe_disabled(p)) {
err = arm_kprobe(p);
if (err) {
@@ -2614,7 +2628,7 @@ static int disarm_all_kprobes(void)
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
/* Disarm all kprobes on a best-effort basis */
- hlist_for_each_entry_rcu(p, head, hlist) {
+ hlist_for_each_entry(p, head, hlist) {
if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) {
err = disarm_kprobe(p, false);
if (err) {
@@ -2705,13 +2719,12 @@ static int __init debugfs_kprobe_init(void)
dir = debugfs_create_dir("kprobes", NULL);
- debugfs_create_file("list", 0400, dir, NULL,
- &debugfs_kprobes_operations);
+ debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops);
debugfs_create_file("enabled", 0600, dir, &value, &fops_kp);
debugfs_create_file("blacklist", 0400, dir, NULL,
- &debugfs_kprobe_blacklist_ops);
+ &kprobe_blacklist_fops);
return 0;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bfbfa481be3a..132f84a5fde3 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,13 +1,17 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Kernel thread helper functions.
* Copyright (C) 2004 IBM Corporation, Rusty Russell.
+ * Copyright (C) 2009 Red Hat, Inc.
*
* Creation is done via kthreadd, so that we get a clean environment
* even if we're invoked from userspace (think modprobe, hotplug cpu,
* etc.).
*/
#include <uapi/linux/sched/types.h>
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/kthread.h>
#include <linux/completion.h>
@@ -25,6 +29,7 @@
#include <linux/numa.h>
#include <trace/events/sched.h>
+
static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
@@ -46,7 +51,9 @@ struct kthread_create_info
struct kthread {
unsigned long flags;
unsigned int cpu;
+ int (*threadfn)(void *);
void *data;
+ mm_segment_t oldfs;
struct completion parked;
struct completion exited;
#ifdef CONFIG_BLK_CGROUP
@@ -153,6 +160,20 @@ bool kthread_freezable_should_stop(bool *was_frozen)
EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
/**
+ * kthread_func - return the function specified on kthread creation
+ * @task: kthread task in question
+ *
+ * Returns NULL if the task is not a kthread.
+ */
+void *kthread_func(struct task_struct *task)
+{
+ if (task->flags & PF_KTHREAD)
+ return to_kthread(task)->threadfn;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(kthread_func);
+
+/**
* kthread_data - return data value specified on kthread creation
* @task: kthread task in question
*
@@ -164,6 +185,7 @@ void *kthread_data(struct task_struct *task)
{
return to_kthread(task)->data;
}
+EXPORT_SYMBOL_GPL(kthread_data);
/**
* kthread_probe_data - speculative version of kthread_data()
@@ -179,7 +201,7 @@ void *kthread_probe_data(struct task_struct *task)
struct kthread *kthread = to_kthread(task);
void *data = NULL;
- probe_kernel_read(&data, &kthread->data, sizeof(data));
+ copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
return data;
}
@@ -244,6 +266,7 @@ static int kthread(void *_create)
do_exit(-ENOMEM);
}
+ self->threadfn = threadfn;
self->data = data;
init_completion(&self->exited);
init_completion(&self->parked);
@@ -1203,6 +1226,61 @@ void kthread_destroy_worker(struct kthread_worker *worker)
}
EXPORT_SYMBOL(kthread_destroy_worker);
+/**
+ * kthread_use_mm - make the calling kthread operate on an address space
+ * @mm: address space to operate on
+ */
+void kthread_use_mm(struct mm_struct *mm)
+{
+ struct mm_struct *active_mm;
+ struct task_struct *tsk = current;
+
+ WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
+ WARN_ON_ONCE(tsk->mm);
+
+ task_lock(tsk);
+ active_mm = tsk->active_mm;
+ if (active_mm != mm) {
+ mmgrab(mm);
+ tsk->active_mm = mm;
+ }
+ tsk->mm = mm;
+ switch_mm(active_mm, mm, tsk);
+ task_unlock(tsk);
+#ifdef finish_arch_post_lock_switch
+ finish_arch_post_lock_switch();
+#endif
+
+ if (active_mm != mm)
+ mmdrop(active_mm);
+
+ to_kthread(tsk)->oldfs = get_fs();
+ set_fs(USER_DS);
+}
+EXPORT_SYMBOL_GPL(kthread_use_mm);
+
+/**
+ * kthread_unuse_mm - reverse the effect of kthread_use_mm()
+ * @mm: address space to operate on
+ */
+void kthread_unuse_mm(struct mm_struct *mm)
+{
+ struct task_struct *tsk = current;
+
+ WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
+ WARN_ON_ONCE(!tsk->mm);
+
+ set_fs(to_kthread(tsk)->oldfs);
+
+ task_lock(tsk);
+ sync_mm_rss(mm);
+ tsk->mm = NULL;
+ /* active_mm is still 'mm' */
+ enter_lazy_tlb(mm, tsk);
+ task_unlock(tsk);
+}
+EXPORT_SYMBOL_GPL(kthread_unuse_mm);
+
#ifdef CONFIG_BLK_CGROUP
/**
* kthread_associate_blkcg - associate blkcg to current kthread
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 8d1c15832e55..166d7bf49666 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -269,8 +269,8 @@ static int __init init_lstats_procfs(void)
return 0;
}
-int sysctl_latencytop(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int err;
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index c3512e7e0801..f76fdb925532 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -191,18 +191,21 @@ static int klp_find_object_symbol(const char *objname, const char *name,
return -EINVAL;
}
-static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod)
+static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
+ unsigned int symndx, Elf_Shdr *relasec,
+ const char *sec_objname)
{
- int i, cnt, vmlinux, ret;
- char objname[MODULE_NAME_LEN];
- char symname[KSYM_NAME_LEN];
- char *strtab = pmod->core_kallsyms.strtab;
+ int i, cnt, ret;
+ char sym_objname[MODULE_NAME_LEN];
+ char sym_name[KSYM_NAME_LEN];
Elf_Rela *relas;
Elf_Sym *sym;
unsigned long sympos, addr;
+ bool sym_vmlinux;
+ bool sec_vmlinux = !strcmp(sec_objname, "vmlinux");
/*
- * Since the field widths for objname and symname in the sscanf()
+ * Since the field widths for sym_objname and sym_name in the sscanf()
* call are hard-coded and correspond to MODULE_NAME_LEN and
* KSYM_NAME_LEN respectively, we must make sure that MODULE_NAME_LEN
* and KSYM_NAME_LEN have the values we expect them to have.
@@ -216,27 +219,40 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod)
relas = (Elf_Rela *) relasec->sh_addr;
/* For each rela in this klp relocation section */
for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
- sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info);
+ sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
if (sym->st_shndx != SHN_LIVEPATCH) {
pr_err("symbol %s is not marked as a livepatch symbol\n",
strtab + sym->st_name);
return -EINVAL;
}
- /* Format: .klp.sym.objname.symname,sympos */
+ /* Format: .klp.sym.sym_objname.sym_name,sympos */
cnt = sscanf(strtab + sym->st_name,
".klp.sym.%55[^.].%127[^,],%lu",
- objname, symname, &sympos);
+ sym_objname, sym_name, &sympos);
if (cnt != 3) {
pr_err("symbol %s has an incorrectly formatted name\n",
strtab + sym->st_name);
return -EINVAL;
}
+ sym_vmlinux = !strcmp(sym_objname, "vmlinux");
+
+ /*
+ * Prevent module-specific KLP rela sections from referencing
+ * vmlinux symbols. This helps prevent ordering issues with
+ * module special section initializations. Presumably such
+ * symbols are exported and normal relas can be used instead.
+ */
+ if (!sec_vmlinux && sym_vmlinux) {
+ pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section",
+ sym_name);
+ return -EINVAL;
+ }
+
/* klp_find_object_symbol() treats a NULL objname as vmlinux */
- vmlinux = !strcmp(objname, "vmlinux");
- ret = klp_find_object_symbol(vmlinux ? NULL : objname,
- symname, sympos, &addr);
+ ret = klp_find_object_symbol(sym_vmlinux ? NULL : sym_objname,
+ sym_name, sympos, &addr);
if (ret)
return ret;
@@ -246,54 +262,59 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod)
return 0;
}
-static int klp_write_object_relocations(struct module *pmod,
- struct klp_object *obj)
+/*
+ * At a high-level, there are two types of klp relocation sections: those which
+ * reference symbols which live in vmlinux; and those which reference symbols
+ * which live in other modules. This function is called for both types:
+ *
+ * 1) When a klp module itself loads, the module code calls this function to
+ * write vmlinux-specific klp relocations (.klp.rela.vmlinux.* sections).
+ * These relocations are written to the klp module text to allow the patched
+ * code/data to reference unexported vmlinux symbols. They're written as
+ * early as possible to ensure that other module init code (.e.g.,
+ * jump_label_apply_nops) can access any unexported vmlinux symbols which
+ * might be referenced by the klp module's special sections.
+ *
+ * 2) When a to-be-patched module loads -- or is already loaded when a
+ * corresponding klp module loads -- klp code calls this function to write
+ * module-specific klp relocations (.klp.rela.{module}.* sections). These
+ * are written to the klp module text to allow the patched code/data to
+ * reference symbols which live in the to-be-patched module or one of its
+ * module dependencies. Exported symbols are supported, in addition to
+ * unexported symbols, in order to enable late module patching, which allows
+ * the to-be-patched module to be loaded and patched sometime *after* the
+ * klp module is loaded.
+ */
+int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
+ const char *shstrtab, const char *strtab,
+ unsigned int symndx, unsigned int secndx,
+ const char *objname)
{
- int i, cnt, ret = 0;
- const char *objname, *secname;
+ int cnt, ret;
char sec_objname[MODULE_NAME_LEN];
- Elf_Shdr *sec;
+ Elf_Shdr *sec = sechdrs + secndx;
- if (WARN_ON(!klp_is_object_loaded(obj)))
+ /*
+ * Format: .klp.rela.sec_objname.section_name
+ * See comment in klp_resolve_symbols() for an explanation
+ * of the selected field width value.
+ */
+ cnt = sscanf(shstrtab + sec->sh_name, ".klp.rela.%55[^.]",
+ sec_objname);
+ if (cnt != 1) {
+ pr_err("section %s has an incorrectly formatted name\n",
+ shstrtab + sec->sh_name);
return -EINVAL;
+ }
- objname = klp_is_module(obj) ? obj->name : "vmlinux";
-
- /* For each klp relocation section */
- for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) {
- sec = pmod->klp_info->sechdrs + i;
- secname = pmod->klp_info->secstrings + sec->sh_name;
- if (!(sec->sh_flags & SHF_RELA_LIVEPATCH))
- continue;
-
- /*
- * Format: .klp.rela.sec_objname.section_name
- * See comment in klp_resolve_symbols() for an explanation
- * of the selected field width value.
- */
- cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname);
- if (cnt != 1) {
- pr_err("section %s has an incorrectly formatted name\n",
- secname);
- ret = -EINVAL;
- break;
- }
-
- if (strcmp(objname, sec_objname))
- continue;
-
- ret = klp_resolve_symbols(sec, pmod);
- if (ret)
- break;
+ if (strcmp(objname ? objname : "vmlinux", sec_objname))
+ return 0;
- ret = apply_relocate_add(pmod->klp_info->sechdrs,
- pmod->core_kallsyms.strtab,
- pmod->klp_info->symndx, i, pmod);
- if (ret)
- break;
- }
+ ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname);
+ if (ret)
+ return ret;
- return ret;
+ return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
}
/*
@@ -724,10 +745,27 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
func->old_sympos ? func->old_sympos : 1);
}
-/* Arches may override this to finish any remaining arch-specific tasks */
-void __weak arch_klp_init_object_loaded(struct klp_patch *patch,
- struct klp_object *obj)
+static int klp_apply_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj)
{
+ int i, ret;
+ struct klp_modinfo *info = patch->mod->klp_info;
+
+ for (i = 1; i < info->hdr.e_shnum; i++) {
+ Elf_Shdr *sec = info->sechdrs + i;
+
+ if (!(sec->sh_flags & SHF_RELA_LIVEPATCH))
+ continue;
+
+ ret = klp_apply_section_relocs(patch->mod, info->sechdrs,
+ info->secstrings,
+ patch->mod->core_kallsyms.strtab,
+ info->symndx, i, obj->name);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
/* parts of the initialization that is done only when the object is loaded */
@@ -737,21 +775,18 @@ static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_func *func;
int ret;
- mutex_lock(&text_mutex);
-
- module_disable_ro(patch->mod);
- ret = klp_write_object_relocations(patch->mod, obj);
- if (ret) {
- module_enable_ro(patch->mod, true);
- mutex_unlock(&text_mutex);
- return ret;
+ if (klp_is_module(obj)) {
+ /*
+ * Only write module-specific relocations here
+ * (.klp.rela.{module}.*). vmlinux-specific relocations were
+ * written earlier during the initialization of the klp module
+ * itself.
+ */
+ ret = klp_apply_object_relocs(patch, obj);
+ if (ret)
+ return ret;
}
- arch_klp_init_object_loaded(patch, obj);
- module_enable_ro(patch->mod, true);
-
- mutex_unlock(&text_mutex);
-
klp_for_each_func(obj, func) {
ret = klp_find_object_symbol(obj->name, func->old_name,
func->old_sympos,
@@ -1139,6 +1174,11 @@ int klp_module_coming(struct module *mod)
if (WARN_ON(mod->state != MODULE_STATE_COMING))
return -EINVAL;
+ if (!strcmp(mod->name, "vmlinux")) {
+ pr_err("vmlinux.ko: invalid module name");
+ return -EINVAL;
+ }
+
mutex_lock(&klp_mutex);
/*
* Each module has to know that klp_module_coming()
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 45452facff3b..6d11cfb9b41f 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -5,6 +5,9 @@ KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
+# Avoid recursion lockdep -> KCSAN -> ... -> lockdep.
+KCSAN_SANITIZE_lockdep.o := n
+
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index dd3cc0854c32..29a8de4c50b9 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -393,7 +393,7 @@ void lockdep_init_task(struct task_struct *task)
task->lockdep_recursion = 0;
}
-static inline void lockdep_recursion_finish(void)
+static __always_inline void lockdep_recursion_finish(void)
{
if (WARN_ON_ONCE(--current->lockdep_recursion))
current->lockdep_recursion = 0;
@@ -801,7 +801,7 @@ static int count_matching_names(struct lock_class *new_class)
}
/* used from NMI context -- must be lockless */
-static inline struct lock_class *
+static __always_inline struct lock_class *
look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
{
struct lockdep_subclass_key *key;
@@ -3616,13 +3616,10 @@ mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit)
/*
* Hardirqs will be enabled:
*/
-static void __trace_hardirqs_on_caller(unsigned long ip)
+static void __trace_hardirqs_on_caller(void)
{
struct task_struct *curr = current;
- /* we'll do an OFF -> ON transition: */
- curr->hardirqs_enabled = 1;
-
/*
* We are going to turn hardirqs on, so set the
* usage bit for all held locks:
@@ -3635,15 +3632,19 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
* this bit from being set before)
*/
if (curr->softirqs_enabled)
- if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ))
- return;
-
- curr->hardirq_enable_ip = ip;
- curr->hardirq_enable_event = ++curr->irq_events;
- debug_atomic_inc(hardirqs_on_events);
+ mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
}
-void lockdep_hardirqs_on(unsigned long ip)
+/**
+ * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts
+ * @ip: Caller address
+ *
+ * Invoked before a possible transition to RCU idle from exit to user or
+ * guest mode. This ensures that all RCU operations are done before RCU
+ * stops watching. After the RCU transition lockdep_hardirqs_on() has to be
+ * invoked to set the final state.
+ */
+void lockdep_hardirqs_on_prepare(unsigned long ip)
{
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
@@ -3679,20 +3680,62 @@ void lockdep_hardirqs_on(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
return;
+ current->hardirq_chain_key = current->curr_chain_key;
+
current->lockdep_recursion++;
- __trace_hardirqs_on_caller(ip);
+ __trace_hardirqs_on_caller();
lockdep_recursion_finish();
}
-NOKPROBE_SYMBOL(lockdep_hardirqs_on);
+EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare);
+
+void noinstr lockdep_hardirqs_on(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks || curr->lockdep_recursion))
+ return;
+
+ if (curr->hardirqs_enabled) {
+ /*
+ * Neither irq nor preemption are disabled here
+ * so this is racy by nature but losing one hit
+ * in a stat is not a big deal.
+ */
+ __debug_atomic_inc(redundant_hardirqs_on);
+ return;
+ }
+
+ /*
+ * We're enabling irqs and according to our state above irqs weren't
+ * already enabled, yet we find the hardware thinks they are in fact
+ * enabled.. someone messed up their IRQ state tracing.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ /*
+ * Ensure the lock stack remained unchanged between
+ * lockdep_hardirqs_on_prepare() and lockdep_hardirqs_on().
+ */
+ DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key !=
+ current->curr_chain_key);
+
+ /* we'll do an OFF -> ON transition: */
+ curr->hardirqs_enabled = 1;
+ curr->hardirq_enable_ip = ip;
+ curr->hardirq_enable_event = ++curr->irq_events;
+ debug_atomic_inc(hardirqs_on_events);
+}
+EXPORT_SYMBOL_GPL(lockdep_hardirqs_on);
/*
* Hardirqs were disabled:
*/
-void lockdep_hardirqs_off(unsigned long ip)
+void noinstr lockdep_hardirqs_off(unsigned long ip)
{
struct task_struct *curr = current;
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(!debug_locks || curr->lockdep_recursion))
return;
/*
@@ -3710,10 +3753,11 @@ void lockdep_hardirqs_off(unsigned long ip)
curr->hardirq_disable_ip = ip;
curr->hardirq_disable_event = ++curr->irq_events;
debug_atomic_inc(hardirqs_off_events);
- } else
+ } else {
debug_atomic_inc(redundant_hardirqs_off);
+ }
}
-NOKPROBE_SYMBOL(lockdep_hardirqs_off);
+EXPORT_SYMBOL_GPL(lockdep_hardirqs_off);
/*
* Softirqs will be enabled:
@@ -4380,7 +4424,7 @@ static void print_unlock_imbalance_bug(struct task_struct *curr,
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
pr_cont(") at:\n");
- print_ip_sym(ip);
+ print_ip_sym(KERN_WARNING, ip);
pr_warn("but there are no more locks to release!\n");
pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
@@ -4389,8 +4433,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr,
dump_stack();
}
-static int match_held_lock(const struct held_lock *hlock,
- const struct lockdep_map *lock)
+static noinstr int match_held_lock(const struct held_lock *hlock,
+ const struct lockdep_map *lock)
{
if (hlock->instance == lock)
return 1;
@@ -4677,7 +4721,7 @@ __lock_release(struct lockdep_map *lock, unsigned long ip)
return 0;
}
-static nokprobe_inline
+static __always_inline
int __lock_is_held(const struct lockdep_map *lock, int read)
{
struct task_struct *curr = current;
@@ -4937,7 +4981,7 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
}
EXPORT_SYMBOL_GPL(lock_release);
-int lock_is_held_type(const struct lockdep_map *lock, int read)
+noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
{
unsigned long flags;
int ret = 0;
@@ -5031,7 +5075,7 @@ static void print_lock_contention_bug(struct task_struct *curr,
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
pr_cont(") at:\n");
- print_ip_sym(ip);
+ print_ip_sym(KERN_WARNING, ip);
pr_warn("but there are no locks held!\n");
pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index fd4fe1f5b458..36e69100e8e0 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -125,7 +125,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
printk("\n%s/%d's [blocked] stackdump:\n\n",
task->comm, task_pid_nr(task));
- show_stack(task, NULL);
+ show_stack(task, NULL, KERN_DEFAULT);
printk("\n%s/%d's [current] stackdump:\n\n",
current->comm, task_pid_nr(current));
dump_stack();
diff --git a/kernel/module.c b/kernel/module.c
index 72ed2b3a6ee2..bee1c25ca5c5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4,6 +4,9 @@
Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
*/
+
+#define INCLUDE_VERMAGIC
+
#include <linux/export.h>
#include <linux/extable.h>
#include <linux/moduleloader.h>
@@ -1943,7 +1946,6 @@ static void mod_sysfs_teardown(struct module *mod)
mod_sysfs_fini(mod);
}
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
/*
* LKM RO/NX protection: protect module's text/ro-data
* from modification and any data from execution.
@@ -1957,6 +1959,14 @@ static void mod_sysfs_teardown(struct module *mod)
*
* These values are always page-aligned (as is base)
*/
+
+/*
+ * Since some arches are moving towards PAGE_KERNEL module allocations instead
+ * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the
+ * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of
+ * whether we are strict.
+ */
+#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
static void frob_text(const struct module_layout *layout,
int (*set_memory)(unsigned long start, int num_pages))
{
@@ -1966,6 +1976,15 @@ static void frob_text(const struct module_layout *layout,
layout->text_size >> PAGE_SHIFT);
}
+static void module_enable_x(const struct module *mod)
+{
+ frob_text(&mod->core_layout, set_memory_x);
+ frob_text(&mod->init_layout, set_memory_x);
+}
+#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
+static void module_enable_x(const struct module *mod) { }
+#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
+
#ifdef CONFIG_STRICT_MODULE_RWX
static void frob_rodata(const struct module_layout *layout,
int (*set_memory)(unsigned long start, int num_pages))
@@ -1997,20 +2016,7 @@ static void frob_writable_data(const struct module_layout *layout,
(layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
}
-/* livepatching wants to disable read-only so it can frob module. */
-void module_disable_ro(const struct module *mod)
-{
- if (!rodata_enabled)
- return;
-
- frob_text(&mod->core_layout, set_memory_rw);
- frob_rodata(&mod->core_layout, set_memory_rw);
- frob_ro_after_init(&mod->core_layout, set_memory_rw);
- frob_text(&mod->init_layout, set_memory_rw);
- frob_rodata(&mod->init_layout, set_memory_rw);
-}
-
-void module_enable_ro(const struct module *mod, bool after_init)
+static void module_enable_ro(const struct module *mod, bool after_init)
{
if (!rodata_enabled)
return;
@@ -2036,19 +2042,29 @@ static void module_enable_nx(const struct module *mod)
frob_writable_data(&mod->init_layout, set_memory_nx);
}
+static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ char *secstrings, struct module *mod)
+{
+ const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR;
+ int i;
+
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if ((sechdrs[i].sh_flags & shf_wx) == shf_wx)
+ return -ENOEXEC;
+ }
+
+ return 0;
+}
+
#else /* !CONFIG_STRICT_MODULE_RWX */
static void module_enable_nx(const struct module *mod) { }
-#endif /* CONFIG_STRICT_MODULE_RWX */
-static void module_enable_x(const struct module *mod)
+static void module_enable_ro(const struct module *mod, bool after_init) {}
+static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ char *secstrings, struct module *mod)
{
- frob_text(&mod->core_layout, set_memory_x);
- frob_text(&mod->init_layout, set_memory_x);
+ return 0;
}
-#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
-static void module_enable_nx(const struct module *mod) { }
-static void module_enable_x(const struct module *mod) { }
-#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
-
+#endif /* CONFIG_STRICT_MODULE_RWX */
#ifdef CONFIG_LIVEPATCH
/*
@@ -2334,11 +2350,13 @@ static int apply_relocations(struct module *mod, const struct load_info *info)
if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
continue;
- /* Livepatch relocation sections are applied by livepatch */
if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
- continue;
-
- if (info->sechdrs[i].sh_type == SHT_REL)
+ err = klp_apply_section_relocs(mod, info->sechdrs,
+ info->secstrings,
+ info->strtab,
+ info->index.sym, i,
+ NULL);
+ else if (info->sechdrs[i].sh_type == SHT_REL)
err = apply_relocate(info->sechdrs, info->strtab,
info->index.sym, i, mod);
else if (info->sechdrs[i].sh_type == SHT_RELA)
@@ -2400,7 +2418,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
- || strstarts(sname, ".init"))
+ || module_init_section(sname))
continue;
s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
pr_debug("\t%s\n", sname);
@@ -2433,7 +2451,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
- || !strstarts(sname, ".init"))
+ || !module_init_section(sname))
continue;
s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
| INIT_OFFSET_MASK);
@@ -2765,7 +2783,14 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
void * __weak module_alloc(unsigned long size)
{
- return vmalloc_exec(size);
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
+ NUMA_NO_NODE, __builtin_return_address(0));
+}
+
+bool __weak module_init_section(const char *name)
+{
+ return strstarts(name, ".init");
}
bool __weak module_exit_section(const char *name)
@@ -2946,8 +2971,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
return err;
/* Suck in entire file: we'll want most of it. */
- info->hdr = __vmalloc(info->len,
- GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL);
+ info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN);
if (!info->hdr)
return -ENOMEM;
@@ -3322,12 +3346,6 @@ static int check_module_license_and_versions(struct module *mod)
static void flush_module_icache(const struct module *mod)
{
- mm_segment_t old_fs;
-
- /* flush the icache in correct context */
- old_fs = get_fs();
- set_fs(KERNEL_DS);
-
/*
* Flush the instruction cache, since we've played with text.
* Do it before processing of module parameters, so the module
@@ -3339,8 +3357,6 @@ static void flush_module_icache(const struct module *mod)
+ mod->init_layout.size);
flush_icache_range((unsigned long)mod->core_layout.base,
(unsigned long)mod->core_layout.base + mod->core_layout.size);
-
- set_fs(old_fs);
}
int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
@@ -3388,6 +3404,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
if (err < 0)
return ERR_PTR(err);
+ err = module_enforce_rwx_sections(info->hdr, info->sechdrs,
+ info->secstrings, info->mod);
+ if (err < 0)
+ return ERR_PTR(err);
+
/* We will do a special allocation for per-cpu sections later. */
info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 5989bbb93039..84c987dfbe03 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -519,7 +519,6 @@ NOKPROBE_SYMBOL(notify_die);
int register_die_notifier(struct notifier_block *nb)
{
- vmalloc_sync_mappings();
return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ed9882108cd2..cd356630a311 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -19,6 +19,8 @@
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
#include <linux/time_namespace.h>
+#include <linux/fs_struct.h>
+#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
@@ -257,37 +259,296 @@ void exit_task_namespaces(struct task_struct *p)
switch_task_namespaces(p, NULL);
}
-SYSCALL_DEFINE2(setns, int, fd, int, nstype)
+static int check_setns_flags(unsigned long flags)
{
- struct task_struct *tsk = current;
- struct nsproxy *new_nsproxy;
- struct file *file;
- struct ns_common *ns;
- int err;
+ if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID |
+ CLONE_NEWCGROUP)))
+ return -EINVAL;
- file = proc_ns_fget(fd);
- if (IS_ERR(file))
- return PTR_ERR(file);
+#ifndef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_UTS_NS
+ if (flags & CLONE_NEWUTS)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_CGROUPS
+ if (flags & CLONE_NEWCGROUP)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_NET_NS
+ if (flags & CLONE_NEWNET)
+ return -EINVAL;
+#endif
- err = -EINVAL;
- ns = get_proc_ns(file_inode(file));
- if (nstype && (ns->ops->type != nstype))
- goto out;
+ return 0;
+}
+
+static void put_nsset(struct nsset *nsset)
+{
+ unsigned flags = nsset->flags;
+
+ if (flags & CLONE_NEWUSER)
+ put_cred(nsset_cred(nsset));
+ /*
+ * We only created a temporary copy if we attached to more than just
+ * the mount namespace.
+ */
+ if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
+ free_fs_struct(nsset->fs);
+ if (nsset->nsproxy)
+ free_nsproxy(nsset->nsproxy);
+}
+
+static int prepare_nsset(unsigned flags, struct nsset *nsset)
+{
+ struct task_struct *me = current;
+
+ nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
+ if (IS_ERR(nsset->nsproxy))
+ return PTR_ERR(nsset->nsproxy);
- new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
- if (IS_ERR(new_nsproxy)) {
- err = PTR_ERR(new_nsproxy);
+ if (flags & CLONE_NEWUSER)
+ nsset->cred = prepare_creds();
+ else
+ nsset->cred = current_cred();
+ if (!nsset->cred)
goto out;
+
+ /* Only create a temporary copy of fs_struct if we really need to. */
+ if (flags == CLONE_NEWNS) {
+ nsset->fs = me->fs;
+ } else if (flags & CLONE_NEWNS) {
+ nsset->fs = copy_fs_struct(me->fs);
+ if (!nsset->fs)
+ goto out;
}
- err = ns->ops->install(new_nsproxy, ns);
- if (err) {
- free_nsproxy(new_nsproxy);
- goto out;
+ nsset->flags = flags;
+ return 0;
+
+out:
+ put_nsset(nsset);
+ return -ENOMEM;
+}
+
+static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
+{
+ return ns->ops->install(nsset, ns);
+}
+
+/*
+ * This is the inverse operation to unshare().
+ * Ordering is equivalent to the standard ordering used everywhere else
+ * during unshare and process creation. The switch to the new set of
+ * namespaces occurs at the point of no return after installation of
+ * all requested namespaces was successful in commit_nsset().
+ */
+static int validate_nsset(struct nsset *nsset, struct pid *pid)
+{
+ int ret = 0;
+ unsigned flags = nsset->flags;
+ struct user_namespace *user_ns = NULL;
+ struct pid_namespace *pid_ns = NULL;
+ struct nsproxy *nsp;
+ struct task_struct *tsk;
+
+ /* Take a "snapshot" of the target task's namespaces. */
+ rcu_read_lock();
+ tsk = pid_task(pid, PIDTYPE_PID);
+ if (!tsk) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+ if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) {
+ rcu_read_unlock();
+ return -EPERM;
+ }
+
+ task_lock(tsk);
+ nsp = tsk->nsproxy;
+ if (nsp)
+ get_nsproxy(nsp);
+ task_unlock(tsk);
+ if (!nsp) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+#ifdef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID) {
+ pid_ns = task_active_pid_ns(tsk);
+ if (unlikely(!pid_ns)) {
+ rcu_read_unlock();
+ ret = -ESRCH;
+ goto out;
+ }
+ get_pid_ns(pid_ns);
+ }
+#endif
+
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER)
+ user_ns = get_user_ns(__task_cred(tsk)->user_ns);
+#endif
+ rcu_read_unlock();
+
+ /*
+ * Install requested namespaces. The caller will have
+ * verified earlier that the requested namespaces are
+ * supported on this kernel. We don't report errors here
+ * if a namespace is requested that isn't supported.
+ */
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER) {
+ ret = validate_ns(nsset, &user_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+ if (flags & CLONE_NEWNS) {
+ ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns));
+ if (ret)
+ goto out;
+ }
+
+#ifdef CONFIG_UTS_NS
+ if (flags & CLONE_NEWUTS) {
+ ret = validate_ns(nsset, &nsp->uts_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC) {
+ ret = validate_ns(nsset, &nsp->ipc_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID) {
+ ret = validate_ns(nsset, &pid_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_CGROUPS
+ if (flags & CLONE_NEWCGROUP) {
+ ret = validate_ns(nsset, &nsp->cgroup_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_NET_NS
+ if (flags & CLONE_NEWNET) {
+ ret = validate_ns(nsset, &nsp->net_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+out:
+ if (pid_ns)
+ put_pid_ns(pid_ns);
+ if (nsp)
+ put_nsproxy(nsp);
+ put_user_ns(user_ns);
+
+ return ret;
+}
+
+/*
+ * This is the point of no return. There are just a few namespaces
+ * that do some actual work here and it's sufficiently minimal that
+ * a separate ns_common operation seems unnecessary for now.
+ * Unshare is doing the same thing. If we'll end up needing to do
+ * more in a given namespace or a helper here is ultimately not
+ * exported anymore a simple commit handler for each namespace
+ * should be added to ns_common.
+ */
+static void commit_nsset(struct nsset *nsset)
+{
+ unsigned flags = nsset->flags;
+ struct task_struct *me = current;
+
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER) {
+ /* transfer ownership */
+ commit_creds(nsset_cred(nsset));
+ nsset->cred = NULL;
+ }
+#endif
+
+ /* We only need to commit if we have used a temporary fs_struct. */
+ if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) {
+ set_fs_root(me->fs, &nsset->fs->root);
+ set_fs_pwd(me->fs, &nsset->fs->pwd);
}
- switch_task_namespaces(tsk, new_nsproxy);
- perf_event_namespaces(tsk);
+#ifdef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC)
+ exit_sem(me);
+#endif
+
+ /* transfer ownership */
+ switch_task_namespaces(me, nsset->nsproxy);
+ nsset->nsproxy = NULL;
+}
+
+SYSCALL_DEFINE2(setns, int, fd, int, flags)
+{
+ struct file *file;
+ struct ns_common *ns = NULL;
+ struct nsset nsset = {};
+ int err = 0;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ if (proc_ns_file(file)) {
+ ns = get_proc_ns(file_inode(file));
+ if (flags && (ns->ops->type != flags))
+ err = -EINVAL;
+ flags = ns->ops->type;
+ } else if (!IS_ERR(pidfd_pid(file))) {
+ err = check_setns_flags(flags);
+ } else {
+ err = -EINVAL;
+ }
+ if (err)
+ goto out;
+
+ err = prepare_nsset(flags, &nsset);
+ if (err)
+ goto out;
+
+ if (proc_ns_file(file))
+ err = validate_ns(&nsset, ns);
+ else
+ err = validate_nsset(&nsset, file->private_data);
+ if (!err) {
+ commit_nsset(&nsset);
+ perf_event_namespaces(current);
+ }
+ put_nsset(&nsset);
out:
fput(file);
return err;
diff --git a/kernel/padata.c b/kernel/padata.c
index aae789896616..4373f7adaa40 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -7,6 +7,9 @@
* Copyright (C) 2008, 2009 secunet Security Networks AG
* Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
*
+ * Copyright (c) 2020 Oracle and/or its affiliates.
+ * Author: Daniel Jordan <daniel.m.jordan@oracle.com>
+ *
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
@@ -21,6 +24,7 @@
* 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*/
+#include <linux/completion.h>
#include <linux/export.h>
#include <linux/cpumask.h>
#include <linux/err.h>
@@ -31,11 +35,30 @@
#include <linux/slab.h>
#include <linux/sysfs.h>
#include <linux/rcupdate.h>
-#include <linux/module.h>
-#define MAX_OBJ_NUM 1000
+#define PADATA_WORK_ONSTACK 1 /* Work's memory is on stack */
+
+struct padata_work {
+ struct work_struct pw_work;
+ struct list_head pw_list; /* padata_free_works linkage */
+ void *pw_data;
+};
+
+static DEFINE_SPINLOCK(padata_works_lock);
+static struct padata_work *padata_works;
+static LIST_HEAD(padata_free_works);
+
+struct padata_mt_job_state {
+ spinlock_t lock;
+ struct completion completion;
+ struct padata_mt_job *job;
+ int nworks;
+ int nworks_fini;
+ unsigned long chunk_size;
+};
static void padata_free_pd(struct parallel_data *pd);
+static void __init padata_mt_helper(struct work_struct *work);
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
{
@@ -59,30 +82,82 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr)
return padata_index_to_cpu(pd, cpu_index);
}
-static void padata_parallel_worker(struct work_struct *parallel_work)
+static struct padata_work *padata_work_alloc(void)
{
- struct padata_parallel_queue *pqueue;
- LIST_HEAD(local_list);
+ struct padata_work *pw;
- local_bh_disable();
- pqueue = container_of(parallel_work,
- struct padata_parallel_queue, work);
+ lockdep_assert_held(&padata_works_lock);
- spin_lock(&pqueue->parallel.lock);
- list_replace_init(&pqueue->parallel.list, &local_list);
- spin_unlock(&pqueue->parallel.lock);
+ if (list_empty(&padata_free_works))
+ return NULL; /* No more work items allowed to be queued. */
- while (!list_empty(&local_list)) {
- struct padata_priv *padata;
+ pw = list_first_entry(&padata_free_works, struct padata_work, pw_list);
+ list_del(&pw->pw_list);
+ return pw;
+}
- padata = list_entry(local_list.next,
- struct padata_priv, list);
+static void padata_work_init(struct padata_work *pw, work_func_t work_fn,
+ void *data, int flags)
+{
+ if (flags & PADATA_WORK_ONSTACK)
+ INIT_WORK_ONSTACK(&pw->pw_work, work_fn);
+ else
+ INIT_WORK(&pw->pw_work, work_fn);
+ pw->pw_data = data;
+}
- list_del_init(&padata->list);
+static int __init padata_work_alloc_mt(int nworks, void *data,
+ struct list_head *head)
+{
+ int i;
- padata->parallel(padata);
+ spin_lock(&padata_works_lock);
+ /* Start at 1 because the current task participates in the job. */
+ for (i = 1; i < nworks; ++i) {
+ struct padata_work *pw = padata_work_alloc();
+
+ if (!pw)
+ break;
+ padata_work_init(pw, padata_mt_helper, data, 0);
+ list_add(&pw->pw_list, head);
}
+ spin_unlock(&padata_works_lock);
+
+ return i;
+}
+static void padata_work_free(struct padata_work *pw)
+{
+ lockdep_assert_held(&padata_works_lock);
+ list_add(&pw->pw_list, &padata_free_works);
+}
+
+static void __init padata_works_free(struct list_head *works)
+{
+ struct padata_work *cur, *next;
+
+ if (list_empty(works))
+ return;
+
+ spin_lock(&padata_works_lock);
+ list_for_each_entry_safe(cur, next, works, pw_list) {
+ list_del(&cur->pw_list);
+ padata_work_free(cur);
+ }
+ spin_unlock(&padata_works_lock);
+}
+
+static void padata_parallel_worker(struct work_struct *parallel_work)
+{
+ struct padata_work *pw = container_of(parallel_work, struct padata_work,
+ pw_work);
+ struct padata_priv *padata = pw->pw_data;
+
+ local_bh_disable();
+ padata->parallel(padata);
+ spin_lock(&padata_works_lock);
+ padata_work_free(pw);
+ spin_unlock(&padata_works_lock);
local_bh_enable();
}
@@ -106,9 +181,9 @@ int padata_do_parallel(struct padata_shell *ps,
struct padata_priv *padata, int *cb_cpu)
{
struct padata_instance *pinst = ps->pinst;
- int i, cpu, cpu_index, target_cpu, err;
- struct padata_parallel_queue *queue;
+ int i, cpu, cpu_index, err;
struct parallel_data *pd;
+ struct padata_work *pw;
rcu_read_lock_bh();
@@ -136,25 +211,25 @@ int padata_do_parallel(struct padata_shell *ps,
if ((pinst->flags & PADATA_RESET))
goto out;
- if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
- goto out;
-
- err = 0;
atomic_inc(&pd->refcnt);
padata->pd = pd;
padata->cb_cpu = *cb_cpu;
- padata->seq_nr = atomic_inc_return(&pd->seq_nr);
- target_cpu = padata_cpu_hash(pd, padata->seq_nr);
- padata->cpu = target_cpu;
- queue = per_cpu_ptr(pd->pqueue, target_cpu);
-
- spin_lock(&queue->parallel.lock);
- list_add_tail(&padata->list, &queue->parallel.list);
- spin_unlock(&queue->parallel.lock);
+ rcu_read_unlock_bh();
- queue_work(pinst->parallel_wq, &queue->work);
+ spin_lock(&padata_works_lock);
+ padata->seq_nr = ++pd->seq_nr;
+ pw = padata_work_alloc();
+ spin_unlock(&padata_works_lock);
+ if (pw) {
+ padata_work_init(pw, padata_parallel_worker, padata, 0);
+ queue_work(pinst->parallel_wq, &pw->pw_work);
+ } else {
+ /* Maximum works limit exceeded, run in the current task. */
+ padata->parallel(padata);
+ }
+ return 0;
out:
rcu_read_unlock_bh();
@@ -260,7 +335,7 @@ static void padata_reorder(struct parallel_data *pd)
*
* Ensure reorder queue is read after pd->lock is dropped so we see
* new objects from another task in padata_do_serial. Pairs with
- * smp_mb__after_atomic in padata_do_serial.
+ * smp_mb in padata_do_serial.
*/
smp_mb();
@@ -325,8 +400,9 @@ static void padata_serial_worker(struct work_struct *serial_work)
void padata_do_serial(struct padata_priv *padata)
{
struct parallel_data *pd = padata->pd;
+ int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr);
struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue,
- padata->cpu);
+ hashed_cpu);
struct padata_priv *cur;
spin_lock(&pqueue->reorder.lock);
@@ -342,7 +418,7 @@ void padata_do_serial(struct padata_priv *padata)
* with the trylock of pd->lock in padata_reorder. Pairs with smp_mb
* in padata_reorder.
*/
- smp_mb__after_atomic();
+ smp_mb();
padata_reorder(pd);
}
@@ -387,6 +463,98 @@ out:
return err;
}
+static void __init padata_mt_helper(struct work_struct *w)
+{
+ struct padata_work *pw = container_of(w, struct padata_work, pw_work);
+ struct padata_mt_job_state *ps = pw->pw_data;
+ struct padata_mt_job *job = ps->job;
+ bool done;
+
+ spin_lock(&ps->lock);
+
+ while (job->size > 0) {
+ unsigned long start, size, end;
+
+ start = job->start;
+ /* So end is chunk size aligned if enough work remains. */
+ size = roundup(start + 1, ps->chunk_size) - start;
+ size = min(size, job->size);
+ end = start + size;
+
+ job->start = end;
+ job->size -= size;
+
+ spin_unlock(&ps->lock);
+ job->thread_fn(start, end, job->fn_arg);
+ spin_lock(&ps->lock);
+ }
+
+ ++ps->nworks_fini;
+ done = (ps->nworks_fini == ps->nworks);
+ spin_unlock(&ps->lock);
+
+ if (done)
+ complete(&ps->completion);
+}
+
+/**
+ * padata_do_multithreaded - run a multithreaded job
+ * @job: Description of the job.
+ *
+ * See the definition of struct padata_mt_job for more details.
+ */
+void __init padata_do_multithreaded(struct padata_mt_job *job)
+{
+ /* In case threads finish at different times. */
+ static const unsigned long load_balance_factor = 4;
+ struct padata_work my_work, *pw;
+ struct padata_mt_job_state ps;
+ LIST_HEAD(works);
+ int nworks;
+
+ if (job->size == 0)
+ return;
+
+ /* Ensure at least one thread when size < min_chunk. */
+ nworks = max(job->size / job->min_chunk, 1ul);
+ nworks = min(nworks, job->max_threads);
+
+ if (nworks == 1) {
+ /* Single thread, no coordination needed, cut to the chase. */
+ job->thread_fn(job->start, job->start + job->size, job->fn_arg);
+ return;
+ }
+
+ spin_lock_init(&ps.lock);
+ init_completion(&ps.completion);
+ ps.job = job;
+ ps.nworks = padata_work_alloc_mt(nworks, &ps, &works);
+ ps.nworks_fini = 0;
+
+ /*
+ * Chunk size is the amount of work a helper does per call to the
+ * thread function. Load balance large jobs between threads by
+ * increasing the number of chunks, guarantee at least the minimum
+ * chunk size from the caller, and honor the caller's alignment.
+ */
+ ps.chunk_size = job->size / (ps.nworks * load_balance_factor);
+ ps.chunk_size = max(ps.chunk_size, job->min_chunk);
+ ps.chunk_size = roundup(ps.chunk_size, job->align);
+
+ list_for_each_entry(pw, &works, pw_list)
+ queue_work(system_unbound_wq, &pw->pw_work);
+
+ /* Use the current thread, which saves starting a workqueue worker. */
+ padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK);
+ padata_mt_helper(&my_work.pw_work);
+
+ /* Wait for all the helpers to finish. */
+ wait_for_completion(&ps.completion);
+
+ destroy_work_on_stack(&my_work.pw_work);
+ padata_works_free(&works);
+}
+
static void __padata_list_init(struct padata_list *pd_list)
{
INIT_LIST_HEAD(&pd_list->list);
@@ -417,8 +585,6 @@ static void padata_init_pqueues(struct parallel_data *pd)
pqueue = per_cpu_ptr(pd->pqueue, cpu);
__padata_list_init(&pqueue->reorder);
- __padata_list_init(&pqueue->parallel);
- INIT_WORK(&pqueue->work, padata_parallel_worker);
atomic_set(&pqueue->num_obj, 0);
}
}
@@ -452,7 +618,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
padata_init_pqueues(pd);
padata_init_squeues(pd);
- atomic_set(&pd->seq_nr, -1);
+ pd->seq_nr = -1;
atomic_set(&pd->refcnt, 1);
spin_lock_init(&pd->lock);
pd->cpu = cpumask_first(pd->cpumask.pcpu);
@@ -1052,32 +1218,41 @@ void padata_free_shell(struct padata_shell *ps)
}
EXPORT_SYMBOL(padata_free_shell);
-#ifdef CONFIG_HOTPLUG_CPU
-
-static __init int padata_driver_init(void)
+void __init padata_init(void)
{
+ unsigned int i, possible_cpus;
+#ifdef CONFIG_HOTPLUG_CPU
int ret;
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online",
padata_cpu_online, NULL);
if (ret < 0)
- return ret;
+ goto err;
hp_online = ret;
ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead",
NULL, padata_cpu_dead);
- if (ret < 0) {
- cpuhp_remove_multi_state(hp_online);
- return ret;
- }
- return 0;
-}
-module_init(padata_driver_init);
+ if (ret < 0)
+ goto remove_online_state;
+#endif
-static __exit void padata_driver_exit(void)
-{
+ possible_cpus = num_possible_cpus();
+ padata_works = kmalloc_array(possible_cpus, sizeof(struct padata_work),
+ GFP_KERNEL);
+ if (!padata_works)
+ goto remove_dead_state;
+
+ for (i = 0; i < possible_cpus; ++i)
+ list_add(&padata_works[i].pw_list, &padata_free_works);
+
+ return;
+
+remove_dead_state:
+#ifdef CONFIG_HOTPLUG_CPU
cpuhp_remove_multi_state(CPUHP_PADATA_DEAD);
+remove_online_state:
cpuhp_remove_multi_state(hp_online);
-}
-module_exit(padata_driver_exit);
+err:
#endif
+ pr_warn("padata: initialization failed\n");
+}
diff --git a/kernel/panic.c b/kernel/panic.c
index b69ee9e76cb2..e2157ca387c8 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,6 +36,14 @@
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
+#ifdef CONFIG_SMP
+/*
+ * Should we dump all CPUs backtraces in an oops event?
+ * Defaults to 0, can be changed via sysctl.
+ */
+unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
+#endif /* CONFIG_SMP */
+
int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
static unsigned long tainted_mask =
IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
@@ -44,6 +52,8 @@ static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
bool crash_kexec_post_notifiers;
int panic_on_warn __read_mostly;
+unsigned long panic_on_taint;
+bool panic_on_taint_nousertaint = false;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -434,6 +444,11 @@ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
pr_warn("Disabling lock debugging due to kernel taint\n");
set_bit(flag, &tainted_mask);
+
+ if (tainted_mask & panic_on_taint) {
+ panic_on_taint = 0;
+ panic("panic_on_taint set ...");
+ }
}
EXPORT_SYMBOL(add_taint);
@@ -515,6 +530,9 @@ void oops_enter(void)
/* can't trust the integrity of the kernel anymore: */
debug_locks_off();
do_oops_enter_exit();
+
+ if (sysctl_oops_all_cpu_backtrace)
+ trigger_all_cpu_backtrace();
}
/*
@@ -662,10 +680,12 @@ device_initcall(register_warn_debugfs);
* Called when gcc's -fstack-protector feature is used, and
* gcc detects corruption of the on-stack canary value
*/
-__visible void __stack_chk_fail(void)
+__visible noinstr void __stack_chk_fail(void)
{
+ instrumentation_begin();
panic("stack-protector: Kernel stack is corrupted in: %pB",
__builtin_return_address(0));
+ instrumentation_end();
}
EXPORT_SYMBOL(__stack_chk_fail);
@@ -686,3 +706,30 @@ static int __init oops_setup(char *s)
return 0;
}
early_param("oops", oops_setup);
+
+static int __init panic_on_taint_setup(char *s)
+{
+ char *taint_str;
+
+ if (!s)
+ return -EINVAL;
+
+ taint_str = strsep(&s, ",");
+ if (kstrtoul(taint_str, 16, &panic_on_taint))
+ return -EINVAL;
+
+ /* make sure panic_on_taint doesn't hold out-of-range TAINT flags */
+ panic_on_taint &= TAINT_FLAGS_MAX;
+
+ if (!panic_on_taint)
+ return -EINVAL;
+
+ if (s && !strcmp(s, "nousertaint"))
+ panic_on_taint_nousertaint = true;
+
+ pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n",
+ panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis");
+
+ return 0;
+}
+early_param("panic_on_taint", panic_on_taint_setup);
diff --git a/kernel/pid.c b/kernel/pid.c
index c835b844aca7..f1496b757162 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -363,6 +363,25 @@ void change_pid(struct task_struct *task, enum pid_type type,
attach_pid(task, type);
}
+void exchange_tids(struct task_struct *left, struct task_struct *right)
+{
+ struct pid *pid1 = left->thread_pid;
+ struct pid *pid2 = right->thread_pid;
+ struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
+ struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
+
+ /* Swap the single entry tid lists */
+ hlists_swap_heads_rcu(head1, head2);
+
+ /* Swap the per task_struct pid */
+ rcu_assign_pointer(left->thread_pid, pid2);
+ rcu_assign_pointer(right->thread_pid, pid1);
+
+ /* Swap the cached value */
+ WRITE_ONCE(left->pid, pid_nr(pid2));
+ WRITE_ONCE(right->pid, pid_nr(pid1));
+}
+
/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
@@ -476,8 +495,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
rcu_read_lock();
if (!ns)
ns = task_active_pid_ns(current);
- if (likely(pid_alive(task)))
- nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
+ nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
rcu_read_unlock();
return nr;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 01f8ba32cc0c..0e5ac162c3a8 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -263,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
#ifdef CONFIG_CHECKPOINT_RESTORE
static int pid_ns_ctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
@@ -378,13 +378,14 @@ static void pidns_put(struct ns_common *ns)
put_pid_ns(to_pid_ns(ns));
}
-static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int pidns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct pid_namespace *active = task_active_pid_ns(current);
struct pid_namespace *ancestor, *new = to_pid_ns(ns);
if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index c208566c844b..a7320f07689d 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -3,7 +3,7 @@ config SUSPEND
bool "Suspend to RAM and standby"
depends on ARCH_SUSPEND_POSSIBLE
default y
- ---help---
+ help
Allow the system to enter sleep states in which main memory is
powered and thus its contents are preserved, such as the
suspend-to-RAM state (e.g. the ACPI S3 state).
@@ -42,7 +42,7 @@ config HIBERNATION
select LZO_COMPRESS
select LZO_DECOMPRESS
select CRC32
- ---help---
+ help
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
system and powers it off; and restores that checkpoint on reboot.
@@ -80,11 +80,23 @@ config HIBERNATION
For more information take a look at <file:Documentation/power/swsusp.rst>.
+config HIBERNATION_SNAPSHOT_DEV
+ bool "Userspace snapshot device"
+ depends on HIBERNATION
+ default y
+ help
+ Device used by the uswsusp tools.
+
+ Say N if no snapshotting from userspace is needed, this also
+ reduces the attack surface of the kernel.
+
+ If in doubt, say Y.
+
config PM_STD_PARTITION
string "Default resume partition"
depends on HIBERNATION
default ""
- ---help---
+ help
The default resume partition is the partition that the suspend-
to-disk implementation will look for a suspended disk image.
@@ -119,7 +131,7 @@ config PM_SLEEP_SMP_NONZERO_CPU
def_bool y
depends on PM_SLEEP_SMP
depends on ARCH_SUSPEND_NONZERO_CPU
- ---help---
+ help
If an arch can suspend (for suspend, hibernate, kexec, etc) on a
non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This
will allow nohz_full mask to include CPU0.
@@ -128,7 +140,7 @@ config PM_AUTOSLEEP
bool "Opportunistic sleep"
depends on PM_SLEEP
default n
- ---help---
+ help
Allow the kernel to trigger a system transition into a global sleep
state automatically whenever there are no active wakeup sources.
@@ -136,7 +148,7 @@ config PM_WAKELOCKS
bool "User space wakeup sources interface"
depends on PM_SLEEP
default n
- ---help---
+ help
Allow user space to create, activate and deactivate wakeup source
objects with the help of a sysfs-based interface.
@@ -153,7 +165,7 @@ config PM_WAKELOCKS_GC
config PM
bool "Device power management core functionality"
- ---help---
+ help
Enable functionality allowing I/O devices to be put into energy-saving
(low power) states, for example after a specified period of inactivity
(autosuspended), and woken up in response to a hardware-generated
@@ -167,7 +179,7 @@ config PM
config PM_DEBUG
bool "Power Management Debug Support"
depends on PM
- ---help---
+ help
This option enables various debugging support in the Power Management
code. This is helpful when debugging and reporting PM bugs, like
suspend support.
@@ -175,7 +187,7 @@ config PM_DEBUG
config PM_ADVANCED_DEBUG
bool "Extra PM attributes in sysfs for low-level debugging/testing"
depends on PM_DEBUG
- ---help---
+ help
Add extra sysfs attributes allowing one to access some Power Management
fields of device objects from user space. If you are not a kernel
developer interested in debugging/testing Power Management, say "no".
@@ -183,7 +195,7 @@ config PM_ADVANCED_DEBUG
config PM_TEST_SUSPEND
bool "Test suspend/resume and wakealarm during bootup"
depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
- ---help---
+ help
This option will let you suspend your machine during bootup, and
make it wake up a few seconds later using an RTC wakeup alarm.
Enable this with a kernel parameter like "test_suspend=mem".
@@ -198,7 +210,7 @@ config PM_SLEEP_DEBUG
config DPM_WATCHDOG
bool "Device suspend/resume watchdog"
depends on PM_DEBUG && PSTORE && EXPERT
- ---help---
+ help
Sets up a watchdog timer to capture drivers that are
locked up attempting to suspend/resume a device.
A detected lockup causes system panic with message
@@ -231,7 +243,7 @@ config PM_TRACE_RTC
depends on PM_SLEEP_DEBUG
depends on X86
select PM_TRACE
- ---help---
+ help
This enables some cheesy code to save the last PM event point in the
RTC across reboots, so that you can debug a machine that just hangs
during suspend (or more commonly, during resume).
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index e7e47d9be1e5..5899260a8bef 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -10,7 +10,8 @@ obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
obj-$(CONFIG_FREEZER) += process.o
obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
-obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o
+obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o
+obj-$(CONFIG_HIBERNATION_SNAPSHOT_DEV) += user.o
obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 30bd28d1d418..02ec716a4927 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -67,6 +67,18 @@ bool freezer_test_done;
static const struct platform_hibernation_ops *hibernation_ops;
+static atomic_t hibernate_atomic = ATOMIC_INIT(1);
+
+bool hibernate_acquire(void)
+{
+ return atomic_add_unless(&hibernate_atomic, -1, 0);
+}
+
+void hibernate_release(void)
+{
+ atomic_inc(&hibernate_atomic);
+}
+
bool hibernation_available(void)
{
return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION);
@@ -704,7 +716,7 @@ int hibernate(void)
lock_system_sleep();
/* The snapshot device should not be opened while we're running */
- if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+ if (!hibernate_acquire()) {
error = -EBUSY;
goto Unlock;
}
@@ -775,7 +787,7 @@ int hibernate(void)
Exit:
__pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
pm_restore_console();
- atomic_inc(&snapshot_device_available);
+ hibernate_release();
Unlock:
unlock_system_sleep();
pr_info("hibernation exit\n");
@@ -880,7 +892,7 @@ static int software_resume(void)
goto Unlock;
/* The snapshot device should not be opened while we're running */
- if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+ if (!hibernate_acquire()) {
error = -EBUSY;
swsusp_close(FMODE_READ);
goto Unlock;
@@ -911,7 +923,7 @@ static int software_resume(void)
__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
pm_restore_console();
pr_info("resume failed (%d)\n", error);
- atomic_inc(&snapshot_device_available);
+ hibernate_release();
/* For success case, the suspend path will release the lock */
Unlock:
mutex_unlock(&system_transition_mutex);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7cdc64dc2373..ba2094db6294 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -154,8 +154,8 @@ extern int snapshot_write_next(struct snapshot_handle *handle);
extern void snapshot_write_finalize(struct snapshot_handle *handle);
extern int snapshot_image_loaded(struct snapshot_handle *handle);
-/* If unset, the snapshot device cannot be open. */
-extern atomic_t snapshot_device_available;
+extern bool hibernate_acquire(void);
+extern void hibernate_release(void);
extern sector_t alloc_swapdev_block(int swap);
extern void free_all_swap_pages(int swap);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 6d475281c730..562aa0e450ed 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -29,7 +29,7 @@ static void handle_poweroff(int key)
schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
}
-static struct sysrq_key_op sysrq_poweroff_op = {
+static const struct sysrq_key_op sysrq_poweroff_op = {
.handler = handle_poweroff,
.help_msg = "poweroff(o)",
.action_msg = "Power Off",
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 659800157b17..881128b9351e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -34,7 +34,6 @@
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index ca0fcb5ced71..01e2858b5fe3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1590,7 +1590,7 @@ int swsusp_unmark(void)
}
#endif
-static int swsusp_header_init(void)
+static int __init swsusp_header_init(void)
{
swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
if (!swsusp_header)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 7959449765d9..d5eedc2baa2a 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -35,9 +35,13 @@ static struct snapshot_data {
bool ready;
bool platform_support;
bool free_bitmaps;
+ struct inode *bd_inode;
} snapshot_state;
-atomic_t snapshot_device_available = ATOMIC_INIT(1);
+int is_hibernate_resume_dev(const struct inode *bd_inode)
+{
+ return hibernation_available() && snapshot_state.bd_inode == bd_inode;
+}
static int snapshot_open(struct inode *inode, struct file *filp)
{
@@ -49,13 +53,13 @@ static int snapshot_open(struct inode *inode, struct file *filp)
lock_system_sleep();
- if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+ if (!hibernate_acquire()) {
error = -EBUSY;
goto Unlock;
}
if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
- atomic_inc(&snapshot_device_available);
+ hibernate_release();
error = -ENOSYS;
goto Unlock;
}
@@ -92,11 +96,12 @@ static int snapshot_open(struct inode *inode, struct file *filp)
__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
}
if (error)
- atomic_inc(&snapshot_device_available);
+ hibernate_release();
data->frozen = false;
data->ready = false;
data->platform_support = false;
+ data->bd_inode = NULL;
Unlock:
unlock_system_sleep();
@@ -112,6 +117,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
swsusp_free();
data = filp->private_data;
+ data->bd_inode = NULL;
free_all_swap_pages(data->swap);
if (data->frozen) {
pm_restore_gfp_mask();
@@ -122,7 +128,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
}
pm_notifier_call_chain(data->mode == O_RDONLY ?
PM_POST_HIBERNATION : PM_POST_RESTORE);
- atomic_inc(&snapshot_device_available);
+ hibernate_release();
unlock_system_sleep();
@@ -204,6 +210,7 @@ struct compat_resume_swap_area {
static int snapshot_set_swap_area(struct snapshot_data *data,
void __user *argp)
{
+ struct block_device *bdev;
sector_t offset;
dev_t swdev;
@@ -234,9 +241,12 @@ static int snapshot_set_swap_area(struct snapshot_data *data,
data->swap = -1;
return -EINVAL;
}
- data->swap = swap_type_of(swdev, offset, NULL);
+ data->swap = swap_type_of(swdev, offset, &bdev);
if (data->swap < 0)
return -ENODEV;
+
+ data->bd_inode = bdev->bd_inode;
+ bdput(bdev);
return 0;
}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9fdd6a42ad6a..b71eaf5f5a86 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -35,7 +35,6 @@
#include <linux/memblock.h>
#include <linux/syscalls.h>
#include <linux/crash_core.h>
-#include <linux/kdb.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
#include <linux/syslog.h>
@@ -173,7 +172,7 @@ __setup("printk.devkmsg=", control_devkmsg);
char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char old_str[DEVKMSG_STR_MAX_SIZE];
unsigned int old;
@@ -975,16 +974,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
user->idx = log_next_idx;
user->seq = log_next_seq;
break;
- case SEEK_CUR:
- /*
- * It isn't supported due to the record nature of this
- * interface: _SET _DATA and _END point to very specific
- * record positions, while _CUR would be more useful in case
- * of a byte-based log. Because of that, return the default
- * errno value for invalid seek operation.
- */
- ret = -ESPIPE;
- break;
default:
ret = -EINVAL;
}
@@ -2047,18 +2036,7 @@ EXPORT_SYMBOL(vprintk);
int vprintk_default(const char *fmt, va_list args)
{
- int r;
-
-#ifdef CONFIG_KGDB_KDB
- /* Allow to pass printk() to kdb but avoid a recursion. */
- if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) {
- r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
- return r;
- }
-#endif
- r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
-
- return r;
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
}
EXPORT_SYMBOL_GPL(vprintk_default);
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 4242403316bb..50aeae770434 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -6,6 +6,7 @@
#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <linux/debug_locks.h>
+#include <linux/kdb.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
#include <linux/irq_work.h>
@@ -360,6 +361,12 @@ void __printk_safe_exit(void)
__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
{
+#ifdef CONFIG_KGDB_KDB
+ /* Allow to pass printk() to kdb but avoid a recursion. */
+ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
+ return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
+#endif
+
/*
* Try to use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c716eadc7617..6c6569e0586c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -250,7 +250,7 @@ static noinstr void rcu_dynticks_eqs_enter(void)
* next idle sojourn.
*/
rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
- seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
+ seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
// RCU is no longer watching. Better be in extended quiescent state!
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
(seq & RCU_DYNTICK_CTRL_CTR));
@@ -274,13 +274,13 @@ static noinstr void rcu_dynticks_eqs_exit(void)
* and we also must force ordering with the next RCU read-side
* critical section.
*/
- seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
+ seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
// RCU is now watching. Better not be in an extended quiescent state!
rcu_dynticks_task_trace_exit(); // After ->dynticks update!
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
!(seq & RCU_DYNTICK_CTRL_CTR));
if (seq & RCU_DYNTICK_CTRL_MASK) {
- atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks);
+ arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks);
smp_mb__after_atomic(); /* _exit after clearing mask. */
}
}
@@ -313,7 +313,7 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- return !(atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR);
+ return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR);
}
/*
@@ -633,6 +633,10 @@ static noinstr void rcu_eqs_enter(bool user)
do_nocb_deferred_wakeup(rdp);
rcu_prepare_for_idle();
rcu_preempt_deferred_qs(current);
+
+ // instrumentation for the noinstr rcu_dynticks_eqs_enter()
+ instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
+
instrumentation_end();
WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
// RCU is watching here ...
@@ -692,6 +696,7 @@ noinstr void rcu_nmi_exit(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ instrumentation_begin();
/*
* Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
* (We are exiting an NMI handler, so RCU better be paying attention
@@ -705,7 +710,6 @@ noinstr void rcu_nmi_exit(void)
* leave it in non-RCU-idle state.
*/
if (rdp->dynticks_nmi_nesting != 1) {
- instrumentation_begin();
trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
@@ -714,13 +718,15 @@ noinstr void rcu_nmi_exit(void)
return;
}
- instrumentation_begin();
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
if (!in_nmi())
rcu_prepare_for_idle();
+
+ // instrumentation for the noinstr rcu_dynticks_eqs_enter()
+ instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
instrumentation_end();
// RCU is watching here ...
@@ -838,6 +844,10 @@ static void noinstr rcu_eqs_exit(bool user)
rcu_dynticks_eqs_exit();
// ... but is watching here.
instrumentation_begin();
+
+ // instrumentation for the noinstr rcu_dynticks_eqs_exit()
+ instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
+
rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
@@ -983,13 +993,21 @@ noinstr void rcu_nmi_enter(void)
if (!in_nmi())
rcu_cleanup_after_idle();
+ instrumentation_begin();
+ // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
+ instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
+ // instrumentation for the noinstr rcu_dynticks_eqs_exit()
+ instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
+
incby = 1;
} else if (!in_nmi()) {
instrumentation_begin();
rcu_irq_enter_check_tick();
instrumentation_end();
+ } else {
+ instrumentation_begin();
}
- instrumentation_begin();
+
trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
rdp->dynticks_nmi_nesting,
rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index ae76bd329582..54a6dba0280d 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -807,7 +807,7 @@ static void sysrq_show_rcu(int key)
show_rcu_gp_kthreads();
}
-static struct sysrq_key_op sysrq_rcudump_op = {
+static const struct sysrq_key_op sysrq_rcudump_op = {
.handler = sysrq_show_rcu,
.help_msg = "show-rcu(y)",
.action_msg = "Show RCU tree",
diff --git a/kernel/relay.c b/kernel/relay.c
index ade14fb7ce2e..72fe443ea78f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1,7 +1,7 @@
/*
* Public API and common code for kernel->userspace relay file support.
*
- * See Documentation/filesystems/relay.txt for an overview.
+ * See Documentation/filesystems/relay.rst for an overview.
*
* Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
* Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
@@ -91,7 +91,7 @@ static void relay_free_page_array(struct page **array)
*
* Returns 0 if ok, negative on error
*
- * Caller should already have grabbed mmap_sem.
+ * Caller should already have grabbed mmap_lock.
*/
static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
{
@@ -581,6 +581,11 @@ struct rchan *relay_open(const char *base_filename,
return NULL;
chan->buf = alloc_percpu(struct rchan_buf *);
+ if (!chan->buf) {
+ kfree(chan);
+ return NULL;
+ }
+
chan->version = RELAYFS_CHANNEL_VERSION;
chan->n_subbufs = n_subbufs;
chan->subbuf_size = subbuf_size;
@@ -991,14 +996,14 @@ static void relay_file_read_consume(struct rchan_buf *buf,
/*
* relay_file_read_avail - boolean, are there unconsumed bytes available?
*/
-static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
+static int relay_file_read_avail(struct rchan_buf *buf)
{
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t produced = buf->subbufs_produced;
size_t consumed = buf->subbufs_consumed;
- relay_file_read_consume(buf, read_pos, 0);
+ relay_file_read_consume(buf, 0, 0);
consumed = buf->subbufs_consumed;
@@ -1059,23 +1064,20 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos,
/**
* relay_file_read_start_pos - find the first available byte to read
- * @read_pos: file read position
* @buf: relay channel buffer
*
- * If the @read_pos is in the middle of padding, return the
+ * If the read_pos is in the middle of padding, return the
* position of the first actually available byte, otherwise
* return the original value.
*/
-static size_t relay_file_read_start_pos(size_t read_pos,
- struct rchan_buf *buf)
+static size_t relay_file_read_start_pos(struct rchan_buf *buf)
{
size_t read_subbuf, padding, padding_start, padding_end;
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t consumed = buf->subbufs_consumed % n_subbufs;
+ size_t read_pos = consumed * subbuf_size + buf->bytes_consumed;
- if (!read_pos)
- read_pos = consumed * subbuf_size + buf->bytes_consumed;
read_subbuf = read_pos / subbuf_size;
padding = buf->padding[read_subbuf];
padding_start = (read_subbuf + 1) * subbuf_size - padding;
@@ -1131,10 +1133,10 @@ static ssize_t relay_file_read(struct file *filp,
do {
void *from;
- if (!relay_file_read_avail(buf, *ppos))
+ if (!relay_file_read_avail(buf))
break;
- read_start = relay_file_read_start_pos(*ppos, buf);
+ read_start = relay_file_read_start_pos(buf);
avail = relay_file_read_subbuf_avail(read_start, buf);
if (!avail)
break;
@@ -1177,10 +1179,9 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
}
static const struct pipe_buf_operations relay_pipe_buf_ops = {
- .confirm = generic_pipe_buf_confirm,
- .release = relay_pipe_buf_release,
- .steal = generic_pipe_buf_steal,
- .get = generic_pipe_buf_get,
+ .release = relay_pipe_buf_release,
+ .try_steal = generic_pipe_buf_try_steal,
+ .get = generic_pipe_buf_get,
};
static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
diff --git a/kernel/resource.c b/kernel/resource.c
index 76036a41143b..841737bbda9e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1126,6 +1126,7 @@ struct resource * __request_region(struct resource *parent,
{
DECLARE_WAITQUEUE(wait, current);
struct resource *res = alloc_resource(GFP_KERNEL);
+ struct resource *orig_parent = parent;
if (!res)
return NULL;
@@ -1176,6 +1177,10 @@ struct resource * __request_region(struct resource *parent,
break;
}
write_unlock(&resource_lock);
+
+ if (res && orig_parent == &iomem_resource)
+ revoke_devmem(res);
+
return res;
}
EXPORT_SYMBOL(__request_region);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 21fb5a5662b5..5fc9c9b70862 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -7,6 +7,12 @@ endif
# that is not a function of syscall inputs. E.g. involuntary context switches.
KCOV_INSTRUMENT := n
+# There are numerous data races here, however, most of them are due to plain accesses.
+# This would make it even harder for syzbot to find reproducers, because these
+# bugs trigger without specific input. Disable by default, but should re-enable
+# eventually.
+KCSAN_SANITIZE := n
+
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
# needed for x86 only. Why this used to be enabled for all architectures is beyond
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a05b85b3e0f7..ca5db40392d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11,6 +11,7 @@
#include <linux/nospec.h>
#include <linux/kcov.h>
+#include <linux/scs.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -20,6 +21,7 @@
#include "../smpboot.h"
#include "pelt.h"
+#include "smp.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -219,6 +221,13 @@ void update_rq_clock(struct rq *rq)
update_rq_clock_task(rq, delta);
}
+static inline void
+rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
+{
+ csd->flags = 0;
+ csd->func = func;
+ csd->info = rq;
+}
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -314,16 +323,14 @@ void hrtick_start(struct rq *rq, u64 delay)
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
HRTIMER_MODE_REL_PINNED_HARD);
}
+
#endif /* CONFIG_SMP */
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
- rq->hrtick_csd.flags = 0;
- rq->hrtick_csd.func = __hrtick_start;
- rq->hrtick_csd.info = rq;
+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
#endif
-
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
}
@@ -632,29 +639,23 @@ void wake_up_nohz_cpu(int cpu)
wake_up_idle_cpu(cpu);
}
-static inline bool got_nohz_idle_kick(void)
+static void nohz_csd_func(void *info)
{
- int cpu = smp_processor_id();
-
- if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
- return false;
-
- if (idle_cpu(cpu) && !need_resched())
- return true;
+ struct rq *rq = info;
+ int cpu = cpu_of(rq);
+ unsigned int flags;
/*
- * We can't run Idle Load Balance on this CPU for this time so we
- * cancel it and clear NOHZ_BALANCE_KICK
+ * Release the rq::nohz_csd.
*/
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
- return false;
-}
-
-#else /* CONFIG_NO_HZ_COMMON */
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+ WARN_ON(!(flags & NOHZ_KICK_MASK));
-static inline bool got_nohz_idle_kick(void)
-{
- return false;
+ rq->idle_balance = idle_cpu(cpu);
+ if (rq->idle_balance && !need_resched()) {
+ rq->nohz_idle_balance = flags;
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
+ }
}
#endif /* CONFIG_NO_HZ_COMMON */
@@ -1110,8 +1111,7 @@ static void uclamp_update_root_tg(void) { }
#endif
int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
int old_min, old_max;
@@ -1539,7 +1539,7 @@ static int migration_cpu_stop(void *data)
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
- sched_ttwu_pending();
+ flush_smp_call_function_from_idle();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
@@ -1637,7 +1637,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
goto out;
}
- if (cpumask_equal(p->cpus_ptr, new_mask))
+ if (cpumask_equal(&p->cpus_mask, new_mask))
goto out;
/*
@@ -2273,75 +2273,63 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-void sched_ttwu_pending(void)
+void sched_ttwu_pending(void *arg)
{
+ struct llist_node *llist = arg;
struct rq *rq = this_rq();
- struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p, *t;
struct rq_flags rf;
if (!llist)
return;
+ /*
+ * rq::ttwu_pending racy indication of out-standing wakeups.
+ * Races such that false-negatives are possible, since they
+ * are shorter lived that false-positives would be.
+ */
+ WRITE_ONCE(rq->ttwu_pending, 0);
+
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- llist_for_each_entry_safe(p, t, llist, wake_entry)
+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
+ if (WARN_ON_ONCE(p->on_cpu))
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
+ set_task_cpu(p, cpu_of(rq));
+
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
+ }
rq_unlock_irqrestore(rq, &rf);
}
-void scheduler_ipi(void)
+void send_call_function_single_ipi(int cpu)
{
- /*
- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
- * TIF_NEED_RESCHED remotely (for the first time) will also send
- * this IPI.
- */
- preempt_fold_need_resched();
-
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
- return;
-
- /*
- * Not all reschedule IPI handlers call irq_enter/irq_exit, since
- * traditionally all their work was done from the interrupt return
- * path. Now that we actually do some work, we need to make sure
- * we do call them.
- *
- * Some archs already do call them, luckily irq_enter/exit nest
- * properly.
- *
- * Arguably we should visit all archs and update all handlers,
- * however a fair share of IPIs are still resched only so this would
- * somewhat pessimize the simple resched case.
- */
- irq_enter();
- sched_ttwu_pending();
+ struct rq *rq = cpu_rq(cpu);
- /*
- * Check if someone kicked us for doing the nohz idle load balance.
- */
- if (unlikely(got_nohz_idle_kick())) {
- this_rq()->idle_balance = 1;
- raise_softirq_irqoff(SCHED_SOFTIRQ);
- }
- irq_exit();
+ if (!set_nr_if_polling(rq->idle))
+ arch_send_call_function_single_ipi(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+/*
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
+ * of the wakeup instead of the waker.
+ */
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
- if (!set_nr_if_polling(rq->idle))
- smp_send_reschedule(cpu);
- else
- trace_sched_wake_idle_without_ipi(cpu);
- }
+ WRITE_ONCE(rq->ttwu_pending, 1);
+ __smp_call_single_queue(cpu, &p->wake_entry.llist);
}
void wake_up_if_idle(int cpu)
@@ -2372,6 +2360,41 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
{
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
+
+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+{
+ /*
+ * If the CPU does not share cache, then queue the task on the
+ * remote rqs wakelist to avoid accessing remote data.
+ */
+ if (!cpus_share_cache(smp_processor_id(), cpu))
+ return true;
+
+ /*
+ * If the task is descheduling and the only running task on the
+ * CPU then use the wakelist to offload the task activation to
+ * the soon-to-be-idle CPU as the current CPU is likely busy.
+ * nr_running is checked to avoid unnecessary task stacking.
+ */
+ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
+ return true;
+
+ return false;
+}
+
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
+ if (WARN_ON_ONCE(cpu == smp_processor_id()))
+ return false;
+
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+ __ttwu_queue_wakelist(p, cpu, wake_flags);
+ return true;
+ }
+
+ return false;
+}
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@ -2380,11 +2403,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
struct rq_flags rf;
#if defined(CONFIG_SMP)
- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
- sched_clock_cpu(cpu); /* Sync clocks across CPUs */
- ttwu_queue_remote(p, cpu, wake_flags);
+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
return;
- }
#endif
rq_lock(rq, &rf);
@@ -2518,7 +2538,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
goto out;
success = 1;
- cpu = task_cpu(p);
trace_sched_waking(p);
p->state = TASK_RUNNING;
trace_sched_wakeup(p);
@@ -2540,7 +2559,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/* We're going to change ->state: */
success = 1;
- cpu = task_cpu(p);
/*
* Ensure we load p->on_rq _after_ p->state, otherwise it would
@@ -2568,7 +2586,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (p->on_rq && ttwu_remote(p, wake_flags))
goto unlock;
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
#ifdef CONFIG_SMP
+ p->sched_contributes_to_load = !!task_contributes_to_load(p);
+ p->state = TASK_WAKING;
+
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
@@ -2592,6 +2618,29 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/*
* If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, considering queueing p on the remote CPUs wake_list
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
+ * let the waker make forward progress. This is safe because IRQs are
+ * disabled and the IPI will deliver after on_cpu is cleared.
+ *
+ * Ensure we load task_cpu(p) after p->on_cpu:
+ *
+ * set_task_cpu(p, cpu);
+ * STORE p->cpu = @cpu
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
+ * STORE p->on_cpu = 1 LOAD p->cpu
+ *
+ * to ensure we observe the correct CPU on which the task is currently
+ * scheduling.
+ */
+ if (smp_load_acquire(&p->on_cpu) &&
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
+ goto unlock;
+
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*
* Pairs with the smp_store_release() in finish_task().
@@ -2601,28 +2650,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
- p->sched_contributes_to_load = !!task_contributes_to_load(p);
- p->state = TASK_WAKING;
-
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
-
-#else /* CONFIG_SMP */
-
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
+#else
+ cpu = task_cpu(p);
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
@@ -2630,7 +2665,7 @@ unlock:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out:
if (success)
- ttwu_stat(p, cpu, wake_flags);
+ ttwu_stat(p, task_cpu(p), wake_flags);
preempt_enable();
return success;
@@ -2750,6 +2785,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->capture_control = NULL;
#endif
init_numa_balancing(clone_flags, p);
+#ifdef CONFIG_SMP
+ p->wake_entry.u_flags = CSD_TYPE_TTWU;
+#endif
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -2766,7 +2804,7 @@ void set_numabalancing_state(bool enabled)
#ifdef CONFIG_PROC_SYSCTL
int sysctl_numa_balancing(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
@@ -2840,8 +2878,8 @@ static void __init init_schedstats(void)
}
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
@@ -3907,8 +3945,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
&& in_atomic_preempt_off()) {
pr_err("Preemption disabled at:");
- print_ip_sym(preempt_disable_ip);
- pr_cont("\n");
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
}
if (panic_on_warn)
panic("scheduling while atomic\n");
@@ -3925,6 +3962,9 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
#ifdef CONFIG_SCHED_STACK_END_CHECK
if (task_stack_end_corrupted(prev))
panic("corrupted stack end detected inside scheduler\n");
+
+ if (task_scs_end_corrupted(prev))
+ panic("corrupted shadow stack detected inside scheduler\n");
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
@@ -3947,6 +3987,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
schedstat_inc(this_rq()->sched_count);
}
+static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+ const struct sched_class *class;
+ /*
+ * We must do the balancing pass before put_prev_task(), such
+ * that when we release the rq->lock the task is in the same
+ * state as before we took rq->lock.
+ *
+ * We can terminate the balance pass as soon as we know there is
+ * a runnable task of @class priority or higher.
+ */
+ for_class_range(class, prev->sched_class, &idle_sched_class) {
+ if (class->balance(rq, prev, rf))
+ break;
+ }
+#endif
+
+ put_prev_task(rq, prev);
+}
+
/*
* Pick up the highest-prio task:
*/
@@ -3980,22 +4042,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
restart:
-#ifdef CONFIG_SMP
- /*
- * We must do the balancing pass before put_next_task(), such
- * that when we release the rq->lock the task is in the same
- * state as before we took rq->lock.
- *
- * We can terminate the balance pass as soon as we know there is
- * a runnable task of @class priority or higher.
- */
- for_class_range(class, prev->sched_class, &idle_sched_class) {
- if (class->balance(rq, prev, rf))
- break;
- }
-#endif
-
- put_prev_task(rq, prev);
+ put_prev_task_balance(rq, prev, rf);
for_each_class(class) {
p = class->pick_next_task(rq);
@@ -4509,7 +4556,8 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
*/
if (dl_prio(prio)) {
if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
queue_flag |= ENQUEUE_REPLENISH;
} else
@@ -4685,7 +4733,7 @@ int idle_cpu(int cpu)
return 0;
#ifdef CONFIG_SMP
- if (!llist_empty(&rq->wake_list))
+ if (rq->ttwu_pending)
return 0;
#endif
@@ -6001,7 +6049,7 @@ void sched_show_task(struct task_struct *p)
(unsigned long)task_thread_info(p)->flags);
print_worker_info(KERN_INFO, p);
- show_stack(p, NULL);
+ show_stack(p, NULL, KERN_INFO);
put_task_stack(p);
}
EXPORT_SYMBOL_GPL(sched_show_task);
@@ -6088,6 +6136,7 @@ void init_idle(struct task_struct *idle, int cpu)
idle->se.exec_start = sched_clock();
idle->flags |= PF_IDLE;
+ scs_task_reset(idle);
kasan_unpoison_task_stack(idle);
#ifdef CONFIG_SMP
@@ -6238,13 +6287,14 @@ void idle_task_exit(void)
struct mm_struct *mm = current->active_mm;
BUG_ON(cpu_online(smp_processor_id()));
+ BUG_ON(current != this_rq()->idle);
if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
- current->active_mm = &init_mm;
finish_arch_post_lock_switch();
}
- mmdrop(mm);
+
+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}
/*
@@ -6534,7 +6584,6 @@ int sched_cpu_dying(unsigned int cpu)
struct rq_flags rf;
/* Handle pending wakeups and then migrate everything off */
- sched_ttwu_pending();
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
@@ -6637,6 +6686,8 @@ void __init sched_init(void)
root_task_group.cfs_rq = (struct cfs_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -6689,7 +6740,6 @@ void __init sched_init(void)
init_rt_rq(&rq->rt);
init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
@@ -6711,7 +6761,6 @@ void __init sched_init(void)
* We achieve this by letting root_task_group's tasks sit
* directly in rq->cfs (i.e root_task_group->se[] = NULL).
*/
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -6739,6 +6788,8 @@ void __init sched_init(void)
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
atomic_set(&rq->nohz_flags, 0);
+
+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
@@ -6843,8 +6894,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
&& !preempt_count_equals(preempt_offset)) {
pr_err("Preemption disabled at:");
- print_ip_sym(preempt_disable_ip);
- pr_cont("\n");
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
}
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
@@ -7433,6 +7483,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
@@ -7461,6 +7513,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
return -EINVAL;
/*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+ return -EINVAL;
+
+ /*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9fbb10383434..941c28cf9738 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -5,6 +5,7 @@
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
+#include <asm/irq_regs.h>
#include "sched.h"
/* Time spent by the tasks of the CPU accounting group executing in ... */
@@ -339,7 +340,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
int index = CPUACCT_STAT_SYSTEM;
- struct pt_regs *regs = task_pt_regs(tsk);
+ struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
if (regs && user_mode(regs))
index = CPUACCT_STAT_USER;
@@ -347,7 +348,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
rcu_read_lock();
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
- this_cpu_ptr(ca->cpuusage)->usages[index] += cputime;
+ __this_cpu_add(ca->cpuusage->usages[index], cputime);
rcu_read_unlock();
}
@@ -363,7 +364,7 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
rcu_read_lock();
for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
- this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
+ __this_cpu_add(ca->cpustat->cpustat[index], val);
rcu_read_unlock();
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 504d2f51b0d6..f63f337c7147 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2692,6 +2692,7 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_bw = 0;
dl_se->dl_density = 0;
+ dl_se->dl_boosted = 0;
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 239970b991c0..36c54265bb2b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax);
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
/* &table[8] is terminator */
@@ -437,7 +437,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else
SEQ_printf(m, " %c", task_state_to_char(p));
- SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
(long long)(p->nvcsw + p->nivcsw),
@@ -464,10 +464,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m, "\n");
SEQ_printf(m, "runnable tasks:\n");
- SEQ_printf(m, " S task PID tree-key switches prio"
+ SEQ_printf(m, " S task PID tree-key switches prio"
" wait-time sum-exec sum-sleep\n");
SEQ_printf(m, "-------------------------------------------------------"
- "----------------------------------------------------\n");
+ "------------------------------------------------------\n");
rcu_read_lock();
for_each_process_thread(g, p) {
@@ -638,7 +638,6 @@ do { \
P(nr_running);
P(nr_switches);
- P(nr_load_updates);
P(nr_uninterruptible);
PN(next_balance);
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da3e5b54715b..658aa7a2ae6f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -191,7 +191,7 @@ static void update_sysctl(void)
#undef SET_SYSCTL
}
-void sched_init_granularity(void)
+void __init sched_init_granularity(void)
{
update_sysctl();
}
@@ -645,8 +645,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
*/
int sched_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
unsigned int factor = get_update_sysctl_factor();
@@ -807,7 +806,7 @@ void post_init_entity_util_avg(struct task_struct *p)
}
}
- sa->runnable_avg = cpu_scale;
+ sa->runnable_avg = sa->util_avg;
if (p->sched_class != &fair_sched_class) {
/*
@@ -1094,7 +1093,7 @@ struct numa_group {
* more by CPU use than by memory faults.
*/
unsigned long *faults_cpu;
- unsigned long faults[0];
+ unsigned long faults[];
};
/*
@@ -2771,7 +2770,7 @@ static void task_numa_work(struct callback_head *work)
return;
- if (!down_read_trylock(&mm->mmap_sem))
+ if (!mmap_read_trylock(mm))
return;
vma = find_vma(mm, start);
if (!vma) {
@@ -2839,7 +2838,7 @@ out:
mm->numa_scan_offset = start;
else
reset_ptenuma_scan(p);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
/*
* Make sure tasks use at least 32x as much time to run other code
@@ -3441,52 +3440,46 @@ static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
/* Nothing to update */
if (!delta)
return;
- /*
- * The relation between sum and avg is:
- *
- * LOAD_AVG_MAX - 1024 + sa->period_contrib
- *
- * however, the PELT windows are not aligned between grq and gse.
- */
-
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
- se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+ se->avg.util_sum = se->avg.util_avg * divider;
/* Update parent cfs_rq utilization */
add_positive(&cfs_rq->avg.util_avg, delta);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
}
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
/* Nothing to update */
if (!delta)
return;
- /*
- * The relation between sum and avg is:
- *
- * LOAD_AVG_MAX - 1024 + sa->period_contrib
- *
- * however, the PELT windows are not aligned between grq and gse.
- */
-
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
- se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX;
+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
/* Update parent cfs_rq runnable */
add_positive(&cfs_rq->avg.runnable_avg, delta);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX;
+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
}
static inline void
@@ -3496,19 +3489,26 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
unsigned long load_avg;
u64 load_sum = 0;
s64 delta_sum;
+ u32 divider;
if (!runnable_sum)
return;
gcfs_rq->prop_runnable_sum = 0;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+
if (runnable_sum >= 0) {
/*
* Add runnable; clip at LOAD_AVG_MAX. Reflects that until
* the CPU is saturated running == runnable.
*/
runnable_sum += se->avg.load_sum;
- runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
+ runnable_sum = min_t(long, runnable_sum, divider);
} else {
/*
* Estimate the new unweighted runnable_sum of the gcfs_rq by
@@ -3533,7 +3533,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
runnable_sum = max(runnable_sum, running_sum);
load_sum = (s64)se_weight(se) * runnable_sum;
- load_avg = div_s64(load_sum, LOAD_AVG_MAX);
+ load_avg = div_s64(load_sum, divider);
delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
delta_avg = load_avg - se->avg.load_avg;
@@ -3697,6 +3697,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
/*
@@ -3873,6 +3877,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
@@ -4054,7 +4060,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
+static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
{
return 0;
}
@@ -4588,16 +4594,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
}
/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
+ struct cfs_rq *cfs_rq, u64 target_runtime)
{
- struct task_group *tg = cfs_rq->tg;
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount;
+ u64 min_amount, amount = 0;
+
+ lockdep_assert_held(&cfs_b->lock);
/* note: this is a positive sum as runtime_remaining <= 0 */
- min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+ min_amount = target_runtime - cfs_rq->runtime_remaining;
- raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
@@ -4609,13 +4615,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
return cfs_rq->runtime_remaining > 0;
}
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ int ret;
+
+ raw_spin_lock(&cfs_b->lock);
+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
+ raw_spin_unlock(&cfs_b->lock);
+
+ return ret;
+}
+
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
@@ -4704,13 +4722,33 @@ static int tg_throttle_down(struct task_group *tg, void *data)
return 0;
}
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta, dequeue = 1;
- bool empty;
+
+ raw_spin_lock(&cfs_b->lock);
+ /* This will start the period timer if necessary */
+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
+ /*
+ * We have raced with bandwidth becoming available, and if we
+ * actually throttled the timer might not unthrottle us for an
+ * entire period. We additionally needed to make sure that any
+ * subsequent check_cfs_rq_runtime calls agree not to throttle
+ * us, as we may commit to do cfs put_prev+pick_next, so we ask
+ * for 1ns of runtime rather than just check cfs_b.
+ */
+ dequeue = 0;
+ } else {
+ list_add_tail_rcu(&cfs_rq->throttled_list,
+ &cfs_b->throttled_cfs_rq);
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ if (!dequeue)
+ return false; /* Throttle no longer required. */
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -4744,29 +4782,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!se)
sub_nr_running(rq, task_delta);
- cfs_rq->throttled = 1;
- cfs_rq->throttled_clock = rq_clock(rq);
- raw_spin_lock(&cfs_b->lock);
- empty = list_empty(&cfs_b->throttled_cfs_rq);
-
/*
- * Add to the _head_ of the list, so that an already-started
- * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
- * not running add to the tail so that later runqueues don't get starved.
+ * Note: distribution will already see us throttled via the
+ * throttled-list. rq->lock protects completion.
*/
- if (cfs_b->distribute_running)
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
- else
- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-
- /*
- * If we're the first throttled task, make sure the bandwidth
- * timer is running.
- */
- if (empty)
- start_cfs_bandwidth(cfs_b);
-
- raw_spin_unlock(&cfs_b->lock);
+ cfs_rq->throttled = 1;
+ cfs_rq->throttled_clock = rq_clock(rq);
+ return true;
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -4933,14 +4955,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
/*
* This check is repeated as we release cfs_b->lock while we unthrottle.
*/
- while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
- cfs_b->distribute_running = 1;
+ while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}
@@ -5054,10 +5074,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
/* confirm we're still not at a refresh boundary */
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->slack_started = false;
- if (cfs_b->distribute_running) {
- raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
- return;
- }
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
@@ -5067,9 +5083,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- if (runtime)
- cfs_b->distribute_running = 1;
-
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
if (!runtime)
@@ -5078,7 +5091,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- cfs_b->distribute_running = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
@@ -5139,8 +5151,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (cfs_rq_throttled(cfs_rq))
return true;
- throttle_cfs_rq(cfs_rq);
- return true;
+ return throttle_cfs_rq(cfs_rq);
}
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -5170,6 +5181,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
@@ -5199,8 +5212,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
/* reset count so we don't come right back in here */
count = 0;
}
-
- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;
@@ -5221,7 +5232,6 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
- cfs_b->distribute_running = 0;
cfs_b->slack_started = false;
}
@@ -5506,28 +5516,27 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
list_add_leaf_cfs_rq(cfs_rq);
}
-enqueue_throttle:
- if (!se) {
- add_nr_running(rq, 1);
- /*
- * Since new tasks are assigned an initial util_avg equal to
- * half of the spare capacity of their CPU, tiny tasks have the
- * ability to cross the overutilized threshold, which will
- * result in the load balancer ruining all the task placement
- * done by EAS. As a way to mitigate that effect, do not account
- * for the first enqueue operation of new tasks during the
- * overutilized flag detection.
- *
- * A better way of solving this problem would be to wait for
- * the PELT signals of tasks to converge before taking them
- * into account, but that is not straightforward to implement,
- * and the following generally works well enough in practice.
- */
- if (flags & ENQUEUE_WAKEUP)
- update_overutilized_status(rq);
+ /* At this point se is NULL and we are at root level*/
+ add_nr_running(rq, 1);
- }
+ /*
+ * Since new tasks are assigned an initial util_avg equal to
+ * half of the spare capacity of their CPU, tiny tasks have the
+ * ability to cross the overutilized threshold, which will
+ * result in the load balancer ruining all the task placement
+ * done by EAS. As a way to mitigate that effect, do not account
+ * for the first enqueue operation of new tasks during the
+ * overutilized flag detection.
+ *
+ * A better way of solving this problem would be to wait for
+ * the PELT signals of tasks to converge before taking them
+ * into account, but that is not straightforward to implement,
+ * and the following generally works well enough in practice.
+ */
+ if (flags & ENQUEUE_WAKEUP)
+ update_overutilized_status(rq);
+enqueue_throttle:
if (cfs_bandwidth_used()) {
/*
* When bandwidth control is enabled; the cfs_rq_throttled()
@@ -5737,7 +5746,7 @@ static int wake_wide(struct task_struct *p)
{
unsigned int master = current->wakee_flips;
unsigned int slave = p->wakee_flips;
- int factor = this_cpu_read(sd_llc_size);
+ int factor = __this_cpu_read(sd_llc_size);
if (master < slave)
swap(master, slave);
@@ -5846,8 +5855,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
}
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int sd_flag);
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
/*
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
@@ -5930,7 +5938,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
continue;
}
- group = find_idlest_group(sd, p, cpu, sd_flag);
+ group = find_idlest_group(sd, p, cpu);
if (!group) {
sd = sd->child;
continue;
@@ -6671,9 +6679,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
rcu_read_lock();
for_each_domain(cpu, tmp) {
- if (!(tmp->flags & SD_LOAD_BALANCE))
- break;
-
/*
* If both 'cpu' and 'prev_cpu' are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
@@ -8584,7 +8589,7 @@ static int idle_cpu_without(int cpu, struct task_struct *p)
*/
#ifdef CONFIG_SMP
- if (!llist_empty(&rq->wake_list))
+ if (rq->ttwu_pending)
return 0;
#endif
@@ -8702,8 +8707,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int sd_flag)
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
{
struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
struct sg_lb_stats local_sgs, tmp_sgs;
@@ -9434,7 +9438,7 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
- int cpu, balance_cpu = -1;
+ int cpu;
/*
* Ensure the balancing environment is consistent; can happen
@@ -9455,18 +9459,12 @@ static int should_we_balance(struct lb_env *env)
if (!idle_cpu(cpu))
continue;
- balance_cpu = cpu;
- break;
+ /* Are we the first idle CPU? */
+ return cpu == env->dst_cpu;
}
- if (balance_cpu == -1)
- balance_cpu = group_balance_cpu(sg);
-
- /*
- * First idle CPU or the first CPU(busiest) in this sched group
- * is eligible for doing load balancing at this and above domains.
- */
- return balance_cpu == env->dst_cpu;
+ /* Are we the first CPU of this group ? */
+ return group_balance_cpu(sg) == env->dst_cpu;
}
/*
@@ -9819,9 +9817,8 @@ static int active_load_balance_cpu_stop(void *data)
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
- if ((sd->flags & SD_LOAD_BALANCE) &&
- cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
- break;
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
}
if (likely(sd)) {
@@ -9910,9 +9907,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
}
max_cost += sd->max_newidle_lb_cost;
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
-
/*
* Stop the load balance at this level. There is another
* CPU in our sched group which is doing load balancing more
@@ -10029,17 +10023,20 @@ static void kick_ilb(unsigned int flags)
if (ilb_cpu >= nr_cpu_ids)
return;
+ /*
+ * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
+ * the first flag owns it; cleared by nohz_csd_func().
+ */
flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
if (flags & NOHZ_KICK_MASK)
return;
/*
- * Use smp_send_reschedule() instead of resched_cpu().
- * This way we generate a sched IPI on the target CPU which
+ * This way we generate an IPI on the target CPU which
* is idle. And the softirq performing nohz idle load balance
* will be run before returning from the IPI.
*/
- smp_send_reschedule(ilb_cpu);
+ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
}
/*
@@ -10377,20 +10374,14 @@ abort:
*/
static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
- int this_cpu = this_rq->cpu;
- unsigned int flags;
+ unsigned int flags = this_rq->nohz_idle_balance;
- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+ if (!flags)
return false;
- if (idle != CPU_IDLE) {
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
- return false;
- }
+ this_rq->nohz_idle_balance = 0;
- /* could be _relaxed() */
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
- if (!(flags & NOHZ_KICK_MASK))
+ if (idle != CPU_IDLE)
return false;
_nohz_idle_balance(this_rq, flags, idle);
@@ -10450,7 +10441,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
* 0 - failed, no new tasks
* > 0 - success, new (fair) tasks present
*/
-int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -10501,9 +10492,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
int continue_balancing = 1;
u64 t0, domain_cost;
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
-
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
break;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index b743bf38f08f..1ae95b9150d3 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -96,6 +96,15 @@ void __cpuidle default_idle_call(void)
}
}
+static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
+ struct cpuidle_device *dev)
+{
+ if (current_clr_polling_and_test())
+ return -EBUSY;
+
+ return cpuidle_enter_s2idle(drv, dev);
+}
+
static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
int next_state)
{
@@ -171,11 +180,9 @@ static void cpuidle_idle_call(void)
if (idle_should_enter_s2idle()) {
rcu_idle_enter();
- entered_state = cpuidle_enter_s2idle(drv, dev);
- if (entered_state > 0) {
- local_irq_enable();
+ entered_state = call_cpuidle_s2idle(drv, dev);
+ if (entered_state > 0)
goto exit_idle;
- }
rcu_idle_exit();
@@ -289,7 +296,11 @@ static void do_idle(void)
*/
smp_mb__after_atomic();
- sched_ttwu_pending();
+ /*
+ * RCU relies on this call to be done outside of an RCU read-side
+ * critical section.
+ */
+ flush_smp_call_function_from_idle();
schedule_idle();
if (unlikely(klp_patch_pending(current)))
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index b647d04d9c8b..b4b1ff96642f 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -237,6 +237,30 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
return 1;
}
+/*
+ * When syncing *_avg with *_sum, we must take into account the current
+ * position in the PELT segment otherwise the remaining part of the segment
+ * will be considered as idle time whereas it's not yet elapsed and this will
+ * generate unwanted oscillation in the range [1002..1024[.
+ *
+ * The max value of *_sum varies with the position in the time segment and is
+ * equals to :
+ *
+ * LOAD_AVG_MAX*y + sa->period_contrib
+ *
+ * which can be simplified into:
+ *
+ * LOAD_AVG_MAX - 1024 + sa->period_contrib
+ *
+ * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024
+ *
+ * The same care must be taken when a sched entity is added, updated or
+ * removed from a cfs_rq and we need to update sched_avg. Scheduler entities
+ * and the cfs rq, to which they are attached, have the same position in the
+ * time segment because they use the same clock. This means that we can use
+ * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity
+ * if it's more convenient.
+ */
static __always_inline void
___update_load_avg(struct sched_avg *sa, unsigned long load)
{
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index df11d88c9895..f395ddb75f38 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,8 @@
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+/* More than 4 hours if BW_SHIFT equals 20. */
+static const u64 max_rt_runtime = MAX_BW;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
if (rt_period == 0)
return -EINVAL;
+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
err = __rt_schedulable(tg, rt_period, rt_runtime);
if (err)
@@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void)
return -EINVAL;
if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
- (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+ ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
+ ((u64)sysctl_sched_rt_runtime *
+ NSEC_PER_USEC > max_rt_runtime)))
return -EINVAL;
return 0;
@@ -2714,9 +2724,8 @@ static void sched_rt_do_global(void)
def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
}
-int sched_rt_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int old_period, old_runtime;
static DEFINE_MUTEX(mutex);
@@ -2754,9 +2763,8 @@ undo:
return ret;
}
-int sched_rr_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret;
static DEFINE_MUTEX(mutex);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index db3a57675ccf..877fb08eb1b0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -349,7 +349,6 @@ struct cfs_bandwidth {
u8 idle;
u8 period_active;
- u8 distribute_running;
u8 slack_started;
struct hrtimer period_timer;
struct hrtimer slack_timer;
@@ -890,12 +889,15 @@ struct rq {
#ifdef CONFIG_SMP
unsigned long last_blocked_load_update_tick;
unsigned int has_blocked_load;
+ call_single_data_t nohz_csd;
#endif /* CONFIG_SMP */
unsigned int nohz_tick_stopped;
- atomic_t nohz_flags;
+ atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
- unsigned long nr_load_updates;
+#ifdef CONFIG_SMP
+ unsigned int ttwu_pending;
+#endif
u64 nr_switches;
#ifdef CONFIG_UCLAMP_TASK
@@ -951,6 +953,7 @@ struct rq {
struct callback_head *balance_callback;
+ unsigned char nohz_idle_balance;
unsigned char idle_balance;
unsigned long misfit_task_load;
@@ -979,7 +982,7 @@ struct rq {
/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
-#endif
+#endif /* CONFIG_SMP */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
@@ -1020,10 +1023,6 @@ struct rq {
unsigned int ttwu_local;
#endif
-#ifdef CONFIG_SMP
- struct llist_head wake_list;
-#endif
-
#ifdef CONFIG_CPU_IDLE
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
@@ -1367,8 +1366,6 @@ queue_balance_callback(struct rq *rq,
rq->balance_callback = head;
}
-extern void sched_ttwu_pending(void);
-
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
@@ -1461,7 +1458,7 @@ struct sched_group {
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
- unsigned long cpumask[0];
+ unsigned long cpumask[];
};
static inline struct cpumask *sched_group_span(struct sched_group *sg)
@@ -1504,15 +1501,11 @@ static inline void unregister_sched_domain_sysctl(void)
}
#endif
-extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
-
-#else
-
-static inline void sched_ttwu_pending(void) { }
+extern void flush_smp_call_function_from_idle(void);
-static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
-
-#endif /* CONFIG_SMP */
+#else /* !CONFIG_SMP: */
+static inline void flush_smp_call_function_from_idle(void) { }
+#endif
#include "stats.h"
#include "autogroup.h"
@@ -1688,7 +1681,8 @@ static inline int task_on_rq_migrating(struct task_struct *p)
*/
#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
#define WF_FORK 0x02 /* Child wakeup after fork */
-#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
+#define WF_MIGRATED 0x04 /* Internal use, task got migrated */
+#define WF_ON_CPU 0x08 /* Wakee is on_cpu */
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1918,6 +1912,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
+#define MAX_BW_BITS (64 - BW_SHIFT)
+#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
new file mode 100644
index 000000000000..9620e323162c
--- /dev/null
+++ b/kernel/sched/smp.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Scheduler internal SMP callback types and methods between the scheduler
+ * and other internal parts of the core kernel:
+ */
+
+extern void sched_ttwu_pending(void *arg);
+
+extern void send_call_function_single_ipi(int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8344757bba6e..ba81187bb7af 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -33,14 +33,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_clear(groupmask);
printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
-
- if (!(sd->flags & SD_LOAD_BALANCE)) {
- printk("does not load-balance\n");
- if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
- return -1;
- }
-
printk(KERN_CONT "span=%*pbl level=%s\n",
cpumask_pr_args(sched_domain_span(sd)), sd->name);
@@ -151,8 +143,7 @@ static int sd_degenerate(struct sched_domain *sd)
return 1;
/* Following flags need at least 2 groups */
- if (sd->flags & (SD_LOAD_BALANCE |
- SD_BALANCE_NEWIDLE |
+ if (sd->flags & (SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUCAPACITY |
@@ -183,15 +174,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
/* Flags needing groups don't count if only 1 group in parent */
if (parent->groups == parent->groups->next) {
- pflags &= ~(SD_LOAD_BALANCE |
- SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_ASYM_CPUCAPACITY |
- SD_SHARE_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
+ pflags &= ~(SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC |
+ SD_ASYM_CPUCAPACITY |
+ SD_SHARE_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_PREFER_SIBLING |
+ SD_SHARE_POWERDOMAIN);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -209,7 +199,7 @@ bool sched_energy_update;
#ifdef CONFIG_PROC_SYSCTL
int sched_energy_aware_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, state;
@@ -1351,8 +1341,7 @@ sd_init(struct sched_domain_topology_level *tl,
.cache_nice_tries = 0,
- .flags = 1*SD_LOAD_BALANCE
- | 1*SD_BALANCE_NEWIDLE
+ .flags = 1*SD_BALANCE_NEWIDLE
| 1*SD_BALANCE_EXEC
| 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
diff --git a/kernel/scs.c b/kernel/scs.c
new file mode 100644
index 000000000000..5d4d9bbdec36
--- /dev/null
+++ b/kernel/scs.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shadow Call Stack support.
+ *
+ * Copyright (C) 2019 Google LLC
+ */
+
+#include <linux/kasan.h>
+#include <linux/mm.h>
+#include <linux/scs.h>
+#include <linux/slab.h>
+#include <linux/vmstat.h>
+
+static struct kmem_cache *scs_cache;
+
+static void __scs_account(void *s, int account)
+{
+ struct page *scs_page = virt_to_page(s);
+
+ mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB,
+ account * (SCS_SIZE / SZ_1K));
+}
+
+static void *scs_alloc(int node)
+{
+ void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node);
+
+ if (!s)
+ return NULL;
+
+ *__scs_magic(s) = SCS_END_MAGIC;
+
+ /*
+ * Poison the allocation to catch unintentional accesses to
+ * the shadow stack when KASAN is enabled.
+ */
+ kasan_poison_object_data(scs_cache, s);
+ __scs_account(s, 1);
+ return s;
+}
+
+static void scs_free(void *s)
+{
+ __scs_account(s, -1);
+ kasan_unpoison_object_data(scs_cache, s);
+ kmem_cache_free(scs_cache, s);
+}
+
+void __init scs_init(void)
+{
+ scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL);
+}
+
+int scs_prepare(struct task_struct *tsk, int node)
+{
+ void *s = scs_alloc(node);
+
+ if (!s)
+ return -ENOMEM;
+
+ task_scs(tsk) = task_scs_sp(tsk) = s;
+ return 0;
+}
+
+static void scs_check_usage(struct task_struct *tsk)
+{
+ static unsigned long highest;
+
+ unsigned long *p, prev, curr = highest, used = 0;
+
+ if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE))
+ return;
+
+ for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) {
+ if (!READ_ONCE_NOCHECK(*p))
+ break;
+ used += sizeof(*p);
+ }
+
+ while (used > curr) {
+ prev = cmpxchg_relaxed(&highest, curr, used);
+
+ if (prev == curr) {
+ pr_info("%s (%d): highest shadow stack usage: %lu bytes\n",
+ tsk->comm, task_pid_nr(tsk), used);
+ break;
+ }
+
+ curr = prev;
+ }
+}
+
+void scs_release(struct task_struct *tsk)
+{
+ void *s = task_scs(tsk);
+
+ if (!s)
+ return;
+
+ WARN(task_scs_end_corrupted(tsk),
+ "corrupted shadow stack detected when freeing task\n");
+ scs_check_usage(tsk);
+ scs_free(s);
+}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 55a6184f5990..d653d8426de9 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1776,7 +1776,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
}
static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
diff --git a/kernel/signal.c b/kernel/signal.c
index 284fc1600063..ee22ec78fd6d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2529,9 +2529,6 @@ bool get_signal(struct ksignal *ksig)
struct signal_struct *signal = current->signal;
int signr;
- if (unlikely(current->task_works))
- task_work_run();
-
if (unlikely(uprobe_deny_signal()))
return false;
@@ -2544,6 +2541,13 @@ bool get_signal(struct ksignal *ksig)
relock:
spin_lock_irq(&sighand->siglock);
+ current->jobctl &= ~JOBCTL_TASK_WORK;
+ if (unlikely(current->task_works)) {
+ spin_unlock_irq(&sighand->siglock);
+ task_work_run();
+ goto relock;
+ }
+
/*
* Every stopped thread goes here after wakeup. Check to see if
* we should notify the parent, prepare_signal(SIGCONT) encodes
@@ -3235,94 +3239,94 @@ int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from)
}
#ifdef CONFIG_COMPAT
-int copy_siginfo_to_user32(struct compat_siginfo __user *to,
- const struct kernel_siginfo *from)
-#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
-{
- return __copy_siginfo_to_user32(to, from, in_x32_syscall());
-}
-int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
- const struct kernel_siginfo *from, bool x32_ABI)
-#endif
+/**
+ * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo
+ * @to: compat siginfo destination
+ * @from: kernel siginfo source
+ *
+ * Note: This function does not work properly for the SIGCHLD on x32, but
+ * fortunately it doesn't have to. The only valid callers for this function are
+ * copy_siginfo_to_user32, which is overriden for x32 and the coredump code.
+ * The latter does not care because SIGCHLD will never cause a coredump.
+ */
+void copy_siginfo_to_external32(struct compat_siginfo *to,
+ const struct kernel_siginfo *from)
{
- struct compat_siginfo new;
- memset(&new, 0, sizeof(new));
+ memset(to, 0, sizeof(*to));
- new.si_signo = from->si_signo;
- new.si_errno = from->si_errno;
- new.si_code = from->si_code;
+ to->si_signo = from->si_signo;
+ to->si_errno = from->si_errno;
+ to->si_code = from->si_code;
switch(siginfo_layout(from->si_signo, from->si_code)) {
case SIL_KILL:
- new.si_pid = from->si_pid;
- new.si_uid = from->si_uid;
+ to->si_pid = from->si_pid;
+ to->si_uid = from->si_uid;
break;
case SIL_TIMER:
- new.si_tid = from->si_tid;
- new.si_overrun = from->si_overrun;
- new.si_int = from->si_int;
+ to->si_tid = from->si_tid;
+ to->si_overrun = from->si_overrun;
+ to->si_int = from->si_int;
break;
case SIL_POLL:
- new.si_band = from->si_band;
- new.si_fd = from->si_fd;
+ to->si_band = from->si_band;
+ to->si_fd = from->si_fd;
break;
case SIL_FAULT:
- new.si_addr = ptr_to_compat(from->si_addr);
+ to->si_addr = ptr_to_compat(from->si_addr);
#ifdef __ARCH_SI_TRAPNO
- new.si_trapno = from->si_trapno;
+ to->si_trapno = from->si_trapno;
#endif
break;
case SIL_FAULT_MCEERR:
- new.si_addr = ptr_to_compat(from->si_addr);
+ to->si_addr = ptr_to_compat(from->si_addr);
#ifdef __ARCH_SI_TRAPNO
- new.si_trapno = from->si_trapno;
+ to->si_trapno = from->si_trapno;
#endif
- new.si_addr_lsb = from->si_addr_lsb;
+ to->si_addr_lsb = from->si_addr_lsb;
break;
case SIL_FAULT_BNDERR:
- new.si_addr = ptr_to_compat(from->si_addr);
+ to->si_addr = ptr_to_compat(from->si_addr);
#ifdef __ARCH_SI_TRAPNO
- new.si_trapno = from->si_trapno;
+ to->si_trapno = from->si_trapno;
#endif
- new.si_lower = ptr_to_compat(from->si_lower);
- new.si_upper = ptr_to_compat(from->si_upper);
+ to->si_lower = ptr_to_compat(from->si_lower);
+ to->si_upper = ptr_to_compat(from->si_upper);
break;
case SIL_FAULT_PKUERR:
- new.si_addr = ptr_to_compat(from->si_addr);
+ to->si_addr = ptr_to_compat(from->si_addr);
#ifdef __ARCH_SI_TRAPNO
- new.si_trapno = from->si_trapno;
+ to->si_trapno = from->si_trapno;
#endif
- new.si_pkey = from->si_pkey;
+ to->si_pkey = from->si_pkey;
break;
case SIL_CHLD:
- new.si_pid = from->si_pid;
- new.si_uid = from->si_uid;
- new.si_status = from->si_status;
-#ifdef CONFIG_X86_X32_ABI
- if (x32_ABI) {
- new._sifields._sigchld_x32._utime = from->si_utime;
- new._sifields._sigchld_x32._stime = from->si_stime;
- } else
-#endif
- {
- new.si_utime = from->si_utime;
- new.si_stime = from->si_stime;
- }
+ to->si_pid = from->si_pid;
+ to->si_uid = from->si_uid;
+ to->si_status = from->si_status;
+ to->si_utime = from->si_utime;
+ to->si_stime = from->si_stime;
break;
case SIL_RT:
- new.si_pid = from->si_pid;
- new.si_uid = from->si_uid;
- new.si_int = from->si_int;
+ to->si_pid = from->si_pid;
+ to->si_uid = from->si_uid;
+ to->si_int = from->si_int;
break;
case SIL_SYS:
- new.si_call_addr = ptr_to_compat(from->si_call_addr);
- new.si_syscall = from->si_syscall;
- new.si_arch = from->si_arch;
+ to->si_call_addr = ptr_to_compat(from->si_call_addr);
+ to->si_syscall = from->si_syscall;
+ to->si_arch = from->si_arch;
break;
}
+}
+int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
+ const struct kernel_siginfo *from)
+{
+ struct compat_siginfo new;
+
+ copy_siginfo_to_external32(&new, from);
if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
return -EFAULT;
-
return 0;
}
diff --git a/kernel/smp.c b/kernel/smp.c
index 84303197caf9..aa17eedff5be 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -22,11 +22,9 @@
#include <linux/hypervisor.h>
#include "smpboot.h"
+#include "sched/smp.h"
-enum {
- CSD_FLAG_LOCK = 0x01,
- CSD_FLAG_SYNCHRONOUS = 0x02,
-};
+#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK)
struct call_function_data {
call_single_data_t __percpu *csd;
@@ -84,6 +82,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
* still pending.
*/
flush_smp_call_function_queue(false);
+ irq_work_run();
return 0;
}
@@ -134,15 +133,33 @@ static __always_inline void csd_unlock(call_single_data_t *csd)
static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
+void __smp_call_single_queue(int cpu, struct llist_node *node)
+{
+ /*
+ * The list addition should be visible before sending the IPI
+ * handler locks the list to pull the entry off it because of
+ * normal cache coherency rules implied by spinlocks.
+ *
+ * If IPIs can go out of order to the cache coherency protocol
+ * in an architecture, sufficient synchronisation should be added
+ * to arch code to make it appear to obey cache coherency WRT
+ * locking and barrier primitives. Generic code isn't really
+ * equipped to do the right thing...
+ */
+ if (llist_add(node, &per_cpu(call_single_queue, cpu)))
+ send_call_function_single_ipi(cpu);
+}
+
/*
* Insert a previously allocated call_single_data_t element
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
-static int generic_exec_single(int cpu, call_single_data_t *csd,
- smp_call_func_t func, void *info)
+static int generic_exec_single(int cpu, call_single_data_t *csd)
{
if (cpu == smp_processor_id()) {
+ smp_call_func_t func = csd->func;
+ void *info = csd->info;
unsigned long flags;
/*
@@ -156,28 +173,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd,
return 0;
}
-
if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
csd_unlock(csd);
return -ENXIO;
}
- csd->func = func;
- csd->info = info;
-
- /*
- * The list addition should be visible before sending the IPI
- * handler locks the list to pull the entry off it because of
- * normal cache coherency rules implied by spinlocks.
- *
- * If IPIs can go out of order to the cache coherency protocol
- * in an architecture, sufficient synchronisation should be added
- * to arch code to make it appear to obey cache coherency WRT
- * locking and barrier primitives. Generic code isn't really
- * equipped to do the right thing...
- */
- if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
- arch_send_call_function_single_ipi(cpu);
+ __smp_call_single_queue(cpu, &csd->llist);
return 0;
}
@@ -209,9 +210,9 @@ void generic_smp_call_function_single_interrupt(void)
*/
static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
- struct llist_head *head;
- struct llist_node *entry;
call_single_data_t *csd, *csd_next;
+ struct llist_node *entry, *prev;
+ struct llist_head *head;
static bool warned;
lockdep_assert_irqs_disabled();
@@ -230,32 +231,99 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
* We don't have to use the _safe() variant here
* because we are not invoking the IPI handlers yet.
*/
- llist_for_each_entry(csd, entry, llist)
- pr_warn("IPI callback %pS sent to offline CPU\n",
- csd->func);
+ llist_for_each_entry(csd, entry, llist) {
+ switch (CSD_TYPE(csd)) {
+ case CSD_TYPE_ASYNC:
+ case CSD_TYPE_SYNC:
+ case CSD_TYPE_IRQ_WORK:
+ pr_warn("IPI callback %pS sent to offline CPU\n",
+ csd->func);
+ break;
+
+ case CSD_TYPE_TTWU:
+ pr_warn("IPI task-wakeup sent to offline CPU\n");
+ break;
+
+ default:
+ pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
+ CSD_TYPE(csd));
+ break;
+ }
+ }
}
+ /*
+ * First; run all SYNC callbacks, people are waiting for us.
+ */
+ prev = NULL;
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
- smp_call_func_t func = csd->func;
- void *info = csd->info;
-
/* Do we wait until *after* callback? */
- if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
+ if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
+ smp_call_func_t func = csd->func;
+ void *info = csd->info;
+
+ if (prev) {
+ prev->next = &csd_next->llist;
+ } else {
+ entry = &csd_next->llist;
+ }
+
func(info);
csd_unlock(csd);
} else {
- csd_unlock(csd);
- func(info);
+ prev = &csd->llist;
}
}
+ if (!entry)
+ return;
+
/*
- * Handle irq works queued remotely by irq_work_queue_on().
- * Smp functions above are typically synchronous so they
- * better run first since some other CPUs may be busy waiting
- * for them.
+ * Second; run all !SYNC callbacks.
*/
- irq_work_run();
+ prev = NULL;
+ llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+ int type = CSD_TYPE(csd);
+
+ if (type != CSD_TYPE_TTWU) {
+ if (prev) {
+ prev->next = &csd_next->llist;
+ } else {
+ entry = &csd_next->llist;
+ }
+
+ if (type == CSD_TYPE_ASYNC) {
+ smp_call_func_t func = csd->func;
+ void *info = csd->info;
+
+ csd_unlock(csd);
+ func(info);
+ } else if (type == CSD_TYPE_IRQ_WORK) {
+ irq_work_single(csd);
+ }
+
+ } else {
+ prev = &csd->llist;
+ }
+ }
+
+ /*
+ * Third; only CSD_TYPE_TTWU is left, issue those.
+ */
+ if (entry)
+ sched_ttwu_pending(entry);
+}
+
+void flush_smp_call_function_from_idle(void)
+{
+ unsigned long flags;
+
+ if (llist_empty(this_cpu_ptr(&call_single_queue)))
+ return;
+
+ local_irq_save(flags);
+ flush_smp_call_function_queue(true);
+ local_irq_restore(flags);
}
/*
@@ -271,7 +339,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
{
call_single_data_t *csd;
call_single_data_t csd_stack = {
- .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+ .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
};
int this_cpu;
int err;
@@ -305,7 +373,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
csd_lock(csd);
}
- err = generic_exec_single(cpu, csd, func, info);
+ csd->func = func;
+ csd->info = info;
+
+ err = generic_exec_single(cpu, csd);
if (wait)
csd_lock_wait(csd);
@@ -351,7 +422,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
csd->flags = CSD_FLAG_LOCK;
smp_wmb();
- err = generic_exec_single(cpu, csd, csd->func, csd->info);
+ err = generic_exec_single(cpu, csd);
out:
preempt_enable();
@@ -466,7 +537,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
csd_lock(csd);
if (wait)
- csd->flags |= CSD_FLAG_SYNCHRONOUS;
+ csd->flags |= CSD_TYPE_SYNC;
csd->func = func;
csd->info = info;
if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a47c6dd57452..c4201b7f42b1 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -339,12 +339,11 @@ asmlinkage __visible void do_softirq(void)
local_irq_restore(flags);
}
-/*
- * Enter an interrupt context.
+/**
+ * irq_enter_rcu - Enter an interrupt context with RCU watching
*/
-void irq_enter(void)
+void irq_enter_rcu(void)
{
- rcu_irq_enter();
if (is_idle_task(current) && !in_interrupt()) {
/*
* Prevent raise_softirq from needlessly waking up ksoftirqd
@@ -354,10 +353,18 @@ void irq_enter(void)
tick_irq_enter();
_local_bh_enable();
}
-
__irq_enter();
}
+/**
+ * irq_enter - Enter an interrupt context including RCU update
+ */
+void irq_enter(void)
+{
+ rcu_irq_enter();
+ irq_enter_rcu();
+}
+
static inline void invoke_softirq(void)
{
if (ksoftirqd_running(local_softirq_pending()))
@@ -397,10 +404,7 @@ static inline void tick_irq_exit(void)
#endif
}
-/*
- * Exit an interrupt context. Process softirqs if needed and possible:
- */
-void irq_exit(void)
+static inline void __irq_exit_rcu(void)
{
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
local_irq_disable();
@@ -413,6 +417,28 @@ void irq_exit(void)
invoke_softirq();
tick_irq_exit();
+}
+
+/**
+ * irq_exit_rcu() - Exit an interrupt context without updating RCU
+ *
+ * Also processes softirqs if needed and possible.
+ */
+void irq_exit_rcu(void)
+{
+ __irq_exit_rcu();
+ /* must be last! */
+ lockdep_hardirq_exit();
+}
+
+/**
+ * irq_exit - Exit an interrupt context, update RCU and lockdep
+ *
+ * Also processes softirqs if needed and possible.
+ */
+void irq_exit(void)
+{
+ __irq_exit_rcu();
rcu_irq_exit();
/* must be last! */
lockdep_hardirq_exit();
diff --git a/kernel/sys.c b/kernel/sys.c
index d325f3ab624a..00a96746e28a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -393,6 +393,10 @@ long __sys_setregid(gid_t rgid, gid_t egid)
new->sgid = new->egid;
new->fsgid = new->egid;
+ retval = security_task_fix_setgid(new, old, LSM_SETID_RE);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
error:
@@ -435,6 +439,10 @@ long __sys_setgid(gid_t gid)
else
goto error;
+ retval = security_task_fix_setgid(new, old, LSM_SETID_ID);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
error:
@@ -756,6 +764,10 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
new->sgid = ksgid;
new->fsgid = new->egid;
+ retval = security_task_fix_setgid(new, old, LSM_SETID_RES);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
error:
@@ -862,7 +874,8 @@ long __sys_setfsgid(gid_t gid)
ns_capable(old->user_ns, CAP_SETGID)) {
if (!gid_eq(kgid, old->fsgid)) {
new->fsgid = kgid;
- goto change_okay;
+ if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0)
+ goto change_okay;
}
}
@@ -1846,7 +1859,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (exe_file) {
struct vm_area_struct *vma;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (!vma->vm_file)
continue;
@@ -1855,7 +1868,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
goto exit_err;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
fput(exe_file);
}
@@ -1869,7 +1882,7 @@ exit:
fdput(exe);
return err;
exit_err:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
fput(exe_file);
goto exit;
}
@@ -2007,10 +2020,10 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
}
/*
- * arg_lock protects concurent updates but we still need mmap_sem for
+ * arg_lock protects concurent updates but we still need mmap_lock for
* read to exclude races with sys_brk.
*/
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
/*
* We don't validate if these members are pointing to
@@ -2049,7 +2062,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
if (prctl_map.auxv_size)
memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return 0;
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
@@ -2122,10 +2135,10 @@ static int prctl_set_mm(int opt, unsigned long addr,
/*
* arg_lock protects concurent updates of arg boundaries, we need
- * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr
+ * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr
* validation.
*/
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma(mm, addr);
spin_lock(&mm->arg_lock);
@@ -2217,7 +2230,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = 0;
out:
spin_unlock(&mm->arg_lock);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return error;
}
@@ -2262,7 +2275,7 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
return -EINVAL;
}
-#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE)
+#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
@@ -2442,13 +2455,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_THP_DISABLE:
if (arg3 || arg4 || arg5)
return -EINVAL;
- if (down_write_killable(&me->mm->mmap_sem))
+ if (mmap_write_lock_killable(me->mm))
return -EINTR;
if (arg2)
set_bit(MMF_DISABLE_THP, &me->mm->flags);
else
clear_bit(MMF_DISABLE_THP, &me->mm->flags);
- up_write(&me->mm->mmap_sem);
+ mmap_write_unlock(me->mm);
break;
case PR_MPX_ENABLE_MANAGEMENT:
case PR_MPX_DISABLE_MANAGEMENT:
@@ -2634,6 +2647,7 @@ struct compat_sysinfo {
COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
{
struct sysinfo s;
+ struct compat_sysinfo s_32;
do_sysinfo(&s);
@@ -2658,23 +2672,23 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
s.freehigh >>= bitcount;
}
- if (!access_ok(info, sizeof(struct compat_sysinfo)) ||
- __put_user(s.uptime, &info->uptime) ||
- __put_user(s.loads[0], &info->loads[0]) ||
- __put_user(s.loads[1], &info->loads[1]) ||
- __put_user(s.loads[2], &info->loads[2]) ||
- __put_user(s.totalram, &info->totalram) ||
- __put_user(s.freeram, &info->freeram) ||
- __put_user(s.sharedram, &info->sharedram) ||
- __put_user(s.bufferram, &info->bufferram) ||
- __put_user(s.totalswap, &info->totalswap) ||
- __put_user(s.freeswap, &info->freeswap) ||
- __put_user(s.procs, &info->procs) ||
- __put_user(s.totalhigh, &info->totalhigh) ||
- __put_user(s.freehigh, &info->freehigh) ||
- __put_user(s.mem_unit, &info->mem_unit))
+ memset(&s_32, 0, sizeof(s_32));
+ s_32.uptime = s.uptime;
+ s_32.loads[0] = s.loads[0];
+ s_32.loads[1] = s.loads[1];
+ s_32.loads[2] = s.loads[2];
+ s_32.totalram = s.totalram;
+ s_32.freeram = s.freeram;
+ s_32.sharedram = s.sharedram;
+ s_32.bufferram = s.bufferram;
+ s_32.totalswap = s.totalswap;
+ s_32.freeswap = s.freeswap;
+ s_32.procs = s.procs;
+ s_32.totalhigh = s.totalhigh;
+ s_32.freehigh = s.freehigh;
+ s_32.mem_unit = s.mem_unit;
+ if (copy_to_user(info, &s_32, sizeof(s_32)))
return -EFAULT;
-
return 0;
}
#endif /* CONFIG_COMPAT */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a176d8727a3..db1ce7af2563 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -68,6 +68,9 @@
#include <linux/bpf.h>
#include <linux/mount.h>
#include <linux/userfaultfd_k.h>
+#include <linux/coredump.h>
+#include <linux/latencytop.h>
+#include <linux/pid.h>
#include "../lib/kstrtox.h"
@@ -103,22 +106,6 @@
#if defined(CONFIG_SYSCTL)
-/* External variables not in a header file. */
-extern int suid_dumpable;
-#ifdef CONFIG_COREDUMP
-extern int core_uses_pid;
-extern char core_pattern[];
-extern unsigned int core_pipe_limit;
-#endif
-extern int pid_max;
-extern int pid_max_min, pid_max_max;
-extern int percpu_pagelist_fraction;
-extern int latencytop_enabled;
-extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
-#ifndef CONFIG_MMU
-extern int sysctl_nr_trim_pages;
-#endif
-
/* Constants used for minimum and maximum */
#ifdef CONFIG_LOCKUP_DETECTOR
static int sixty = 60;
@@ -131,6 +118,7 @@ static unsigned long zero_ul;
static unsigned long one_ul = 1;
static unsigned long long_max = LONG_MAX;
static int one_hundred = 100;
+static int two_hundred = 200;
static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
@@ -160,24 +148,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
#ifdef CONFIG_INOTIFY_USER
#include <linux/inotify.h>
#endif
-#ifdef CONFIG_SPARC
-#endif
-
-#ifdef CONFIG_PARISC
-extern int pwrsw_enabled;
-#endif
-
-#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
-extern int unaligned_enabled;
-#endif
-
-#ifdef CONFIG_IA64
-extern int unaligned_dump_stack;
-#endif
-
-#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
-extern int no_unaligned_warning;
-#endif
#ifdef CONFIG_PROC_SYSCTL
@@ -207,102 +177,1480 @@ enum sysctl_writes_mode {
};
static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
+#endif /* CONFIG_PROC_SYSCTL */
+
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
+int sysctl_legacy_va_layout;
+#endif
+
+#ifdef CONFIG_SCHED_DEBUG
+static int min_sched_granularity_ns = 100000; /* 100 usecs */
+static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
+static int min_wakeup_granularity_ns; /* 0 usecs */
+static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
+#ifdef CONFIG_SMP
+static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
-static int proc_do_cad_pid(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
-static int proc_taint(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
#ifdef CONFIG_COMPACTION
-static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos);
+static int min_extfrag_threshold;
+static int max_extfrag_threshold = 1000;
#endif
+
+#endif /* CONFIG_SYSCTL */
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
+static int bpf_stats_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct static_key *key = (struct static_key *)table->data;
+ static int saved_val;
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&bpf_stats_enabled_mutex);
+ val = saved_val;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret && val != saved_val) {
+ if (val)
+ static_key_slow_inc(key);
+ else
+ static_key_slow_dec(key);
+ saved_val = val;
+ }
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return ret;
+}
#endif
+/*
+ * /proc/sys support
+ */
+
+#ifdef CONFIG_PROC_SYSCTL
+
+static int _proc_do_string(char *data, int maxlen, int write,
+ char *buffer, size_t *lenp, loff_t *ppos)
+{
+ size_t len;
+ char c, *p;
+
+ if (!data || !maxlen || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
+ /* Only continue writes not past the end of buffer. */
+ len = strlen(data);
+ if (len > maxlen - 1)
+ len = maxlen - 1;
+
+ if (*ppos > len)
+ return 0;
+ len = *ppos;
+ } else {
+ /* Start writing from beginning of buffer. */
+ len = 0;
+ }
+
+ *ppos += *lenp;
+ p = buffer;
+ while ((p - buffer) < *lenp && len < maxlen - 1) {
+ c = *(p++);
+ if (c == 0 || c == '\n')
+ break;
+ data[len++] = c;
+ }
+ data[len] = 0;
+ } else {
+ len = strlen(data);
+ if (len > maxlen)
+ len = maxlen;
+
+ if (*ppos > len) {
+ *lenp = 0;
+ return 0;
+ }
+
+ data += *ppos;
+ len -= *ppos;
+
+ if (len > *lenp)
+ len = *lenp;
+ if (len)
+ memcpy(buffer, data, len);
+ if (len < *lenp) {
+ buffer[len] = '\n';
+ len++;
+ }
+ *lenp = len;
+ *ppos += len;
+ }
+ return 0;
+}
+
+static void warn_sysctl_write(struct ctl_table *table)
+{
+ pr_warn_once("%s wrote to %s when file position was not 0!\n"
+ "This will not be supported in the future. To silence this\n"
+ "warning, set kernel.sysctl_writes_strict = -1\n",
+ current->comm, table->procname);
+}
+
+/**
+ * proc_first_pos_non_zero_ignore - check if first position is allowed
+ * @ppos: file position
+ * @table: the sysctl table
+ *
+ * Returns true if the first position is non-zero and the sysctl_writes_strict
+ * mode indicates this is not allowed for numeric input types. String proc
+ * handlers can ignore the return value.
+ */
+static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
+ struct ctl_table *table)
+{
+ if (!*ppos)
+ return false;
+
+ switch (sysctl_writes_strict) {
+ case SYSCTL_WRITES_STRICT:
+ return true;
+ case SYSCTL_WRITES_WARN:
+ warn_sysctl_write(table);
+ return false;
+ default:
+ return false;
+ }
+}
+
+/**
+ * proc_dostring - read a string sysctl
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes a string from/to the user buffer. If the kernel
+ * buffer provided is not large enough to hold the string, the
+ * string is truncated. The copied string is %NULL-terminated.
+ * If the string is being read by the user process, it is copied
+ * and a newline '\n' is added. It is truncated if the buffer is
+ * not large enough.
+ *
+ * Returns 0 on success.
+ */
+int proc_dostring(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write)
+ proc_first_pos_non_zero_ignore(ppos, table);
+
+ return _proc_do_string(table->data, table->maxlen, write, buffer, lenp,
+ ppos);
+}
+
+static size_t proc_skip_spaces(char **buf)
+{
+ size_t ret;
+ char *tmp = skip_spaces(*buf);
+ ret = tmp - *buf;
+ *buf = tmp;
+ return ret;
+}
+
+static void proc_skip_char(char **buf, size_t *size, const char v)
+{
+ while (*size) {
+ if (**buf != v)
+ break;
+ (*size)--;
+ (*buf)++;
+ }
+}
+
+/**
+ * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
+ * fail on overflow
+ *
+ * @cp: kernel buffer containing the string to parse
+ * @endp: pointer to store the trailing characters
+ * @base: the base to use
+ * @res: where the parsed integer will be stored
+ *
+ * In case of success 0 is returned and @res will contain the parsed integer,
+ * @endp will hold any trailing characters.
+ * This function will fail the parse on overflow. If there wasn't an overflow
+ * the function will defer the decision what characters count as invalid to the
+ * caller.
+ */
+static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
+ unsigned long *res)
+{
+ unsigned long long result;
+ unsigned int rv;
+
+ cp = _parse_integer_fixup_radix(cp, &base);
+ rv = _parse_integer(cp, base, &result);
+ if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
+ return -ERANGE;
+
+ cp += rv;
+
+ if (endp)
+ *endp = (char *)cp;
+
+ *res = (unsigned long)result;
+ return 0;
+}
+
+#define TMPBUFLEN 22
+/**
+ * proc_get_long - reads an ASCII formatted integer from a user buffer
+ *
+ * @buf: a kernel buffer
+ * @size: size of the kernel buffer
+ * @val: this is where the number will be stored
+ * @neg: set to %TRUE if number is negative
+ * @perm_tr: a vector which contains the allowed trailers
+ * @perm_tr_len: size of the perm_tr vector
+ * @tr: pointer to store the trailer character
+ *
+ * In case of success %0 is returned and @buf and @size are updated with
+ * the amount of bytes read. If @tr is non-NULL and a trailing
+ * character exists (size is non-zero after returning from this
+ * function), @tr is updated with the trailing character.
+ */
+static int proc_get_long(char **buf, size_t *size,
+ unsigned long *val, bool *neg,
+ const char *perm_tr, unsigned perm_tr_len, char *tr)
+{
+ int len;
+ char *p, tmp[TMPBUFLEN];
+
+ if (!*size)
+ return -EINVAL;
+
+ len = *size;
+ if (len > TMPBUFLEN - 1)
+ len = TMPBUFLEN - 1;
+
+ memcpy(tmp, *buf, len);
+
+ tmp[len] = 0;
+ p = tmp;
+ if (*p == '-' && *size > 1) {
+ *neg = true;
+ p++;
+ } else
+ *neg = false;
+ if (!isdigit(*p))
+ return -EINVAL;
+
+ if (strtoul_lenient(p, &p, 0, val))
+ return -EINVAL;
+
+ len = p - tmp;
+
+ /* We don't know if the next char is whitespace thus we may accept
+ * invalid integers (e.g. 1234...a) or two integers instead of one
+ * (e.g. 123...1). So lets not allow such large numbers. */
+ if (len == TMPBUFLEN - 1)
+ return -EINVAL;
+
+ if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
+ return -EINVAL;
+
+ if (tr && (len < *size))
+ *tr = *p;
+
+ *buf += len;
+ *size -= len;
+
+ return 0;
+}
+
+/**
+ * proc_put_long - converts an integer to a decimal ASCII formatted string
+ *
+ * @buf: the user buffer
+ * @size: the size of the user buffer
+ * @val: the integer to be converted
+ * @neg: sign of the number, %TRUE for negative
+ *
+ * In case of success @buf and @size are updated with the amount of bytes
+ * written.
+ */
+static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg)
+{
+ int len;
+ char tmp[TMPBUFLEN], *p = tmp;
+
+ sprintf(p, "%s%lu", neg ? "-" : "", val);
+ len = strlen(tmp);
+ if (len > *size)
+ len = *size;
+ memcpy(*buf, tmp, len);
+ *size -= len;
+ *buf += len;
+}
+#undef TMPBUFLEN
+
+static void proc_put_char(void **buf, size_t *size, char c)
+{
+ if (*size) {
+ char **buffer = (char **)buf;
+ **buffer = c;
+
+ (*size)--;
+ (*buffer)++;
+ *buf = *buffer;
+ }
+}
+
+static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*negp) {
+ if (*lvalp > (unsigned long) INT_MAX + 1)
+ return -EINVAL;
+ *valp = -*lvalp;
+ } else {
+ if (*lvalp > (unsigned long) INT_MAX)
+ return -EINVAL;
+ *valp = *lvalp;
+ }
+ } else {
+ int val = *valp;
+ if (val < 0) {
+ *negp = true;
+ *lvalp = -(unsigned long)val;
+ } else {
+ *negp = false;
+ *lvalp = (unsigned long)val;
+ }
+ }
+ return 0;
+}
+
+static int do_proc_douintvec_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
+ *valp = *lvalp;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long)val;
+ }
+ return 0;
+}
+
+static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
+
+static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
+ int write, void *data),
+ void *data)
+{
+ int *i, vleft, first = 1, err = 0;
+ size_t left;
+ char *p;
+
+ if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (int *) tbl_data;
+ vleft = table->maxlen / sizeof(*i);
+ left = *lenp;
+
+ if (!conv)
+ conv = do_proc_dointvec_conv;
+
+ if (write) {
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto out;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+ p = buffer;
+ }
+
+ for (; left && vleft--; i++, first=0) {
+ unsigned long lval;
+ bool neg;
+
+ if (write) {
+ left -= proc_skip_spaces(&p);
+
+ if (!left)
+ break;
+ err = proc_get_long(&p, &left, &lval, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err)
+ break;
+ if (conv(&neg, &lval, i, 1, data)) {
+ err = -EINVAL;
+ break;
+ }
+ } else {
+ if (conv(&neg, &lval, i, 0, data)) {
+ err = -EINVAL;
+ break;
+ }
+ if (!first)
+ proc_put_char(&buffer, &left, '\t');
+ proc_put_long(&buffer, &left, lval, neg);
+ }
+ }
+
+ if (!write && !first && left && !err)
+ proc_put_char(&buffer, &left, '\n');
+ if (write && !err && left)
+ left -= proc_skip_spaces(&p);
+ if (write && first)
+ return err ? : -EINVAL;
+ *lenp -= left;
+out:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_dointvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
+ int write, void *data),
+ void *data)
+{
+ return __do_proc_dointvec(table->data, table, write,
+ buffer, lenp, ppos, conv, data);
+}
+
+static int do_proc_douintvec_w(unsigned int *tbl_data,
+ struct ctl_table *table,
+ void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned long lval;
+ int err = 0;
+ size_t left;
+ bool neg;
+ char *p = buffer;
+
+ left = *lenp;
+
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto bail_early;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+
+ left -= proc_skip_spaces(&p);
+ if (!left) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ err = proc_get_long(&p, &left, &lval, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err || neg) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (conv(&lval, tbl_data, 1, data)) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (!err && left)
+ left -= proc_skip_spaces(&p);
+
+out_free:
+ if (err)
+ return -EINVAL;
+
+ return 0;
+
+ /* This is in keeping with old __do_proc_dointvec() */
+bail_early:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned long lval;
+ int err = 0;
+ size_t left;
+
+ left = *lenp;
+
+ if (conv(&lval, tbl_data, 0, data)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ proc_put_long(&buffer, &left, lval, false);
+ if (!left)
+ goto out;
+
+ proc_put_char(&buffer, &left, '\n');
+
+out:
+ *lenp -= left;
+ *ppos += *lenp;
+
+ return err;
+}
+
+static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned int *i, vleft;
+
+ if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (unsigned int *) tbl_data;
+ vleft = table->maxlen / sizeof(*i);
+
+ /*
+ * Arrays are not supported, keep this simple. *Do not* add
+ * support for them.
+ */
+ if (vleft != 1) {
+ *lenp = 0;
+ return -EINVAL;
+ }
+
+ if (!conv)
+ conv = do_proc_douintvec_conv;
+
+ if (write)
+ return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
+ conv, data);
+ return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
+}
+
+static int do_proc_douintvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ return __do_proc_douintvec(table->data, table, write,
+ buffer, lenp, ppos, conv, data);
+}
+
+/**
+ * proc_dointvec - read a vector of integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+}
+
+#ifdef CONFIG_COMPACTION
+static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
+ int write, void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, old;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ old = *(int *)table->data;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+ if (old != *(int *)table->data)
+ pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
+ table->procname, current->comm,
+ task_pid_nr(current));
+ return ret;
+}
+#endif
+
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_conv, NULL);
+}
+
+/*
+ * Taint values can only be increased
+ * This means we can safely use a temporary.
+ */
+static int proc_taint(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long tmptaint = get_taint();
+ int err;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &tmptaint;
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ if (write) {
+ int i;
+
+ /*
+ * If we are relying on panic_on_taint not producing
+ * false positives due to userspace input, bail out
+ * before setting the requested taint flags.
+ */
+ if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint))
+ return -EINVAL;
+
+ /*
+ * Poor man's atomic or. Not worth adding a primitive
+ * to everyone's atomic.h for this
+ */
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++)
+ if ((1UL << i) & tmptaint)
+ add_taint(i, LOCKDEP_STILL_OK);
+ }
+
+ return err;
+}
+
#ifdef CONFIG_PRINTK
static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif
+
+/**
+ * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
+ * @min: pointer to minimum allowable value
+ * @max: pointer to maximum allowable value
+ *
+ * The do_proc_dointvec_minmax_conv_param structure provides the
+ * minimum and maximum values for doing range checking for those sysctl
+ * parameters that use the proc_dointvec_minmax() handler.
+ */
+struct do_proc_dointvec_minmax_conv_param {
+ int *min;
+ int *max;
+};
+
+static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ int tmp, ret;
+ struct do_proc_dointvec_minmax_conv_param *param = data;
+ /*
+ * If writing, first do so via a temporary local int so we can
+ * bounds-check it before touching *valp.
+ */
+ int *ip = write ? &tmp : valp;
+
+ ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
+ if (ret)
+ return ret;
+
+ if (write) {
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
+ return -EINVAL;
+ *valp = tmp;
+ }
+
+ return 0;
+}
+
+/**
+ * proc_dointvec_minmax - read a vector of integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success or -EINVAL on write when the range check fails.
+ */
+int proc_dointvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_dointvec_minmax_conv_param param = {
+ .min = (int *) table->extra1,
+ .max = (int *) table->extra2,
+ };
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_minmax_conv, &param);
+}
+
+/**
+ * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure
+ * @min: pointer to minimum allowable value
+ * @max: pointer to maximum allowable value
+ *
+ * The do_proc_douintvec_minmax_conv_param structure provides the
+ * minimum and maximum values for doing range checking for those sysctl
+ * parameters that use the proc_douintvec_minmax() handler.
+ */
+struct do_proc_douintvec_minmax_conv_param {
+ unsigned int *min;
+ unsigned int *max;
+};
+
+static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ int ret;
+ unsigned int tmp;
+ struct do_proc_douintvec_minmax_conv_param *param = data;
+ /* write via temporary local uint for bounds-checking */
+ unsigned int *up = write ? &tmp : valp;
+
+ ret = do_proc_douintvec_conv(lvalp, up, write, data);
+ if (ret)
+ return ret;
+
+ if (write) {
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
+ return -ERANGE;
+
+ *valp = tmp;
+ }
+
+ return 0;
+}
+
+/**
+ * proc_douintvec_minmax - read a vector of unsigned ints with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string. Negative
+ * strings are not allowed.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max). There is a final sanity
+ * check for UINT_MAX to avoid having to support wrap around uses from
+ * userspace.
+ *
+ * Returns 0 on success or -ERANGE on write when the range check fails.
+ */
+int proc_douintvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_douintvec_minmax_conv_param param = {
+ .min = (unsigned int *) table->extra1,
+ .max = (unsigned int *) table->extra2,
+ };
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_minmax_conv, &param);
+}
+
+static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ if (write) {
+ unsigned int val;
+
+ val = round_pipe_size(*lvalp);
+ if (val == 0)
+ return -EINVAL;
+
+ *valp = val;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long) val;
+ }
+
+ return 0;
+}
+
+static int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_dopipe_max_size_conv, NULL);
+}
+
+static void validate_coredump_safety(void)
+{
+#ifdef CONFIG_COREDUMP
+ if (suid_dumpable == SUID_DUMP_ROOT &&
+ core_pattern[0] != '/' && core_pattern[0] != '|') {
+ printk(KERN_WARNING
+"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
+"Pipe handler or fully qualified core dump path required.\n"
+"Set kernel.core_pattern before fs.suid_dumpable.\n"
+ );
+ }
#endif
+}
static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
+
#ifdef CONFIG_COREDUMP
static int proc_dostring_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dostring(table, write, buffer, lenp, ppos);
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
#endif
-static int proc_dopipe_max_size(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
#ifdef CONFIG_MAGIC_SYSRQ
static int sysrq_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
-#endif
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int tmp, ret;
-static struct ctl_table kern_table[];
-static struct ctl_table vm_table[];
-static struct ctl_table fs_table[];
-static struct ctl_table debug_table[];
-static struct ctl_table dev_table[];
-extern struct ctl_table random_table[];
-#ifdef CONFIG_EPOLL
-extern struct ctl_table epoll_table[];
-#endif
+ tmp = sysrq_mask();
-#ifdef CONFIG_FW_LOADER_USER_HELPER
-extern struct ctl_table firmware_config_table[];
-#endif
+ ret = __do_proc_dointvec(&tmp, table, write, buffer,
+ lenp, ppos, NULL, NULL);
+ if (ret || !write)
+ return ret;
-#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
- defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
-int sysctl_legacy_va_layout;
+ if (write)
+ sysrq_toggle_support(tmp);
+
+ return 0;
+}
#endif
-/* The default sysctl tables: */
+static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
+ int write, void *buffer, size_t *lenp, loff_t *ppos,
+ unsigned long convmul, unsigned long convdiv)
+{
+ unsigned long *i, *min, *max;
+ int vleft, first = 1, err = 0;
+ size_t left;
+ char *p;
-static struct ctl_table sysctl_base_table[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = kern_table,
- },
- {
- .procname = "vm",
- .mode = 0555,
- .child = vm_table,
- },
- {
- .procname = "fs",
- .mode = 0555,
- .child = fs_table,
- },
- {
- .procname = "debug",
- .mode = 0555,
- .child = debug_table,
- },
- {
- .procname = "dev",
- .mode = 0555,
- .child = dev_table,
- },
- { }
-};
+ if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
-#ifdef CONFIG_SCHED_DEBUG
-static int min_sched_granularity_ns = 100000; /* 100 usecs */
-static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
-static int min_wakeup_granularity_ns; /* 0 usecs */
-static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
-#ifdef CONFIG_SMP
-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif /* CONFIG_SMP */
-#endif /* CONFIG_SCHED_DEBUG */
+ i = (unsigned long *) data;
+ min = (unsigned long *) table->extra1;
+ max = (unsigned long *) table->extra2;
+ vleft = table->maxlen / sizeof(unsigned long);
+ left = *lenp;
-#ifdef CONFIG_COMPACTION
-static int min_extfrag_threshold;
-static int max_extfrag_threshold = 1000;
-#endif
+ if (write) {
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto out;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+ p = buffer;
+ }
+
+ for (; left && vleft--; i++, first = 0) {
+ unsigned long val;
+
+ if (write) {
+ bool neg;
+
+ left -= proc_skip_spaces(&p);
+ if (!left)
+ break;
+
+ err = proc_get_long(&p, &left, &val, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err)
+ break;
+ if (neg)
+ continue;
+ val = convmul * val / convdiv;
+ if ((min && val < *min) || (max && val > *max)) {
+ err = -EINVAL;
+ break;
+ }
+ *i = val;
+ } else {
+ val = convdiv * (*i) / convmul;
+ if (!first)
+ proc_put_char(&buffer, &left, '\t');
+ proc_put_long(&buffer, &left, val, false);
+ }
+ }
+
+ if (!write && !first && left && !err)
+ proc_put_char(&buffer, &left, '\n');
+ if (write && !err)
+ left -= proc_skip_spaces(&p);
+ if (write && first)
+ return err ? : -EINVAL;
+ *lenp -= left;
+out:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul,
+ unsigned long convdiv)
+{
+ return __do_proc_doulongvec_minmax(table->data, table, write,
+ buffer, lenp, ppos, convmul, convdiv);
+}
+
+/**
+ * proc_doulongvec_minmax - read a vector of long integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
+}
+
+/**
+ * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string. The values
+ * are treated as milliseconds, and converted to jiffies when they are stored.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_doulongvec_minmax(table, write, buffer,
+ lenp, ppos, HZ, 1000l);
+}
+
+
+static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*lvalp > INT_MAX / HZ)
+ return 1;
+ *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = -(unsigned long)val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = lval / HZ;
+ }
+ return 0;
+}
+
+static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
+ return 1;
+ *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = -(unsigned long)val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = jiffies_to_clock_t(lval);
+ }
+ return 0;
+}
+
+static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
+
+ if (jif > INT_MAX)
+ return 1;
+ *valp = (int)jif;
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = -(unsigned long)val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = jiffies_to_msecs(lval);
+ }
+ return 0;
+}
+
+/**
+ * proc_dointvec_jiffies - read a vector of integers as seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in seconds, and are converted into
+ * jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ do_proc_dointvec_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/USER_HZ seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ do_proc_dointvec_userhz_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ * @ppos: the current position in the file
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/1000 seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_ms_jiffies_conv, NULL);
+}
+
+static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct pid *new_pid;
+ pid_t tmp;
+ int r;
+
+ tmp = pid_vnr(cad_pid);
+
+ r = __do_proc_dointvec(&tmp, table, write, buffer,
+ lenp, ppos, NULL, NULL);
+ if (r || !write)
+ return r;
+
+ new_pid = find_get_pid(tmp);
+ if (!new_pid)
+ return -ESRCH;
+
+ put_pid(xchg(&cad_pid, new_pid));
+ return 0;
+}
+
+/**
+ * proc_do_large_bitmap - read/write from/to a large bitmap
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * The bitmap is stored at table->data and the bitmap length (in bits)
+ * in table->maxlen.
+ *
+ * We use a range comma separated format (e.g. 1,3-4,10-10) so that
+ * large bitmaps may be represented in a compact manner. Writing into
+ * the file will clear the bitmap then update it with the given input.
+ *
+ * Returns 0 on success.
+ */
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err = 0;
+ bool first = 1;
+ size_t left = *lenp;
+ unsigned long bitmap_len = table->maxlen;
+ unsigned long *bitmap = *(unsigned long **) table->data;
+ unsigned long *tmp_bitmap = NULL;
+ char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
+
+ if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ char *p = buffer;
+ size_t skipped = 0;
+
+ if (left > PAGE_SIZE - 1) {
+ left = PAGE_SIZE - 1;
+ /* How much of the buffer we'll skip this pass */
+ skipped = *lenp - left;
+ }
+
+ tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
+ if (!tmp_bitmap)
+ return -ENOMEM;
+ proc_skip_char(&p, &left, '\n');
+ while (!err && left) {
+ unsigned long val_a, val_b;
+ bool neg;
+ size_t saved_left;
+
+ /* In case we stop parsing mid-number, we can reset */
+ saved_left = left;
+ err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
+ sizeof(tr_a), &c);
+ /*
+ * If we consumed the entirety of a truncated buffer or
+ * only one char is left (may be a "-"), then stop here,
+ * reset, & come back for more.
+ */
+ if ((left <= 1) && skipped) {
+ left = saved_left;
+ break;
+ }
+
+ if (err)
+ break;
+ if (val_a >= bitmap_len || neg) {
+ err = -EINVAL;
+ break;
+ }
+
+ val_b = val_a;
+ if (left) {
+ p++;
+ left--;
+ }
+
+ if (c == '-') {
+ err = proc_get_long(&p, &left, &val_b,
+ &neg, tr_b, sizeof(tr_b),
+ &c);
+ /*
+ * If we consumed all of a truncated buffer or
+ * then stop here, reset, & come back for more.
+ */
+ if (!left && skipped) {
+ left = saved_left;
+ break;
+ }
+
+ if (err)
+ break;
+ if (val_b >= bitmap_len || neg ||
+ val_a > val_b) {
+ err = -EINVAL;
+ break;
+ }
+ if (left) {
+ p++;
+ left--;
+ }
+ }
+
+ bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
+ first = 0;
+ proc_skip_char(&p, &left, '\n');
+ }
+ left += skipped;
+ } else {
+ unsigned long bit_a, bit_b = 0;
+
+ while (left) {
+ bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
+ if (bit_a >= bitmap_len)
+ break;
+ bit_b = find_next_zero_bit(bitmap, bitmap_len,
+ bit_a + 1) - 1;
+
+ if (!first)
+ proc_put_char(&buffer, &left, ',');
+ proc_put_long(&buffer, &left, bit_a, false);
+ if (bit_a != bit_b) {
+ proc_put_char(&buffer, &left, '-');
+ proc_put_long(&buffer, &left, bit_b, false);
+ }
+
+ first = 0; bit_b++;
+ }
+ proc_put_char(&buffer, &left, '\n');
+ }
+
+ if (!err) {
+ if (write) {
+ if (*ppos)
+ bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
+ else
+ bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
+ }
+ *lenp -= left;
+ *ppos += *lenp;
+ }
+
+ bitmap_free(tmp_bitmap);
+ return err;
+}
+
+#else /* CONFIG_PROC_SYSCTL */
+
+int proc_dostring(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_douintvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_douintvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+#endif /* CONFIG_PROC_SYSCTL */
+
+#if defined(CONFIG_SYSCTL)
+int proc_do_static_key(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct static_key *key = (struct static_key *)table->data;
+ static DEFINE_MUTEX(static_key_mutex);
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&static_key_mutex);
+ val = static_key_enabled(key);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (val)
+ static_key_enable(key);
+ else
+ static_key_disable(key);
+ }
+ mutex_unlock(&static_key_mutex);
+ return ret;
+}
static struct ctl_table kern_table[] = {
{
@@ -613,7 +1961,7 @@ static struct ctl_table kern_table[] = {
.procname = "soft-power",
.data = &pwrsw_enabled,
.maxlen = sizeof (int),
- .mode = 0644,
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
#endif
@@ -801,6 +2149,17 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
+#ifdef CONFIG_SMP
+ {
+ .procname = "oops_all_cpu_backtrace",
+ .data = &sysctl_oops_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SMP */
{
.procname = "pid_max",
.data = &pid_max,
@@ -994,30 +2353,32 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#if defined(CONFIG_X86)
+
+#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \
+ defined(CONFIG_DEBUG_STACKOVERFLOW)
{
- .procname = "panic_on_unrecovered_nmi",
- .data = &panic_on_unrecovered_nmi,
+ .procname = "panic_on_stackoverflow",
+ .data = &sysctl_panic_on_stackoverflow,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#endif
+#if defined(CONFIG_X86)
{
- .procname = "panic_on_io_nmi",
- .data = &panic_on_io_nmi,
+ .procname = "panic_on_unrecovered_nmi",
+ .data = &panic_on_unrecovered_nmi,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
{
- .procname = "panic_on_stackoverflow",
- .data = &sysctl_panic_on_stackoverflow,
+ .procname = "panic_on_io_nmi",
+ .data = &panic_on_io_nmi,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
-#endif
{
.procname = "bootloader_type",
.data = &bootloader_type,
@@ -1072,7 +2433,7 @@ static struct ctl_table kern_table[] = {
.procname = "ignore-unaligned-usertrap",
.data = &no_unaligned_warning,
.maxlen = sizeof (int),
- .mode = 0644,
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
#endif
@@ -1086,6 +2447,17 @@ static struct ctl_table kern_table[] = {
},
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
+#ifdef CONFIG_SMP
+ {
+ .procname = "hung_task_all_cpu_backtrace",
+ .data = &sysctl_hung_task_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SMP */
{
.procname = "hung_task_panic",
.data = &sysctl_hung_task_panic,
@@ -1244,7 +2616,7 @@ static struct ctl_table kern_table[] = {
.data = &bpf_stats_enabled_key.key,
.maxlen = sizeof(bpf_stats_enabled_key),
.mode = 0644,
- .proc_handler = proc_do_static_key,
+ .proc_handler = bpf_stats_handler,
},
#endif
#if defined(CONFIG_TREE_RCU)
@@ -1320,7 +2692,7 @@ static struct ctl_table vm_table[] = {
.proc_handler = overcommit_kbytes_handler,
},
{
- .procname = "page-cluster",
+ .procname = "page-cluster",
.data = &page_cluster,
.maxlen = sizeof(int),
.mode = 0644,
@@ -1391,7 +2763,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &one_hundred,
+ .extra2 = &two_hundred,
},
#ifdef CONFIG_HUGETLB_PAGE
{
@@ -1491,7 +2863,7 @@ static struct ctl_table vm_table[] = {
.data = &watermark_boost_factor,
.maxlen = sizeof(watermark_boost_factor),
.mode = 0644,
- .proc_handler = watermark_boost_factor_sysctl_handler,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
{
@@ -1959,6 +3331,35 @@ static struct ctl_table dev_table[] = {
{ }
};
+static struct ctl_table sysctl_base_table[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = kern_table,
+ },
+ {
+ .procname = "vm",
+ .mode = 0555,
+ .child = vm_table,
+ },
+ {
+ .procname = "fs",
+ .mode = 0555,
+ .child = fs_table,
+ },
+ {
+ .procname = "debug",
+ .mode = 0555,
+ .child = debug_table,
+ },
+ {
+ .procname = "dev",
+ .mode = 0555,
+ .child = dev_table,
+ },
+ { }
+};
+
int __init sysctl_init(void)
{
struct ctl_table_header *hdr;
@@ -1967,1474 +3368,7 @@ int __init sysctl_init(void)
kmemleak_not_leak(hdr);
return 0;
}
-
#endif /* CONFIG_SYSCTL */
-
-/*
- * /proc/sys support
- */
-
-#ifdef CONFIG_PROC_SYSCTL
-
-static int _proc_do_string(char *data, int maxlen, int write,
- char __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- size_t len;
- char __user *p;
- char c;
-
- if (!data || !maxlen || !*lenp) {
- *lenp = 0;
- return 0;
- }
-
- if (write) {
- if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
- /* Only continue writes not past the end of buffer. */
- len = strlen(data);
- if (len > maxlen - 1)
- len = maxlen - 1;
-
- if (*ppos > len)
- return 0;
- len = *ppos;
- } else {
- /* Start writing from beginning of buffer. */
- len = 0;
- }
-
- *ppos += *lenp;
- p = buffer;
- while ((p - buffer) < *lenp && len < maxlen - 1) {
- if (get_user(c, p++))
- return -EFAULT;
- if (c == 0 || c == '\n')
- break;
- data[len++] = c;
- }
- data[len] = 0;
- } else {
- len = strlen(data);
- if (len > maxlen)
- len = maxlen;
-
- if (*ppos > len) {
- *lenp = 0;
- return 0;
- }
-
- data += *ppos;
- len -= *ppos;
-
- if (len > *lenp)
- len = *lenp;
- if (len)
- if (copy_to_user(buffer, data, len))
- return -EFAULT;
- if (len < *lenp) {
- if (put_user('\n', buffer + len))
- return -EFAULT;
- len++;
- }
- *lenp = len;
- *ppos += len;
- }
- return 0;
-}
-
-static void warn_sysctl_write(struct ctl_table *table)
-{
- pr_warn_once("%s wrote to %s when file position was not 0!\n"
- "This will not be supported in the future. To silence this\n"
- "warning, set kernel.sysctl_writes_strict = -1\n",
- current->comm, table->procname);
-}
-
-/**
- * proc_first_pos_non_zero_ignore - check if first position is allowed
- * @ppos: file position
- * @table: the sysctl table
- *
- * Returns true if the first position is non-zero and the sysctl_writes_strict
- * mode indicates this is not allowed for numeric input types. String proc
- * handlers can ignore the return value.
- */
-static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
- struct ctl_table *table)
-{
- if (!*ppos)
- return false;
-
- switch (sysctl_writes_strict) {
- case SYSCTL_WRITES_STRICT:
- return true;
- case SYSCTL_WRITES_WARN:
- warn_sysctl_write(table);
- return false;
- default:
- return false;
- }
-}
-
-/**
- * proc_dostring - read a string sysctl
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes a string from/to the user buffer. If the kernel
- * buffer provided is not large enough to hold the string, the
- * string is truncated. The copied string is %NULL-terminated.
- * If the string is being read by the user process, it is copied
- * and a newline '\n' is added. It is truncated if the buffer is
- * not large enough.
- *
- * Returns 0 on success.
- */
-int proc_dostring(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- if (write)
- proc_first_pos_non_zero_ignore(ppos, table);
-
- return _proc_do_string((char *)(table->data), table->maxlen, write,
- (char __user *)buffer, lenp, ppos);
-}
-
-static size_t proc_skip_spaces(char **buf)
-{
- size_t ret;
- char *tmp = skip_spaces(*buf);
- ret = tmp - *buf;
- *buf = tmp;
- return ret;
-}
-
-static void proc_skip_char(char **buf, size_t *size, const char v)
-{
- while (*size) {
- if (**buf != v)
- break;
- (*size)--;
- (*buf)++;
- }
-}
-
-/**
- * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
- * fail on overflow
- *
- * @cp: kernel buffer containing the string to parse
- * @endp: pointer to store the trailing characters
- * @base: the base to use
- * @res: where the parsed integer will be stored
- *
- * In case of success 0 is returned and @res will contain the parsed integer,
- * @endp will hold any trailing characters.
- * This function will fail the parse on overflow. If there wasn't an overflow
- * the function will defer the decision what characters count as invalid to the
- * caller.
- */
-static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
- unsigned long *res)
-{
- unsigned long long result;
- unsigned int rv;
-
- cp = _parse_integer_fixup_radix(cp, &base);
- rv = _parse_integer(cp, base, &result);
- if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
- return -ERANGE;
-
- cp += rv;
-
- if (endp)
- *endp = (char *)cp;
-
- *res = (unsigned long)result;
- return 0;
-}
-
-#define TMPBUFLEN 22
-/**
- * proc_get_long - reads an ASCII formatted integer from a user buffer
- *
- * @buf: a kernel buffer
- * @size: size of the kernel buffer
- * @val: this is where the number will be stored
- * @neg: set to %TRUE if number is negative
- * @perm_tr: a vector which contains the allowed trailers
- * @perm_tr_len: size of the perm_tr vector
- * @tr: pointer to store the trailer character
- *
- * In case of success %0 is returned and @buf and @size are updated with
- * the amount of bytes read. If @tr is non-NULL and a trailing
- * character exists (size is non-zero after returning from this
- * function), @tr is updated with the trailing character.
- */
-static int proc_get_long(char **buf, size_t *size,
- unsigned long *val, bool *neg,
- const char *perm_tr, unsigned perm_tr_len, char *tr)
-{
- int len;
- char *p, tmp[TMPBUFLEN];
-
- if (!*size)
- return -EINVAL;
-
- len = *size;
- if (len > TMPBUFLEN - 1)
- len = TMPBUFLEN - 1;
-
- memcpy(tmp, *buf, len);
-
- tmp[len] = 0;
- p = tmp;
- if (*p == '-' && *size > 1) {
- *neg = true;
- p++;
- } else
- *neg = false;
- if (!isdigit(*p))
- return -EINVAL;
-
- if (strtoul_lenient(p, &p, 0, val))
- return -EINVAL;
-
- len = p - tmp;
-
- /* We don't know if the next char is whitespace thus we may accept
- * invalid integers (e.g. 1234...a) or two integers instead of one
- * (e.g. 123...1). So lets not allow such large numbers. */
- if (len == TMPBUFLEN - 1)
- return -EINVAL;
-
- if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
- return -EINVAL;
-
- if (tr && (len < *size))
- *tr = *p;
-
- *buf += len;
- *size -= len;
-
- return 0;
-}
-
-/**
- * proc_put_long - converts an integer to a decimal ASCII formatted string
- *
- * @buf: the user buffer
- * @size: the size of the user buffer
- * @val: the integer to be converted
- * @neg: sign of the number, %TRUE for negative
- *
- * In case of success %0 is returned and @buf and @size are updated with
- * the amount of bytes written.
- */
-static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
- bool neg)
-{
- int len;
- char tmp[TMPBUFLEN], *p = tmp;
-
- sprintf(p, "%s%lu", neg ? "-" : "", val);
- len = strlen(tmp);
- if (len > *size)
- len = *size;
- if (copy_to_user(*buf, tmp, len))
- return -EFAULT;
- *size -= len;
- *buf += len;
- return 0;
-}
-#undef TMPBUFLEN
-
-static int proc_put_char(void __user **buf, size_t *size, char c)
-{
- if (*size) {
- char __user **buffer = (char __user **)buf;
- if (put_user(c, *buffer))
- return -EFAULT;
- (*size)--, (*buffer)++;
- *buf = *buffer;
- }
- return 0;
-}
-
-static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- if (*negp) {
- if (*lvalp > (unsigned long) INT_MAX + 1)
- return -EINVAL;
- *valp = -*lvalp;
- } else {
- if (*lvalp > (unsigned long) INT_MAX)
- return -EINVAL;
- *valp = *lvalp;
- }
- } else {
- int val = *valp;
- if (val < 0) {
- *negp = true;
- *lvalp = -(unsigned long)val;
- } else {
- *negp = false;
- *lvalp = (unsigned long)val;
- }
- }
- return 0;
-}
-
-static int do_proc_douintvec_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
-{
- if (write) {
- if (*lvalp > UINT_MAX)
- return -EINVAL;
- *valp = *lvalp;
- } else {
- unsigned int val = *valp;
- *lvalp = (unsigned long)val;
- }
- return 0;
-}
-
-static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
-
-static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
- int write, void *data),
- void *data)
-{
- int *i, vleft, first = 1, err = 0;
- size_t left;
- char *kbuf = NULL, *p;
-
- if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- i = (int *) tbl_data;
- vleft = table->maxlen / sizeof(*i);
- left = *lenp;
-
- if (!conv)
- conv = do_proc_dointvec_conv;
-
- if (write) {
- if (proc_first_pos_non_zero_ignore(ppos, table))
- goto out;
-
- if (left > PAGE_SIZE - 1)
- left = PAGE_SIZE - 1;
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- }
-
- for (; left && vleft--; i++, first=0) {
- unsigned long lval;
- bool neg;
-
- if (write) {
- left -= proc_skip_spaces(&p);
-
- if (!left)
- break;
- err = proc_get_long(&p, &left, &lval, &neg,
- proc_wspace_sep,
- sizeof(proc_wspace_sep), NULL);
- if (err)
- break;
- if (conv(&neg, &lval, i, 1, data)) {
- err = -EINVAL;
- break;
- }
- } else {
- if (conv(&neg, &lval, i, 0, data)) {
- err = -EINVAL;
- break;
- }
- if (!first)
- err = proc_put_char(&buffer, &left, '\t');
- if (err)
- break;
- err = proc_put_long(&buffer, &left, lval, neg);
- if (err)
- break;
- }
- }
-
- if (!write && !first && left && !err)
- err = proc_put_char(&buffer, &left, '\n');
- if (write && !err && left)
- left -= proc_skip_spaces(&p);
- if (write) {
- kfree(kbuf);
- if (first)
- return err ? : -EINVAL;
- }
- *lenp -= left;
-out:
- *ppos += *lenp;
- return err;
-}
-
-static int do_proc_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos,
- int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
- int write, void *data),
- void *data)
-{
- return __do_proc_dointvec(table->data, table, write,
- buffer, lenp, ppos, conv, data);
-}
-
-static int do_proc_douintvec_w(unsigned int *tbl_data,
- struct ctl_table *table,
- void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- unsigned long lval;
- int err = 0;
- size_t left;
- bool neg;
- char *kbuf = NULL, *p;
-
- left = *lenp;
-
- if (proc_first_pos_non_zero_ignore(ppos, table))
- goto bail_early;
-
- if (left > PAGE_SIZE - 1)
- left = PAGE_SIZE - 1;
-
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return -EINVAL;
-
- left -= proc_skip_spaces(&p);
- if (!left) {
- err = -EINVAL;
- goto out_free;
- }
-
- err = proc_get_long(&p, &left, &lval, &neg,
- proc_wspace_sep,
- sizeof(proc_wspace_sep), NULL);
- if (err || neg) {
- err = -EINVAL;
- goto out_free;
- }
-
- if (conv(&lval, tbl_data, 1, data)) {
- err = -EINVAL;
- goto out_free;
- }
-
- if (!err && left)
- left -= proc_skip_spaces(&p);
-
-out_free:
- kfree(kbuf);
- if (err)
- return -EINVAL;
-
- return 0;
-
- /* This is in keeping with old __do_proc_dointvec() */
-bail_early:
- *ppos += *lenp;
- return err;
-}
-
-static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- unsigned long lval;
- int err = 0;
- size_t left;
-
- left = *lenp;
-
- if (conv(&lval, tbl_data, 0, data)) {
- err = -EINVAL;
- goto out;
- }
-
- err = proc_put_long(&buffer, &left, lval, false);
- if (err || !left)
- goto out;
-
- err = proc_put_char(&buffer, &left, '\n');
-
-out:
- *lenp -= left;
- *ppos += *lenp;
-
- return err;
-}
-
-static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- unsigned int *i, vleft;
-
- if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- i = (unsigned int *) tbl_data;
- vleft = table->maxlen / sizeof(*i);
-
- /*
- * Arrays are not supported, keep this simple. *Do not* add
- * support for them.
- */
- if (vleft != 1) {
- *lenp = 0;
- return -EINVAL;
- }
-
- if (!conv)
- conv = do_proc_douintvec_conv;
-
- if (write)
- return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
- conv, data);
- return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
-}
-
-static int do_proc_douintvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- return __do_proc_douintvec(table->data, table, write,
- buffer, lenp, ppos, conv, data);
-}
-
-/**
- * proc_dointvec - read a vector of integers
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- *
- * Returns 0 on success.
- */
-int proc_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
-}
-
-#ifdef CONFIG_COMPACTION
-static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- int ret, old;
-
- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-
- old = *(int *)table->data;
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (ret)
- return ret;
- if (old != *(int *)table->data)
- pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
- table->procname, current->comm,
- task_pid_nr(current));
- return ret;
-}
-#endif
-
-/**
- * proc_douintvec - read a vector of unsigned integers
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
- * values from/to the user buffer, treated as an ASCII string.
- *
- * Returns 0 on success.
- */
-int proc_douintvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_douintvec_conv, NULL);
-}
-
-/*
- * Taint values can only be increased
- * This means we can safely use a temporary.
- */
-static int proc_taint(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct ctl_table t;
- unsigned long tmptaint = get_taint();
- int err;
-
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- t = *table;
- t.data = &tmptaint;
- err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
- if (err < 0)
- return err;
-
- if (write) {
- /*
- * Poor man's atomic or. Not worth adding a primitive
- * to everyone's atomic.h for this
- */
- int i;
- for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
- if ((tmptaint >> i) & 1)
- add_taint(i, LOCKDEP_STILL_OK);
- }
- }
-
- return err;
-}
-
-#ifdef CONFIG_PRINTK
-static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-}
-#endif
-
-/**
- * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
- * @min: pointer to minimum allowable value
- * @max: pointer to maximum allowable value
- *
- * The do_proc_dointvec_minmax_conv_param structure provides the
- * minimum and maximum values for doing range checking for those sysctl
- * parameters that use the proc_dointvec_minmax() handler.
- */
-struct do_proc_dointvec_minmax_conv_param {
- int *min;
- int *max;
-};
-
-static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- int tmp, ret;
- struct do_proc_dointvec_minmax_conv_param *param = data;
- /*
- * If writing, first do so via a temporary local int so we can
- * bounds-check it before touching *valp.
- */
- int *ip = write ? &tmp : valp;
-
- ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
- if (ret)
- return ret;
-
- if (write) {
- if ((param->min && *param->min > tmp) ||
- (param->max && *param->max < tmp))
- return -EINVAL;
- *valp = tmp;
- }
-
- return 0;
-}
-
-/**
- * proc_dointvec_minmax - read a vector of integers with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max).
- *
- * Returns 0 on success or -EINVAL on write when the range check fails.
- */
-int proc_dointvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct do_proc_dointvec_minmax_conv_param param = {
- .min = (int *) table->extra1,
- .max = (int *) table->extra2,
- };
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_dointvec_minmax_conv, &param);
-}
-
-/**
- * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure
- * @min: pointer to minimum allowable value
- * @max: pointer to maximum allowable value
- *
- * The do_proc_douintvec_minmax_conv_param structure provides the
- * minimum and maximum values for doing range checking for those sysctl
- * parameters that use the proc_douintvec_minmax() handler.
- */
-struct do_proc_douintvec_minmax_conv_param {
- unsigned int *min;
- unsigned int *max;
-};
-
-static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
-{
- int ret;
- unsigned int tmp;
- struct do_proc_douintvec_minmax_conv_param *param = data;
- /* write via temporary local uint for bounds-checking */
- unsigned int *up = write ? &tmp : valp;
-
- ret = do_proc_douintvec_conv(lvalp, up, write, data);
- if (ret)
- return ret;
-
- if (write) {
- if ((param->min && *param->min > tmp) ||
- (param->max && *param->max < tmp))
- return -ERANGE;
-
- *valp = tmp;
- }
-
- return 0;
-}
-
-/**
- * proc_douintvec_minmax - read a vector of unsigned ints with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
- * values from/to the user buffer, treated as an ASCII string. Negative
- * strings are not allowed.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max). There is a final sanity
- * check for UINT_MAX to avoid having to support wrap around uses from
- * userspace.
- *
- * Returns 0 on success or -ERANGE on write when the range check fails.
- */
-int proc_douintvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct do_proc_douintvec_minmax_conv_param param = {
- .min = (unsigned int *) table->extra1,
- .max = (unsigned int *) table->extra2,
- };
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_douintvec_minmax_conv, &param);
-}
-
-static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
-{
- if (write) {
- unsigned int val;
-
- val = round_pipe_size(*lvalp);
- if (val == 0)
- return -EINVAL;
-
- *valp = val;
- } else {
- unsigned int val = *valp;
- *lvalp = (unsigned long) val;
- }
-
- return 0;
-}
-
-static int proc_dopipe_max_size(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_dopipe_max_size_conv, NULL);
-}
-
-static void validate_coredump_safety(void)
-{
-#ifdef CONFIG_COREDUMP
- if (suid_dumpable == SUID_DUMP_ROOT &&
- core_pattern[0] != '/' && core_pattern[0] != '|') {
- printk(KERN_WARNING
-"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
-"Pipe handler or fully qualified core dump path required.\n"
-"Set kernel.core_pattern before fs.suid_dumpable.\n"
- );
- }
-#endif
-}
-
-static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!error)
- validate_coredump_safety();
- return error;
-}
-
-#ifdef CONFIG_COREDUMP
-static int proc_dostring_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int error = proc_dostring(table, write, buffer, lenp, ppos);
- if (!error)
- validate_coredump_safety();
- return error;
-}
-#endif
-
-#ifdef CONFIG_MAGIC_SYSRQ
-static int sysrq_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int tmp, ret;
-
- tmp = sysrq_mask();
-
- ret = __do_proc_dointvec(&tmp, table, write, buffer,
- lenp, ppos, NULL, NULL);
- if (ret || !write)
- return ret;
-
- if (write)
- sysrq_toggle_support(tmp);
-
- return 0;
-}
-#endif
-
-static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos,
- unsigned long convmul,
- unsigned long convdiv)
-{
- unsigned long *i, *min, *max;
- int vleft, first = 1, err = 0;
- size_t left;
- char *kbuf = NULL, *p;
-
- if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- i = (unsigned long *) data;
- min = (unsigned long *) table->extra1;
- max = (unsigned long *) table->extra2;
- vleft = table->maxlen / sizeof(unsigned long);
- left = *lenp;
-
- if (write) {
- if (proc_first_pos_non_zero_ignore(ppos, table))
- goto out;
-
- if (left > PAGE_SIZE - 1)
- left = PAGE_SIZE - 1;
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- }
-
- for (; left && vleft--; i++, first = 0) {
- unsigned long val;
-
- if (write) {
- bool neg;
-
- left -= proc_skip_spaces(&p);
- if (!left)
- break;
-
- err = proc_get_long(&p, &left, &val, &neg,
- proc_wspace_sep,
- sizeof(proc_wspace_sep), NULL);
- if (err)
- break;
- if (neg)
- continue;
- val = convmul * val / convdiv;
- if ((min && val < *min) || (max && val > *max)) {
- err = -EINVAL;
- break;
- }
- *i = val;
- } else {
- val = convdiv * (*i) / convmul;
- if (!first) {
- err = proc_put_char(&buffer, &left, '\t');
- if (err)
- break;
- }
- err = proc_put_long(&buffer, &left, val, false);
- if (err)
- break;
- }
- }
-
- if (!write && !first && left && !err)
- err = proc_put_char(&buffer, &left, '\n');
- if (write && !err)
- left -= proc_skip_spaces(&p);
- if (write) {
- kfree(kbuf);
- if (first)
- return err ? : -EINVAL;
- }
- *lenp -= left;
-out:
- *ppos += *lenp;
- return err;
-}
-
-static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos,
- unsigned long convmul,
- unsigned long convdiv)
-{
- return __do_proc_doulongvec_minmax(table->data, table, write,
- buffer, lenp, ppos, convmul, convdiv);
-}
-
-/**
- * proc_doulongvec_minmax - read a vector of long integers with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
- * values from/to the user buffer, treated as an ASCII string.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max).
- *
- * Returns 0 on success.
- */
-int proc_doulongvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
-}
-
-/**
- * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
- * values from/to the user buffer, treated as an ASCII string. The values
- * are treated as milliseconds, and converted to jiffies when they are stored.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max).
- *
- * Returns 0 on success.
- */
-int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return do_proc_doulongvec_minmax(table, write, buffer,
- lenp, ppos, HZ, 1000l);
-}
-
-
-static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- if (*lvalp > INT_MAX / HZ)
- return 1;
- *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
- } else {
- int val = *valp;
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = lval / HZ;
- }
- return 0;
-}
-
-static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
- return 1;
- *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
- } else {
- int val = *valp;
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = jiffies_to_clock_t(lval);
- }
- return 0;
-}
-
-static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
-
- if (jif > INT_MAX)
- return 1;
- *valp = (int)jif;
- } else {
- int val = *valp;
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = jiffies_to_msecs(lval);
- }
- return 0;
-}
-
-/**
- * proc_dointvec_jiffies - read a vector of integers as seconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in seconds, and are converted into
- * jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- do_proc_dointvec_jiffies_conv,NULL);
-}
-
-/**
- * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: pointer to the file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/USER_HZ seconds, and
- * are converted into jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- do_proc_dointvec_userhz_jiffies_conv,NULL);
-}
-
-/**
- * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- * @ppos: the current position in the file
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/1000 seconds, and
- * are converted into jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_dointvec_ms_jiffies_conv, NULL);
-}
-
-static int proc_do_cad_pid(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct pid *new_pid;
- pid_t tmp;
- int r;
-
- tmp = pid_vnr(cad_pid);
-
- r = __do_proc_dointvec(&tmp, table, write, buffer,
- lenp, ppos, NULL, NULL);
- if (r || !write)
- return r;
-
- new_pid = find_get_pid(tmp);
- if (!new_pid)
- return -ESRCH;
-
- put_pid(xchg(&cad_pid, new_pid));
- return 0;
-}
-
-/**
- * proc_do_large_bitmap - read/write from/to a large bitmap
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * The bitmap is stored at table->data and the bitmap length (in bits)
- * in table->maxlen.
- *
- * We use a range comma separated format (e.g. 1,3-4,10-10) so that
- * large bitmaps may be represented in a compact manner. Writing into
- * the file will clear the bitmap then update it with the given input.
- *
- * Returns 0 on success.
- */
-int proc_do_large_bitmap(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int err = 0;
- bool first = 1;
- size_t left = *lenp;
- unsigned long bitmap_len = table->maxlen;
- unsigned long *bitmap = *(unsigned long **) table->data;
- unsigned long *tmp_bitmap = NULL;
- char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
-
- if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- if (write) {
- char *kbuf, *p;
- size_t skipped = 0;
-
- if (left > PAGE_SIZE - 1) {
- left = PAGE_SIZE - 1;
- /* How much of the buffer we'll skip this pass */
- skipped = *lenp - left;
- }
-
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
- if (!tmp_bitmap) {
- kfree(kbuf);
- return -ENOMEM;
- }
- proc_skip_char(&p, &left, '\n');
- while (!err && left) {
- unsigned long val_a, val_b;
- bool neg;
- size_t saved_left;
-
- /* In case we stop parsing mid-number, we can reset */
- saved_left = left;
- err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
- sizeof(tr_a), &c);
- /*
- * If we consumed the entirety of a truncated buffer or
- * only one char is left (may be a "-"), then stop here,
- * reset, & come back for more.
- */
- if ((left <= 1) && skipped) {
- left = saved_left;
- break;
- }
-
- if (err)
- break;
- if (val_a >= bitmap_len || neg) {
- err = -EINVAL;
- break;
- }
-
- val_b = val_a;
- if (left) {
- p++;
- left--;
- }
-
- if (c == '-') {
- err = proc_get_long(&p, &left, &val_b,
- &neg, tr_b, sizeof(tr_b),
- &c);
- /*
- * If we consumed all of a truncated buffer or
- * then stop here, reset, & come back for more.
- */
- if (!left && skipped) {
- left = saved_left;
- break;
- }
-
- if (err)
- break;
- if (val_b >= bitmap_len || neg ||
- val_a > val_b) {
- err = -EINVAL;
- break;
- }
- if (left) {
- p++;
- left--;
- }
- }
-
- bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
- first = 0;
- proc_skip_char(&p, &left, '\n');
- }
- kfree(kbuf);
- left += skipped;
- } else {
- unsigned long bit_a, bit_b = 0;
-
- while (left) {
- bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
- if (bit_a >= bitmap_len)
- break;
- bit_b = find_next_zero_bit(bitmap, bitmap_len,
- bit_a + 1) - 1;
-
- if (!first) {
- err = proc_put_char(&buffer, &left, ',');
- if (err)
- break;
- }
- err = proc_put_long(&buffer, &left, bit_a, false);
- if (err)
- break;
- if (bit_a != bit_b) {
- err = proc_put_char(&buffer, &left, '-');
- if (err)
- break;
- err = proc_put_long(&buffer, &left, bit_b, false);
- if (err)
- break;
- }
-
- first = 0; bit_b++;
- }
- if (!err)
- err = proc_put_char(&buffer, &left, '\n');
- }
-
- if (!err) {
- if (write) {
- if (*ppos)
- bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
- else
- bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
- }
- *lenp -= left;
- *ppos += *lenp;
- }
-
- bitmap_free(tmp_bitmap);
- return err;
-}
-
-#else /* CONFIG_PROC_SYSCTL */
-
-int proc_dostring(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_douintvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_douintvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_doulongvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_do_large_bitmap(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-#endif /* CONFIG_PROC_SYSCTL */
-
-#if defined(CONFIG_SYSCTL)
-int proc_do_static_key(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- struct static_key *key = (struct static_key *)table->data;
- static DEFINE_MUTEX(static_key_mutex);
- int val, ret;
- struct ctl_table tmp = {
- .data = &val,
- .maxlen = sizeof(val),
- .mode = table->mode,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- };
-
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- mutex_lock(&static_key_mutex);
- val = static_key_enabled(key);
- ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
- if (write && !ret) {
- if (val)
- static_key_enable(key);
- else
- static_key_disable(key);
- }
- mutex_unlock(&static_key_mutex);
- return ret;
-}
-#endif
/*
* No sense putting this after each symbol definition, twice,
* exception granted :-)
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 825f28259a19..5c0848ca1287 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -25,9 +25,10 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
* 0 if succeeds or -ESRCH.
*/
int
-task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
+task_work_add(struct task_struct *task, struct callback_head *work, int notify)
{
struct callback_head *head;
+ unsigned long flags;
do {
head = READ_ONCE(task->task_works);
@@ -36,8 +37,19 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
work->next = head;
} while (cmpxchg(&task->task_works, head, work) != head);
- if (notify)
+ switch (notify) {
+ case TWA_RESUME:
set_notify_resume(task);
+ break;
+ case TWA_SIGNAL:
+ if (lock_task_sighand(task, &flags)) {
+ task->jobctl |= JOBCTL_TASK_WORK;
+ signal_wake_up(task, 0);
+ unlock_task_sighand(task, &flags);
+ }
+ break;
+ }
+
return 0;
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7cb09c4cf21c..02441ead3c3b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -928,14 +928,12 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
clocksource_arch_init(cs);
-#ifdef CONFIG_GENERIC_VDSO_CLOCK_MODE
if (cs->vdso_clock_mode < 0 ||
cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
cs->name, cs->vdso_clock_mode);
cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE;
}
-#endif
/* Initialize mult/shift and max_idle_ns */
__clocksource_update_freq_scale(cs, scale, freq);
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 53bce347cd50..5d9fc22d836a 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -280,8 +280,9 @@ static void timens_put(struct ns_common *ns)
put_time_ns(to_time_ns(ns));
}
-static int timens_install(struct nsproxy *nsproxy, struct ns_common *new)
+static int timens_install(struct nsset *nsset, struct ns_common *new)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct time_namespace *ns = to_time_ns(new);
int err;
@@ -289,7 +290,7 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new)
return -EUSERS;
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
timens_set_vvar_page(current, ns);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2fd3b3fa68bf..165117996ea0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -47,85 +47,65 @@ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
/*
* Functions for validating access to tasks.
*/
-static struct task_struct *lookup_task(const pid_t pid, bool thread,
- bool gettime)
+static struct pid *pid_for_clock(const clockid_t clock, bool gettime)
{
- struct task_struct *p;
+ const bool thread = !!CPUCLOCK_PERTHREAD(clock);
+ const pid_t upid = CPUCLOCK_PID(clock);
+ struct pid *pid;
+
+ if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX)
+ return NULL;
/*
* If the encoded PID is 0, then the timer is targeted at current
* or the process to which current belongs.
*/
- if (!pid)
- return thread ? current : current->group_leader;
+ if (upid == 0)
+ return thread ? task_pid(current) : task_tgid(current);
- p = find_task_by_vpid(pid);
- if (!p)
- return p;
-
- if (thread)
- return same_thread_group(p, current) ? p : NULL;
+ pid = find_vpid(upid);
+ if (!pid)
+ return NULL;
- if (gettime) {
- /*
- * For clock_gettime(PROCESS) the task does not need to be
- * the actual group leader. tsk->sighand gives
- * access to the group's clock.
- *
- * Timers need the group leader because they take a
- * reference on it and store the task pointer until the
- * timer is destroyed.
- */
- return (p == current || thread_group_leader(p)) ? p : NULL;
+ if (thread) {
+ struct task_struct *tsk = pid_task(pid, PIDTYPE_PID);
+ return (tsk && same_thread_group(tsk, current)) ? pid : NULL;
}
/*
- * For processes require that p is group leader.
+ * For clock_gettime(PROCESS) allow finding the process by
+ * with the pid of the current task. The code needs the tgid
+ * of the process so that pid_task(pid, PIDTYPE_TGID) can be
+ * used to find the process.
*/
- return has_group_leader_pid(p) ? p : NULL;
+ if (gettime && (pid == task_pid(current)))
+ return task_tgid(current);
+
+ /*
+ * For processes require that pid identifies a process.
+ */
+ return pid_has_task(pid, PIDTYPE_TGID) ? pid : NULL;
}
-static struct task_struct *__get_task_for_clock(const clockid_t clock,
- bool getref, bool gettime)
+static inline int validate_clock_permissions(const clockid_t clock)
{
- const bool thread = !!CPUCLOCK_PERTHREAD(clock);
- const pid_t pid = CPUCLOCK_PID(clock);
- struct task_struct *p;
-
- if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX)
- return NULL;
+ int ret;
rcu_read_lock();
- p = lookup_task(pid, thread, gettime);
- if (p && getref)
- get_task_struct(p);
+ ret = pid_for_clock(clock, false) ? 0 : -EINVAL;
rcu_read_unlock();
- return p;
-}
-
-static inline struct task_struct *get_task_for_clock(const clockid_t clock)
-{
- return __get_task_for_clock(clock, true, false);
-}
-static inline struct task_struct *get_task_for_clock_get(const clockid_t clock)
-{
- return __get_task_for_clock(clock, true, true);
-}
-
-static inline int validate_clock_permissions(const clockid_t clock)
-{
- return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL;
+ return ret;
}
-static inline enum pid_type cpu_timer_pid_type(struct k_itimer *timer)
+static inline enum pid_type clock_pid_type(const clockid_t clock)
{
- return CPUCLOCK_PERTHREAD(timer->it_clock) ? PIDTYPE_PID : PIDTYPE_TGID;
+ return CPUCLOCK_PERTHREAD(clock) ? PIDTYPE_PID : PIDTYPE_TGID;
}
static inline struct task_struct *cpu_timer_task_rcu(struct k_itimer *timer)
{
- return pid_task(timer->it.cpu.pid, cpu_timer_pid_type(timer));
+ return pid_task(timer->it.cpu.pid, clock_pid_type(timer->it_clock));
}
/*
@@ -373,15 +353,18 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
struct task_struct *tsk;
u64 t;
- tsk = get_task_for_clock_get(clock);
- if (!tsk)
+ rcu_read_lock();
+ tsk = pid_task(pid_for_clock(clock, true), clock_pid_type(clock));
+ if (!tsk) {
+ rcu_read_unlock();
return -EINVAL;
+ }
if (CPUCLOCK_PERTHREAD(clock))
t = cpu_clock_sample(clkid, tsk);
else
t = cpu_clock_sample_group(clkid, tsk, false);
- put_task_struct(tsk);
+ rcu_read_unlock();
*tp = ns_to_timespec64(t);
return 0;
@@ -394,19 +377,19 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
*/
static int posix_cpu_timer_create(struct k_itimer *new_timer)
{
- struct task_struct *p = get_task_for_clock(new_timer->it_clock);
+ struct pid *pid;
- if (!p)
+ rcu_read_lock();
+ pid = pid_for_clock(new_timer->it_clock, false);
+ if (!pid) {
+ rcu_read_unlock();
return -EINVAL;
+ }
new_timer->kclock = &clock_posix_cpu;
timerqueue_init(&new_timer->it.cpu.node);
- new_timer->it.cpu.pid = get_task_pid(p, cpu_timer_pid_type(new_timer));
- /*
- * get_task_for_clock() took a reference on @p. Drop it as the timer
- * holds a reference on the pid of @p.
- */
- put_task_struct(p);
+ new_timer->it.cpu.pid = get_pid(pid);
+ rcu_read_unlock();
return 0;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9ebaab13339d..d20d489841c8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -953,7 +953,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
* but without the sequence counter protect. This internal function
* is called just when timekeeping lock is already held.
*/
-time64_t __ktime_get_real_seconds(void)
+noinstr time64_t __ktime_get_real_seconds(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a5221abb4594..398e6eadb861 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -249,8 +249,7 @@ void timers_update_nohz(void)
}
int timer_migration_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 24876faac753..a4020c0b4508 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -249,15 +249,6 @@ config TRACE_PREEMPT_TOGGLE
Enables hooks which will be called when preemption is first disabled,
and last enabled.
-config PREEMPTIRQ_EVENTS
- bool "Enable trace events for preempt and irq disable/enable"
- select TRACE_IRQFLAGS
- select TRACE_PREEMPT_TOGGLE if PREEMPTION
- select GENERIC_TRACER
- default n
- help
- Enable tracing of disable and enable events for preemption and irqs.
-
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
@@ -614,12 +605,30 @@ config TRACING_MAP
generally used outside of that context, and is normally
selected by tracers that use it.
+config SYNTH_EVENTS
+ bool "Synthetic trace events"
+ select TRACING
+ select DYNAMIC_EVENTS
+ default n
+ help
+ Synthetic events are user-defined trace events that can be
+ used to combine data from other trace events or in fact any
+ data source. Synthetic events can be generated indirectly
+ via the trace() action of histogram triggers or directly
+ by way of an in-kernel API.
+
+ See Documentation/trace/events.rst or
+ Documentation/trace/histogram.rst for details and examples.
+
+ If in doubt, say N.
+
config HIST_TRIGGERS
bool "Histogram triggers"
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
select TRACING_MAP
select TRACING
select DYNAMIC_EVENTS
+ select SYNTH_EVENTS
default n
help
Hist triggers allow one or more arbitrary trace event fields
@@ -815,7 +824,7 @@ config PREEMPTIRQ_DELAY_TEST
config SYNTH_EVENT_GEN_TEST
tristate "Test module for in-kernel synthetic event generation"
- depends on HIST_TRIGGERS
+ depends on SYNTH_EVENTS
help
This option creates a test module to check the base
functionality of in-kernel synthetic event definition and
@@ -838,6 +847,29 @@ config KPROBE_EVENT_GEN_TEST
If unsure, say N.
+config HIST_TRIGGERS_DEBUG
+ bool "Hist trigger debug support"
+ depends on HIST_TRIGGERS
+ help
+ Add "hist_debug" file for each event, which when read will
+ dump out a bunch of internal details about the hist triggers
+ defined on that event.
+
+ The hist_debug file serves a couple of purposes:
+
+ - Helps developers verify that nothing is broken.
+
+ - Provides educational information to support the details
+ of the hist trigger internals as described by
+ Documentation/trace/histogram-design.rst.
+
+ The hist_debug output only covers the data structures
+ related to the histogram definitions themselves and doesn't
+ display the internals of map buckets or variable values of
+ running histograms.
+
+ If unsure, say N.
+
endif # FTRACE
endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index f9dcd19165fa..6575bb0a0434 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -6,6 +6,9 @@ ifdef CONFIG_FUNCTION_TRACER
ORIG_CFLAGS := $(KBUILD_CFLAGS)
KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
+# Avoid recursion due to instrumentation.
+KCSAN_SANITIZE := n
+
ifdef CONFIG_FTRACE_SELFTEST
# selftest needs instrumentation
CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE)
@@ -72,6 +75,7 @@ endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
+obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ca39dc3230cb..5ef0484513ec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -3,6 +3,9 @@
* Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
*
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blktrace_api.h>
@@ -170,10 +173,10 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
blkcg = NULL;
#ifdef CONFIG_BLK_CGROUP
- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
+ trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n,
blkcg ? cgroup_id(blkcg->css.cgroup) : 1);
#else
- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0);
+ trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0);
#endif
local_irq_restore(flags);
}
@@ -344,7 +347,8 @@ static int __blk_trace_remove(struct request_queue *q)
{
struct blk_trace *bt;
- bt = xchg(&q->blk_trace, NULL);
+ bt = rcu_replace_pointer(q->blk_trace, NULL,
+ lockdep_is_held(&q->blk_trace_mutex));
if (!bt)
return -EINVAL;
@@ -494,6 +498,17 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
*/
strreplace(buts->name, '/', '_');
+ /*
+ * bdev can be NULL, as with scsi-generic, this is a helpful as
+ * we can be.
+ */
+ if (rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex))) {
+ pr_warn("Concurrent blktraces are not allowed on %s\n",
+ buts->name);
+ return -EBUSY;
+ }
+
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
return -ENOMEM;
@@ -543,10 +558,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
bt->pid = buts->pid;
bt->trace_state = Blktrace_setup;
- ret = -EBUSY;
- if (cmpxchg(&q->blk_trace, NULL, bt))
- goto err;
-
+ rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
ret = 0;
@@ -885,10 +897,10 @@ static void blk_add_trace_bio_bounce(void *ignore,
}
static void blk_add_trace_bio_complete(void *ignore,
- struct request_queue *q, struct bio *bio,
- int error)
+ struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE,
+ blk_status_to_errno(bio->bi_status));
}
static void blk_add_trace_bio_backmerge(void *ignore,
@@ -995,8 +1007,10 @@ static void blk_add_trace_split(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
- BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
- &rpdu, blk_trace_bio_get_cgid(q, bio));
+ BLK_TA_SPLIT,
+ blk_status_to_errno(bio->bi_status),
+ sizeof(rpdu), &rpdu,
+ blk_trace_bio_get_cgid(q, bio));
}
rcu_read_unlock();
}
@@ -1033,7 +1047,8 @@ static void blk_add_trace_bio_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
+ bio_op(bio), bio->bi_opf, BLK_TA_REMAP,
+ blk_status_to_errno(bio->bi_status),
sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
}
@@ -1253,21 +1268,10 @@ static inline __u16 t_error(const struct trace_entry *ent)
static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
{
- const __u64 *val = pdu_start(ent, has_cg);
+ const __be64 *val = pdu_start(ent, has_cg);
return be64_to_cpu(*val);
}
-static void get_pdu_remap(const struct trace_entry *ent,
- struct blk_io_trace_remap *r, bool has_cg)
-{
- const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
- __u64 sector_from = __r->sector_from;
-
- r->device_from = be32_to_cpu(__r->device_from);
- r->device_to = be32_to_cpu(__r->device_to);
- r->sector_from = be64_to_cpu(sector_from);
-}
-
typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
bool has_cg);
@@ -1407,13 +1411,13 @@ static void blk_log_with_error(struct trace_seq *s,
static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
- struct blk_io_trace_remap r = { .device_from = 0, };
+ const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
- get_pdu_remap(ent, &r, has_cg);
trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
t_sector(ent), t_sec(ent),
- MAJOR(r.device_from), MINOR(r.device_from),
- (unsigned long long)r.sector_from);
+ MAJOR(be32_to_cpu(__r->device_from)),
+ MINOR(be32_to_cpu(__r->device_from)),
+ be64_to_cpu(__r->sector_from));
}
static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
@@ -1637,7 +1641,8 @@ static int blk_trace_remove_queue(struct request_queue *q)
{
struct blk_trace *bt;
- bt = xchg(&q->blk_trace, NULL);
+ bt = rcu_replace_pointer(q->blk_trace, NULL,
+ lockdep_is_held(&q->blk_trace_mutex));
if (bt == NULL)
return -EINVAL;
@@ -1669,10 +1674,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
blk_trace_setup_lba(bt, bdev);
- ret = -EBUSY;
- if (cmpxchg(&q->blk_trace, NULL, bt))
- goto free_bt;
-
+ rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 92ba69b716dc..7bc3d6175868 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -136,18 +136,24 @@ static const struct bpf_func_proto bpf_override_return_proto = {
};
#endif
-BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
- const void __user *, unsafe_ptr)
+static __always_inline int
+bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr)
{
- int ret = probe_user_read(dst, unsafe_ptr, size);
+ int ret;
+ ret = copy_from_user_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
-
return ret;
}
-static const struct bpf_func_proto bpf_probe_read_user_proto = {
+BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
+ const void __user *, unsafe_ptr)
+{
+ return bpf_probe_read_user_common(dst, size, unsafe_ptr);
+}
+
+const struct bpf_func_proto bpf_probe_read_user_proto = {
.func = bpf_probe_read_user,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -156,18 +162,25 @@ static const struct bpf_func_proto bpf_probe_read_user_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
- const void __user *, unsafe_ptr)
+static __always_inline int
+bpf_probe_read_user_str_common(void *dst, u32 size,
+ const void __user *unsafe_ptr)
{
- int ret = strncpy_from_unsafe_user(dst, unsafe_ptr, size);
+ int ret;
+ ret = strncpy_from_user_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
-
return ret;
}
-static const struct bpf_func_proto bpf_probe_read_user_str_proto = {
+BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
+ const void __user *, unsafe_ptr)
+{
+ return bpf_probe_read_user_str_common(dst, size, unsafe_ptr);
+}
+
+const struct bpf_func_proto bpf_probe_read_user_str_proto = {
.func = bpf_probe_read_user_str,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -177,28 +190,28 @@ static const struct bpf_func_proto bpf_probe_read_user_str_proto = {
};
static __always_inline int
-bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr,
- const bool compat)
+bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr)
{
int ret = security_locked_down(LOCKDOWN_BPF_READ);
if (unlikely(ret < 0))
- goto out;
- ret = compat ? probe_kernel_read(dst, unsafe_ptr, size) :
- probe_kernel_read_strict(dst, unsafe_ptr, size);
+ goto fail;
+ ret = copy_from_kernel_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
-out:
- memset(dst, 0, size);
+ goto fail;
+ return ret;
+fail:
+ memset(dst, 0, size);
return ret;
}
BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
const void *, unsafe_ptr)
{
- return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false);
+ return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
}
-static const struct bpf_func_proto bpf_probe_read_kernel_proto = {
+const struct bpf_func_proto bpf_probe_read_kernel_proto = {
.func = bpf_probe_read_kernel,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -207,53 +220,40 @@ static const struct bpf_func_proto bpf_probe_read_kernel_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
- const void *, unsafe_ptr)
-{
- return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, true);
-}
-
-static const struct bpf_func_proto bpf_probe_read_compat_proto = {
- .func = bpf_probe_read_compat,
- .gpl_only = true,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_UNINIT_MEM,
- .arg2_type = ARG_CONST_SIZE_OR_ZERO,
- .arg3_type = ARG_ANYTHING,
-};
-
static __always_inline int
-bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr,
- const bool compat)
+bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr)
{
int ret = security_locked_down(LOCKDOWN_BPF_READ);
if (unlikely(ret < 0))
- goto out;
+ goto fail;
+
/*
- * The strncpy_from_unsafe_*() call will likely not fill the entire
- * buffer, but that's okay in this circumstance as we're probing
+ * The strncpy_from_kernel_nofault() call will likely not fill the
+ * entire buffer, but that's okay in this circumstance as we're probing
* arbitrary memory anyway similar to bpf_probe_read_*() and might
* as well probe the stack. Thus, memory is explicitly cleared
* only in error case, so that improper users ignoring return
* code altogether don't copy garbage; otherwise length of string
* is returned that can be used for bpf_perf_event_output() et al.
*/
- ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) :
- strncpy_from_unsafe_strict(dst, unsafe_ptr, size);
+ ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
-out:
- memset(dst, 0, size);
+ goto fail;
+
+ return ret;
+fail:
+ memset(dst, 0, size);
return ret;
}
BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size,
const void *, unsafe_ptr)
{
- return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false);
+ return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
}
-static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
+const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
.func = bpf_probe_read_kernel_str,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -262,10 +262,34 @@ static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
.arg3_type = ARG_ANYTHING,
};
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
+ const void *, unsafe_ptr)
+{
+ if ((unsigned long)unsafe_ptr < TASK_SIZE) {
+ return bpf_probe_read_user_common(dst, size,
+ (__force void __user *)unsafe_ptr);
+ }
+ return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
+}
+
+static const struct bpf_func_proto bpf_probe_read_compat_proto = {
+ .func = bpf_probe_read_compat,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size,
const void *, unsafe_ptr)
{
- return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, true);
+ if ((unsigned long)unsafe_ptr < TASK_SIZE) {
+ return bpf_probe_read_user_str_common(dst, size,
+ (__force void __user *)unsafe_ptr);
+ }
+ return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
}
static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
@@ -276,6 +300,7 @@ static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
+#endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */
BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
u32, size)
@@ -301,7 +326,7 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
- return probe_user_write(unsafe_ptr, src, size);
+ return copy_to_user_nofault(unsafe_ptr, src, size);
}
static const struct bpf_func_proto bpf_probe_write_user_proto = {
@@ -315,12 +340,40 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
{
+ if (!capable(CAP_SYS_ADMIN))
+ return NULL;
+
pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
current->comm, task_pid_nr(current));
return &bpf_probe_write_user_proto;
}
+static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
+ size_t bufsz)
+{
+ void __user *user_ptr = (__force void __user *)unsafe_ptr;
+
+ buf[0] = 0;
+
+ switch (fmt_ptype) {
+ case 's':
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)unsafe_ptr < TASK_SIZE) {
+ strncpy_from_user_nofault(buf, user_ptr, bufsz);
+ break;
+ }
+ fallthrough;
+#endif
+ case 'k':
+ strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
+ break;
+ case 'u':
+ strncpy_from_user_nofault(buf, user_ptr, bufsz);
+ break;
+ }
+}
+
/*
* Only limited trace_printk() conversion specifiers allowed:
* %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s
@@ -403,24 +456,8 @@ fmt_str:
break;
}
- buf[0] = 0;
- switch (fmt_ptype) {
- case 's':
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- strncpy_from_unsafe(buf, unsafe_ptr,
- sizeof(buf));
- break;
-#endif
- case 'k':
- strncpy_from_unsafe_strict(buf, unsafe_ptr,
- sizeof(buf));
- break;
- case 'u':
- strncpy_from_unsafe_user(buf,
- (__force void __user *)unsafe_ptr,
- sizeof(buf));
- break;
- }
+ bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype,
+ sizeof(buf));
goto fmt_next;
}
@@ -487,6 +524,214 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
+#define MAX_SEQ_PRINTF_VARARGS 12
+#define MAX_SEQ_PRINTF_MAX_MEMCPY 6
+#define MAX_SEQ_PRINTF_STR_LEN 128
+
+struct bpf_seq_printf_buf {
+ char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN];
+};
+static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf);
+static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used);
+
+BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
+ const void *, data, u32, data_len)
+{
+ int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
+ int i, buf_used, copy_size, num_args;
+ u64 params[MAX_SEQ_PRINTF_VARARGS];
+ struct bpf_seq_printf_buf *bufs;
+ const u64 *args = data;
+
+ buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
+ if (WARN_ON_ONCE(buf_used > 1)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ bufs = this_cpu_ptr(&bpf_seq_printf_buf);
+
+ /*
+ * bpf_check()->check_func_arg()->check_stack_boundary()
+ * guarantees that fmt points to bpf program stack,
+ * fmt_size bytes of it were initialized and fmt_size > 0
+ */
+ if (fmt[--fmt_size] != 0)
+ goto out;
+
+ if (data_len & 7)
+ goto out;
+
+ for (i = 0; i < fmt_size; i++) {
+ if (fmt[i] == '%') {
+ if (fmt[i + 1] == '%')
+ i++;
+ else if (!data || !data_len)
+ goto out;
+ }
+ }
+
+ num_args = data_len / 8;
+
+ /* check format string for allowed specifiers */
+ for (i = 0; i < fmt_size; i++) {
+ /* only printable ascii for now. */
+ if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (fmt[i] != '%')
+ continue;
+
+ if (fmt[i + 1] == '%') {
+ i++;
+ continue;
+ }
+
+ if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
+ err = -E2BIG;
+ goto out;
+ }
+
+ if (fmt_cnt >= num_args) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+ i++;
+
+ /* skip optional "[0 +-][num]" width formating field */
+ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' ||
+ fmt[i] == ' ')
+ i++;
+ if (fmt[i] >= '1' && fmt[i] <= '9') {
+ i++;
+ while (fmt[i] >= '0' && fmt[i] <= '9')
+ i++;
+ }
+
+ if (fmt[i] == 's') {
+ void *unsafe_ptr;
+
+ /* try our best to copy */
+ if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
+ err = -E2BIG;
+ goto out;
+ }
+
+ unsafe_ptr = (void *)(long)args[fmt_cnt];
+ err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt],
+ unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN);
+ if (err < 0)
+ bufs->buf[memcpy_cnt][0] = '\0';
+ params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
+
+ fmt_cnt++;
+ memcpy_cnt++;
+ continue;
+ }
+
+ if (fmt[i] == 'p') {
+ if (fmt[i + 1] == 0 ||
+ fmt[i + 1] == 'K' ||
+ fmt[i + 1] == 'x') {
+ /* just kernel pointers */
+ params[fmt_cnt] = args[fmt_cnt];
+ fmt_cnt++;
+ continue;
+ }
+
+ /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
+ if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') {
+ err = -EINVAL;
+ goto out;
+ }
+ if (fmt[i + 2] != '4' && fmt[i + 2] != '6') {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
+ err = -E2BIG;
+ goto out;
+ }
+
+
+ copy_size = (fmt[i + 2] == '4') ? 4 : 16;
+
+ err = copy_from_kernel_nofault(bufs->buf[memcpy_cnt],
+ (void *) (long) args[fmt_cnt],
+ copy_size);
+ if (err < 0)
+ memset(bufs->buf[memcpy_cnt], 0, copy_size);
+ params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
+
+ i += 2;
+ fmt_cnt++;
+ memcpy_cnt++;
+ continue;
+ }
+
+ if (fmt[i] == 'l') {
+ i++;
+ if (fmt[i] == 'l')
+ i++;
+ }
+
+ if (fmt[i] != 'i' && fmt[i] != 'd' &&
+ fmt[i] != 'u' && fmt[i] != 'x') {
+ err = -EINVAL;
+ goto out;
+ }
+
+ params[fmt_cnt] = args[fmt_cnt];
+ fmt_cnt++;
+ }
+
+ /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
+ * all of them to seq_printf().
+ */
+ seq_printf(m, fmt, params[0], params[1], params[2], params[3],
+ params[4], params[5], params[6], params[7], params[8],
+ params[9], params[10], params[11]);
+
+ err = seq_has_overflowed(m) ? -EOVERFLOW : 0;
+out:
+ this_cpu_dec(bpf_seq_printf_buf_used);
+ return err;
+}
+
+static int bpf_seq_printf_btf_ids[5];
+static const struct bpf_func_proto bpf_seq_printf_proto = {
+ .func = bpf_seq_printf,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+ .btf_id = bpf_seq_printf_btf_ids,
+};
+
+BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
+{
+ return seq_write(m, data, len) ? -EOVERFLOW : 0;
+}
+
+static int bpf_seq_write_btf_ids[5];
+static const struct bpf_func_proto bpf_seq_write_proto = {
+ .func = bpf_seq_write,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .btf_id = bpf_seq_write_btf_ids,
+};
+
static __always_inline int
get_map_perf_counter(struct bpf_map *map, u64 flags,
u64 *value, u64 *enabled, u64 *running)
@@ -698,7 +943,7 @@ BPF_CALL_0(bpf_get_current_task)
return (long) current;
}
-static const struct bpf_func_proto bpf_get_current_task_proto = {
+const struct bpf_func_proto bpf_get_current_task_proto = {
.func = bpf_get_current_task,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -827,6 +1072,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_peek_elem_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_ktime_get_boot_ns:
+ return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
@@ -877,6 +1124,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_perf_event_read_value_proto;
case BPF_FUNC_get_ns_current_pid_tgid:
return &bpf_get_ns_current_pid_tgid_proto;
+ case BPF_FUNC_ringbuf_output:
+ return &bpf_ringbuf_output_proto;
+ case BPF_FUNC_ringbuf_reserve:
+ return &bpf_ringbuf_reserve_proto;
+ case BPF_FUNC_ringbuf_submit:
+ return &bpf_ringbuf_submit_proto;
+ case BPF_FUNC_ringbuf_discard:
+ return &bpf_ringbuf_discard_proto;
+ case BPF_FUNC_ringbuf_query:
+ return &bpf_ringbuf_query_proto;
default:
return NULL;
}
@@ -1246,7 +1503,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
}
-static const struct bpf_func_proto *
+const struct bpf_func_proto *
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
@@ -1256,6 +1513,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_xdp_output:
return &bpf_xdp_output_proto;
#endif
+ case BPF_FUNC_seq_printf:
+ return prog->expected_attach_type == BPF_TRACE_ITER ?
+ &bpf_seq_printf_proto :
+ NULL;
+ case BPF_FUNC_seq_write:
+ return prog->expected_attach_type == BPF_TRACE_ITER ?
+ &bpf_seq_write_proto :
+ NULL;
default:
return raw_tp_prog_func_proto(func_id, prog);
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b5765aeea698..1903b80db6eb 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2016,16 +2016,16 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
{
unsigned long ip = rec ? rec->ip : 0;
+ pr_info("------------[ ftrace bug ]------------\n");
+
switch (failed) {
case -EFAULT:
- FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace faulted on modifying ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
break;
case -EINVAL:
- FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace failed to modify ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
print_ip_ins(" actual: ", (unsigned char *)ip);
pr_cont("\n");
if (ftrace_expected) {
@@ -2034,14 +2034,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
}
break;
case -EPERM:
- FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace faulted on writing ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
break;
default:
- FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace faulted on unknown error ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
}
print_bug_type();
if (rec) {
@@ -2066,6 +2064,8 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
ip = ftrace_get_addr_curr(rec);
pr_cont("\n expected tramp: %lx\n", ip);
}
+
+ FTRACE_WARN_ON_ONCE(1);
}
static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
@@ -2260,7 +2260,7 @@ ftrace_find_tramp_ops_next(struct dyn_ftrace *rec,
if (hash_contains_ip(ip, op->func_hash))
return op;
- }
+ }
return NULL;
}
@@ -3599,7 +3599,7 @@ static int t_show(struct seq_file *m, void *v)
if (direct)
seq_printf(m, "\n\tdirect-->%pS", (void *)direct);
}
- }
+ }
seq_putc(m, '\n');
@@ -7151,6 +7151,10 @@ static int pid_open(struct inode *inode, struct file *file, int type)
case TRACE_NO_PIDS:
seq_ops = &ftrace_no_pid_sops;
break;
+ default:
+ trace_array_put(tr);
+ WARN_ON_ONCE(1);
+ return -EINVAL;
}
ret = seq_open(file, seq_ops);
@@ -7229,6 +7233,10 @@ pid_write(struct file *filp, const char __user *ubuf,
other_pids = rcu_dereference_protected(tr->function_pids,
lockdep_is_held(&ftrace_lock));
break;
+ default:
+ ret = -EINVAL;
+ WARN_ON_ONCE(1);
+ goto out;
}
ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b8e1ca48be50..00867ff82412 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2427,7 +2427,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
if (unlikely(info->add_timestamp)) {
bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
- event = rb_add_time_stamp(event, info->delta, abs);
+ event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
length -= RB_LEN_TIME_EXTEND;
delta = 0;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 29615f15a820..bb62269724d5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1299,8 +1299,11 @@ EXPORT_SYMBOL_GPL(tracing_off);
void disable_trace_on_warning(void)
{
- if (__disable_trace_on_warning)
+ if (__disable_trace_on_warning) {
+ trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_,
+ "Disabling tracing due to warning\n");
tracing_off();
+ }
}
/**
@@ -2662,7 +2665,7 @@ static void output_printk(struct trace_event_buffer *fbuffer)
}
int tracepoint_printk_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int save_tracepoint_printk;
@@ -3567,7 +3570,6 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
void tracing_iter_reset(struct trace_iterator *iter, int cpu)
{
- struct ring_buffer_event *event;
struct ring_buffer_iter *buf_iter;
unsigned long entries = 0;
u64 ts;
@@ -3585,7 +3587,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
* that a reset never took place on a cpu. This is evident
* by the timestamp being before the start of the buffer.
*/
- while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
+ while (ring_buffer_iter_peek(buf_iter, &ts)) {
if (ts >= iter->array_buffer->time_start)
break;
entries++;
@@ -6305,13 +6307,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
__free_page(spd->pages[idx]);
}
-static const struct pipe_buf_operations tracing_pipe_buf_ops = {
- .confirm = generic_pipe_buf_confirm,
- .release = generic_pipe_buf_release,
- .steal = generic_pipe_buf_steal,
- .get = generic_pipe_buf_get,
-};
-
static size_t
tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
{
@@ -6373,7 +6368,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
.partial = partial_def,
.nr_pages = 0, /* This gets updated below. */
.nr_pages_max = PIPE_DEF_BUFFERS,
- .ops = &tracing_pipe_buf_ops,
+ .ops = &default_pipe_buf_ops,
.spd_release = tracing_spd_release_pipe,
};
ssize_t ret;
@@ -7582,9 +7577,7 @@ static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
/* Pipe buffer operations for a buffer. */
static const struct pipe_buf_operations buffer_pipe_buf_ops = {
- .confirm = generic_pipe_buf_confirm,
.release = buffer_pipe_buf_release,
- .steal = generic_pipe_buf_nosteal,
.get = buffer_pipe_buf_get,
};
@@ -8527,18 +8520,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
allocate_snapshot = false;
#endif
- /*
- * Because of some magic with the way alloc_percpu() works on
- * x86_64, we need to synchronize the pgd of all the tables,
- * otherwise the trace events that happen in x86_64 page fault
- * handlers can't cope with accessing the chance that a
- * alloc_percpu()'d memory might be touched in the page fault trace
- * event. Oh, and we need to audit all other alloc_percpu() and vmalloc()
- * calls in tracing, because something might get triggered within a
- * page fault trace event!
- */
- vmalloc_sync_mappings();
-
return 0;
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4eb1d004d5f2..13db4000af3f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -61,6 +61,9 @@ enum trace_type {
#undef __field_desc
#define __field_desc(type, container, item)
+#undef __field_packed
+#define __field_packed(type, container, item)
+
#undef __array
#define __array(type, item, size) type item[size];
@@ -1661,6 +1664,7 @@ extern struct list_head ftrace_events;
extern const struct file_operations event_trigger_fops;
extern const struct file_operations event_hist_fops;
+extern const struct file_operations event_hist_debug_fops;
extern const struct file_operations event_inject_fops;
#ifdef CONFIG_HIST_TRIGGERS
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 9de29bb45a27..fa0fc08c6ef8 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -101,12 +101,16 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event)
kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN);
ret = kprobe_event_gen_cmd_start(&cmd, event, val);
- if (ret)
+ if (ret) {
+ pr_err("Failed to generate probe: %s\n", buf);
break;
+ }
ret = kprobe_event_gen_cmd_end(&cmd);
- if (ret)
+ if (ret) {
pr_err("Failed to add probe: %s\n", buf);
+ break;
+ }
}
return ret;
@@ -120,7 +124,7 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event)
}
#endif
-#ifdef CONFIG_HIST_TRIGGERS
+#ifdef CONFIG_SYNTH_EVENTS
static int __init
trace_boot_add_synth_event(struct xbc_node *node, const char *event)
{
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index a523da0dae0a..18c4a58aff79 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -78,8 +78,8 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ent, graph_ent )
- __field_desc( unsigned long, graph_ent, func )
- __field_desc( int, graph_ent, depth )
+ __field_packed( unsigned long, graph_ent, func )
+ __field_packed( int, graph_ent, depth )
),
F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth)
@@ -92,11 +92,11 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
- __field_desc( unsigned long, ret, func )
- __field_desc( unsigned long, ret, overrun )
- __field_desc( unsigned long long, ret, calltime)
- __field_desc( unsigned long long, ret, rettime )
- __field_desc( int, ret, depth )
+ __field_packed( unsigned long, ret, func )
+ __field_packed( unsigned long, ret, overrun )
+ __field_packed( unsigned long long, ret, calltime)
+ __field_packed( unsigned long long, ret, rettime )
+ __field_packed( int, ret, depth )
),
F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 242f59e7f17d..f6f55682d3e2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2209,6 +2209,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
trace_create_file("hist", 0444, file->dir, file,
&event_hist_fops);
#endif
+#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+ trace_create_file("hist_debug", 0444, file->dir, file,
+ &event_hist_debug_fops);
+#endif
trace_create_file("format", 0444, file->dir, call,
&ftrace_event_format_fops);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index fcab11cc6833..0b933546142e 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -19,13 +19,7 @@
#include <trace/events/mmflags.h>
#include "tracing_map.h"
-#include "trace.h"
-#include "trace_dynevent.h"
-
-#define SYNTH_SYSTEM "synthetic"
-#define SYNTH_FIELDS_MAX 32
-
-#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
+#include "trace_synth.h"
#define ERRORS \
C(NONE, "No error"), \
@@ -380,69 +374,6 @@ struct hist_trigger_data {
unsigned int n_save_var_str;
};
-static int create_synth_event(int argc, const char **argv);
-static int synth_event_show(struct seq_file *m, struct dyn_event *ev);
-static int synth_event_release(struct dyn_event *ev);
-static bool synth_event_is_busy(struct dyn_event *ev);
-static bool synth_event_match(const char *system, const char *event,
- int argc, const char **argv, struct dyn_event *ev);
-
-static struct dyn_event_operations synth_event_ops = {
- .create = create_synth_event,
- .show = synth_event_show,
- .is_busy = synth_event_is_busy,
- .free = synth_event_release,
- .match = synth_event_match,
-};
-
-struct synth_field {
- char *type;
- char *name;
- size_t size;
- unsigned int offset;
- bool is_signed;
- bool is_string;
-};
-
-struct synth_event {
- struct dyn_event devent;
- int ref;
- char *name;
- struct synth_field **fields;
- unsigned int n_fields;
- unsigned int n_u64;
- struct trace_event_class class;
- struct trace_event_call call;
- struct tracepoint *tp;
- struct module *mod;
-};
-
-static bool is_synth_event(struct dyn_event *ev)
-{
- return ev->ops == &synth_event_ops;
-}
-
-static struct synth_event *to_synth_event(struct dyn_event *ev)
-{
- return container_of(ev, struct synth_event, devent);
-}
-
-static bool synth_event_is_busy(struct dyn_event *ev)
-{
- struct synth_event *event = to_synth_event(ev);
-
- return event->ref != 0;
-}
-
-static bool synth_event_match(const char *system, const char *event,
- int argc, const char **argv, struct dyn_event *ev)
-{
- struct synth_event *sev = to_synth_event(ev);
-
- return strcmp(sev->name, event) == 0 &&
- (!system || strcmp(system, SYNTH_SYSTEM) == 0);
-}
-
struct action_data;
typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
@@ -589,6 +520,7 @@ static struct track_data *track_data_alloc(unsigned int key_len,
track_data_free(data);
return ERR_PTR(-ENOMEM);
}
+
data->elt.private_data = elt_data;
elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL);
@@ -621,7 +553,6 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
if (file) {
call = file->event_call;
-
system = call->class->system;
if (system) {
name = trace_event_name(call);
@@ -646,510 +577,6 @@ static void hist_err_clear(void)
last_cmd_loc[0] = '\0';
}
-struct synth_trace_event {
- struct trace_entry ent;
- u64 fields[];
-};
-
-static int synth_event_define_fields(struct trace_event_call *call)
-{
- struct synth_trace_event trace;
- int offset = offsetof(typeof(trace), fields);
- struct synth_event *event = call->data;
- unsigned int i, size, n_u64;
- char *name, *type;
- bool is_signed;
- int ret = 0;
-
- for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
- size = event->fields[i]->size;
- is_signed = event->fields[i]->is_signed;
- type = event->fields[i]->type;
- name = event->fields[i]->name;
- ret = trace_define_field(call, type, name, offset, size,
- is_signed, FILTER_OTHER);
- if (ret)
- break;
-
- event->fields[i]->offset = n_u64;
-
- if (event->fields[i]->is_string) {
- offset += STR_VAR_LEN_MAX;
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- offset += sizeof(u64);
- n_u64++;
- }
- }
-
- event->n_u64 = n_u64;
-
- return ret;
-}
-
-static bool synth_field_signed(char *type)
-{
- if (str_has_prefix(type, "u"))
- return false;
- if (strcmp(type, "gfp_t") == 0)
- return false;
-
- return true;
-}
-
-static int synth_field_is_string(char *type)
-{
- if (strstr(type, "char[") != NULL)
- return true;
-
- return false;
-}
-
-static int synth_field_string_size(char *type)
-{
- char buf[4], *end, *start;
- unsigned int len;
- int size, err;
-
- start = strstr(type, "char[");
- if (start == NULL)
- return -EINVAL;
- start += sizeof("char[") - 1;
-
- end = strchr(type, ']');
- if (!end || end < start)
- return -EINVAL;
-
- len = end - start;
- if (len > 3)
- return -EINVAL;
-
- strncpy(buf, start, len);
- buf[len] = '\0';
-
- err = kstrtouint(buf, 0, &size);
- if (err)
- return err;
-
- if (size > STR_VAR_LEN_MAX)
- return -EINVAL;
-
- return size;
-}
-
-static int synth_field_size(char *type)
-{
- int size = 0;
-
- if (strcmp(type, "s64") == 0)
- size = sizeof(s64);
- else if (strcmp(type, "u64") == 0)
- size = sizeof(u64);
- else if (strcmp(type, "s32") == 0)
- size = sizeof(s32);
- else if (strcmp(type, "u32") == 0)
- size = sizeof(u32);
- else if (strcmp(type, "s16") == 0)
- size = sizeof(s16);
- else if (strcmp(type, "u16") == 0)
- size = sizeof(u16);
- else if (strcmp(type, "s8") == 0)
- size = sizeof(s8);
- else if (strcmp(type, "u8") == 0)
- size = sizeof(u8);
- else if (strcmp(type, "char") == 0)
- size = sizeof(char);
- else if (strcmp(type, "unsigned char") == 0)
- size = sizeof(unsigned char);
- else if (strcmp(type, "int") == 0)
- size = sizeof(int);
- else if (strcmp(type, "unsigned int") == 0)
- size = sizeof(unsigned int);
- else if (strcmp(type, "long") == 0)
- size = sizeof(long);
- else if (strcmp(type, "unsigned long") == 0)
- size = sizeof(unsigned long);
- else if (strcmp(type, "pid_t") == 0)
- size = sizeof(pid_t);
- else if (strcmp(type, "gfp_t") == 0)
- size = sizeof(gfp_t);
- else if (synth_field_is_string(type))
- size = synth_field_string_size(type);
-
- return size;
-}
-
-static const char *synth_field_fmt(char *type)
-{
- const char *fmt = "%llu";
-
- if (strcmp(type, "s64") == 0)
- fmt = "%lld";
- else if (strcmp(type, "u64") == 0)
- fmt = "%llu";
- else if (strcmp(type, "s32") == 0)
- fmt = "%d";
- else if (strcmp(type, "u32") == 0)
- fmt = "%u";
- else if (strcmp(type, "s16") == 0)
- fmt = "%d";
- else if (strcmp(type, "u16") == 0)
- fmt = "%u";
- else if (strcmp(type, "s8") == 0)
- fmt = "%d";
- else if (strcmp(type, "u8") == 0)
- fmt = "%u";
- else if (strcmp(type, "char") == 0)
- fmt = "%d";
- else if (strcmp(type, "unsigned char") == 0)
- fmt = "%u";
- else if (strcmp(type, "int") == 0)
- fmt = "%d";
- else if (strcmp(type, "unsigned int") == 0)
- fmt = "%u";
- else if (strcmp(type, "long") == 0)
- fmt = "%ld";
- else if (strcmp(type, "unsigned long") == 0)
- fmt = "%lu";
- else if (strcmp(type, "pid_t") == 0)
- fmt = "%d";
- else if (strcmp(type, "gfp_t") == 0)
- fmt = "%x";
- else if (synth_field_is_string(type))
- fmt = "%s";
-
- return fmt;
-}
-
-static void print_synth_event_num_val(struct trace_seq *s,
- char *print_fmt, char *name,
- int size, u64 val, char *space)
-{
- switch (size) {
- case 1:
- trace_seq_printf(s, print_fmt, name, (u8)val, space);
- break;
-
- case 2:
- trace_seq_printf(s, print_fmt, name, (u16)val, space);
- break;
-
- case 4:
- trace_seq_printf(s, print_fmt, name, (u32)val, space);
- break;
-
- default:
- trace_seq_printf(s, print_fmt, name, val, space);
- break;
- }
-}
-
-static enum print_line_t print_synth_event(struct trace_iterator *iter,
- int flags,
- struct trace_event *event)
-{
- struct trace_array *tr = iter->tr;
- struct trace_seq *s = &iter->seq;
- struct synth_trace_event *entry;
- struct synth_event *se;
- unsigned int i, n_u64;
- char print_fmt[32];
- const char *fmt;
-
- entry = (struct synth_trace_event *)iter->ent;
- se = container_of(event, struct synth_event, call.event);
-
- trace_seq_printf(s, "%s: ", se->name);
-
- for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
- if (trace_seq_has_overflowed(s))
- goto end;
-
- fmt = synth_field_fmt(se->fields[i]->type);
-
- /* parameter types */
- if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
- trace_seq_printf(s, "%s ", fmt);
-
- snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
-
- /* parameter values */
- if (se->fields[i]->is_string) {
- trace_seq_printf(s, print_fmt, se->fields[i]->name,
- (char *)&entry->fields[n_u64],
- i == se->n_fields - 1 ? "" : " ");
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- struct trace_print_flags __flags[] = {
- __def_gfpflag_names, {-1, NULL} };
- char *space = (i == se->n_fields - 1 ? "" : " ");
-
- print_synth_event_num_val(s, print_fmt,
- se->fields[i]->name,
- se->fields[i]->size,
- entry->fields[n_u64],
- space);
-
- if (strcmp(se->fields[i]->type, "gfp_t") == 0) {
- trace_seq_puts(s, " (");
- trace_print_flags_seq(s, "|",
- entry->fields[n_u64],
- __flags);
- trace_seq_putc(s, ')');
- }
- n_u64++;
- }
- }
-end:
- trace_seq_putc(s, '\n');
-
- return trace_handle_return(s);
-}
-
-static struct trace_event_functions synth_event_funcs = {
- .trace = print_synth_event
-};
-
-static notrace void trace_event_raw_event_synth(void *__data,
- u64 *var_ref_vals,
- unsigned int *var_ref_idx)
-{
- struct trace_event_file *trace_file = __data;
- struct synth_trace_event *entry;
- struct trace_event_buffer fbuffer;
- struct trace_buffer *buffer;
- struct synth_event *event;
- unsigned int i, n_u64, val_idx;
- int fields_size = 0;
-
- event = trace_file->event_call->data;
-
- if (trace_trigger_soft_disabled(trace_file))
- return;
-
- fields_size = event->n_u64 * sizeof(u64);
-
- /*
- * Avoid ring buffer recursion detection, as this event
- * is being performed within another event.
- */
- buffer = trace_file->tr->array_buffer.buffer;
- ring_buffer_nest_start(buffer);
-
- entry = trace_event_buffer_reserve(&fbuffer, trace_file,
- sizeof(*entry) + fields_size);
- if (!entry)
- goto out;
-
- for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
- val_idx = var_ref_idx[i];
- if (event->fields[i]->is_string) {
- char *str_val = (char *)(long)var_ref_vals[val_idx];
- char *str_field = (char *)&entry->fields[n_u64];
-
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- struct synth_field *field = event->fields[i];
- u64 val = var_ref_vals[val_idx];
-
- switch (field->size) {
- case 1:
- *(u8 *)&entry->fields[n_u64] = (u8)val;
- break;
-
- case 2:
- *(u16 *)&entry->fields[n_u64] = (u16)val;
- break;
-
- case 4:
- *(u32 *)&entry->fields[n_u64] = (u32)val;
- break;
-
- default:
- entry->fields[n_u64] = val;
- break;
- }
- n_u64++;
- }
- }
-
- trace_event_buffer_commit(&fbuffer);
-out:
- ring_buffer_nest_end(buffer);
-}
-
-static void free_synth_event_print_fmt(struct trace_event_call *call)
-{
- if (call) {
- kfree(call->print_fmt);
- call->print_fmt = NULL;
- }
-}
-
-static int __set_synth_event_print_fmt(struct synth_event *event,
- char *buf, int len)
-{
- const char *fmt;
- int pos = 0;
- int i;
-
- /* When len=0, we just calculate the needed length */
-#define LEN_OR_ZERO (len ? len - pos : 0)
-
- pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
- for (i = 0; i < event->n_fields; i++) {
- fmt = synth_field_fmt(event->fields[i]->type);
- pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
- event->fields[i]->name, fmt,
- i == event->n_fields - 1 ? "" : ", ");
- }
- pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
-
- for (i = 0; i < event->n_fields; i++) {
- pos += snprintf(buf + pos, LEN_OR_ZERO,
- ", REC->%s", event->fields[i]->name);
- }
-
-#undef LEN_OR_ZERO
-
- /* return the length of print_fmt */
- return pos;
-}
-
-static int set_synth_event_print_fmt(struct trace_event_call *call)
-{
- struct synth_event *event = call->data;
- char *print_fmt;
- int len;
-
- /* First: called with 0 length to calculate the needed length */
- len = __set_synth_event_print_fmt(event, NULL, 0);
-
- print_fmt = kmalloc(len + 1, GFP_KERNEL);
- if (!print_fmt)
- return -ENOMEM;
-
- /* Second: actually write the @print_fmt */
- __set_synth_event_print_fmt(event, print_fmt, len + 1);
- call->print_fmt = print_fmt;
-
- return 0;
-}
-
-static void free_synth_field(struct synth_field *field)
-{
- kfree(field->type);
- kfree(field->name);
- kfree(field);
-}
-
-static struct synth_field *parse_synth_field(int argc, const char **argv,
- int *consumed)
-{
- struct synth_field *field;
- const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
- int len, ret = 0;
-
- if (field_type[0] == ';')
- field_type++;
-
- if (!strcmp(field_type, "unsigned")) {
- if (argc < 3)
- return ERR_PTR(-EINVAL);
- prefix = "unsigned ";
- field_type = argv[1];
- field_name = argv[2];
- *consumed = 3;
- } else {
- field_name = argv[1];
- *consumed = 2;
- }
-
- field = kzalloc(sizeof(*field), GFP_KERNEL);
- if (!field)
- return ERR_PTR(-ENOMEM);
-
- len = strlen(field_name);
- array = strchr(field_name, '[');
- if (array)
- len -= strlen(array);
- else if (field_name[len - 1] == ';')
- len--;
-
- field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
- if (!field->name) {
- ret = -ENOMEM;
- goto free;
- }
-
- if (field_type[0] == ';')
- field_type++;
- len = strlen(field_type) + 1;
- if (array)
- len += strlen(array);
- if (prefix)
- len += strlen(prefix);
-
- field->type = kzalloc(len, GFP_KERNEL);
- if (!field->type) {
- ret = -ENOMEM;
- goto free;
- }
- if (prefix)
- strcat(field->type, prefix);
- strcat(field->type, field_type);
- if (array) {
- strcat(field->type, array);
- if (field->type[len - 1] == ';')
- field->type[len - 1] = '\0';
- }
-
- field->size = synth_field_size(field->type);
- if (!field->size) {
- ret = -EINVAL;
- goto free;
- }
-
- if (synth_field_is_string(field->type))
- field->is_string = true;
-
- field->is_signed = synth_field_signed(field->type);
-
- out:
- return field;
- free:
- free_synth_field(field);
- field = ERR_PTR(ret);
- goto out;
-}
-
-static void free_synth_tracepoint(struct tracepoint *tp)
-{
- if (!tp)
- return;
-
- kfree(tp->name);
- kfree(tp);
-}
-
-static struct tracepoint *alloc_synth_tracepoint(char *name)
-{
- struct tracepoint *tp;
-
- tp = kzalloc(sizeof(*tp), GFP_KERNEL);
- if (!tp)
- return ERR_PTR(-ENOMEM);
-
- tp->name = kstrdup(name, GFP_KERNEL);
- if (!tp->name) {
- kfree(tp);
- return ERR_PTR(-ENOMEM);
- }
-
- return tp;
-}
-
typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
unsigned int *var_ref_idx);
@@ -1177,145 +604,6 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
}
}
-static struct synth_event *find_synth_event(const char *name)
-{
- struct dyn_event *pos;
- struct synth_event *event;
-
- for_each_dyn_event(pos) {
- if (!is_synth_event(pos))
- continue;
- event = to_synth_event(pos);
- if (strcmp(event->name, name) == 0)
- return event;
- }
-
- return NULL;
-}
-
-static struct trace_event_fields synth_event_fields_array[] = {
- { .type = TRACE_FUNCTION_TYPE,
- .define_fields = synth_event_define_fields },
- {}
-};
-
-static int register_synth_event(struct synth_event *event)
-{
- struct trace_event_call *call = &event->call;
- int ret = 0;
-
- event->call.class = &event->class;
- event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
- if (!event->class.system) {
- ret = -ENOMEM;
- goto out;
- }
-
- event->tp = alloc_synth_tracepoint(event->name);
- if (IS_ERR(event->tp)) {
- ret = PTR_ERR(event->tp);
- event->tp = NULL;
- goto out;
- }
-
- INIT_LIST_HEAD(&call->class->fields);
- call->event.funcs = &synth_event_funcs;
- call->class->fields_array = synth_event_fields_array;
-
- ret = register_trace_event(&call->event);
- if (!ret) {
- ret = -ENODEV;
- goto out;
- }
- call->flags = TRACE_EVENT_FL_TRACEPOINT;
- call->class->reg = trace_event_reg;
- call->class->probe = trace_event_raw_event_synth;
- call->data = event;
- call->tp = event->tp;
-
- ret = trace_add_event_call(call);
- if (ret) {
- pr_warn("Failed to register synthetic event: %s\n",
- trace_event_name(call));
- goto err;
- }
-
- ret = set_synth_event_print_fmt(call);
- if (ret < 0) {
- trace_remove_event_call(call);
- goto err;
- }
- out:
- return ret;
- err:
- unregister_trace_event(&call->event);
- goto out;
-}
-
-static int unregister_synth_event(struct synth_event *event)
-{
- struct trace_event_call *call = &event->call;
- int ret;
-
- ret = trace_remove_event_call(call);
-
- return ret;
-}
-
-static void free_synth_event(struct synth_event *event)
-{
- unsigned int i;
-
- if (!event)
- return;
-
- for (i = 0; i < event->n_fields; i++)
- free_synth_field(event->fields[i]);
-
- kfree(event->fields);
- kfree(event->name);
- kfree(event->class.system);
- free_synth_tracepoint(event->tp);
- free_synth_event_print_fmt(&event->call);
- kfree(event);
-}
-
-static struct synth_event *alloc_synth_event(const char *name, int n_fields,
- struct synth_field **fields)
-{
- struct synth_event *event;
- unsigned int i;
-
- event = kzalloc(sizeof(*event), GFP_KERNEL);
- if (!event) {
- event = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- event->name = kstrdup(name, GFP_KERNEL);
- if (!event->name) {
- kfree(event);
- event = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
- if (!event->fields) {
- free_synth_event(event);
- event = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- dyn_event_init(&event->devent, &synth_event_ops);
-
- for (i = 0; i < n_fields; i++)
- event->fields[i] = fields[i];
-
- event->n_fields = n_fields;
- out:
- return event;
-}
-
static void action_trace(struct hist_trigger_data *hist_data,
struct tracing_map_elt *elt, void *rec,
struct ring_buffer_event *rbe, void *key,
@@ -1331,1056 +619,6 @@ struct hist_var_data {
struct hist_trigger_data *hist_data;
};
-static int synth_event_check_arg_fn(void *data)
-{
- struct dynevent_arg_pair *arg_pair = data;
- int size;
-
- size = synth_field_size((char *)arg_pair->lhs);
-
- return size ? 0 : -EINVAL;
-}
-
-/**
- * synth_event_add_field - Add a new field to a synthetic event cmd
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @type: The type of the new field to add
- * @name: The name of the new field to add
- *
- * Add a new field to a synthetic event cmd object. Field ordering is in
- * the same order the fields are added.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_add_field(struct dynevent_cmd *cmd, const char *type,
- const char *name)
-{
- struct dynevent_arg_pair arg_pair;
- int ret;
-
- if (cmd->type != DYNEVENT_TYPE_SYNTH)
- return -EINVAL;
-
- if (!type || !name)
- return -EINVAL;
-
- dynevent_arg_pair_init(&arg_pair, 0, ';');
-
- arg_pair.lhs = type;
- arg_pair.rhs = name;
-
- ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn);
- if (ret)
- return ret;
-
- if (++cmd->n_fields > SYNTH_FIELDS_MAX)
- ret = -EINVAL;
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_add_field);
-
-/**
- * synth_event_add_field_str - Add a new field to a synthetic event cmd
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @type_name: The type and name of the new field to add, as a single string
- *
- * Add a new field to a synthetic event cmd object, as a single
- * string. The @type_name string is expected to be of the form 'type
- * name', which will be appended by ';'. No sanity checking is done -
- * what's passed in is assumed to already be well-formed. Field
- * ordering is in the same order the fields are added.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name)
-{
- struct dynevent_arg arg;
- int ret;
-
- if (cmd->type != DYNEVENT_TYPE_SYNTH)
- return -EINVAL;
-
- if (!type_name)
- return -EINVAL;
-
- dynevent_arg_init(&arg, ';');
-
- arg.str = type_name;
-
- ret = dynevent_arg_add(cmd, &arg, NULL);
- if (ret)
- return ret;
-
- if (++cmd->n_fields > SYNTH_FIELDS_MAX)
- ret = -EINVAL;
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_add_field_str);
-
-/**
- * synth_event_add_fields - Add multiple fields to a synthetic event cmd
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @fields: An array of type/name field descriptions
- * @n_fields: The number of field descriptions contained in the fields array
- *
- * Add a new set of fields to a synthetic event cmd object. The event
- * fields that will be defined for the event should be passed in as an
- * array of struct synth_field_desc, and the number of elements in the
- * array passed in as n_fields. Field ordering will retain the
- * ordering given in the fields array.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_add_fields(struct dynevent_cmd *cmd,
- struct synth_field_desc *fields,
- unsigned int n_fields)
-{
- unsigned int i;
- int ret = 0;
-
- for (i = 0; i < n_fields; i++) {
- if (fields[i].type == NULL || fields[i].name == NULL) {
- ret = -EINVAL;
- break;
- }
-
- ret = synth_event_add_field(cmd, fields[i].type, fields[i].name);
- if (ret)
- break;
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_add_fields);
-
-/**
- * __synth_event_gen_cmd_start - Start a synthetic event command from arg list
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @name: The name of the synthetic event
- * @mod: The module creating the event, NULL if not created from a module
- * @args: Variable number of arg (pairs), one pair for each field
- *
- * NOTE: Users normally won't want to call this function directly, but
- * rather use the synth_event_gen_cmd_start() wrapper, which
- * automatically adds a NULL to the end of the arg list. If this
- * function is used directly, make sure the last arg in the variable
- * arg list is NULL.
- *
- * Generate a synthetic event command to be executed by
- * synth_event_gen_cmd_end(). This function can be used to generate
- * the complete command or only the first part of it; in the latter
- * case, synth_event_add_field(), synth_event_add_field_str(), or
- * synth_event_add_fields() can be used to add more fields following
- * this.
- *
- * There should be an even number variable args, each pair consisting
- * of a type followed by a field name.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name,
- struct module *mod, ...)
-{
- struct dynevent_arg arg;
- va_list args;
- int ret;
-
- cmd->event_name = name;
- cmd->private_data = mod;
-
- if (cmd->type != DYNEVENT_TYPE_SYNTH)
- return -EINVAL;
-
- dynevent_arg_init(&arg, 0);
- arg.str = name;
- ret = dynevent_arg_add(cmd, &arg, NULL);
- if (ret)
- return ret;
-
- va_start(args, mod);
- for (;;) {
- const char *type, *name;
-
- type = va_arg(args, const char *);
- if (!type)
- break;
- name = va_arg(args, const char *);
- if (!name)
- break;
-
- if (++cmd->n_fields > SYNTH_FIELDS_MAX) {
- ret = -EINVAL;
- break;
- }
-
- ret = synth_event_add_field(cmd, type, name);
- if (ret)
- break;
- }
- va_end(args);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start);
-
-/**
- * synth_event_gen_cmd_array_start - Start synthetic event command from an array
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @name: The name of the synthetic event
- * @fields: An array of type/name field descriptions
- * @n_fields: The number of field descriptions contained in the fields array
- *
- * Generate a synthetic event command to be executed by
- * synth_event_gen_cmd_end(). This function can be used to generate
- * the complete command or only the first part of it; in the latter
- * case, synth_event_add_field(), synth_event_add_field_str(), or
- * synth_event_add_fields() can be used to add more fields following
- * this.
- *
- * The event fields that will be defined for the event should be
- * passed in as an array of struct synth_field_desc, and the number of
- * elements in the array passed in as n_fields. Field ordering will
- * retain the ordering given in the fields array.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name,
- struct module *mod,
- struct synth_field_desc *fields,
- unsigned int n_fields)
-{
- struct dynevent_arg arg;
- unsigned int i;
- int ret = 0;
-
- cmd->event_name = name;
- cmd->private_data = mod;
-
- if (cmd->type != DYNEVENT_TYPE_SYNTH)
- return -EINVAL;
-
- if (n_fields > SYNTH_FIELDS_MAX)
- return -EINVAL;
-
- dynevent_arg_init(&arg, 0);
- arg.str = name;
- ret = dynevent_arg_add(cmd, &arg, NULL);
- if (ret)
- return ret;
-
- for (i = 0; i < n_fields; i++) {
- if (fields[i].type == NULL || fields[i].name == NULL)
- return -EINVAL;
-
- ret = synth_event_add_field(cmd, fields[i].type, fields[i].name);
- if (ret)
- break;
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start);
-
-static int __create_synth_event(int argc, const char *name, const char **argv)
-{
- struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
- struct synth_event *event = NULL;
- int i, consumed = 0, n_fields = 0, ret = 0;
-
- /*
- * Argument syntax:
- * - Add synthetic event: <event_name> field[;field] ...
- * - Remove synthetic event: !<event_name> field[;field] ...
- * where 'field' = type field_name
- */
-
- if (name[0] == '\0' || argc < 1)
- return -EINVAL;
-
- mutex_lock(&event_mutex);
-
- event = find_synth_event(name);
- if (event) {
- ret = -EEXIST;
- goto out;
- }
-
- for (i = 0; i < argc - 1; i++) {
- if (strcmp(argv[i], ";") == 0)
- continue;
- if (n_fields == SYNTH_FIELDS_MAX) {
- ret = -EINVAL;
- goto err;
- }
-
- field = parse_synth_field(argc - i, &argv[i], &consumed);
- if (IS_ERR(field)) {
- ret = PTR_ERR(field);
- goto err;
- }
- fields[n_fields++] = field;
- i += consumed - 1;
- }
-
- if (i < argc && strcmp(argv[i], ";") != 0) {
- ret = -EINVAL;
- goto err;
- }
-
- event = alloc_synth_event(name, n_fields, fields);
- if (IS_ERR(event)) {
- ret = PTR_ERR(event);
- event = NULL;
- goto err;
- }
- ret = register_synth_event(event);
- if (!ret)
- dyn_event_add(&event->devent);
- else
- free_synth_event(event);
- out:
- mutex_unlock(&event_mutex);
-
- return ret;
- err:
- for (i = 0; i < n_fields; i++)
- free_synth_field(fields[i]);
-
- goto out;
-}
-
-/**
- * synth_event_create - Create a new synthetic event
- * @name: The name of the new sythetic event
- * @fields: An array of type/name field descriptions
- * @n_fields: The number of field descriptions contained in the fields array
- * @mod: The module creating the event, NULL if not created from a module
- *
- * Create a new synthetic event with the given name under the
- * trace/events/synthetic/ directory. The event fields that will be
- * defined for the event should be passed in as an array of struct
- * synth_field_desc, and the number elements in the array passed in as
- * n_fields. Field ordering will retain the ordering given in the
- * fields array.
- *
- * If the new synthetic event is being created from a module, the mod
- * param must be non-NULL. This will ensure that the trace buffer
- * won't contain unreadable events.
- *
- * The new synth event should be deleted using synth_event_delete()
- * function. The new synthetic event can be generated from modules or
- * other kernel code using trace_synth_event() and related functions.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_create(const char *name, struct synth_field_desc *fields,
- unsigned int n_fields, struct module *mod)
-{
- struct dynevent_cmd cmd;
- char *buf;
- int ret;
-
- buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
-
- ret = synth_event_gen_cmd_array_start(&cmd, name, mod,
- fields, n_fields);
- if (ret)
- goto out;
-
- ret = synth_event_gen_cmd_end(&cmd);
- out:
- kfree(buf);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_create);
-
-static int destroy_synth_event(struct synth_event *se)
-{
- int ret;
-
- if (se->ref)
- ret = -EBUSY;
- else {
- ret = unregister_synth_event(se);
- if (!ret) {
- dyn_event_remove(&se->devent);
- free_synth_event(se);
- }
- }
-
- return ret;
-}
-
-/**
- * synth_event_delete - Delete a synthetic event
- * @event_name: The name of the new sythetic event
- *
- * Delete a synthetic event that was created with synth_event_create().
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_delete(const char *event_name)
-{
- struct synth_event *se = NULL;
- struct module *mod = NULL;
- int ret = -ENOENT;
-
- mutex_lock(&event_mutex);
- se = find_synth_event(event_name);
- if (se) {
- mod = se->mod;
- ret = destroy_synth_event(se);
- }
- mutex_unlock(&event_mutex);
-
- if (mod) {
- mutex_lock(&trace_types_lock);
- /*
- * It is safest to reset the ring buffer if the module
- * being unloaded registered any events that were
- * used. The only worry is if a new module gets
- * loaded, and takes on the same id as the events of
- * this module. When printing out the buffer, traced
- * events left over from this module may be passed to
- * the new module events and unexpected results may
- * occur.
- */
- tracing_reset_all_online_cpus();
- mutex_unlock(&trace_types_lock);
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_delete);
-
-static int create_or_delete_synth_event(int argc, char **argv)
-{
- const char *name = argv[0];
- int ret;
-
- /* trace_run_command() ensures argc != 0 */
- if (name[0] == '!') {
- ret = synth_event_delete(name + 1);
- return ret;
- }
-
- ret = __create_synth_event(argc - 1, name, (const char **)argv + 1);
- return ret == -ECANCELED ? -EINVAL : ret;
-}
-
-static int synth_event_run_command(struct dynevent_cmd *cmd)
-{
- struct synth_event *se;
- int ret;
-
- ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event);
- if (ret)
- return ret;
-
- se = find_synth_event(cmd->event_name);
- if (WARN_ON(!se))
- return -ENOENT;
-
- se->mod = cmd->private_data;
-
- return ret;
-}
-
-/**
- * synth_event_cmd_init - Initialize a synthetic event command object
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @buf: A pointer to the buffer used to build the command
- * @maxlen: The length of the buffer passed in @buf
- *
- * Initialize a synthetic event command object. Use this before
- * calling any of the other dyenvent_cmd functions.
- */
-void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen)
-{
- dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH,
- synth_event_run_command);
-}
-EXPORT_SYMBOL_GPL(synth_event_cmd_init);
-
-static inline int
-__synth_event_trace_start(struct trace_event_file *file,
- struct synth_event_trace_state *trace_state)
-{
- int entry_size, fields_size = 0;
- int ret = 0;
-
- memset(trace_state, '\0', sizeof(*trace_state));
-
- /*
- * Normal event tracing doesn't get called at all unless the
- * ENABLED bit is set (which attaches the probe thus allowing
- * this code to be called, etc). Because this is called
- * directly by the user, we don't have that but we still need
- * to honor not logging when disabled. For the the iterated
- * trace case, we save the enabed state upon start and just
- * ignore the following data calls.
- */
- if (!(file->flags & EVENT_FILE_FL_ENABLED) ||
- trace_trigger_soft_disabled(file)) {
- trace_state->disabled = true;
- ret = -ENOENT;
- goto out;
- }
-
- trace_state->event = file->event_call->data;
-
- fields_size = trace_state->event->n_u64 * sizeof(u64);
-
- /*
- * Avoid ring buffer recursion detection, as this event
- * is being performed within another event.
- */
- trace_state->buffer = file->tr->array_buffer.buffer;
- ring_buffer_nest_start(trace_state->buffer);
-
- entry_size = sizeof(*trace_state->entry) + fields_size;
- trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer,
- file,
- entry_size);
- if (!trace_state->entry) {
- ring_buffer_nest_end(trace_state->buffer);
- ret = -EINVAL;
- }
-out:
- return ret;
-}
-
-static inline void
-__synth_event_trace_end(struct synth_event_trace_state *trace_state)
-{
- trace_event_buffer_commit(&trace_state->fbuffer);
-
- ring_buffer_nest_end(trace_state->buffer);
-}
-
-/**
- * synth_event_trace - Trace a synthetic event
- * @file: The trace_event_file representing the synthetic event
- * @n_vals: The number of values in vals
- * @args: Variable number of args containing the event values
- *
- * Trace a synthetic event using the values passed in the variable
- * argument list.
- *
- * The argument list should be a list 'n_vals' u64 values. The number
- * of vals must match the number of field in the synthetic event, and
- * must be in the same order as the synthetic event fields.
- *
- * All vals should be cast to u64, and string vals are just pointers
- * to strings, cast to u64. Strings will be copied into space
- * reserved in the event for the string, using these pointers.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
-{
- struct synth_event_trace_state state;
- unsigned int i, n_u64;
- va_list args;
- int ret;
-
- ret = __synth_event_trace_start(file, &state);
- if (ret) {
- if (ret == -ENOENT)
- ret = 0; /* just disabled, not really an error */
- return ret;
- }
-
- if (n_vals != state.event->n_fields) {
- ret = -EINVAL;
- goto out;
- }
-
- va_start(args, n_vals);
- for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
- u64 val;
-
- val = va_arg(args, u64);
-
- if (state.event->fields[i]->is_string) {
- char *str_val = (char *)(long)val;
- char *str_field = (char *)&state.entry->fields[n_u64];
-
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- struct synth_field *field = state.event->fields[i];
-
- switch (field->size) {
- case 1:
- *(u8 *)&state.entry->fields[n_u64] = (u8)val;
- break;
-
- case 2:
- *(u16 *)&state.entry->fields[n_u64] = (u16)val;
- break;
-
- case 4:
- *(u32 *)&state.entry->fields[n_u64] = (u32)val;
- break;
-
- default:
- state.entry->fields[n_u64] = val;
- break;
- }
- n_u64++;
- }
- }
- va_end(args);
-out:
- __synth_event_trace_end(&state);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_trace);
-
-/**
- * synth_event_trace_array - Trace a synthetic event from an array
- * @file: The trace_event_file representing the synthetic event
- * @vals: Array of values
- * @n_vals: The number of values in vals
- *
- * Trace a synthetic event using the values passed in as 'vals'.
- *
- * The 'vals' array is just an array of 'n_vals' u64. The number of
- * vals must match the number of field in the synthetic event, and
- * must be in the same order as the synthetic event fields.
- *
- * All vals should be cast to u64, and string vals are just pointers
- * to strings, cast to u64. Strings will be copied into space
- * reserved in the event for the string, using these pointers.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
- unsigned int n_vals)
-{
- struct synth_event_trace_state state;
- unsigned int i, n_u64;
- int ret;
-
- ret = __synth_event_trace_start(file, &state);
- if (ret) {
- if (ret == -ENOENT)
- ret = 0; /* just disabled, not really an error */
- return ret;
- }
-
- if (n_vals != state.event->n_fields) {
- ret = -EINVAL;
- goto out;
- }
-
- for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
- if (state.event->fields[i]->is_string) {
- char *str_val = (char *)(long)vals[i];
- char *str_field = (char *)&state.entry->fields[n_u64];
-
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- struct synth_field *field = state.event->fields[i];
- u64 val = vals[i];
-
- switch (field->size) {
- case 1:
- *(u8 *)&state.entry->fields[n_u64] = (u8)val;
- break;
-
- case 2:
- *(u16 *)&state.entry->fields[n_u64] = (u16)val;
- break;
-
- case 4:
- *(u32 *)&state.entry->fields[n_u64] = (u32)val;
- break;
-
- default:
- state.entry->fields[n_u64] = val;
- break;
- }
- n_u64++;
- }
- }
-out:
- __synth_event_trace_end(&state);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_trace_array);
-
-/**
- * synth_event_trace_start - Start piecewise synthetic event trace
- * @file: The trace_event_file representing the synthetic event
- * @trace_state: A pointer to object tracking the piecewise trace state
- *
- * Start the trace of a synthetic event field-by-field rather than all
- * at once.
- *
- * This function 'opens' an event trace, which means space is reserved
- * for the event in the trace buffer, after which the event's
- * individual field values can be set through either
- * synth_event_add_next_val() or synth_event_add_val().
- *
- * A pointer to a trace_state object is passed in, which will keep
- * track of the current event trace state until the event trace is
- * closed (and the event finally traced) using
- * synth_event_trace_end().
- *
- * Note that synth_event_trace_end() must be called after all values
- * have been added for each event trace, regardless of whether adding
- * all field values succeeded or not.
- *
- * Note also that for a given event trace, all fields must be added
- * using either synth_event_add_next_val() or synth_event_add_val()
- * but not both together or interleaved.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_trace_start(struct trace_event_file *file,
- struct synth_event_trace_state *trace_state)
-{
- int ret;
-
- if (!trace_state)
- return -EINVAL;
-
- ret = __synth_event_trace_start(file, trace_state);
- if (ret == -ENOENT)
- ret = 0; /* just disabled, not really an error */
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_trace_start);
-
-static int __synth_event_add_val(const char *field_name, u64 val,
- struct synth_event_trace_state *trace_state)
-{
- struct synth_field *field = NULL;
- struct synth_trace_event *entry;
- struct synth_event *event;
- int i, ret = 0;
-
- if (!trace_state) {
- ret = -EINVAL;
- goto out;
- }
-
- /* can't mix add_next_synth_val() with add_synth_val() */
- if (field_name) {
- if (trace_state->add_next) {
- ret = -EINVAL;
- goto out;
- }
- trace_state->add_name = true;
- } else {
- if (trace_state->add_name) {
- ret = -EINVAL;
- goto out;
- }
- trace_state->add_next = true;
- }
-
- if (trace_state->disabled)
- goto out;
-
- event = trace_state->event;
- if (trace_state->add_name) {
- for (i = 0; i < event->n_fields; i++) {
- field = event->fields[i];
- if (strcmp(field->name, field_name) == 0)
- break;
- }
- if (!field) {
- ret = -EINVAL;
- goto out;
- }
- } else {
- if (trace_state->cur_field >= event->n_fields) {
- ret = -EINVAL;
- goto out;
- }
- field = event->fields[trace_state->cur_field++];
- }
-
- entry = trace_state->entry;
- if (field->is_string) {
- char *str_val = (char *)(long)val;
- char *str_field;
-
- if (!str_val) {
- ret = -EINVAL;
- goto out;
- }
-
- str_field = (char *)&entry->fields[field->offset];
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- } else {
- switch (field->size) {
- case 1:
- *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val;
- break;
-
- case 2:
- *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val;
- break;
-
- case 4:
- *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val;
- break;
-
- default:
- trace_state->entry->fields[field->offset] = val;
- break;
- }
- }
- out:
- return ret;
-}
-
-/**
- * synth_event_add_next_val - Add the next field's value to an open synth trace
- * @val: The value to set the next field to
- * @trace_state: A pointer to object tracking the piecewise trace state
- *
- * Set the value of the next field in an event that's been opened by
- * synth_event_trace_start().
- *
- * The val param should be the value cast to u64. If the value points
- * to a string, the val param should be a char * cast to u64.
- *
- * This function assumes all the fields in an event are to be set one
- * after another - successive calls to this function are made, one for
- * each field, in the order of the fields in the event, until all
- * fields have been set. If you'd rather set each field individually
- * without regard to ordering, synth_event_add_val() can be used
- * instead.
- *
- * Note however that synth_event_add_next_val() and
- * synth_event_add_val() can't be intermixed for a given event trace -
- * one or the other but not both can be used at the same time.
- *
- * Note also that synth_event_trace_end() must be called after all
- * values have been added for each event trace, regardless of whether
- * adding all field values succeeded or not.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_add_next_val(u64 val,
- struct synth_event_trace_state *trace_state)
-{
- return __synth_event_add_val(NULL, val, trace_state);
-}
-EXPORT_SYMBOL_GPL(synth_event_add_next_val);
-
-/**
- * synth_event_add_val - Add a named field's value to an open synth trace
- * @field_name: The name of the synthetic event field value to set
- * @val: The value to set the next field to
- * @trace_state: A pointer to object tracking the piecewise trace state
- *
- * Set the value of the named field in an event that's been opened by
- * synth_event_trace_start().
- *
- * The val param should be the value cast to u64. If the value points
- * to a string, the val param should be a char * cast to u64.
- *
- * This function looks up the field name, and if found, sets the field
- * to the specified value. This lookup makes this function more
- * expensive than synth_event_add_next_val(), so use that or the
- * none-piecewise synth_event_trace() instead if efficiency is more
- * important.
- *
- * Note however that synth_event_add_next_val() and
- * synth_event_add_val() can't be intermixed for a given event trace -
- * one or the other but not both can be used at the same time.
- *
- * Note also that synth_event_trace_end() must be called after all
- * values have been added for each event trace, regardless of whether
- * adding all field values succeeded or not.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_add_val(const char *field_name, u64 val,
- struct synth_event_trace_state *trace_state)
-{
- return __synth_event_add_val(field_name, val, trace_state);
-}
-EXPORT_SYMBOL_GPL(synth_event_add_val);
-
-/**
- * synth_event_trace_end - End piecewise synthetic event trace
- * @trace_state: A pointer to object tracking the piecewise trace state
- *
- * End the trace of a synthetic event opened by
- * synth_event_trace__start().
- *
- * This function 'closes' an event trace, which basically means that
- * it commits the reserved event and cleans up other loose ends.
- *
- * A pointer to a trace_state object is passed in, which will keep
- * track of the current event trace state opened with
- * synth_event_trace_start().
- *
- * Note that this function must be called after all values have been
- * added for each event trace, regardless of whether adding all field
- * values succeeded or not.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_trace_end(struct synth_event_trace_state *trace_state)
-{
- if (!trace_state)
- return -EINVAL;
-
- __synth_event_trace_end(trace_state);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(synth_event_trace_end);
-
-static int create_synth_event(int argc, const char **argv)
-{
- const char *name = argv[0];
- int len;
-
- if (name[0] != 's' || name[1] != ':')
- return -ECANCELED;
- name += 2;
-
- /* This interface accepts group name prefix */
- if (strchr(name, '/')) {
- len = str_has_prefix(name, SYNTH_SYSTEM "/");
- if (len == 0)
- return -EINVAL;
- name += len;
- }
- return __create_synth_event(argc - 1, name, argv + 1);
-}
-
-static int synth_event_release(struct dyn_event *ev)
-{
- struct synth_event *event = to_synth_event(ev);
- int ret;
-
- if (event->ref)
- return -EBUSY;
-
- ret = unregister_synth_event(event);
- if (ret)
- return ret;
-
- dyn_event_remove(ev);
- free_synth_event(event);
- return 0;
-}
-
-static int __synth_event_show(struct seq_file *m, struct synth_event *event)
-{
- struct synth_field *field;
- unsigned int i;
-
- seq_printf(m, "%s\t", event->name);
-
- for (i = 0; i < event->n_fields; i++) {
- field = event->fields[i];
-
- /* parameter values */
- seq_printf(m, "%s %s%s", field->type, field->name,
- i == event->n_fields - 1 ? "" : "; ");
- }
-
- seq_putc(m, '\n');
-
- return 0;
-}
-
-static int synth_event_show(struct seq_file *m, struct dyn_event *ev)
-{
- struct synth_event *event = to_synth_event(ev);
-
- seq_printf(m, "s:%s/", event->class.system);
-
- return __synth_event_show(m, event);
-}
-
-static int synth_events_seq_show(struct seq_file *m, void *v)
-{
- struct dyn_event *ev = v;
-
- if (!is_synth_event(ev))
- return 0;
-
- return __synth_event_show(m, to_synth_event(ev));
-}
-
-static const struct seq_operations synth_events_seq_op = {
- .start = dyn_event_seq_start,
- .next = dyn_event_seq_next,
- .stop = dyn_event_seq_stop,
- .show = synth_events_seq_show,
-};
-
-static int synth_events_open(struct inode *inode, struct file *file)
-{
- int ret;
-
- ret = security_locked_down(LOCKDOWN_TRACEFS);
- if (ret)
- return ret;
-
- if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
- ret = dyn_events_release_all(&synth_event_ops);
- if (ret < 0)
- return ret;
- }
-
- return seq_open(file, &synth_events_seq_op);
-}
-
-static ssize_t synth_events_write(struct file *file,
- const char __user *buffer,
- size_t count, loff_t *ppos)
-{
- return trace_parse_run_command(file, buffer, count, ppos,
- create_or_delete_synth_event);
-}
-
-static const struct file_operations synth_events_fops = {
- .open = synth_events_open,
- .write = synth_events_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static u64 hist_field_timestamp(struct hist_field *hist_field,
struct tracing_map_elt *elt,
struct ring_buffer_event *rbe,
@@ -6491,6 +4729,279 @@ const struct file_operations event_hist_fops = {
.release = single_release,
};
+#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+static void hist_field_debug_show_flags(struct seq_file *m,
+ unsigned long flags)
+{
+ seq_puts(m, " flags:\n");
+
+ if (flags & HIST_FIELD_FL_KEY)
+ seq_puts(m, " HIST_FIELD_FL_KEY\n");
+ else if (flags & HIST_FIELD_FL_HITCOUNT)
+ seq_puts(m, " VAL: HIST_FIELD_FL_HITCOUNT\n");
+ else if (flags & HIST_FIELD_FL_VAR)
+ seq_puts(m, " HIST_FIELD_FL_VAR\n");
+ else if (flags & HIST_FIELD_FL_VAR_REF)
+ seq_puts(m, " HIST_FIELD_FL_VAR_REF\n");
+ else
+ seq_puts(m, " VAL: normal u64 value\n");
+
+ if (flags & HIST_FIELD_FL_ALIAS)
+ seq_puts(m, " HIST_FIELD_FL_ALIAS\n");
+}
+
+static int hist_field_debug_show(struct seq_file *m,
+ struct hist_field *field, unsigned long flags)
+{
+ if ((field->flags & flags) != flags) {
+ seq_printf(m, "ERROR: bad flags - %lx\n", flags);
+ return -EINVAL;
+ }
+
+ hist_field_debug_show_flags(m, field->flags);
+ if (field->field)
+ seq_printf(m, " ftrace_event_field name: %s\n",
+ field->field->name);
+
+ if (field->flags & HIST_FIELD_FL_VAR) {
+ seq_printf(m, " var.name: %s\n", field->var.name);
+ seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n",
+ field->var.idx);
+ }
+
+ if (field->flags & HIST_FIELD_FL_ALIAS)
+ seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n",
+ field->var_ref_idx);
+
+ if (field->flags & HIST_FIELD_FL_VAR_REF) {
+ seq_printf(m, " name: %s\n", field->name);
+ seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n",
+ field->var.idx);
+ seq_printf(m, " var.hist_data: %p\n", field->var.hist_data);
+ seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n",
+ field->var_ref_idx);
+ if (field->system)
+ seq_printf(m, " system: %s\n", field->system);
+ if (field->event_name)
+ seq_printf(m, " event_name: %s\n", field->event_name);
+ }
+
+ seq_printf(m, " type: %s\n", field->type);
+ seq_printf(m, " size: %u\n", field->size);
+ seq_printf(m, " is_signed: %u\n", field->is_signed);
+
+ return 0;
+}
+
+static int field_var_debug_show(struct seq_file *m,
+ struct field_var *field_var, unsigned int i,
+ bool save_vars)
+{
+ const char *vars_name = save_vars ? "save_vars" : "field_vars";
+ struct hist_field *field;
+ int ret = 0;
+
+ seq_printf(m, "\n hist_data->%s[%d]:\n", vars_name, i);
+
+ field = field_var->var;
+
+ seq_printf(m, "\n %s[%d].var:\n", vars_name, i);
+
+ hist_field_debug_show_flags(m, field->flags);
+ seq_printf(m, " var.name: %s\n", field->var.name);
+ seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n",
+ field->var.idx);
+
+ field = field_var->val;
+
+ seq_printf(m, "\n %s[%d].val:\n", vars_name, i);
+ if (field->field)
+ seq_printf(m, " ftrace_event_field name: %s\n",
+ field->field->name);
+ else {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ seq_printf(m, " type: %s\n", field->type);
+ seq_printf(m, " size: %u\n", field->size);
+ seq_printf(m, " is_signed: %u\n", field->is_signed);
+out:
+ return ret;
+}
+
+static int hist_action_debug_show(struct seq_file *m,
+ struct action_data *data, int i)
+{
+ int ret = 0;
+
+ if (data->handler == HANDLER_ONMAX ||
+ data->handler == HANDLER_ONCHANGE) {
+ seq_printf(m, "\n hist_data->actions[%d].track_data.var_ref:\n", i);
+ ret = hist_field_debug_show(m, data->track_data.var_ref,
+ HIST_FIELD_FL_VAR_REF);
+ if (ret)
+ goto out;
+
+ seq_printf(m, "\n hist_data->actions[%d].track_data.track_var:\n", i);
+ ret = hist_field_debug_show(m, data->track_data.track_var,
+ HIST_FIELD_FL_VAR);
+ if (ret)
+ goto out;
+ }
+
+ if (data->handler == HANDLER_ONMATCH) {
+ seq_printf(m, "\n hist_data->actions[%d].match_data.event_system: %s\n",
+ i, data->match_data.event_system);
+ seq_printf(m, " hist_data->actions[%d].match_data.event: %s\n",
+ i, data->match_data.event);
+ }
+out:
+ return ret;
+}
+
+static int hist_actions_debug_show(struct seq_file *m,
+ struct hist_trigger_data *hist_data)
+{
+ int i, ret = 0;
+
+ if (hist_data->n_actions)
+ seq_puts(m, "\n action tracking variables (for onmax()/onchange()/onmatch()):\n");
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ struct action_data *action = hist_data->actions[i];
+
+ ret = hist_action_debug_show(m, action, i);
+ if (ret)
+ goto out;
+ }
+
+ if (hist_data->n_save_vars)
+ seq_puts(m, "\n save action variables (save() params):\n");
+
+ for (i = 0; i < hist_data->n_save_vars; i++) {
+ ret = field_var_debug_show(m, hist_data->save_vars[i], i, true);
+ if (ret)
+ goto out;
+ }
+out:
+ return ret;
+}
+
+static void hist_trigger_debug_show(struct seq_file *m,
+ struct event_trigger_data *data, int n)
+{
+ struct hist_trigger_data *hist_data;
+ int i, ret;
+
+ if (n > 0)
+ seq_puts(m, "\n\n");
+
+ seq_puts(m, "# event histogram\n#\n# trigger info: ");
+ data->ops->print(m, data->ops, data);
+ seq_puts(m, "#\n\n");
+
+ hist_data = data->private_data;
+
+ seq_printf(m, "hist_data: %p\n\n", hist_data);
+ seq_printf(m, " n_vals: %u\n", hist_data->n_vals);
+ seq_printf(m, " n_keys: %u\n", hist_data->n_keys);
+ seq_printf(m, " n_fields: %u\n", hist_data->n_fields);
+
+ seq_puts(m, "\n val fields:\n\n");
+
+ seq_puts(m, " hist_data->fields[0]:\n");
+ ret = hist_field_debug_show(m, hist_data->fields[0],
+ HIST_FIELD_FL_HITCOUNT);
+ if (ret)
+ return;
+
+ for (i = 1; i < hist_data->n_vals; i++) {
+ seq_printf(m, "\n hist_data->fields[%d]:\n", i);
+ ret = hist_field_debug_show(m, hist_data->fields[i], 0);
+ if (ret)
+ return;
+ }
+
+ seq_puts(m, "\n key fields:\n");
+
+ for (i = hist_data->n_vals; i < hist_data->n_fields; i++) {
+ seq_printf(m, "\n hist_data->fields[%d]:\n", i);
+ ret = hist_field_debug_show(m, hist_data->fields[i],
+ HIST_FIELD_FL_KEY);
+ if (ret)
+ return;
+ }
+
+ if (hist_data->n_var_refs)
+ seq_puts(m, "\n variable reference fields:\n");
+
+ for (i = 0; i < hist_data->n_var_refs; i++) {
+ seq_printf(m, "\n hist_data->var_refs[%d]:\n", i);
+ ret = hist_field_debug_show(m, hist_data->var_refs[i],
+ HIST_FIELD_FL_VAR_REF);
+ if (ret)
+ return;
+ }
+
+ if (hist_data->n_field_vars)
+ seq_puts(m, "\n field variables:\n");
+
+ for (i = 0; i < hist_data->n_field_vars; i++) {
+ ret = field_var_debug_show(m, hist_data->field_vars[i], i, false);
+ if (ret)
+ return;
+ }
+
+ ret = hist_actions_debug_show(m, hist_data);
+ if (ret)
+ return;
+}
+
+static int hist_debug_show(struct seq_file *m, void *v)
+{
+ struct event_trigger_data *data;
+ struct trace_event_file *event_file;
+ int n = 0, ret = 0;
+
+ mutex_lock(&event_mutex);
+
+ event_file = event_file_data(m->private);
+ if (unlikely(!event_file)) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ list_for_each_entry(data, &event_file->triggers, list) {
+ if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
+ hist_trigger_debug_show(m, data, n++);
+ }
+
+ out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static int event_hist_debug_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
+ return single_open(file, hist_debug_show, file);
+}
+
+const struct file_operations event_hist_debug_fops = {
+ .open = event_hist_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
{
const char *field_name = hist_field_name(hist_field, 0);
@@ -7393,37 +5904,3 @@ __init int register_trigger_hist_enable_disable_cmds(void)
return ret;
}
-
-static __init int trace_events_hist_init(void)
-{
- struct dentry *entry = NULL;
- struct dentry *d_tracer;
- int err = 0;
-
- err = dyn_event_register(&synth_event_ops);
- if (err) {
- pr_warn("Could not register synth_event_ops\n");
- return err;
- }
-
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer)) {
- err = PTR_ERR(d_tracer);
- goto err;
- }
-
- entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
- NULL, &synth_events_fops);
- if (!entry) {
- err = -ENODEV;
- goto err;
- }
-
- return err;
- err:
- pr_warn("Could not create tracefs 'synthetic_events' entry\n");
-
- return err;
-}
-
-fs_initcall(trace_events_hist_init);
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
new file mode 100644
index 000000000000..c6cca0d1d584
--- /dev/null
+++ b/kernel/trace/trace_events_synth.c
@@ -0,0 +1,1789 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * trace_events_synth - synthetic trace events
+ *
+ * Copyright (C) 2015, 2020 Tom Zanussi <tom.zanussi@linux.intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/security.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/rculist.h>
+#include <linux/tracefs.h>
+
+/* for gfp flag names */
+#include <linux/trace_events.h>
+#include <trace/events/mmflags.h>
+
+#include "trace_synth.h"
+
+static int create_synth_event(int argc, const char **argv);
+static int synth_event_show(struct seq_file *m, struct dyn_event *ev);
+static int synth_event_release(struct dyn_event *ev);
+static bool synth_event_is_busy(struct dyn_event *ev);
+static bool synth_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev);
+
+static struct dyn_event_operations synth_event_ops = {
+ .create = create_synth_event,
+ .show = synth_event_show,
+ .is_busy = synth_event_is_busy,
+ .free = synth_event_release,
+ .match = synth_event_match,
+};
+
+static bool is_synth_event(struct dyn_event *ev)
+{
+ return ev->ops == &synth_event_ops;
+}
+
+static struct synth_event *to_synth_event(struct dyn_event *ev)
+{
+ return container_of(ev, struct synth_event, devent);
+}
+
+static bool synth_event_is_busy(struct dyn_event *ev)
+{
+ struct synth_event *event = to_synth_event(ev);
+
+ return event->ref != 0;
+}
+
+static bool synth_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct synth_event *sev = to_synth_event(ev);
+
+ return strcmp(sev->name, event) == 0 &&
+ (!system || strcmp(system, SYNTH_SYSTEM) == 0);
+}
+
+struct synth_trace_event {
+ struct trace_entry ent;
+ u64 fields[];
+};
+
+static int synth_event_define_fields(struct trace_event_call *call)
+{
+ struct synth_trace_event trace;
+ int offset = offsetof(typeof(trace), fields);
+ struct synth_event *event = call->data;
+ unsigned int i, size, n_u64;
+ char *name, *type;
+ bool is_signed;
+ int ret = 0;
+
+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+ size = event->fields[i]->size;
+ is_signed = event->fields[i]->is_signed;
+ type = event->fields[i]->type;
+ name = event->fields[i]->name;
+ ret = trace_define_field(call, type, name, offset, size,
+ is_signed, FILTER_OTHER);
+ if (ret)
+ break;
+
+ event->fields[i]->offset = n_u64;
+
+ if (event->fields[i]->is_string) {
+ offset += STR_VAR_LEN_MAX;
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ offset += sizeof(u64);
+ n_u64++;
+ }
+ }
+
+ event->n_u64 = n_u64;
+
+ return ret;
+}
+
+static bool synth_field_signed(char *type)
+{
+ if (str_has_prefix(type, "u"))
+ return false;
+ if (strcmp(type, "gfp_t") == 0)
+ return false;
+
+ return true;
+}
+
+static int synth_field_is_string(char *type)
+{
+ if (strstr(type, "char[") != NULL)
+ return true;
+
+ return false;
+}
+
+static int synth_field_string_size(char *type)
+{
+ char buf[4], *end, *start;
+ unsigned int len;
+ int size, err;
+
+ start = strstr(type, "char[");
+ if (start == NULL)
+ return -EINVAL;
+ start += sizeof("char[") - 1;
+
+ end = strchr(type, ']');
+ if (!end || end < start)
+ return -EINVAL;
+
+ len = end - start;
+ if (len > 3)
+ return -EINVAL;
+
+ strncpy(buf, start, len);
+ buf[len] = '\0';
+
+ err = kstrtouint(buf, 0, &size);
+ if (err)
+ return err;
+
+ if (size > STR_VAR_LEN_MAX)
+ return -EINVAL;
+
+ return size;
+}
+
+static int synth_field_size(char *type)
+{
+ int size = 0;
+
+ if (strcmp(type, "s64") == 0)
+ size = sizeof(s64);
+ else if (strcmp(type, "u64") == 0)
+ size = sizeof(u64);
+ else if (strcmp(type, "s32") == 0)
+ size = sizeof(s32);
+ else if (strcmp(type, "u32") == 0)
+ size = sizeof(u32);
+ else if (strcmp(type, "s16") == 0)
+ size = sizeof(s16);
+ else if (strcmp(type, "u16") == 0)
+ size = sizeof(u16);
+ else if (strcmp(type, "s8") == 0)
+ size = sizeof(s8);
+ else if (strcmp(type, "u8") == 0)
+ size = sizeof(u8);
+ else if (strcmp(type, "char") == 0)
+ size = sizeof(char);
+ else if (strcmp(type, "unsigned char") == 0)
+ size = sizeof(unsigned char);
+ else if (strcmp(type, "int") == 0)
+ size = sizeof(int);
+ else if (strcmp(type, "unsigned int") == 0)
+ size = sizeof(unsigned int);
+ else if (strcmp(type, "long") == 0)
+ size = sizeof(long);
+ else if (strcmp(type, "unsigned long") == 0)
+ size = sizeof(unsigned long);
+ else if (strcmp(type, "pid_t") == 0)
+ size = sizeof(pid_t);
+ else if (strcmp(type, "gfp_t") == 0)
+ size = sizeof(gfp_t);
+ else if (synth_field_is_string(type))
+ size = synth_field_string_size(type);
+
+ return size;
+}
+
+static const char *synth_field_fmt(char *type)
+{
+ const char *fmt = "%llu";
+
+ if (strcmp(type, "s64") == 0)
+ fmt = "%lld";
+ else if (strcmp(type, "u64") == 0)
+ fmt = "%llu";
+ else if (strcmp(type, "s32") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "u32") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "s16") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "u16") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "s8") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "u8") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "char") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "unsigned char") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "int") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "unsigned int") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "long") == 0)
+ fmt = "%ld";
+ else if (strcmp(type, "unsigned long") == 0)
+ fmt = "%lu";
+ else if (strcmp(type, "pid_t") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "gfp_t") == 0)
+ fmt = "%x";
+ else if (synth_field_is_string(type))
+ fmt = "%s";
+
+ return fmt;
+}
+
+static void print_synth_event_num_val(struct trace_seq *s,
+ char *print_fmt, char *name,
+ int size, u64 val, char *space)
+{
+ switch (size) {
+ case 1:
+ trace_seq_printf(s, print_fmt, name, (u8)val, space);
+ break;
+
+ case 2:
+ trace_seq_printf(s, print_fmt, name, (u16)val, space);
+ break;
+
+ case 4:
+ trace_seq_printf(s, print_fmt, name, (u32)val, space);
+ break;
+
+ default:
+ trace_seq_printf(s, print_fmt, name, val, space);
+ break;
+ }
+}
+
+static enum print_line_t print_synth_event(struct trace_iterator *iter,
+ int flags,
+ struct trace_event *event)
+{
+ struct trace_array *tr = iter->tr;
+ struct trace_seq *s = &iter->seq;
+ struct synth_trace_event *entry;
+ struct synth_event *se;
+ unsigned int i, n_u64;
+ char print_fmt[32];
+ const char *fmt;
+
+ entry = (struct synth_trace_event *)iter->ent;
+ se = container_of(event, struct synth_event, call.event);
+
+ trace_seq_printf(s, "%s: ", se->name);
+
+ for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
+ if (trace_seq_has_overflowed(s))
+ goto end;
+
+ fmt = synth_field_fmt(se->fields[i]->type);
+
+ /* parameter types */
+ if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
+ trace_seq_printf(s, "%s ", fmt);
+
+ snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
+
+ /* parameter values */
+ if (se->fields[i]->is_string) {
+ trace_seq_printf(s, print_fmt, se->fields[i]->name,
+ (char *)&entry->fields[n_u64],
+ i == se->n_fields - 1 ? "" : " ");
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ struct trace_print_flags __flags[] = {
+ __def_gfpflag_names, {-1, NULL} };
+ char *space = (i == se->n_fields - 1 ? "" : " ");
+
+ print_synth_event_num_val(s, print_fmt,
+ se->fields[i]->name,
+ se->fields[i]->size,
+ entry->fields[n_u64],
+ space);
+
+ if (strcmp(se->fields[i]->type, "gfp_t") == 0) {
+ trace_seq_puts(s, " (");
+ trace_print_flags_seq(s, "|",
+ entry->fields[n_u64],
+ __flags);
+ trace_seq_putc(s, ')');
+ }
+ n_u64++;
+ }
+ }
+end:
+ trace_seq_putc(s, '\n');
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions synth_event_funcs = {
+ .trace = print_synth_event
+};
+
+static notrace void trace_event_raw_event_synth(void *__data,
+ u64 *var_ref_vals,
+ unsigned int *var_ref_idx)
+{
+ struct trace_event_file *trace_file = __data;
+ struct synth_trace_event *entry;
+ struct trace_event_buffer fbuffer;
+ struct trace_buffer *buffer;
+ struct synth_event *event;
+ unsigned int i, n_u64, val_idx;
+ int fields_size = 0;
+
+ event = trace_file->event_call->data;
+
+ if (trace_trigger_soft_disabled(trace_file))
+ return;
+
+ fields_size = event->n_u64 * sizeof(u64);
+
+ /*
+ * Avoid ring buffer recursion detection, as this event
+ * is being performed within another event.
+ */
+ buffer = trace_file->tr->array_buffer.buffer;
+ ring_buffer_nest_start(buffer);
+
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry) + fields_size);
+ if (!entry)
+ goto out;
+
+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+ val_idx = var_ref_idx[i];
+ if (event->fields[i]->is_string) {
+ char *str_val = (char *)(long)var_ref_vals[val_idx];
+ char *str_field = (char *)&entry->fields[n_u64];
+
+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ struct synth_field *field = event->fields[i];
+ u64 val = var_ref_vals[val_idx];
+
+ switch (field->size) {
+ case 1:
+ *(u8 *)&entry->fields[n_u64] = (u8)val;
+ break;
+
+ case 2:
+ *(u16 *)&entry->fields[n_u64] = (u16)val;
+ break;
+
+ case 4:
+ *(u32 *)&entry->fields[n_u64] = (u32)val;
+ break;
+
+ default:
+ entry->fields[n_u64] = val;
+ break;
+ }
+ n_u64++;
+ }
+ }
+
+ trace_event_buffer_commit(&fbuffer);
+out:
+ ring_buffer_nest_end(buffer);
+}
+
+static void free_synth_event_print_fmt(struct trace_event_call *call)
+{
+ if (call) {
+ kfree(call->print_fmt);
+ call->print_fmt = NULL;
+ }
+}
+
+static int __set_synth_event_print_fmt(struct synth_event *event,
+ char *buf, int len)
+{
+ const char *fmt;
+ int pos = 0;
+ int i;
+
+ /* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+ for (i = 0; i < event->n_fields; i++) {
+ fmt = synth_field_fmt(event->fields[i]->type);
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
+ event->fields[i]->name, fmt,
+ i == event->n_fields - 1 ? "" : ", ");
+ }
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ for (i = 0; i < event->n_fields; i++) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", REC->%s", event->fields[i]->name);
+ }
+
+#undef LEN_OR_ZERO
+
+ /* return the length of print_fmt */
+ return pos;
+}
+
+static int set_synth_event_print_fmt(struct trace_event_call *call)
+{
+ struct synth_event *event = call->data;
+ char *print_fmt;
+ int len;
+
+ /* First: called with 0 length to calculate the needed length */
+ len = __set_synth_event_print_fmt(event, NULL, 0);
+
+ print_fmt = kmalloc(len + 1, GFP_KERNEL);
+ if (!print_fmt)
+ return -ENOMEM;
+
+ /* Second: actually write the @print_fmt */
+ __set_synth_event_print_fmt(event, print_fmt, len + 1);
+ call->print_fmt = print_fmt;
+
+ return 0;
+}
+
+static void free_synth_field(struct synth_field *field)
+{
+ kfree(field->type);
+ kfree(field->name);
+ kfree(field);
+}
+
+static struct synth_field *parse_synth_field(int argc, const char **argv,
+ int *consumed)
+{
+ struct synth_field *field;
+ const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
+ int len, ret = 0;
+
+ if (field_type[0] == ';')
+ field_type++;
+
+ if (!strcmp(field_type, "unsigned")) {
+ if (argc < 3)
+ return ERR_PTR(-EINVAL);
+ prefix = "unsigned ";
+ field_type = argv[1];
+ field_name = argv[2];
+ *consumed = 3;
+ } else {
+ field_name = argv[1];
+ *consumed = 2;
+ }
+
+ field = kzalloc(sizeof(*field), GFP_KERNEL);
+ if (!field)
+ return ERR_PTR(-ENOMEM);
+
+ len = strlen(field_name);
+ array = strchr(field_name, '[');
+ if (array)
+ len -= strlen(array);
+ else if (field_name[len - 1] == ';')
+ len--;
+
+ field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
+ if (!field->name) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ if (field_type[0] == ';')
+ field_type++;
+ len = strlen(field_type) + 1;
+ if (array)
+ len += strlen(array);
+ if (prefix)
+ len += strlen(prefix);
+
+ field->type = kzalloc(len, GFP_KERNEL);
+ if (!field->type) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ if (prefix)
+ strcat(field->type, prefix);
+ strcat(field->type, field_type);
+ if (array) {
+ strcat(field->type, array);
+ if (field->type[len - 1] == ';')
+ field->type[len - 1] = '\0';
+ }
+
+ field->size = synth_field_size(field->type);
+ if (!field->size) {
+ ret = -EINVAL;
+ goto free;
+ }
+
+ if (synth_field_is_string(field->type))
+ field->is_string = true;
+
+ field->is_signed = synth_field_signed(field->type);
+
+ out:
+ return field;
+ free:
+ free_synth_field(field);
+ field = ERR_PTR(ret);
+ goto out;
+}
+
+static void free_synth_tracepoint(struct tracepoint *tp)
+{
+ if (!tp)
+ return;
+
+ kfree(tp->name);
+ kfree(tp);
+}
+
+static struct tracepoint *alloc_synth_tracepoint(char *name)
+{
+ struct tracepoint *tp;
+
+ tp = kzalloc(sizeof(*tp), GFP_KERNEL);
+ if (!tp)
+ return ERR_PTR(-ENOMEM);
+
+ tp->name = kstrdup(name, GFP_KERNEL);
+ if (!tp->name) {
+ kfree(tp);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return tp;
+}
+
+struct synth_event *find_synth_event(const char *name)
+{
+ struct dyn_event *pos;
+ struct synth_event *event;
+
+ for_each_dyn_event(pos) {
+ if (!is_synth_event(pos))
+ continue;
+ event = to_synth_event(pos);
+ if (strcmp(event->name, name) == 0)
+ return event;
+ }
+
+ return NULL;
+}
+
+static struct trace_event_fields synth_event_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = synth_event_define_fields },
+ {}
+};
+
+static int register_synth_event(struct synth_event *event)
+{
+ struct trace_event_call *call = &event->call;
+ int ret = 0;
+
+ event->call.class = &event->class;
+ event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
+ if (!event->class.system) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ event->tp = alloc_synth_tracepoint(event->name);
+ if (IS_ERR(event->tp)) {
+ ret = PTR_ERR(event->tp);
+ event->tp = NULL;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&call->class->fields);
+ call->event.funcs = &synth_event_funcs;
+ call->class->fields_array = synth_event_fields_array;
+
+ ret = register_trace_event(&call->event);
+ if (!ret) {
+ ret = -ENODEV;
+ goto out;
+ }
+ call->flags = TRACE_EVENT_FL_TRACEPOINT;
+ call->class->reg = trace_event_reg;
+ call->class->probe = trace_event_raw_event_synth;
+ call->data = event;
+ call->tp = event->tp;
+
+ ret = trace_add_event_call(call);
+ if (ret) {
+ pr_warn("Failed to register synthetic event: %s\n",
+ trace_event_name(call));
+ goto err;
+ }
+
+ ret = set_synth_event_print_fmt(call);
+ if (ret < 0) {
+ trace_remove_event_call(call);
+ goto err;
+ }
+ out:
+ return ret;
+ err:
+ unregister_trace_event(&call->event);
+ goto out;
+}
+
+static int unregister_synth_event(struct synth_event *event)
+{
+ struct trace_event_call *call = &event->call;
+ int ret;
+
+ ret = trace_remove_event_call(call);
+
+ return ret;
+}
+
+static void free_synth_event(struct synth_event *event)
+{
+ unsigned int i;
+
+ if (!event)
+ return;
+
+ for (i = 0; i < event->n_fields; i++)
+ free_synth_field(event->fields[i]);
+
+ kfree(event->fields);
+ kfree(event->name);
+ kfree(event->class.system);
+ free_synth_tracepoint(event->tp);
+ free_synth_event_print_fmt(&event->call);
+ kfree(event);
+}
+
+static struct synth_event *alloc_synth_event(const char *name, int n_fields,
+ struct synth_field **fields)
+{
+ struct synth_event *event;
+ unsigned int i;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event) {
+ event = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ event->name = kstrdup(name, GFP_KERNEL);
+ if (!event->name) {
+ kfree(event);
+ event = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
+ if (!event->fields) {
+ free_synth_event(event);
+ event = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ dyn_event_init(&event->devent, &synth_event_ops);
+
+ for (i = 0; i < n_fields; i++)
+ event->fields[i] = fields[i];
+
+ event->n_fields = n_fields;
+ out:
+ return event;
+}
+
+static int synth_event_check_arg_fn(void *data)
+{
+ struct dynevent_arg_pair *arg_pair = data;
+ int size;
+
+ size = synth_field_size((char *)arg_pair->lhs);
+
+ return size ? 0 : -EINVAL;
+}
+
+/**
+ * synth_event_add_field - Add a new field to a synthetic event cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @type: The type of the new field to add
+ * @name: The name of the new field to add
+ *
+ * Add a new field to a synthetic event cmd object. Field ordering is in
+ * the same order the fields are added.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_add_field(struct dynevent_cmd *cmd, const char *type,
+ const char *name)
+{
+ struct dynevent_arg_pair arg_pair;
+ int ret;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ if (!type || !name)
+ return -EINVAL;
+
+ dynevent_arg_pair_init(&arg_pair, 0, ';');
+
+ arg_pair.lhs = type;
+ arg_pair.rhs = name;
+
+ ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn);
+ if (ret)
+ return ret;
+
+ if (++cmd->n_fields > SYNTH_FIELDS_MAX)
+ ret = -EINVAL;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_add_field);
+
+/**
+ * synth_event_add_field_str - Add a new field to a synthetic event cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @type_name: The type and name of the new field to add, as a single string
+ *
+ * Add a new field to a synthetic event cmd object, as a single
+ * string. The @type_name string is expected to be of the form 'type
+ * name', which will be appended by ';'. No sanity checking is done -
+ * what's passed in is assumed to already be well-formed. Field
+ * ordering is in the same order the fields are added.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name)
+{
+ struct dynevent_arg arg;
+ int ret;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ if (!type_name)
+ return -EINVAL;
+
+ dynevent_arg_init(&arg, ';');
+
+ arg.str = type_name;
+
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ return ret;
+
+ if (++cmd->n_fields > SYNTH_FIELDS_MAX)
+ ret = -EINVAL;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_add_field_str);
+
+/**
+ * synth_event_add_fields - Add multiple fields to a synthetic event cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @fields: An array of type/name field descriptions
+ * @n_fields: The number of field descriptions contained in the fields array
+ *
+ * Add a new set of fields to a synthetic event cmd object. The event
+ * fields that will be defined for the event should be passed in as an
+ * array of struct synth_field_desc, and the number of elements in the
+ * array passed in as n_fields. Field ordering will retain the
+ * ordering given in the fields array.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_add_fields(struct dynevent_cmd *cmd,
+ struct synth_field_desc *fields,
+ unsigned int n_fields)
+{
+ unsigned int i;
+ int ret = 0;
+
+ for (i = 0; i < n_fields; i++) {
+ if (fields[i].type == NULL || fields[i].name == NULL) {
+ ret = -EINVAL;
+ break;
+ }
+
+ ret = synth_event_add_field(cmd, fields[i].type, fields[i].name);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_add_fields);
+
+/**
+ * __synth_event_gen_cmd_start - Start a synthetic event command from arg list
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @name: The name of the synthetic event
+ * @mod: The module creating the event, NULL if not created from a module
+ * @args: Variable number of arg (pairs), one pair for each field
+ *
+ * NOTE: Users normally won't want to call this function directly, but
+ * rather use the synth_event_gen_cmd_start() wrapper, which
+ * automatically adds a NULL to the end of the arg list. If this
+ * function is used directly, make sure the last arg in the variable
+ * arg list is NULL.
+ *
+ * Generate a synthetic event command to be executed by
+ * synth_event_gen_cmd_end(). This function can be used to generate
+ * the complete command or only the first part of it; in the latter
+ * case, synth_event_add_field(), synth_event_add_field_str(), or
+ * synth_event_add_fields() can be used to add more fields following
+ * this.
+ *
+ * There should be an even number variable args, each pair consisting
+ * of a type followed by a field name.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name,
+ struct module *mod, ...)
+{
+ struct dynevent_arg arg;
+ va_list args;
+ int ret;
+
+ cmd->event_name = name;
+ cmd->private_data = mod;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ dynevent_arg_init(&arg, 0);
+ arg.str = name;
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ return ret;
+
+ va_start(args, mod);
+ for (;;) {
+ const char *type, *name;
+
+ type = va_arg(args, const char *);
+ if (!type)
+ break;
+ name = va_arg(args, const char *);
+ if (!name)
+ break;
+
+ if (++cmd->n_fields > SYNTH_FIELDS_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+
+ ret = synth_event_add_field(cmd, type, name);
+ if (ret)
+ break;
+ }
+ va_end(args);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start);
+
+/**
+ * synth_event_gen_cmd_array_start - Start synthetic event command from an array
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @name: The name of the synthetic event
+ * @fields: An array of type/name field descriptions
+ * @n_fields: The number of field descriptions contained in the fields array
+ *
+ * Generate a synthetic event command to be executed by
+ * synth_event_gen_cmd_end(). This function can be used to generate
+ * the complete command or only the first part of it; in the latter
+ * case, synth_event_add_field(), synth_event_add_field_str(), or
+ * synth_event_add_fields() can be used to add more fields following
+ * this.
+ *
+ * The event fields that will be defined for the event should be
+ * passed in as an array of struct synth_field_desc, and the number of
+ * elements in the array passed in as n_fields. Field ordering will
+ * retain the ordering given in the fields array.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name,
+ struct module *mod,
+ struct synth_field_desc *fields,
+ unsigned int n_fields)
+{
+ struct dynevent_arg arg;
+ unsigned int i;
+ int ret = 0;
+
+ cmd->event_name = name;
+ cmd->private_data = mod;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ if (n_fields > SYNTH_FIELDS_MAX)
+ return -EINVAL;
+
+ dynevent_arg_init(&arg, 0);
+ arg.str = name;
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < n_fields; i++) {
+ if (fields[i].type == NULL || fields[i].name == NULL)
+ return -EINVAL;
+
+ ret = synth_event_add_field(cmd, fields[i].type, fields[i].name);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start);
+
+static int __create_synth_event(int argc, const char *name, const char **argv)
+{
+ struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
+ struct synth_event *event = NULL;
+ int i, consumed = 0, n_fields = 0, ret = 0;
+
+ /*
+ * Argument syntax:
+ * - Add synthetic event: <event_name> field[;field] ...
+ * - Remove synthetic event: !<event_name> field[;field] ...
+ * where 'field' = type field_name
+ */
+
+ if (name[0] == '\0' || argc < 1)
+ return -EINVAL;
+
+ mutex_lock(&event_mutex);
+
+ event = find_synth_event(name);
+ if (event) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ for (i = 0; i < argc - 1; i++) {
+ if (strcmp(argv[i], ";") == 0)
+ continue;
+ if (n_fields == SYNTH_FIELDS_MAX) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ field = parse_synth_field(argc - i, &argv[i], &consumed);
+ if (IS_ERR(field)) {
+ ret = PTR_ERR(field);
+ goto err;
+ }
+ fields[n_fields++] = field;
+ i += consumed - 1;
+ }
+
+ if (i < argc && strcmp(argv[i], ";") != 0) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ event = alloc_synth_event(name, n_fields, fields);
+ if (IS_ERR(event)) {
+ ret = PTR_ERR(event);
+ event = NULL;
+ goto err;
+ }
+ ret = register_synth_event(event);
+ if (!ret)
+ dyn_event_add(&event->devent);
+ else
+ free_synth_event(event);
+ out:
+ mutex_unlock(&event_mutex);
+
+ return ret;
+ err:
+ for (i = 0; i < n_fields; i++)
+ free_synth_field(fields[i]);
+
+ goto out;
+}
+
+/**
+ * synth_event_create - Create a new synthetic event
+ * @name: The name of the new sythetic event
+ * @fields: An array of type/name field descriptions
+ * @n_fields: The number of field descriptions contained in the fields array
+ * @mod: The module creating the event, NULL if not created from a module
+ *
+ * Create a new synthetic event with the given name under the
+ * trace/events/synthetic/ directory. The event fields that will be
+ * defined for the event should be passed in as an array of struct
+ * synth_field_desc, and the number elements in the array passed in as
+ * n_fields. Field ordering will retain the ordering given in the
+ * fields array.
+ *
+ * If the new synthetic event is being created from a module, the mod
+ * param must be non-NULL. This will ensure that the trace buffer
+ * won't contain unreadable events.
+ *
+ * The new synth event should be deleted using synth_event_delete()
+ * function. The new synthetic event can be generated from modules or
+ * other kernel code using trace_synth_event() and related functions.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_create(const char *name, struct synth_field_desc *fields,
+ unsigned int n_fields, struct module *mod)
+{
+ struct dynevent_cmd cmd;
+ char *buf;
+ int ret;
+
+ buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
+
+ ret = synth_event_gen_cmd_array_start(&cmd, name, mod,
+ fields, n_fields);
+ if (ret)
+ goto out;
+
+ ret = synth_event_gen_cmd_end(&cmd);
+ out:
+ kfree(buf);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_create);
+
+static int destroy_synth_event(struct synth_event *se)
+{
+ int ret;
+
+ if (se->ref)
+ ret = -EBUSY;
+ else {
+ ret = unregister_synth_event(se);
+ if (!ret) {
+ dyn_event_remove(&se->devent);
+ free_synth_event(se);
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * synth_event_delete - Delete a synthetic event
+ * @event_name: The name of the new sythetic event
+ *
+ * Delete a synthetic event that was created with synth_event_create().
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_delete(const char *event_name)
+{
+ struct synth_event *se = NULL;
+ struct module *mod = NULL;
+ int ret = -ENOENT;
+
+ mutex_lock(&event_mutex);
+ se = find_synth_event(event_name);
+ if (se) {
+ mod = se->mod;
+ ret = destroy_synth_event(se);
+ }
+ mutex_unlock(&event_mutex);
+
+ if (mod) {
+ mutex_lock(&trace_types_lock);
+ /*
+ * It is safest to reset the ring buffer if the module
+ * being unloaded registered any events that were
+ * used. The only worry is if a new module gets
+ * loaded, and takes on the same id as the events of
+ * this module. When printing out the buffer, traced
+ * events left over from this module may be passed to
+ * the new module events and unexpected results may
+ * occur.
+ */
+ tracing_reset_all_online_cpus();
+ mutex_unlock(&trace_types_lock);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_delete);
+
+static int create_or_delete_synth_event(int argc, char **argv)
+{
+ const char *name = argv[0];
+ int ret;
+
+ /* trace_run_command() ensures argc != 0 */
+ if (name[0] == '!') {
+ ret = synth_event_delete(name + 1);
+ return ret;
+ }
+
+ ret = __create_synth_event(argc - 1, name, (const char **)argv + 1);
+ return ret == -ECANCELED ? -EINVAL : ret;
+}
+
+static int synth_event_run_command(struct dynevent_cmd *cmd)
+{
+ struct synth_event *se;
+ int ret;
+
+ ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event);
+ if (ret)
+ return ret;
+
+ se = find_synth_event(cmd->event_name);
+ if (WARN_ON(!se))
+ return -ENOENT;
+
+ se->mod = cmd->private_data;
+
+ return ret;
+}
+
+/**
+ * synth_event_cmd_init - Initialize a synthetic event command object
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @buf: A pointer to the buffer used to build the command
+ * @maxlen: The length of the buffer passed in @buf
+ *
+ * Initialize a synthetic event command object. Use this before
+ * calling any of the other dyenvent_cmd functions.
+ */
+void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen)
+{
+ dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH,
+ synth_event_run_command);
+}
+EXPORT_SYMBOL_GPL(synth_event_cmd_init);
+
+static inline int
+__synth_event_trace_start(struct trace_event_file *file,
+ struct synth_event_trace_state *trace_state)
+{
+ int entry_size, fields_size = 0;
+ int ret = 0;
+
+ memset(trace_state, '\0', sizeof(*trace_state));
+
+ /*
+ * Normal event tracing doesn't get called at all unless the
+ * ENABLED bit is set (which attaches the probe thus allowing
+ * this code to be called, etc). Because this is called
+ * directly by the user, we don't have that but we still need
+ * to honor not logging when disabled. For the the iterated
+ * trace case, we save the enabed state upon start and just
+ * ignore the following data calls.
+ */
+ if (!(file->flags & EVENT_FILE_FL_ENABLED) ||
+ trace_trigger_soft_disabled(file)) {
+ trace_state->disabled = true;
+ ret = -ENOENT;
+ goto out;
+ }
+
+ trace_state->event = file->event_call->data;
+
+ fields_size = trace_state->event->n_u64 * sizeof(u64);
+
+ /*
+ * Avoid ring buffer recursion detection, as this event
+ * is being performed within another event.
+ */
+ trace_state->buffer = file->tr->array_buffer.buffer;
+ ring_buffer_nest_start(trace_state->buffer);
+
+ entry_size = sizeof(*trace_state->entry) + fields_size;
+ trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer,
+ file,
+ entry_size);
+ if (!trace_state->entry) {
+ ring_buffer_nest_end(trace_state->buffer);
+ ret = -EINVAL;
+ }
+out:
+ return ret;
+}
+
+static inline void
+__synth_event_trace_end(struct synth_event_trace_state *trace_state)
+{
+ trace_event_buffer_commit(&trace_state->fbuffer);
+
+ ring_buffer_nest_end(trace_state->buffer);
+}
+
+/**
+ * synth_event_trace - Trace a synthetic event
+ * @file: The trace_event_file representing the synthetic event
+ * @n_vals: The number of values in vals
+ * @args: Variable number of args containing the event values
+ *
+ * Trace a synthetic event using the values passed in the variable
+ * argument list.
+ *
+ * The argument list should be a list 'n_vals' u64 values. The number
+ * of vals must match the number of field in the synthetic event, and
+ * must be in the same order as the synthetic event fields.
+ *
+ * All vals should be cast to u64, and string vals are just pointers
+ * to strings, cast to u64. Strings will be copied into space
+ * reserved in the event for the string, using these pointers.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
+{
+ struct synth_event_trace_state state;
+ unsigned int i, n_u64;
+ va_list args;
+ int ret;
+
+ ret = __synth_event_trace_start(file, &state);
+ if (ret) {
+ if (ret == -ENOENT)
+ ret = 0; /* just disabled, not really an error */
+ return ret;
+ }
+
+ if (n_vals != state.event->n_fields) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ va_start(args, n_vals);
+ for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
+ u64 val;
+
+ val = va_arg(args, u64);
+
+ if (state.event->fields[i]->is_string) {
+ char *str_val = (char *)(long)val;
+ char *str_field = (char *)&state.entry->fields[n_u64];
+
+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ struct synth_field *field = state.event->fields[i];
+
+ switch (field->size) {
+ case 1:
+ *(u8 *)&state.entry->fields[n_u64] = (u8)val;
+ break;
+
+ case 2:
+ *(u16 *)&state.entry->fields[n_u64] = (u16)val;
+ break;
+
+ case 4:
+ *(u32 *)&state.entry->fields[n_u64] = (u32)val;
+ break;
+
+ default:
+ state.entry->fields[n_u64] = val;
+ break;
+ }
+ n_u64++;
+ }
+ }
+ va_end(args);
+out:
+ __synth_event_trace_end(&state);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_trace);
+
+/**
+ * synth_event_trace_array - Trace a synthetic event from an array
+ * @file: The trace_event_file representing the synthetic event
+ * @vals: Array of values
+ * @n_vals: The number of values in vals
+ *
+ * Trace a synthetic event using the values passed in as 'vals'.
+ *
+ * The 'vals' array is just an array of 'n_vals' u64. The number of
+ * vals must match the number of field in the synthetic event, and
+ * must be in the same order as the synthetic event fields.
+ *
+ * All vals should be cast to u64, and string vals are just pointers
+ * to strings, cast to u64. Strings will be copied into space
+ * reserved in the event for the string, using these pointers.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
+ unsigned int n_vals)
+{
+ struct synth_event_trace_state state;
+ unsigned int i, n_u64;
+ int ret;
+
+ ret = __synth_event_trace_start(file, &state);
+ if (ret) {
+ if (ret == -ENOENT)
+ ret = 0; /* just disabled, not really an error */
+ return ret;
+ }
+
+ if (n_vals != state.event->n_fields) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
+ if (state.event->fields[i]->is_string) {
+ char *str_val = (char *)(long)vals[i];
+ char *str_field = (char *)&state.entry->fields[n_u64];
+
+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ struct synth_field *field = state.event->fields[i];
+ u64 val = vals[i];
+
+ switch (field->size) {
+ case 1:
+ *(u8 *)&state.entry->fields[n_u64] = (u8)val;
+ break;
+
+ case 2:
+ *(u16 *)&state.entry->fields[n_u64] = (u16)val;
+ break;
+
+ case 4:
+ *(u32 *)&state.entry->fields[n_u64] = (u32)val;
+ break;
+
+ default:
+ state.entry->fields[n_u64] = val;
+ break;
+ }
+ n_u64++;
+ }
+ }
+out:
+ __synth_event_trace_end(&state);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_trace_array);
+
+/**
+ * synth_event_trace_start - Start piecewise synthetic event trace
+ * @file: The trace_event_file representing the synthetic event
+ * @trace_state: A pointer to object tracking the piecewise trace state
+ *
+ * Start the trace of a synthetic event field-by-field rather than all
+ * at once.
+ *
+ * This function 'opens' an event trace, which means space is reserved
+ * for the event in the trace buffer, after which the event's
+ * individual field values can be set through either
+ * synth_event_add_next_val() or synth_event_add_val().
+ *
+ * A pointer to a trace_state object is passed in, which will keep
+ * track of the current event trace state until the event trace is
+ * closed (and the event finally traced) using
+ * synth_event_trace_end().
+ *
+ * Note that synth_event_trace_end() must be called after all values
+ * have been added for each event trace, regardless of whether adding
+ * all field values succeeded or not.
+ *
+ * Note also that for a given event trace, all fields must be added
+ * using either synth_event_add_next_val() or synth_event_add_val()
+ * but not both together or interleaved.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_trace_start(struct trace_event_file *file,
+ struct synth_event_trace_state *trace_state)
+{
+ int ret;
+
+ if (!trace_state)
+ return -EINVAL;
+
+ ret = __synth_event_trace_start(file, trace_state);
+ if (ret == -ENOENT)
+ ret = 0; /* just disabled, not really an error */
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_trace_start);
+
+static int __synth_event_add_val(const char *field_name, u64 val,
+ struct synth_event_trace_state *trace_state)
+{
+ struct synth_field *field = NULL;
+ struct synth_trace_event *entry;
+ struct synth_event *event;
+ int i, ret = 0;
+
+ if (!trace_state) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* can't mix add_next_synth_val() with add_synth_val() */
+ if (field_name) {
+ if (trace_state->add_next) {
+ ret = -EINVAL;
+ goto out;
+ }
+ trace_state->add_name = true;
+ } else {
+ if (trace_state->add_name) {
+ ret = -EINVAL;
+ goto out;
+ }
+ trace_state->add_next = true;
+ }
+
+ if (trace_state->disabled)
+ goto out;
+
+ event = trace_state->event;
+ if (trace_state->add_name) {
+ for (i = 0; i < event->n_fields; i++) {
+ field = event->fields[i];
+ if (strcmp(field->name, field_name) == 0)
+ break;
+ }
+ if (!field) {
+ ret = -EINVAL;
+ goto out;
+ }
+ } else {
+ if (trace_state->cur_field >= event->n_fields) {
+ ret = -EINVAL;
+ goto out;
+ }
+ field = event->fields[trace_state->cur_field++];
+ }
+
+ entry = trace_state->entry;
+ if (field->is_string) {
+ char *str_val = (char *)(long)val;
+ char *str_field;
+
+ if (!str_val) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ str_field = (char *)&entry->fields[field->offset];
+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+ } else {
+ switch (field->size) {
+ case 1:
+ *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val;
+ break;
+
+ case 2:
+ *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val;
+ break;
+
+ case 4:
+ *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val;
+ break;
+
+ default:
+ trace_state->entry->fields[field->offset] = val;
+ break;
+ }
+ }
+ out:
+ return ret;
+}
+
+/**
+ * synth_event_add_next_val - Add the next field's value to an open synth trace
+ * @val: The value to set the next field to
+ * @trace_state: A pointer to object tracking the piecewise trace state
+ *
+ * Set the value of the next field in an event that's been opened by
+ * synth_event_trace_start().
+ *
+ * The val param should be the value cast to u64. If the value points
+ * to a string, the val param should be a char * cast to u64.
+ *
+ * This function assumes all the fields in an event are to be set one
+ * after another - successive calls to this function are made, one for
+ * each field, in the order of the fields in the event, until all
+ * fields have been set. If you'd rather set each field individually
+ * without regard to ordering, synth_event_add_val() can be used
+ * instead.
+ *
+ * Note however that synth_event_add_next_val() and
+ * synth_event_add_val() can't be intermixed for a given event trace -
+ * one or the other but not both can be used at the same time.
+ *
+ * Note also that synth_event_trace_end() must be called after all
+ * values have been added for each event trace, regardless of whether
+ * adding all field values succeeded or not.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_add_next_val(u64 val,
+ struct synth_event_trace_state *trace_state)
+{
+ return __synth_event_add_val(NULL, val, trace_state);
+}
+EXPORT_SYMBOL_GPL(synth_event_add_next_val);
+
+/**
+ * synth_event_add_val - Add a named field's value to an open synth trace
+ * @field_name: The name of the synthetic event field value to set
+ * @val: The value to set the next field to
+ * @trace_state: A pointer to object tracking the piecewise trace state
+ *
+ * Set the value of the named field in an event that's been opened by
+ * synth_event_trace_start().
+ *
+ * The val param should be the value cast to u64. If the value points
+ * to a string, the val param should be a char * cast to u64.
+ *
+ * This function looks up the field name, and if found, sets the field
+ * to the specified value. This lookup makes this function more
+ * expensive than synth_event_add_next_val(), so use that or the
+ * none-piecewise synth_event_trace() instead if efficiency is more
+ * important.
+ *
+ * Note however that synth_event_add_next_val() and
+ * synth_event_add_val() can't be intermixed for a given event trace -
+ * one or the other but not both can be used at the same time.
+ *
+ * Note also that synth_event_trace_end() must be called after all
+ * values have been added for each event trace, regardless of whether
+ * adding all field values succeeded or not.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_add_val(const char *field_name, u64 val,
+ struct synth_event_trace_state *trace_state)
+{
+ return __synth_event_add_val(field_name, val, trace_state);
+}
+EXPORT_SYMBOL_GPL(synth_event_add_val);
+
+/**
+ * synth_event_trace_end - End piecewise synthetic event trace
+ * @trace_state: A pointer to object tracking the piecewise trace state
+ *
+ * End the trace of a synthetic event opened by
+ * synth_event_trace__start().
+ *
+ * This function 'closes' an event trace, which basically means that
+ * it commits the reserved event and cleans up other loose ends.
+ *
+ * A pointer to a trace_state object is passed in, which will keep
+ * track of the current event trace state opened with
+ * synth_event_trace_start().
+ *
+ * Note that this function must be called after all values have been
+ * added for each event trace, regardless of whether adding all field
+ * values succeeded or not.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_trace_end(struct synth_event_trace_state *trace_state)
+{
+ if (!trace_state)
+ return -EINVAL;
+
+ __synth_event_trace_end(trace_state);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(synth_event_trace_end);
+
+static int create_synth_event(int argc, const char **argv)
+{
+ const char *name = argv[0];
+ int len;
+
+ if (name[0] != 's' || name[1] != ':')
+ return -ECANCELED;
+ name += 2;
+
+ /* This interface accepts group name prefix */
+ if (strchr(name, '/')) {
+ len = str_has_prefix(name, SYNTH_SYSTEM "/");
+ if (len == 0)
+ return -EINVAL;
+ name += len;
+ }
+ return __create_synth_event(argc - 1, name, argv + 1);
+}
+
+static int synth_event_release(struct dyn_event *ev)
+{
+ struct synth_event *event = to_synth_event(ev);
+ int ret;
+
+ if (event->ref)
+ return -EBUSY;
+
+ ret = unregister_synth_event(event);
+ if (ret)
+ return ret;
+
+ dyn_event_remove(ev);
+ free_synth_event(event);
+ return 0;
+}
+
+static int __synth_event_show(struct seq_file *m, struct synth_event *event)
+{
+ struct synth_field *field;
+ unsigned int i;
+
+ seq_printf(m, "%s\t", event->name);
+
+ for (i = 0; i < event->n_fields; i++) {
+ field = event->fields[i];
+
+ /* parameter values */
+ seq_printf(m, "%s %s%s", field->type, field->name,
+ i == event->n_fields - 1 ? "" : "; ");
+ }
+
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static int synth_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct synth_event *event = to_synth_event(ev);
+
+ seq_printf(m, "s:%s/", event->class.system);
+
+ return __synth_event_show(m, event);
+}
+
+static int synth_events_seq_show(struct seq_file *m, void *v)
+{
+ struct dyn_event *ev = v;
+
+ if (!is_synth_event(ev))
+ return 0;
+
+ return __synth_event_show(m, to_synth_event(ev));
+}
+
+static const struct seq_operations synth_events_seq_op = {
+ .start = dyn_event_seq_start,
+ .next = dyn_event_seq_next,
+ .stop = dyn_event_seq_stop,
+ .show = synth_events_seq_show,
+};
+
+static int synth_events_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ ret = dyn_events_release_all(&synth_event_ops);
+ if (ret < 0)
+ return ret;
+ }
+
+ return seq_open(file, &synth_events_seq_op);
+}
+
+static ssize_t synth_events_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ return trace_parse_run_command(file, buffer, count, ppos,
+ create_or_delete_synth_event);
+}
+
+static const struct file_operations synth_events_fops = {
+ .open = synth_events_open,
+ .write = synth_events_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static __init int trace_events_synth_init(void)
+{
+ struct dentry *entry = NULL;
+ struct dentry *d_tracer;
+ int err = 0;
+
+ err = dyn_event_register(&synth_event_ops);
+ if (err) {
+ pr_warn("Could not register synth_event_ops\n");
+ return err;
+ }
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer)) {
+ err = PTR_ERR(d_tracer);
+ goto err;
+ }
+
+ entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
+ NULL, &synth_events_fops);
+ if (!entry) {
+ err = -ENODEV;
+ goto err;
+ }
+
+ return err;
+ err:
+ pr_warn("Could not create tracefs 'synthetic_events' entry\n");
+
+ return err;
+}
+
+fs_initcall(trace_events_synth_init);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 3a74736da363..f725802160c0 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -216,11 +216,17 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
int trigger_process_regex(struct trace_event_file *file, char *buff)
{
- char *command, *next = buff;
+ char *command, *next;
struct event_command *p;
int ret = -EINVAL;
+ next = buff = skip_spaces(buff);
command = strsep(&next, ": \t");
+ if (next) {
+ next = skip_spaces(next);
+ if (!*next)
+ next = NULL;
+ }
command = (command[0] != '!') ? command : command + 1;
mutex_lock(&trigger_cmd_mutex);
@@ -630,8 +636,14 @@ event_trigger_callback(struct event_command *cmd_ops,
int ret;
/* separate the trigger from the filter (t:n [if filter]) */
- if (param && isdigit(param[0]))
+ if (param && isdigit(param[0])) {
trigger = strsep(&param, " \t");
+ if (param) {
+ param = skip_spaces(param);
+ if (!*param)
+ param = NULL;
+ }
+ }
trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
@@ -1368,6 +1380,11 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
trigger = strsep(&param, " \t");
if (!trigger)
return -EINVAL;
+ if (param) {
+ param = skip_spaces(param);
+ if (!*param)
+ param = NULL;
+ }
system = strsep(&trigger, ":");
if (!trigger)
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 77ce5a3b6773..70d3d0a09053 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -45,6 +45,9 @@ static int ftrace_event_register(struct trace_event_call *call,
#undef __field_desc
#define __field_desc(type, container, item) type item;
+#undef __field_packed
+#define __field_packed(type, container, item) type item;
+
#undef __array
#define __array(type, item, size) type item[size];
@@ -85,6 +88,13 @@ static void __always_unused ____ftrace_check_##name(void) \
.size = sizeof(_type), .align = __alignof__(_type), \
is_signed_type(_type), .filter_type = _filter_type },
+
+#undef __field_ext_packed
+#define __field_ext_packed(_type, _item, _filter_type) { \
+ .type = #_type, .name = #_item, \
+ .size = sizeof(_type), .align = 1, \
+ is_signed_type(_type), .filter_type = _filter_type },
+
#undef __field
#define __field(_type, _item) __field_ext(_type, _item, FILTER_OTHER)
@@ -94,6 +104,9 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef __field_desc
#define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER)
+#undef __field_packed
+#define __field_packed(_type, _container, _item) __field_ext_packed(_type, _item, FILTER_OTHER)
+
#undef __array
#define __array(_type, _item, _len) { \
.type = #_type"["__stringify(_len)"]", .name = #_item, \
@@ -129,6 +142,9 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \
#undef __field_desc
#define __field_desc(type, container, item)
+#undef __field_packed
+#define __field_packed(type, container, item)
+
#undef __array
#define __array(type, item, len)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8a4c8d5c2c98..dd4dff71d89a 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -42,7 +42,7 @@ static int allocate_ftrace_ops(struct trace_array *tr)
if (!ops)
return -ENOMEM;
- /* Currently only the non stack verision is supported */
+ /* Currently only the non stack version is supported */
ops->func = function_trace_call;
ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 35989383ae11..aefb6065b508 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1202,35 +1202,41 @@ static const struct file_operations kprobe_profile_ops = {
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
+fetch_store_strlen_user(unsigned long addr)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+
+ return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+}
+
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
fetch_store_strlen(unsigned long addr)
{
int ret, len = 0;
u8 c;
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if (addr < TASK_SIZE)
+ return fetch_store_strlen_user(addr);
+#endif
+
do {
- ret = probe_kernel_read(&c, (u8 *)addr + len, 1);
+ ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
len++;
} while (c && ret == 0 && len < MAX_STRING_SIZE);
return (ret < 0) ? ret : len;
}
-/* Return the length of string -- including null terminal byte */
-static nokprobe_inline int
-fetch_store_strlen_user(unsigned long addr)
-{
- const void __user *uaddr = (__force const void __user *)addr;
-
- return strnlen_unsafe_user(uaddr, MAX_STRING_SIZE);
-}
-
/*
- * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
- * length and relative data location.
+ * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
+ * with max length and relative data location.
*/
static nokprobe_inline int
-fetch_store_string(unsigned long addr, void *dest, void *base)
+fetch_store_string_user(unsigned long addr, void *dest, void *base)
{
+ const void __user *uaddr = (__force const void __user *)addr;
int maxlen = get_loc_len(*(u32 *)dest);
void *__dest;
long ret;
@@ -1240,11 +1246,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
__dest = get_loc_data(dest, base);
- /*
- * Try to get string again, since the string can be changed while
- * probing.
- */
- ret = strncpy_from_unsafe(__dest, (void *)addr, maxlen);
+ ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
if (ret >= 0)
*(u32 *)dest = make_data_loc(ret, __dest - base);
@@ -1252,23 +1254,31 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
}
/*
- * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
- * with max length and relative data location.
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
+ * length and relative data location.
*/
static nokprobe_inline int
-fetch_store_string_user(unsigned long addr, void *dest, void *base)
+fetch_store_string(unsigned long addr, void *dest, void *base)
{
- const void __user *uaddr = (__force const void __user *)addr;
int maxlen = get_loc_len(*(u32 *)dest);
void *__dest;
long ret;
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)addr < TASK_SIZE)
+ return fetch_store_string_user(addr, dest, base);
+#endif
+
if (unlikely(!maxlen))
return -ENOMEM;
__dest = get_loc_data(dest, base);
- ret = strncpy_from_unsafe_user(__dest, uaddr, maxlen);
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
if (ret >= 0)
*(u32 *)dest = make_data_loc(ret, __dest - base);
@@ -1276,17 +1286,21 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base)
}
static nokprobe_inline int
-probe_mem_read(void *dest, void *src, size_t size)
+probe_mem_read_user(void *dest, void *src, size_t size)
{
- return probe_kernel_read(dest, src, size);
+ const void __user *uaddr = (__force const void __user *)src;
+
+ return copy_from_user_nofault(dest, uaddr, size);
}
static nokprobe_inline int
-probe_mem_read_user(void *dest, void *src, size_t size)
+probe_mem_read(void *dest, void *src, size_t size)
{
- const void __user *uaddr = (__force const void __user *)src;
-
- return probe_user_read(dest, uaddr, size);
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)src < TASK_SIZE)
+ return probe_mem_read_user(dest, src, size);
+#endif
+ return copy_from_kernel_nofault(dest, src, size);
}
/* Note that we don't verify it, since the code does not come from user space */
@@ -1629,7 +1643,7 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
if (perf_type_tracepoint)
tk = find_trace_kprobe(pevent, group);
else
- tk = event->tp_event->data;
+ tk = trace_kprobe_primary_from_call(event->tp_event);
if (!tk)
return -EINVAL;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 9a121e147102..73976de7f8cc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -393,7 +393,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
if (mm) {
const struct vm_area_struct *vma;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma(mm, ip);
if (vma) {
file = vma->vm_file;
@@ -405,7 +405,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
trace_seq_printf(s, "[+0x%lx]",
ip - vmstart);
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
trace_seq_printf(s, " <" IP_FMT ">", ip);
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index 4d8e99fdbbbe..f10073e62603 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -19,6 +19,24 @@
/* Per-cpu variable to prevent redundant calls when IRQs already off */
static DEFINE_PER_CPU(int, tracing_irq_cpu);
+/*
+ * Like trace_hardirqs_on() but without the lockdep invocation. This is
+ * used in the low level entry code where the ordering vs. RCU is important
+ * and lockdep uses a staged approach which splits the lockdep hardirq
+ * tracking into a RCU on and a RCU off section.
+ */
+void trace_hardirqs_on_prepare(void)
+{
+ if (this_cpu_read(tracing_irq_cpu)) {
+ if (!in_nmi())
+ trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
+ this_cpu_write(tracing_irq_cpu, 0);
+ }
+}
+EXPORT_SYMBOL(trace_hardirqs_on_prepare);
+NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);
+
void trace_hardirqs_on(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
@@ -28,21 +46,41 @@ void trace_hardirqs_on(void)
this_cpu_write(tracing_irq_cpu, 0);
}
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
NOKPROBE_SYMBOL(trace_hardirqs_on);
-void trace_hardirqs_off(void)
+/*
+ * Like trace_hardirqs_off() but without the lockdep invocation. This is
+ * used in the low level entry code where the ordering vs. RCU is important
+ * and lockdep uses a staged approach which splits the lockdep hardirq
+ * tracking into a RCU on and a RCU off section.
+ */
+void trace_hardirqs_off_finish(void)
{
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
if (!in_nmi())
- trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
}
+}
+EXPORT_SYMBOL(trace_hardirqs_off_finish);
+NOKPROBE_SYMBOL(trace_hardirqs_off_finish);
+
+void trace_hardirqs_off(void)
+{
lockdep_hardirqs_off(CALLER_ADDR0);
+
+ if (!this_cpu_read(tracing_irq_cpu)) {
+ this_cpu_write(tracing_irq_cpu, 1);
+ tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
+ if (!in_nmi())
+ trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ }
}
EXPORT_SYMBOL(trace_hardirqs_off);
NOKPROBE_SYMBOL(trace_hardirqs_off);
@@ -56,6 +94,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
this_cpu_write(tracing_irq_cpu, 0);
}
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index ab8b6436d53f..d2867ccc6aca 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -639,8 +639,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
ret = -EINVAL;
goto fail;
}
- if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) ||
- parg->count) {
+ if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM ||
+ code->op == FETCH_OP_DATA) || parg->count) {
/*
* IMM, DATA and COMM is pointing actual address, those
* must be kept, and if parg->count != 0, this is an
@@ -1006,7 +1006,7 @@ int trace_probe_init(struct trace_probe *tp, const char *event,
INIT_LIST_HEAD(&tp->event->class.fields);
INIT_LIST_HEAD(&tp->event->probes);
INIT_LIST_HEAD(&tp->list);
- list_add(&tp->event->probes, &tp->list);
+ list_add(&tp->list, &tp->event->probes);
call = trace_probe_event_call(tp);
call->class = &tp->event->class;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index a0ff9e200ef6..a22b62813f8c 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -236,7 +236,7 @@ struct trace_probe_event {
struct trace_event_call call;
struct list_head files;
struct list_head probes;
- struct trace_uprobe_filter filter[0];
+ struct trace_uprobe_filter filter[];
};
struct trace_probe {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c557f42a9397..98bba4764c52 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -515,9 +515,8 @@ static const struct file_operations stack_trace_filter_fops = {
#endif /* CONFIG_DYNAMIC_FTRACE */
int
-stack_trace_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+stack_trace_sysctl(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int was_enabled;
int ret;
diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h
new file mode 100644
index 000000000000..ac35c45207c4
--- /dev/null
+++ b/kernel/trace/trace_synth.h
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __TRACE_SYNTH_H
+#define __TRACE_SYNTH_H
+
+#include "trace_dynevent.h"
+
+#define SYNTH_SYSTEM "synthetic"
+#define SYNTH_FIELDS_MAX 32
+
+#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
+
+struct synth_field {
+ char *type;
+ char *name;
+ size_t size;
+ unsigned int offset;
+ bool is_signed;
+ bool is_string;
+};
+
+struct synth_event {
+ struct dyn_event devent;
+ int ref;
+ char *name;
+ struct synth_field **fields;
+ unsigned int n_fields;
+ unsigned int n_u64;
+ struct trace_event_class class;
+ struct trace_event_call call;
+ struct tracepoint *tp;
+ struct module *mod;
+};
+
+extern struct synth_event *find_synth_event(const char *name);
+
+#endif /* __TRACE_SYNTH_H */
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2a8e8e9c1c75..fdd47f99b18f 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1412,7 +1412,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
if (perf_type_tracepoint)
tu = find_probe_event(pevent, group);
else
- tu = event->tp_event->data;
+ tu = trace_uprobe_primary_from_call(event->tp_event);
if (!tu)
return -EINVAL;
diff --git a/kernel/umh.c b/kernel/umh.c
index 3474d6aa55d8..79f139a7ca03 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -641,7 +641,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
EXPORT_SYMBOL(call_usermodehelper);
static int proc_cap_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
diff --git a/kernel/user.c b/kernel/user.c
index 5235d7f49982..b1635d94a1f2 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -82,7 +82,7 @@ EXPORT_SYMBOL_GPL(init_user_ns);
#define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid))))
static struct kmem_cache *uid_cachep;
-struct hlist_head uidhash_table[UIDHASH_SZ];
+static struct hlist_head uidhash_table[UIDHASH_SZ];
/*
* The uidhash_lock is mostly taken from process context, but it is
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 8eadadc478f9..87804e0371fe 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1253,7 +1253,7 @@ static void userns_put(struct ns_common *ns)
put_user_ns(to_user_ns(ns));
}
-static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int userns_install(struct nsset *nsset, struct ns_common *ns)
{
struct user_namespace *user_ns = to_user_ns(ns);
struct cred *cred;
@@ -1274,14 +1274,14 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
- cred = prepare_creds();
+ cred = nsset_cred(nsset);
if (!cred)
- return -ENOMEM;
+ return -EINVAL;
put_user_ns(cred->user_ns);
set_cred_user_ns(cred, get_user_ns(user_ns));
- return commit_creds(cred);
+ return 0;
}
struct ns_common *ns_get_owner(struct ns_common *ns)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index f0e491193009..e488d0e2ab45 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -140,12 +140,13 @@ static void utsns_put(struct ns_common *ns)
put_uts_ns(to_uts_ns(ns));
}
-static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
+static int utsns_install(struct nsset *nsset, struct ns_common *new)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct uts_namespace *ns = to_uts_ns(new);
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
get_uts_ns(ns);
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 3732c888a949..4ca61d49885b 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table)
* to observe. Should this be in kernel/sys.c ????
*/
static int proc_do_uts_string(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table uts_table;
int r;
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
new file mode 100644
index 000000000000..f74020f6bd9d
--- /dev/null
+++ b/kernel/watch_queue.c
@@ -0,0 +1,655 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Watch queue and general notification mechanism, built on pipes
+ *
+ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * See Documentation/watch_queue.rst
+ */
+
+#define pr_fmt(fmt) "watchq: " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/printk.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/file.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include <linux/sched/signal.h>
+#include <linux/watch_queue.h>
+#include <linux/pipe_fs_i.h>
+
+MODULE_DESCRIPTION("Watch queue");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+#define WATCH_QUEUE_NOTE_SIZE 128
+#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE)
+
+static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct watch_queue *wqueue = (struct watch_queue *)buf->private;
+ struct page *page;
+ unsigned int bit;
+
+ /* We need to work out which note within the page this refers to, but
+ * the note might have been maximum size, so merely ANDing the offset
+ * off doesn't work. OTOH, the note must've been more than zero size.
+ */
+ bit = buf->offset + buf->len;
+ if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0)
+ bit -= WATCH_QUEUE_NOTE_SIZE;
+ bit /= WATCH_QUEUE_NOTE_SIZE;
+
+ page = buf->page;
+ bit += page->index;
+
+ set_bit(bit, wqueue->notes_bitmap);
+}
+
+// No try_steal function => no stealing
+#define watch_queue_pipe_buf_try_steal NULL
+
+/* New data written to a pipe may be appended to a buffer with this type. */
+static const struct pipe_buf_operations watch_queue_pipe_buf_ops = {
+ .release = watch_queue_pipe_buf_release,
+ .try_steal = watch_queue_pipe_buf_try_steal,
+ .get = generic_pipe_buf_get,
+};
+
+/*
+ * Post a notification to a watch queue.
+ */
+static bool post_one_notification(struct watch_queue *wqueue,
+ struct watch_notification *n)
+{
+ void *p;
+ struct pipe_inode_info *pipe = wqueue->pipe;
+ struct pipe_buffer *buf;
+ struct page *page;
+ unsigned int head, tail, mask, note, offset, len;
+ bool done = false;
+
+ if (!pipe)
+ return false;
+
+ spin_lock_irq(&pipe->rd_wait.lock);
+
+ if (wqueue->defunct)
+ goto out;
+
+ mask = pipe->ring_size - 1;
+ head = pipe->head;
+ tail = pipe->tail;
+ if (pipe_full(head, tail, pipe->ring_size))
+ goto lost;
+
+ note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes);
+ if (note >= wqueue->nr_notes)
+ goto lost;
+
+ page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE];
+ offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE;
+ get_page(page);
+ len = n->info & WATCH_INFO_LENGTH;
+ p = kmap_atomic(page);
+ memcpy(p + offset, n, len);
+ kunmap_atomic(p);
+
+ buf = &pipe->bufs[head & mask];
+ buf->page = page;
+ buf->private = (unsigned long)wqueue;
+ buf->ops = &watch_queue_pipe_buf_ops;
+ buf->offset = offset;
+ buf->len = len;
+ buf->flags = PIPE_BUF_FLAG_WHOLE;
+ pipe->head = head + 1;
+
+ if (!test_and_clear_bit(note, wqueue->notes_bitmap)) {
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ BUG();
+ }
+ wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
+ done = true;
+
+out:
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ if (done)
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ return done;
+
+lost:
+ buf = &pipe->bufs[(head - 1) & mask];
+ buf->flags |= PIPE_BUF_FLAG_LOSS;
+ goto out;
+}
+
+/*
+ * Apply filter rules to a notification.
+ */
+static bool filter_watch_notification(const struct watch_filter *wf,
+ const struct watch_notification *n)
+{
+ const struct watch_type_filter *wt;
+ unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8;
+ unsigned int st_index = n->subtype / st_bits;
+ unsigned int st_bit = 1U << (n->subtype % st_bits);
+ int i;
+
+ if (!test_bit(n->type, wf->type_filter))
+ return false;
+
+ for (i = 0; i < wf->nr_filters; i++) {
+ wt = &wf->filters[i];
+ if (n->type == wt->type &&
+ (wt->subtype_filter[st_index] & st_bit) &&
+ (n->info & wt->info_mask) == wt->info_filter)
+ return true;
+ }
+
+ return false; /* If there is a filter, the default is to reject. */
+}
+
+/**
+ * __post_watch_notification - Post an event notification
+ * @wlist: The watch list to post the event to.
+ * @n: The notification record to post.
+ * @cred: The creds of the process that triggered the notification.
+ * @id: The ID to match on the watch.
+ *
+ * Post a notification of an event into a set of watch queues and let the users
+ * know.
+ *
+ * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and
+ * should be in units of sizeof(*n).
+ */
+void __post_watch_notification(struct watch_list *wlist,
+ struct watch_notification *n,
+ const struct cred *cred,
+ u64 id)
+{
+ const struct watch_filter *wf;
+ struct watch_queue *wqueue;
+ struct watch *watch;
+
+ if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) {
+ WARN_ON(1);
+ return;
+ }
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) {
+ if (watch->id != id)
+ continue;
+ n->info &= ~WATCH_INFO_ID;
+ n->info |= watch->info_id;
+
+ wqueue = rcu_dereference(watch->queue);
+ wf = rcu_dereference(wqueue->filter);
+ if (wf && !filter_watch_notification(wf, n))
+ continue;
+
+ if (security_post_notification(watch->cred, cred, n) < 0)
+ continue;
+
+ post_one_notification(wqueue, n);
+ }
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(__post_watch_notification);
+
+/*
+ * Allocate sufficient pages to preallocation for the requested number of
+ * notifications.
+ */
+long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
+{
+ struct watch_queue *wqueue = pipe->watch_queue;
+ struct page **pages;
+ unsigned long *bitmap;
+ unsigned long user_bufs;
+ unsigned int bmsize;
+ int ret, i, nr_pages;
+
+ if (!wqueue)
+ return -ENODEV;
+ if (wqueue->notes)
+ return -EBUSY;
+
+ if (nr_notes < 1 ||
+ nr_notes > 512) /* TODO: choose a better hard limit */
+ return -EINVAL;
+
+ nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1);
+ nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE;
+ user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages);
+
+ if (nr_pages > pipe->max_usage &&
+ (too_many_pipe_buffers_hard(user_bufs) ||
+ too_many_pipe_buffers_soft(user_bufs)) &&
+ pipe_is_unprivileged_user()) {
+ ret = -EPERM;
+ goto error;
+ }
+
+ ret = pipe_resize_ring(pipe, nr_notes);
+ if (ret < 0)
+ goto error;
+
+ pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL);
+ if (!pages)
+ goto error;
+
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i])
+ goto error_p;
+ pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE;
+ }
+
+ bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG;
+ bmsize *= sizeof(unsigned long);
+ bitmap = kmalloc(bmsize, GFP_KERNEL);
+ if (!bitmap)
+ goto error_p;
+
+ memset(bitmap, 0xff, bmsize);
+ wqueue->notes = pages;
+ wqueue->notes_bitmap = bitmap;
+ wqueue->nr_pages = nr_pages;
+ wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
+ return 0;
+
+error_p:
+ for (i = 0; i < nr_pages; i++)
+ __free_page(pages[i]);
+ kfree(pages);
+error:
+ (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted);
+ return ret;
+}
+
+/*
+ * Set the filter on a watch queue.
+ */
+long watch_queue_set_filter(struct pipe_inode_info *pipe,
+ struct watch_notification_filter __user *_filter)
+{
+ struct watch_notification_type_filter *tf;
+ struct watch_notification_filter filter;
+ struct watch_type_filter *q;
+ struct watch_filter *wfilter;
+ struct watch_queue *wqueue = pipe->watch_queue;
+ int ret, nr_filter = 0, i;
+
+ if (!wqueue)
+ return -ENODEV;
+
+ if (!_filter) {
+ /* Remove the old filter */
+ wfilter = NULL;
+ goto set;
+ }
+
+ /* Grab the user's filter specification */
+ if (copy_from_user(&filter, _filter, sizeof(filter)) != 0)
+ return -EFAULT;
+ if (filter.nr_filters == 0 ||
+ filter.nr_filters > 16 ||
+ filter.__reserved != 0)
+ return -EINVAL;
+
+ tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf));
+ if (IS_ERR(tf))
+ return PTR_ERR(tf);
+
+ ret = -EINVAL;
+ for (i = 0; i < filter.nr_filters; i++) {
+ if ((tf[i].info_filter & ~tf[i].info_mask) ||
+ tf[i].info_mask & WATCH_INFO_LENGTH)
+ goto err_filter;
+ /* Ignore any unknown types */
+ if (tf[i].type >= sizeof(wfilter->type_filter) * 8)
+ continue;
+ nr_filter++;
+ }
+
+ /* Now we need to build the internal filter from only the relevant
+ * user-specified filters.
+ */
+ ret = -ENOMEM;
+ wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL);
+ if (!wfilter)
+ goto err_filter;
+ wfilter->nr_filters = nr_filter;
+
+ q = wfilter->filters;
+ for (i = 0; i < filter.nr_filters; i++) {
+ if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG)
+ continue;
+
+ q->type = tf[i].type;
+ q->info_filter = tf[i].info_filter;
+ q->info_mask = tf[i].info_mask;
+ q->subtype_filter[0] = tf[i].subtype_filter[0];
+ __set_bit(q->type, wfilter->type_filter);
+ q++;
+ }
+
+ kfree(tf);
+set:
+ pipe_lock(pipe);
+ wfilter = rcu_replace_pointer(wqueue->filter, wfilter,
+ lockdep_is_held(&pipe->mutex));
+ pipe_unlock(pipe);
+ if (wfilter)
+ kfree_rcu(wfilter, rcu);
+ return 0;
+
+err_filter:
+ kfree(tf);
+ return ret;
+}
+
+static void __put_watch_queue(struct kref *kref)
+{
+ struct watch_queue *wqueue =
+ container_of(kref, struct watch_queue, usage);
+ struct watch_filter *wfilter;
+ int i;
+
+ for (i = 0; i < wqueue->nr_pages; i++)
+ __free_page(wqueue->notes[i]);
+
+ wfilter = rcu_access_pointer(wqueue->filter);
+ if (wfilter)
+ kfree_rcu(wfilter, rcu);
+ kfree_rcu(wqueue, rcu);
+}
+
+/**
+ * put_watch_queue - Dispose of a ref on a watchqueue.
+ * @wqueue: The watch queue to unref.
+ */
+void put_watch_queue(struct watch_queue *wqueue)
+{
+ kref_put(&wqueue->usage, __put_watch_queue);
+}
+EXPORT_SYMBOL(put_watch_queue);
+
+static void free_watch(struct rcu_head *rcu)
+{
+ struct watch *watch = container_of(rcu, struct watch, rcu);
+
+ put_watch_queue(rcu_access_pointer(watch->queue));
+ put_cred(watch->cred);
+}
+
+static void __put_watch(struct kref *kref)
+{
+ struct watch *watch = container_of(kref, struct watch, usage);
+
+ call_rcu(&watch->rcu, free_watch);
+}
+
+/*
+ * Discard a watch.
+ */
+static void put_watch(struct watch *watch)
+{
+ kref_put(&watch->usage, __put_watch);
+}
+
+/**
+ * init_watch_queue - Initialise a watch
+ * @watch: The watch to initialise.
+ * @wqueue: The queue to assign.
+ *
+ * Initialise a watch and set the watch queue.
+ */
+void init_watch(struct watch *watch, struct watch_queue *wqueue)
+{
+ kref_init(&watch->usage);
+ INIT_HLIST_NODE(&watch->list_node);
+ INIT_HLIST_NODE(&watch->queue_node);
+ rcu_assign_pointer(watch->queue, wqueue);
+}
+
+/**
+ * add_watch_to_object - Add a watch on an object to a watch list
+ * @watch: The watch to add
+ * @wlist: The watch list to add to
+ *
+ * @watch->queue must have been set to point to the queue to post notifications
+ * to and the watch list of the object to be watched. @watch->cred must also
+ * have been set to the appropriate credentials and a ref taken on them.
+ *
+ * The caller must pin the queue and the list both and must hold the list
+ * locked against racing watch additions/removals.
+ */
+int add_watch_to_object(struct watch *watch, struct watch_list *wlist)
+{
+ struct watch_queue *wqueue = rcu_access_pointer(watch->queue);
+ struct watch *w;
+
+ hlist_for_each_entry(w, &wlist->watchers, list_node) {
+ struct watch_queue *wq = rcu_access_pointer(w->queue);
+ if (wqueue == wq && watch->id == w->id)
+ return -EBUSY;
+ }
+
+ watch->cred = get_current_cred();
+ rcu_assign_pointer(watch->watch_list, wlist);
+
+ spin_lock_bh(&wqueue->lock);
+ kref_get(&wqueue->usage);
+ kref_get(&watch->usage);
+ hlist_add_head(&watch->queue_node, &wqueue->watches);
+ spin_unlock_bh(&wqueue->lock);
+
+ hlist_add_head(&watch->list_node, &wlist->watchers);
+ return 0;
+}
+EXPORT_SYMBOL(add_watch_to_object);
+
+/**
+ * remove_watch_from_object - Remove a watch or all watches from an object.
+ * @wlist: The watch list to remove from
+ * @wq: The watch queue of interest (ignored if @all is true)
+ * @id: The ID of the watch to remove (ignored if @all is true)
+ * @all: True to remove all objects
+ *
+ * Remove a specific watch or all watches from an object. A notification is
+ * sent to the watcher to tell them that this happened.
+ */
+int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq,
+ u64 id, bool all)
+{
+ struct watch_notification_removal n;
+ struct watch_queue *wqueue;
+ struct watch *watch;
+ int ret = -EBADSLT;
+
+ rcu_read_lock();
+
+again:
+ spin_lock(&wlist->lock);
+ hlist_for_each_entry(watch, &wlist->watchers, list_node) {
+ if (all ||
+ (watch->id == id && rcu_access_pointer(watch->queue) == wq))
+ goto found;
+ }
+ spin_unlock(&wlist->lock);
+ goto out;
+
+found:
+ ret = 0;
+ hlist_del_init_rcu(&watch->list_node);
+ rcu_assign_pointer(watch->watch_list, NULL);
+ spin_unlock(&wlist->lock);
+
+ /* We now own the reference on watch that used to belong to wlist. */
+
+ n.watch.type = WATCH_TYPE_META;
+ n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION;
+ n.watch.info = watch->info_id | watch_sizeof(n.watch);
+ n.id = id;
+ if (id != 0)
+ n.watch.info = watch->info_id | watch_sizeof(n);
+
+ wqueue = rcu_dereference(watch->queue);
+
+ /* We don't need the watch list lock for the next bit as RCU is
+ * protecting *wqueue from deallocation.
+ */
+ if (wqueue) {
+ post_one_notification(wqueue, &n.watch);
+
+ spin_lock_bh(&wqueue->lock);
+
+ if (!hlist_unhashed(&watch->queue_node)) {
+ hlist_del_init_rcu(&watch->queue_node);
+ put_watch(watch);
+ }
+
+ spin_unlock_bh(&wqueue->lock);
+ }
+
+ if (wlist->release_watch) {
+ void (*release_watch)(struct watch *);
+
+ release_watch = wlist->release_watch;
+ rcu_read_unlock();
+ (*release_watch)(watch);
+ rcu_read_lock();
+ }
+ put_watch(watch);
+
+ if (all && !hlist_empty(&wlist->watchers))
+ goto again;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(remove_watch_from_object);
+
+/*
+ * Remove all the watches that are contributory to a queue. This has the
+ * potential to race with removal of the watches by the destruction of the
+ * objects being watched or with the distribution of notifications.
+ */
+void watch_queue_clear(struct watch_queue *wqueue)
+{
+ struct watch_list *wlist;
+ struct watch *watch;
+ bool release;
+
+ rcu_read_lock();
+ spin_lock_bh(&wqueue->lock);
+
+ /* Prevent new additions and prevent notifications from happening */
+ wqueue->defunct = true;
+
+ while (!hlist_empty(&wqueue->watches)) {
+ watch = hlist_entry(wqueue->watches.first, struct watch, queue_node);
+ hlist_del_init_rcu(&watch->queue_node);
+ /* We now own a ref on the watch. */
+ spin_unlock_bh(&wqueue->lock);
+
+ /* We can't do the next bit under the queue lock as we need to
+ * get the list lock - which would cause a deadlock if someone
+ * was removing from the opposite direction at the same time or
+ * posting a notification.
+ */
+ wlist = rcu_dereference(watch->watch_list);
+ if (wlist) {
+ void (*release_watch)(struct watch *);
+
+ spin_lock(&wlist->lock);
+
+ release = !hlist_unhashed(&watch->list_node);
+ if (release) {
+ hlist_del_init_rcu(&watch->list_node);
+ rcu_assign_pointer(watch->watch_list, NULL);
+
+ /* We now own a second ref on the watch. */
+ }
+
+ release_watch = wlist->release_watch;
+ spin_unlock(&wlist->lock);
+
+ if (release) {
+ if (release_watch) {
+ rcu_read_unlock();
+ /* This might need to call dput(), so
+ * we have to drop all the locks.
+ */
+ (*release_watch)(watch);
+ rcu_read_lock();
+ }
+ put_watch(watch);
+ }
+ }
+
+ put_watch(watch);
+ spin_lock_bh(&wqueue->lock);
+ }
+
+ spin_unlock_bh(&wqueue->lock);
+ rcu_read_unlock();
+}
+
+/**
+ * get_watch_queue - Get a watch queue from its file descriptor.
+ * @fd: The fd to query.
+ */
+struct watch_queue *get_watch_queue(int fd)
+{
+ struct pipe_inode_info *pipe;
+ struct watch_queue *wqueue = ERR_PTR(-EINVAL);
+ struct fd f;
+
+ f = fdget(fd);
+ if (f.file) {
+ pipe = get_pipe_info(f.file, false);
+ if (pipe && pipe->watch_queue) {
+ wqueue = pipe->watch_queue;
+ kref_get(&wqueue->usage);
+ }
+ fdput(f);
+ }
+
+ return wqueue;
+}
+EXPORT_SYMBOL(get_watch_queue);
+
+/*
+ * Initialise a watch queue
+ */
+int watch_queue_init(struct pipe_inode_info *pipe)
+{
+ struct watch_queue *wqueue;
+
+ wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL);
+ if (!wqueue)
+ return -ENOMEM;
+
+ wqueue->pipe = pipe;
+ kref_init(&wqueue->usage);
+ spin_lock_init(&wqueue->lock);
+ INIT_HLIST_HEAD(&wqueue->watches);
+
+ pipe->watch_queue = wqueue;
+ return 0;
+}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b6b1f54a7837..5abb5b22ad13 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -50,6 +50,11 @@ struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
#ifdef CONFIG_HARDLOCKUP_DETECTOR
+
+# ifdef CONFIG_SMP
+int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
+# endif /* CONFIG_SMP */
+
/*
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
@@ -82,16 +87,6 @@ static int __init hardlockup_panic_setup(char *str)
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
-# ifdef CONFIG_SMP
-int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-
-static int __init hardlockup_all_cpu_backtrace_setup(char *str)
-{
- sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
- return 1;
-}
-__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
-# endif /* CONFIG_SMP */
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
/*
@@ -163,6 +158,10 @@ static void lockup_detector_update_enable(void)
#define SOFTLOCKUP_RESET ULONG_MAX
+#ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+#endif
+
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -178,13 +177,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static unsigned long soft_lockup_nmi_warn;
-static int __init softlockup_panic_setup(char *str)
-{
- softlockup_panic = simple_strtoul(str, NULL, 0);
- return 1;
-}
-__setup("softlockup_panic=", softlockup_panic_setup);
-
static int __init nowatchdog_setup(char *str)
{
watchdog_user_enabled = 0;
@@ -206,17 +198,6 @@ static int __init watchdog_thresh_setup(char *str)
}
__setup("watchdog_thresh=", watchdog_thresh_setup);
-#ifdef CONFIG_SMP
-int __read_mostly sysctl_softlockup_all_cpu_backtrace;
-
-static int __init softlockup_all_cpu_backtrace_setup(char *str)
-{
- sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
- return 1;
-}
-__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
-#endif
-
static void __lockup_detector_cleanup(void);
/*
@@ -661,7 +642,7 @@ static void proc_watchdog_update(void)
* proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
*/
static int proc_watchdog_common(int which, struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err, old, *param = table->data;
@@ -688,7 +669,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
* /proc/sys/kernel/watchdog
*/
int proc_watchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
table, write, buffer, lenp, ppos);
@@ -698,7 +679,7 @@ int proc_watchdog(struct ctl_table *table, int write,
* /proc/sys/kernel/nmi_watchdog
*/
int proc_nmi_watchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!nmi_watchdog_available && write)
return -ENOTSUPP;
@@ -710,7 +691,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write,
* /proc/sys/kernel/soft_watchdog
*/
int proc_soft_watchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
table, write, buffer, lenp, ppos);
@@ -720,7 +701,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
* /proc/sys/kernel/watchdog_thresh
*/
int proc_watchdog_thresh(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err, old;
@@ -743,7 +724,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
* been brought online, if desired.
*/
int proc_watchdog_cpumask(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 891ccad5f271..c41c3c17b86a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -145,7 +145,7 @@ enum {
/* struct worker is defined in workqueue_internal.h */
struct worker_pool {
- spinlock_t lock; /* the pool lock */
+ raw_spinlock_t lock; /* the pool lock */
int cpu; /* I: the associated cpu */
int node; /* I: the associated node ID */
int id; /* I: pool ID */
@@ -300,8 +300,9 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
-static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
-static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
+static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
+/* wait for manager to go away */
+static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
@@ -826,7 +827,7 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
* Wake up the first idle worker of @pool.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void wake_up_worker(struct worker_pool *pool)
{
@@ -881,7 +882,7 @@ void wq_worker_sleeping(struct task_struct *task)
return;
worker->sleeping = 1;
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* The counterpart of the following dec_and_test, implied mb,
@@ -900,7 +901,7 @@ void wq_worker_sleeping(struct task_struct *task)
if (next)
wake_up_process(next->task);
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
/**
@@ -911,7 +912,7 @@ void wq_worker_sleeping(struct task_struct *task)
* the scheduler to get a worker's last known identity.
*
* CONTEXT:
- * spin_lock_irq(rq->lock)
+ * raw_spin_lock_irq(rq->lock)
*
* This function is called during schedule() when a kworker is going
* to sleep. It's used by psi to identify aggregation workers during
@@ -942,7 +943,7 @@ work_func_t wq_worker_last_func(struct task_struct *task)
* Set @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
- * spin_lock_irq(pool->lock)
+ * raw_spin_lock_irq(pool->lock)
*/
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
@@ -967,7 +968,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags)
* Clear @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
- * spin_lock_irq(pool->lock)
+ * raw_spin_lock_irq(pool->lock)
*/
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
@@ -1015,7 +1016,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
* actually occurs, it should be easy to locate the culprit work function.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*
* Return:
* Pointer to worker which is executing @work if found, %NULL
@@ -1050,7 +1051,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
* nested inside outer list_for_each_entry_safe().
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void move_linked_works(struct work_struct *work, struct list_head *head,
struct work_struct **nextp)
@@ -1128,9 +1129,9 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
* As both pwqs and pools are RCU protected, the
* following lock operations are safe.
*/
- spin_lock_irq(&pwq->pool->lock);
+ raw_spin_lock_irq(&pwq->pool->lock);
put_pwq(pwq);
- spin_unlock_irq(&pwq->pool->lock);
+ raw_spin_unlock_irq(&pwq->pool->lock);
}
}
@@ -1163,7 +1164,7 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
* decrement nr_in_flight of its pwq and handle workqueue flushing.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
{
@@ -1262,7 +1263,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
if (!pool)
goto fail;
- spin_lock(&pool->lock);
+ raw_spin_lock(&pool->lock);
/*
* work->data is guaranteed to point to pwq only while the work
* item is queued on pwq->wq, and both updating work->data to point
@@ -1291,11 +1292,11 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
/* work->data points to pwq iff queued, point to pool */
set_work_pool_and_keep_pending(work, pool->id);
- spin_unlock(&pool->lock);
+ raw_spin_unlock(&pool->lock);
rcu_read_unlock();
return 1;
}
- spin_unlock(&pool->lock);
+ raw_spin_unlock(&pool->lock);
fail:
rcu_read_unlock();
local_irq_restore(*flags);
@@ -1316,7 +1317,7 @@ fail:
* work_struct flags.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
struct list_head *head, unsigned int extra_flags)
@@ -1433,7 +1434,7 @@ retry:
if (last_pool && last_pool != pwq->pool) {
struct worker *worker;
- spin_lock(&last_pool->lock);
+ raw_spin_lock(&last_pool->lock);
worker = find_worker_executing_work(last_pool, work);
@@ -1441,11 +1442,11 @@ retry:
pwq = worker->current_pwq;
} else {
/* meh... not running there, queue here */
- spin_unlock(&last_pool->lock);
- spin_lock(&pwq->pool->lock);
+ raw_spin_unlock(&last_pool->lock);
+ raw_spin_lock(&pwq->pool->lock);
}
} else {
- spin_lock(&pwq->pool->lock);
+ raw_spin_lock(&pwq->pool->lock);
}
/*
@@ -1458,7 +1459,7 @@ retry:
*/
if (unlikely(!pwq->refcnt)) {
if (wq->flags & WQ_UNBOUND) {
- spin_unlock(&pwq->pool->lock);
+ raw_spin_unlock(&pwq->pool->lock);
cpu_relax();
goto retry;
}
@@ -1490,7 +1491,7 @@ retry:
insert_work(pwq, work, worklist, work_flags);
out:
- spin_unlock(&pwq->pool->lock);
+ raw_spin_unlock(&pwq->pool->lock);
rcu_read_unlock();
}
@@ -1759,7 +1760,7 @@ EXPORT_SYMBOL(queue_rcu_work);
* necessary.
*
* LOCKING:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void worker_enter_idle(struct worker *worker)
{
@@ -1799,7 +1800,7 @@ static void worker_enter_idle(struct worker *worker)
* @worker is leaving idle state. Update stats.
*
* LOCKING:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void worker_leave_idle(struct worker *worker)
{
@@ -1937,11 +1938,11 @@ static struct worker *create_worker(struct worker_pool *pool)
worker_attach_to_pool(worker, pool);
/* start the newly created worker */
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
worker->pool->nr_workers++;
worker_enter_idle(worker);
wake_up_process(worker->task);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
return worker;
@@ -1960,7 +1961,7 @@ fail:
* be idle.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void destroy_worker(struct worker *worker)
{
@@ -1986,7 +1987,7 @@ static void idle_worker_timeout(struct timer_list *t)
{
struct worker_pool *pool = from_timer(pool, t, idle_timer);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
while (too_many_workers(pool)) {
struct worker *worker;
@@ -2004,7 +2005,7 @@ static void idle_worker_timeout(struct timer_list *t)
destroy_worker(worker);
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
static void send_mayday(struct work_struct *work)
@@ -2035,8 +2036,8 @@ static void pool_mayday_timeout(struct timer_list *t)
struct worker_pool *pool = from_timer(pool, t, mayday_timer);
struct work_struct *work;
- spin_lock_irq(&pool->lock);
- spin_lock(&wq_mayday_lock); /* for wq->maydays */
+ raw_spin_lock_irq(&pool->lock);
+ raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */
if (need_to_create_worker(pool)) {
/*
@@ -2049,8 +2050,8 @@ static void pool_mayday_timeout(struct timer_list *t)
send_mayday(work);
}
- spin_unlock(&wq_mayday_lock);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock(&wq_mayday_lock);
+ raw_spin_unlock_irq(&pool->lock);
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}
@@ -2069,7 +2070,7 @@ static void pool_mayday_timeout(struct timer_list *t)
* may_start_working() %true.
*
* LOCKING:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations. Called only from
* manager.
*/
@@ -2078,7 +2079,7 @@ __releases(&pool->lock)
__acquires(&pool->lock)
{
restart:
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
@@ -2094,7 +2095,7 @@ restart:
}
del_timer_sync(&pool->mayday_timer);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* This is necessary even after a new worker was just successfully
* created as @pool->lock was dropped and the new worker might have
@@ -2117,7 +2118,7 @@ restart:
* and may_start_working() is true.
*
* CONTEXT:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations.
*
* Return:
@@ -2140,7 +2141,7 @@ static bool manage_workers(struct worker *worker)
pool->manager = NULL;
pool->flags &= ~POOL_MANAGER_ACTIVE;
- wake_up(&wq_manager_wait);
+ rcuwait_wake_up(&manager_wait);
return true;
}
@@ -2156,7 +2157,7 @@ static bool manage_workers(struct worker *worker)
* call this function to process a work.
*
* CONTEXT:
- * spin_lock_irq(pool->lock) which is released and regrabbed.
+ * raw_spin_lock_irq(pool->lock) which is released and regrabbed.
*/
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
@@ -2238,7 +2239,7 @@ __acquires(&pool->lock)
*/
set_work_pool_and_clear_pending(work, pool->id);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
@@ -2293,7 +2294,7 @@ __acquires(&pool->lock)
*/
cond_resched();
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/* clear cpu intensive status */
if (unlikely(cpu_intensive))
@@ -2319,7 +2320,7 @@ __acquires(&pool->lock)
* fetches a work from the top and executes it.
*
* CONTEXT:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times.
*/
static void process_scheduled_works(struct worker *worker)
@@ -2361,11 +2362,11 @@ static int worker_thread(void *__worker)
/* tell the scheduler that this is a workqueue worker */
set_pf_worker(true);
woke_up:
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/* am I supposed to die? */
if (unlikely(worker->flags & WORKER_DIE)) {
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
set_pf_worker(false);
@@ -2431,7 +2432,7 @@ sleep:
*/
worker_enter_idle(worker);
__set_current_state(TASK_IDLE);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
schedule();
goto woke_up;
}
@@ -2485,7 +2486,7 @@ repeat:
should_stop = kthread_should_stop();
/* see whether any pwq is asking for help */
- spin_lock_irq(&wq_mayday_lock);
+ raw_spin_lock_irq(&wq_mayday_lock);
while (!list_empty(&wq->maydays)) {
struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
@@ -2497,11 +2498,11 @@ repeat:
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
- spin_unlock_irq(&wq_mayday_lock);
+ raw_spin_unlock_irq(&wq_mayday_lock);
worker_attach_to_pool(rescuer, pool);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* Slurp in all works issued via this workqueue and
@@ -2529,8 +2530,8 @@ repeat:
* being used to relieve memory pressure, don't
* incur MAYDAY_INTERVAL delay inbetween.
*/
- if (need_to_create_worker(pool)) {
- spin_lock(&wq_mayday_lock);
+ if (pwq->nr_active && need_to_create_worker(pool)) {
+ raw_spin_lock(&wq_mayday_lock);
/*
* Queue iff we aren't racing destruction
* and somebody else hasn't queued it already.
@@ -2539,7 +2540,7 @@ repeat:
get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
}
- spin_unlock(&wq_mayday_lock);
+ raw_spin_unlock(&wq_mayday_lock);
}
}
@@ -2557,14 +2558,14 @@ repeat:
if (need_more_worker(pool))
wake_up_worker(pool);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
worker_detach_from_pool(rescuer);
- spin_lock_irq(&wq_mayday_lock);
+ raw_spin_lock_irq(&wq_mayday_lock);
}
- spin_unlock_irq(&wq_mayday_lock);
+ raw_spin_unlock_irq(&wq_mayday_lock);
if (should_stop) {
__set_current_state(TASK_RUNNING);
@@ -2644,7 +2645,7 @@ static void wq_barrier_func(struct work_struct *work)
* underneath us, so we can't reliably determine pwq from @target.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void insert_wq_barrier(struct pool_workqueue *pwq,
struct wq_barrier *barr,
@@ -2731,7 +2732,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
for_each_pwq(pwq, wq) {
struct worker_pool *pool = pwq->pool;
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
if (flush_color >= 0) {
WARN_ON_ONCE(pwq->flush_color != -1);
@@ -2748,7 +2749,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
pwq->work_color = work_color;
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
@@ -2948,9 +2949,9 @@ reflush:
for_each_pwq(pwq, wq) {
bool drained;
- spin_lock_irq(&pwq->pool->lock);
+ raw_spin_lock_irq(&pwq->pool->lock);
drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
- spin_unlock_irq(&pwq->pool->lock);
+ raw_spin_unlock_irq(&pwq->pool->lock);
if (drained)
continue;
@@ -2986,7 +2987,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
return false;
}
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/* see the comment in try_to_grab_pending() with the same code */
pwq = get_work_pwq(work);
if (pwq) {
@@ -3002,7 +3003,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
check_flush_dependency(pwq->wq, work);
insert_wq_barrier(pwq, barr, work, worker);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
/*
* Force a lock recursion deadlock when using flush_work() inside a
@@ -3021,7 +3022,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
rcu_read_unlock();
return true;
already_gone:
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
rcu_read_unlock();
return false;
}
@@ -3414,7 +3415,7 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
*/
static int init_worker_pool(struct worker_pool *pool)
{
- spin_lock_init(&pool->lock);
+ raw_spin_lock_init(&pool->lock);
pool->id = -1;
pool->cpu = -1;
pool->node = NUMA_NO_NODE;
@@ -3491,7 +3492,6 @@ static void rcu_free_wq(struct rcu_head *rcu)
else
free_workqueue_attrs(wq->unbound_attrs);
- kfree(wq->rescuer);
kfree(wq);
}
@@ -3504,6 +3504,18 @@ static void rcu_free_pool(struct rcu_head *rcu)
kfree(pool);
}
+/* This returns with the lock held on success (pool manager is inactive). */
+static bool wq_manager_inactive(struct worker_pool *pool)
+{
+ raw_spin_lock_irq(&pool->lock);
+
+ if (pool->flags & POOL_MANAGER_ACTIVE) {
+ raw_spin_unlock_irq(&pool->lock);
+ return false;
+ }
+ return true;
+}
+
/**
* put_unbound_pool - put a worker_pool
* @pool: worker_pool to put
@@ -3539,16 +3551,17 @@ static void put_unbound_pool(struct worker_pool *pool)
* Become the manager and destroy all workers. This prevents
* @pool's workers from blocking on attach_mutex. We're the last
* manager and @pool gets freed with the flag set.
+ * Because of how wq_manager_inactive() works, we will hold the
+ * spinlock after a successful wait.
*/
- spin_lock_irq(&pool->lock);
- wait_event_lock_irq(wq_manager_wait,
- !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock);
+ rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool),
+ TASK_UNINTERRUPTIBLE);
pool->flags |= POOL_MANAGER_ACTIVE;
while ((worker = first_idle_worker(pool)))
destroy_worker(worker);
WARN_ON(pool->nr_workers || pool->nr_idle);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
mutex_lock(&wq_pool_attach_mutex);
if (!list_empty(&pool->workers))
@@ -3704,7 +3717,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
return;
/* this function can be called during early boot w/ irq disabled */
- spin_lock_irqsave(&pwq->pool->lock, flags);
+ raw_spin_lock_irqsave(&pwq->pool->lock, flags);
/*
* During [un]freezing, the caller is responsible for ensuring that
@@ -3727,7 +3740,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
pwq->max_active = 0;
}
- spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
}
/* initialize newly alloced @pwq which is associated with @wq and @pool */
@@ -4129,9 +4142,9 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
use_dfl_pwq:
mutex_lock(&wq->mutex);
- spin_lock_irq(&wq->dfl_pwq->pool->lock);
+ raw_spin_lock_irq(&wq->dfl_pwq->pool->lock);
get_pwq(wq->dfl_pwq);
- spin_unlock_irq(&wq->dfl_pwq->pool->lock);
+ raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock);
old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
out_unlock:
mutex_unlock(&wq->mutex);
@@ -4208,8 +4221,8 @@ static int init_rescuer(struct workqueue_struct *wq)
rescuer->rescue_wq = wq;
rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name);
- ret = PTR_ERR_OR_ZERO(rescuer->task);
- if (ret) {
+ if (IS_ERR(rescuer->task)) {
+ ret = PTR_ERR(rescuer->task);
kfree(rescuer);
return ret;
}
@@ -4360,9 +4373,9 @@ void destroy_workqueue(struct workqueue_struct *wq)
struct worker *rescuer = wq->rescuer;
/* this prevents new queueing */
- spin_lock_irq(&wq_mayday_lock);
+ raw_spin_lock_irq(&wq_mayday_lock);
wq->rescuer = NULL;
- spin_unlock_irq(&wq_mayday_lock);
+ raw_spin_unlock_irq(&wq_mayday_lock);
/* rescuer will empty maydays list before exiting */
kthread_stop(rescuer->task);
@@ -4376,27 +4389,25 @@ void destroy_workqueue(struct workqueue_struct *wq)
mutex_lock(&wq_pool_mutex);
mutex_lock(&wq->mutex);
for_each_pwq(pwq, wq) {
- spin_lock_irq(&pwq->pool->lock);
+ raw_spin_lock_irq(&pwq->pool->lock);
if (WARN_ON(pwq_busy(pwq))) {
pr_warn("%s: %s has the following busy pwq\n",
__func__, wq->name);
show_pwq(pwq);
- spin_unlock_irq(&pwq->pool->lock);
+ raw_spin_unlock_irq(&pwq->pool->lock);
mutex_unlock(&wq->mutex);
mutex_unlock(&wq_pool_mutex);
show_workqueue_state();
return;
}
- spin_unlock_irq(&pwq->pool->lock);
+ raw_spin_unlock_irq(&pwq->pool->lock);
}
mutex_unlock(&wq->mutex);
- mutex_unlock(&wq_pool_mutex);
/*
* wq list is used to freeze wq, remove from list after
* flushing is complete in case freeze races us.
*/
- mutex_lock(&wq_pool_mutex);
list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
@@ -4558,10 +4569,10 @@ unsigned int work_busy(struct work_struct *work)
rcu_read_lock();
pool = get_work_pool(work);
if (pool) {
- spin_lock_irqsave(&pool->lock, flags);
+ raw_spin_lock_irqsave(&pool->lock, flags);
if (find_worker_executing_work(pool, work))
ret |= WORK_BUSY_RUNNING;
- spin_unlock_irqrestore(&pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pool->lock, flags);
}
rcu_read_unlock();
@@ -4627,11 +4638,11 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
* Carefully copy the associated workqueue's workfn, name and desc.
* Keep the original last '\0' in case the original is garbage.
*/
- probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
- probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
- probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
- probe_kernel_read(name, wq->name, sizeof(name) - 1);
- probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
+ copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn));
+ copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq));
+ copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq));
+ copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1);
+ copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1);
if (fn || name[0] || desc[0]) {
printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
@@ -4768,10 +4779,10 @@ void show_workqueue_state(void)
pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
for_each_pwq(pwq, wq) {
- spin_lock_irqsave(&pwq->pool->lock, flags);
+ raw_spin_lock_irqsave(&pwq->pool->lock, flags);
if (pwq->nr_active || !list_empty(&pwq->delayed_works))
show_pwq(pwq);
- spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
/*
* We could be printing a lot from atomic context, e.g.
* sysrq-t -> show_workqueue_state(). Avoid triggering
@@ -4785,7 +4796,7 @@ void show_workqueue_state(void)
struct worker *worker;
bool first = true;
- spin_lock_irqsave(&pool->lock, flags);
+ raw_spin_lock_irqsave(&pool->lock, flags);
if (pool->nr_workers == pool->nr_idle)
goto next_pool;
@@ -4804,7 +4815,7 @@ void show_workqueue_state(void)
}
pr_cont("\n");
next_pool:
- spin_unlock_irqrestore(&pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pool->lock, flags);
/*
* We could be printing a lot from atomic context, e.g.
* sysrq-t -> show_workqueue_state(). Avoid triggering
@@ -4834,7 +4845,7 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
struct worker_pool *pool = worker->pool;
if (pool) {
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* ->desc tracks information (wq name or
* set_worker_desc()) for the latest execution. If
@@ -4848,7 +4859,7 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
scnprintf(buf + off, size - off, "-%s",
worker->desc);
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
}
@@ -4879,7 +4890,7 @@ static void unbind_workers(int cpu)
for_each_cpu_worker_pool(pool, cpu) {
mutex_lock(&wq_pool_attach_mutex);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* We've blocked all attach/detach operations. Make all workers
@@ -4893,7 +4904,7 @@ static void unbind_workers(int cpu)
pool->flags |= POOL_DISASSOCIATED;
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
mutex_unlock(&wq_pool_attach_mutex);
/*
@@ -4919,9 +4930,9 @@ static void unbind_workers(int cpu)
* worker blocking could lead to lengthy stalls. Kick off
* unbound chain execution of currently pending work items.
*/
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
wake_up_worker(pool);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
}
@@ -4948,7 +4959,7 @@ static void rebind_workers(struct worker_pool *pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
pool->attrs->cpumask) < 0);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
pool->flags &= ~POOL_DISASSOCIATED;
@@ -4987,7 +4998,7 @@ static void rebind_workers(struct worker_pool *pool)
WRITE_ONCE(worker->flags, worker_flags);
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
/**
@@ -5906,7 +5917,7 @@ void __init workqueue_init_early(void)
int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
int i, cpu;
- WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
+ BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));