summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/audit.c66
-rw-r--r--kernel/audit.h29
-rw-r--r--kernel/audit_watch.c14
-rw-r--r--kernel/auditsc.c32
-rw-r--r--kernel/bpf/Makefile8
-rw-r--r--kernel/bpf/arraymap.c88
-rw-r--r--kernel/bpf/bpf_lru_list.h3
-rw-r--r--kernel/bpf/cgroup.c37
-rw-r--r--kernel/bpf/core.c110
-rw-r--r--kernel/bpf/devmap.c409
-rw-r--r--kernel/bpf/hashtab.c111
-rw-r--r--kernel/bpf/inode.c16
-rw-r--r--kernel/bpf/lpm_trie.c9
-rw-r--r--kernel/bpf/map_in_map.c5
-rw-r--r--kernel/bpf/map_in_map.h1
-rw-r--r--kernel/bpf/sockmap.c873
-rw-r--r--kernel/bpf/stackmap.c8
-rw-r--r--kernel/bpf/syscall.c585
-rw-r--r--kernel/bpf/tnum.c180
-rw-r--r--kernel/bpf/verifier.c2440
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/cgroup-internal.h17
-rw-r--r--kernel/cgroup/cgroup-v1.c230
-rw-r--r--kernel/cgroup/cgroup.c1098
-rw-r--r--kernel/cgroup/cpuset.c96
-rw-r--r--kernel/cgroup/debug.c382
-rw-r--r--kernel/cgroup/freezer.c6
-rw-r--r--kernel/cgroup/pids.c1
-rw-r--r--kernel/compat.c398
-rw-r--r--kernel/configs/android-base.config1
-rw-r--r--kernel/cpu.c51
-rw-r--r--kernel/cpu_pm.c50
-rw-r--r--kernel/crash_core.c44
-rw-r--r--kernel/events/core.c281
-rw-r--r--kernel/events/internal.h6
-rw-r--r--kernel/events/ring_buffer.c31
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/exit.c334
-rw-r--r--kernel/extable.c5
-rw-r--r--kernel/fork.c81
-rw-r--r--kernel/futex.c70
-rw-r--r--kernel/groups.c35
-rw-r--r--kernel/irq/Kconfig9
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/affinity.c13
-rw-r--r--kernel/irq/chip.c131
-rw-r--r--kernel/irq/cpuhotplug.c9
-rw-r--r--kernel/irq/debugfs.c50
-rw-r--r--kernel/irq/internals.h16
-rw-r--r--kernel/irq/ipi.c4
-rw-r--r--kernel/irq/irq_sim.c164
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/irq/irqdomain.c260
-rw-r--r--kernel/irq/manage.c125
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/irq/proc.c8
-rw-r--r--kernel/jump_label.c104
-rw-r--r--kernel/kallsyms.c10
-rw-r--r--kernel/kcmp.c57
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kexec_core.c51
-rw-r--r--kernel/kexec_file.c29
-rw-r--r--kernel/kexec_internal.h2
-rw-r--r--kernel/kmod.c75
-rw-r--r--kernel/kprobes.c42
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/locking/lockdep.c1008
-rw-r--r--kernel/locking/lockdep_internals.h2
-rw-r--r--kernel/locking/lockdep_proc.c4
-rw-r--r--kernel/locking/lockdep_states.h1
-rw-r--r--kernel/locking/osq_lock.c13
-rw-r--r--kernel/locking/qrwlock.c1
-rw-r--r--kernel/locking/qspinlock.c118
-rw-r--r--kernel/locking/qspinlock_paravirt.h27
-rw-r--r--kernel/locking/rtmutex.c1
-rw-r--r--kernel/locking/rtmutex_common.h29
-rw-r--r--kernel/locking/rwsem-spinlock.c41
-rw-r--r--kernel/locking/rwsem-xadd.c33
-rw-r--r--kernel/membarrier.c70
-rw-r--r--kernel/memremap.c78
-rw-r--r--kernel/module.c97
-rw-r--r--kernel/panic.c12
-rw-r--r--kernel/pid.c19
-rw-r--r--kernel/power/hibernate.c29
-rw-r--r--kernel/power/main.c66
-rw-r--r--kernel/power/power.h5
-rw-r--r--kernel/power/snapshot.c6
-rw-r--r--kernel/power/suspend.c184
-rw-r--r--kernel/power/suspend_test.c4
-rw-r--r--kernel/power/swap.c5
-rw-r--r--kernel/printk/printk.c3
-rw-r--r--kernel/rcu/Kconfig3
-rw-r--r--kernel/rcu/rcu.h128
-rw-r--r--kernel/rcu/rcu_segcblist.c108
-rw-r--r--kernel/rcu/rcu_segcblist.h28
-rw-r--r--kernel/rcu/rcuperf.c17
-rw-r--r--kernel/rcu/rcutorture.c83
-rw-r--r--kernel/rcu/srcutiny.c8
-rw-r--r--kernel/rcu/srcutree.c50
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tiny_plugin.h47
-rw-r--r--kernel/rcu/tree.c213
-rw-r--r--kernel/rcu/tree.h15
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h238
-rw-r--r--kernel/rcu/update.c18
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/autogroup.c3
-rw-r--r--kernel/sched/completion.c30
-rw-r--r--kernel/sched/core.c66
-rw-r--r--kernel/sched/cpudeadline.c27
-rw-r--r--kernel/sched/cpufreq_schedutil.c103
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/cputime.c180
-rw-r--r--kernel/sched/deadline.c49
-rw-r--r--kernel/sched/debug.c83
-rw-r--r--kernel/sched/fair.c495
-rw-r--r--kernel/sched/idle.c8
-rw-r--r--kernel/sched/membarrier.c152
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h16
-rw-r--r--kernel/sched/swait.c6
-rw-r--r--kernel/sched/topology.c39
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/seccomp.c16
-rw-r--r--kernel/signal.c168
-rw-r--r--kernel/smp.c32
-rw-r--r--kernel/sys.c122
-rw-r--r--kernel/sysctl.c335
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/task_work.c8
-rw-r--r--kernel/time/alarmtimer.c21
-rw-r--r--kernel/time/hrtimer.c30
-rw-r--r--kernel/time/posix-cpu-timers.c22
-rw-r--r--kernel/time/posix-stubs.c96
-rw-r--r--kernel/time/posix-timers.c127
-rw-r--r--kernel/time/time.c58
-rw-r--r--kernel/time/timekeeping.c6
-rw-r--r--kernel/time/timekeeping_debug.c5
-rw-r--r--kernel/time/timer.c52
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/Kconfig22
-rw-r--r--kernel/trace/blktrace.c261
-rw-r--r--kernel/trace/bpf_trace.c100
-rw-r--r--kernel/trace/ftrace.c415
-rw-r--r--kernel/trace/ring_buffer.c24
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c489
-rw-r--r--kernel/trace/trace.h36
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_events.c66
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_kprobe.c15
-rw-r--r--kernel/trace/trace_output.c27
-rw-r--r--kernel/trace/trace_sched_switch.c72
-rw-r--r--kernel/trace/trace_stack.c6
-rw-r--r--kernel/trace/trace_syscalls.c57
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/trace/tracing_map.c11
-rw-r--r--kernel/up.c2
-rw-r--r--kernel/watchdog.c290
-rw-r--r--kernel/watchdog_hld.c96
-rw-r--r--kernel/workqueue.c89
165 files changed, 11906 insertions, 4794 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 72aa080f91f0..9c323a6daa46 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -82,7 +82,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
-obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
@@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_HAS_IOMEM) += memremap.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 4b7d49868ce1..be1c28fd4d57 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -575,12 +575,16 @@ static void kauditd_retry_skb(struct sk_buff *skb)
/**
* auditd_reset - Disconnect the auditd connection
+ * @ac: auditd connection state
*
* Description:
* Break the auditd/kauditd connection and move all the queued records into the
- * hold queue in case auditd reconnects.
+ * hold queue in case auditd reconnects. It is important to note that the @ac
+ * pointer should never be dereferenced inside this function as it may be NULL
+ * or invalid, you can only compare the memory address! If @ac is NULL then
+ * the connection will always be reset.
*/
-static void auditd_reset(void)
+static void auditd_reset(const struct auditd_connection *ac)
{
unsigned long flags;
struct sk_buff *skb;
@@ -590,17 +594,21 @@ static void auditd_reset(void)
spin_lock_irqsave(&auditd_conn_lock, flags);
ac_old = rcu_dereference_protected(auditd_conn,
lockdep_is_held(&auditd_conn_lock));
+ if (ac && ac != ac_old) {
+ /* someone already registered a new auditd connection */
+ spin_unlock_irqrestore(&auditd_conn_lock, flags);
+ return;
+ }
rcu_assign_pointer(auditd_conn, NULL);
spin_unlock_irqrestore(&auditd_conn_lock, flags);
if (ac_old)
call_rcu(&ac_old->rcu, auditd_conn_free);
- /* flush all of the main and retry queues to the hold queue */
+ /* flush the retry queue to the hold queue, but don't touch the main
+ * queue since we need to process that normally for multicast */
while ((skb = skb_dequeue(&audit_retry_queue)))
kauditd_hold_skb(skb);
- while ((skb = skb_dequeue(&audit_queue)))
- kauditd_hold_skb(skb);
}
/**
@@ -633,6 +641,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
ac = rcu_dereference(auditd_conn);
if (!ac) {
rcu_read_unlock();
+ kfree_skb(skb);
rc = -ECONNREFUSED;
goto err;
}
@@ -649,8 +658,8 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
return rc;
err:
- if (rc == -ECONNREFUSED)
- auditd_reset();
+ if (ac && rc == -ECONNREFUSED)
+ auditd_reset(ac);
return rc;
}
@@ -795,9 +804,9 @@ static int kauditd_thread(void *dummy)
rc = kauditd_send_queue(sk, portid,
&audit_hold_queue, UNICAST_RETRIES,
NULL, kauditd_rehold_skb);
- if (rc < 0) {
+ if (ac && rc < 0) {
sk = NULL;
- auditd_reset();
+ auditd_reset(ac);
goto main_queue;
}
@@ -805,9 +814,9 @@ static int kauditd_thread(void *dummy)
rc = kauditd_send_queue(sk, portid,
&audit_retry_queue, UNICAST_RETRIES,
NULL, kauditd_hold_skb);
- if (rc < 0) {
+ if (ac && rc < 0) {
sk = NULL;
- auditd_reset();
+ auditd_reset(ac);
goto main_queue;
}
@@ -815,12 +824,13 @@ main_queue:
/* process the main queue - do the multicast send and attempt
* unicast, dump failed record sends to the retry queue; if
* sk == NULL due to previous failures we will just do the
- * multicast send and move the record to the retry queue */
+ * multicast send and move the record to the hold queue */
rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
kauditd_send_multicast_skb,
- kauditd_retry_skb);
- if (sk == NULL || rc < 0)
- auditd_reset();
+ (sk ?
+ kauditd_retry_skb : kauditd_hold_skb));
+ if (ac && rc < 0)
+ auditd_reset(ac);
sk = NULL;
/* drop our netns reference, no auditd sends past this line */
@@ -1230,7 +1240,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
auditd_pid, 1);
/* unregister the auditd connection */
- auditd_reset();
+ auditd_reset(NULL);
}
}
if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
@@ -1652,7 +1662,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial)
{
if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
- ktime_get_real_ts64(t);
+ *t = current_kernel_time64();
*serial = audit_serial();
}
}
@@ -1823,7 +1833,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
}
/**
- * audit_log_hex - convert a buffer to hex and append it to the audit skb
+ * audit_log_n_hex - convert a buffer to hex and append it to the audit skb
* @ab: the audit_buffer
* @buf: buffer to convert to hex
* @len: length of @buf to be converted
@@ -1999,22 +2009,10 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
{
- kernel_cap_t *perm = &name->fcap.permitted;
- kernel_cap_t *inh = &name->fcap.inheritable;
- int log = 0;
-
- if (!cap_isclear(*perm)) {
- audit_log_cap(ab, "cap_fp", perm);
- log = 1;
- }
- if (!cap_isclear(*inh)) {
- audit_log_cap(ab, "cap_fi", inh);
- log = 1;
- }
-
- if (log)
- audit_log_format(ab, " cap_fe=%d cap_fver=%x",
- name->fcap.fE, name->fcap_ver);
+ audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
+ audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
+ audit_log_format(ab, " cap_fe=%d cap_fver=%x",
+ name->fcap.fE, name->fcap_ver);
}
static inline int audit_copy_fcaps(struct audit_names *name,
diff --git a/kernel/audit.h b/kernel/audit.h
index ddfce2ea4891..b331d9b83f63 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -68,6 +68,7 @@ struct audit_cap_data {
unsigned int fE; /* effective bit of file cap */
kernel_cap_t effective; /* effective set of process */
};
+ kernel_cap_t ambient;
};
/* When fs/namei.c:getname() is called, we store the pointer in name and bump
@@ -247,13 +248,13 @@ struct audit_netlink_list {
struct sk_buff_head q;
};
-int audit_send_list(void *);
+int audit_send_list(void *_dest);
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
-extern int audit_del_rule(struct audit_entry *);
-extern void audit_free_rule_rcu(struct rcu_head *);
+extern int audit_del_rule(struct audit_entry *entry);
+extern void audit_free_rule_rcu(struct rcu_head *head);
extern struct list_head audit_filter_list[];
extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
@@ -301,17 +302,17 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
#endif /* CONFIG_AUDIT_WATCH */
#ifdef CONFIG_AUDIT_TREE
-extern struct audit_chunk *audit_tree_lookup(const struct inode *);
-extern void audit_put_chunk(struct audit_chunk *);
-extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *);
-extern int audit_make_tree(struct audit_krule *, char *, u32);
-extern int audit_add_tree_rule(struct audit_krule *);
-extern int audit_remove_tree_rule(struct audit_krule *);
+extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
+extern void audit_put_chunk(struct audit_chunk *chunk);
+extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree);
+extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
+extern int audit_add_tree_rule(struct audit_krule *rule);
+extern int audit_remove_tree_rule(struct audit_krule *rule);
extern void audit_trim_trees(void);
extern int audit_tag_tree(char *old, char *new);
-extern const char *audit_tree_path(struct audit_tree *);
-extern void audit_put_tree(struct audit_tree *);
-extern void audit_kill_trees(struct list_head *);
+extern const char *audit_tree_path(struct audit_tree *tree);
+extern void audit_put_tree(struct audit_tree *tree);
+extern void audit_kill_trees(struct list_head *list);
#else
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
@@ -323,7 +324,7 @@ extern void audit_kill_trees(struct list_head *);
#define audit_kill_trees(list) BUG()
#endif
-extern char *audit_unpack_string(void **, size_t *, size_t);
+extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
extern pid_t audit_sig_pid;
extern kuid_t audit_sig_uid;
@@ -333,7 +334,7 @@ extern int audit_filter(int msgtype, unsigned int listtype);
#ifdef CONFIG_AUDITSYSCALL
extern int audit_signal_info(int sig, struct task_struct *t);
-extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
+extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx);
extern struct list_head *audit_killed_trees(void);
#else
#define audit_signal_info(s,t) AUDIT_DISABLED
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 62d686d96581..9eb8b3511636 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -66,7 +66,7 @@ static struct fsnotify_group *audit_watch_group;
/* fsnotify events we care about. */
#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+ FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)
static void audit_free_parent(struct audit_parent *parent)
{
@@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule)
list_del(&krule->rlist);
if (list_empty(&watch->rules)) {
+ /*
+ * audit_remove_watch() drops our reference to 'parent' which
+ * can get freed. Grab our own reference to be safe.
+ */
+ audit_get_parent(parent);
audit_remove_watch(watch);
-
- if (list_empty(&parent->watches)) {
- audit_get_parent(parent);
+ if (list_empty(&parent->watches))
fsnotify_destroy_mark(&parent->mark, audit_watch_group);
- audit_put_parent(parent);
- }
+ audit_put_parent(parent);
}
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bb724baa7ac9..aac1a41f82bd 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1261,6 +1261,7 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
+ audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient);
break;
case AUDIT_MMAP:
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
@@ -1382,9 +1383,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
- audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted);
- audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable);
- audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
+ audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient);
+ audit_log_cap(ab, "pp", &axs->new_pcap.permitted);
+ audit_log_cap(ab, "pi", &axs->new_pcap.inheritable);
+ audit_log_cap(ab, "pe", &axs->new_pcap.effective);
+ audit_log_cap(ab, "pa", &axs->new_pcap.ambient);
break; }
}
@@ -1459,7 +1462,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
}
/**
- * audit_free - free a per-task audit context
+ * __audit_free - free a per-task audit context
* @tsk: task whose audit context block to free
*
* Called from copy_process and do_exit
@@ -1486,7 +1489,7 @@ void __audit_free(struct task_struct *tsk)
}
/**
- * audit_syscall_entry - fill in an audit record at syscall entry
+ * __audit_syscall_entry - fill in an audit record at syscall entry
* @major: major syscall type (function)
* @a1: additional syscall register 1
* @a2: additional syscall register 2
@@ -1533,14 +1536,14 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
return;
context->serial = 0;
- ktime_get_real_ts64(&context->ctime);
+ context->ctime = current_kernel_time64();
context->in_syscall = 1;
context->current_state = state;
context->ppid = 0;
}
/**
- * audit_syscall_exit - deallocate audit context after a system call
+ * __audit_syscall_exit - deallocate audit context after a system call
* @success: success value of the syscall
* @return_code: return value of the syscall
*
@@ -1702,7 +1705,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
}
/**
- * audit_reusename - fill out filename with info from existing entry
+ * __audit_reusename - fill out filename with info from existing entry
* @uptr: userland ptr to pathname
*
* Search the audit_names list for the current audit context. If there is an
@@ -1727,7 +1730,7 @@ __audit_reusename(const __user char *uptr)
}
/**
- * audit_getname - add a name to the list
+ * __audit_getname - add a name to the list
* @name: name to add
*
* Add a name to the list of audit names for this context.
@@ -2132,7 +2135,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
}
/**
- * audit_ipc_obj - record audit data for ipc object
+ * __audit_ipc_obj - record audit data for ipc object
* @ipcp: ipc permissions
*
*/
@@ -2148,7 +2151,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
}
/**
- * audit_ipc_set_perm - record audit data for new ipc permissions
+ * __audit_ipc_set_perm - record audit data for new ipc permissions
* @qbytes: msgq bytes
* @uid: msgq user id
* @gid: msgq group id
@@ -2177,7 +2180,7 @@ void __audit_bprm(struct linux_binprm *bprm)
/**
- * audit_socketcall - record audit data for sys_socketcall
+ * __audit_socketcall - record audit data for sys_socketcall
* @nargs: number of args, which should not be more than AUDITSC_ARGS.
* @args: args array
*
@@ -2208,7 +2211,7 @@ void __audit_fd_pair(int fd1, int fd2)
}
/**
- * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
+ * __audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
* @len: data length in user space
* @a: data address in kernel space
*
@@ -2342,10 +2345,12 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->old_pcap.permitted = old->cap_permitted;
ax->old_pcap.inheritable = old->cap_inheritable;
ax->old_pcap.effective = old->cap_effective;
+ ax->old_pcap.ambient = old->cap_ambient;
ax->new_pcap.permitted = new->cap_permitted;
ax->new_pcap.inheritable = new->cap_inheritable;
ax->new_pcap.effective = new->cap_effective;
+ ax->new_pcap.ambient = new->cap_ambient;
return 0;
}
@@ -2364,6 +2369,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
context->capset.cap.permitted = new->cap_permitted;
+ context->capset.cap.ambient = new->cap_ambient;
context->type = AUDIT_CAPSET;
}
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e1e5e658f2db..897daa005b23 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,13 @@
obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+ifeq ($(CONFIG_NET),y)
+obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+ifeq ($(CONFIG_STREAM_PARSER),y)
+obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
+endif
+endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 172dc8ee0e3b..98c0f00c3f5e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -49,13 +49,15 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_array *array;
u64 array_size;
u32 elem_size;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size == 0 || attr->map_flags)
+ attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+ (percpu && numa_node != NUMA_NO_NODE))
return ERR_PTR(-EINVAL);
if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -77,7 +79,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
/* allocate all map elements and zero-initialize them */
- array = bpf_map_area_alloc(array_size);
+ array = bpf_map_area_alloc(array_size, numa_node);
if (!array)
return ERR_PTR(-ENOMEM);
@@ -87,6 +89,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array->map.value_size = attr->value_size;
array->map.max_entries = attr->max_entries;
array->map.map_flags = attr->map_flags;
+ array->map.numa_node = numa_node;
array->elem_size = elem_size;
if (!percpu)
@@ -335,6 +338,26 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
}
/* only called from syscall */
+int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
+{
+ void **elem, *ptr;
+ int ret = 0;
+
+ if (!map->ops->map_fd_sys_lookup_elem)
+ return -ENOTSUPP;
+
+ rcu_read_lock();
+ elem = array_map_lookup_elem(map, key);
+ if (elem && (ptr = READ_ONCE(*elem)))
+ *value = map->ops->map_fd_sys_lookup_elem(ptr);
+ else
+ ret = -ENOENT;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* only called from syscall */
int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags)
{
@@ -400,6 +423,11 @@ static void prog_fd_array_put_ptr(void *ptr)
bpf_prog_put(ptr);
}
+static u32 prog_fd_array_sys_lookup_elem(void *ptr)
+{
+ return ((struct bpf_prog *)ptr)->aux->id;
+}
+
/* decrement refcnt of all bpf_progs that are stored in this map */
void bpf_fd_array_map_clear(struct bpf_map *map)
{
@@ -418,6 +446,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
+ .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -452,38 +481,24 @@ static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
- const struct perf_event_attr *attr;
struct bpf_event_entry *ee;
struct perf_event *event;
struct file *perf_file;
+ u64 value;
perf_file = perf_event_get(fd);
if (IS_ERR(perf_file))
return perf_file;
+ ee = ERR_PTR(-EOPNOTSUPP);
event = perf_file->private_data;
- ee = ERR_PTR(-EINVAL);
-
- attr = perf_event_attrs(event);
- if (IS_ERR(attr) || attr->inherit)
+ if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
goto err_out;
- switch (attr->type) {
- case PERF_TYPE_SOFTWARE:
- if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
- goto err_out;
- /* fall-through */
- case PERF_TYPE_RAW:
- case PERF_TYPE_HARDWARE:
- ee = bpf_event_entry_gen(perf_file, map_file);
- if (ee)
- return ee;
- ee = ERR_PTR(-ENOMEM);
- /* fall-through */
- default:
- break;
- }
-
+ ee = bpf_event_entry_gen(perf_file, map_file);
+ if (ee)
+ return ee;
+ ee = ERR_PTR(-ENOMEM);
err_out:
fput(perf_file);
return ee;
@@ -591,6 +606,31 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
+static u32 array_of_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ u32 elem_size = round_up(map->value_size, 8);
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+ const int map_ptr = BPF_REG_1;
+ const int index = BPF_REG_2;
+
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
+ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ if (is_power_of_2(elem_size))
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
+ else
+ *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
+ *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
+ *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(ret, 0);
+
+ return insn - insn_buf;
+}
+
const struct bpf_map_ops array_of_maps_map_ops = {
.map_alloc = array_of_map_alloc,
.map_free = array_of_map_free,
@@ -599,4 +639,6 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
+ .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
+ .map_gen_lookup = array_of_map_gen_lookup,
};
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 5c35a98d02bf..7d4f89b7cb84 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -69,7 +69,8 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
/* ref is an approximation on access frequency. It does not
* have to be very accurate. Hence, no protection is used.
*/
- node->ref = 1;
+ if (!node->ref)
+ node->ref = 1;
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ea6033cba947..546113430049 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
+
+/**
+ * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
+ * @sk: socket to get cgroup from
+ * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
+ * sk with connection information (IP addresses, etc.) May not contain
+ * cgroup info if it is a req sock.
+ * @type: The type of program to be exectuted
+ *
+ * socket passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock_ops
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
+ struct bpf_sock_ops_kern *sock_ops,
+ enum bpf_attach_type type)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_prog *prog;
+ int ret = 0;
+
+
+ rcu_read_lock();
+
+ prog = rcu_dereference(cgrp->bpf.effective[type]);
+ if (prog)
+ ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
+
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index dedf367f59bb..917cc04a0a94 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -595,9 +595,13 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case BPF_JMP | BPF_JEQ | BPF_K:
case BPF_JMP | BPF_JNE | BPF_K:
case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JLT | BPF_K:
case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JLE | BPF_K:
case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSLT | BPF_K:
case BPF_JMP | BPF_JSGE | BPF_K:
+ case BPF_JMP | BPF_JSLE | BPF_K:
case BPF_JMP | BPF_JSET | BPF_K:
/* Accommodate for extra offset in case of a backjump. */
off = from->off;
@@ -763,10 +767,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
*
* Decode and execute eBPF instructions.
*/
-static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
+ u64 *stack)
{
- u64 stack[MAX_BPF_STACK / sizeof(u64)];
- u64 regs[MAX_BPF_REG], tmp;
+ u64 tmp;
static const void *jumptable[256] = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
@@ -824,7 +828,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
/* Call instruction */
[BPF_JMP | BPF_CALL] = &&JMP_CALL,
- [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
+ [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
/* Jumps */
[BPF_JMP | BPF_JA] = &&JMP_JA,
[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -833,12 +837,20 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X,
+ [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K,
[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X,
+ [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K,
[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X,
+ [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K,
[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X,
+ [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K,
[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
/* Program return */
@@ -874,9 +886,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
#define CONT ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })
- FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
- ARG1 = (u64) (unsigned long) ctx;
-
select_insn:
goto *jumptable[insn->code];
@@ -1076,6 +1085,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JLT_X:
+ if (DST < SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JLT_K:
+ if (DST < IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JGE_X:
if (DST >= SRC) {
insn += insn->off;
@@ -1088,6 +1109,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JLE_X:
+ if (DST <= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JLE_K:
+ if (DST <= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSGT_X:
if (((s64) DST) > ((s64) SRC)) {
insn += insn->off;
@@ -1100,6 +1133,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JSLT_X:
+ if (((s64) DST) < ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSLT_K:
+ if (((s64) DST) < ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSGE_X:
if (((s64) DST) >= ((s64) SRC)) {
insn += insn->off;
@@ -1112,6 +1157,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JSLE_X:
+ if (((s64) DST) <= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSLE_K:
+ if (((s64) DST) <= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSET_X:
if (DST & SRC) {
insn += insn->off;
@@ -1219,7 +1276,39 @@ load_byte:
WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
return 0;
}
-STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */
+STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
+
+#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
+#define DEFINE_BPF_PROG_RUN(stack_size) \
+static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
+{ \
+ u64 stack[stack_size / sizeof(u64)]; \
+ u64 regs[MAX_BPF_REG]; \
+\
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
+ ARG1 = (u64) (unsigned long) ctx; \
+ return ___bpf_prog_run(regs, insn, stack); \
+}
+
+#define EVAL1(FN, X) FN(X)
+#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
+#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
+#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
+#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
+#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
+
+EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
+EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
+EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
+
+#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
+
+static unsigned int (*interpreters[])(const void *ctx,
+ const struct bpf_insn *insn) = {
+EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
+EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
+EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
+};
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
@@ -1268,7 +1357,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
- fp->bpf_func = (void *) __bpf_prog_run;
+ u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
+
+ fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
/* eBPF JITs can rewrite the program in case constant
* blinding is active. However, in case of error during
@@ -1347,6 +1438,7 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto bpf_sock_map_update_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
new file mode 100644
index 000000000000..ecf9f99ecc57
--- /dev/null
+++ b/kernel/bpf/devmap.c
@@ -0,0 +1,409 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* Devmaps primary use is as a backend map for XDP BPF helper call
+ * bpf_redirect_map(). Because XDP is mostly concerned with performance we
+ * spent some effort to ensure the datapath with redirect maps does not use
+ * any locking. This is a quick note on the details.
+ *
+ * We have three possible paths to get into the devmap control plane bpf
+ * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
+ * will invoke an update, delete, or lookup operation. To ensure updates and
+ * deletes appear atomic from the datapath side xchg() is used to modify the
+ * netdev_map array. Then because the datapath does a lookup into the netdev_map
+ * array (read-only) from an RCU critical section we use call_rcu() to wait for
+ * an rcu grace period before free'ing the old data structures. This ensures the
+ * datapath always has a valid copy. However, the datapath does a "flush"
+ * operation that pushes any pending packets in the driver outside the RCU
+ * critical section. Each bpf_dtab_netdev tracks these pending operations using
+ * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
+ * until all bits are cleared indicating outstanding flush operations have
+ * completed.
+ *
+ * BPF syscalls may race with BPF program calls on any of the update, delete
+ * or lookup operations. As noted above the xchg() operation also keep the
+ * netdev_map consistent in this case. From the devmap side BPF programs
+ * calling into these operations are the same as multiple user space threads
+ * making system calls.
+ *
+ * Finally, any of the above may race with a netdev_unregister notifier. The
+ * unregister notifier must search for net devices in the map structure that
+ * contain a reference to the net device and remove them. This is a two step
+ * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
+ * check to see if the ifindex is the same as the net_device being removed.
+ * When removing the dev a cmpxchg() is used to ensure the correct dev is
+ * removed, in the case of a concurrent update or delete operation it is
+ * possible that the initially referenced dev is no longer in the map. As the
+ * notifier hook walks the map we know that new dev references can not be
+ * added by the user because core infrastructure ensures dev_get_by_index()
+ * calls will fail at this point.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+struct bpf_dtab_netdev {
+ struct net_device *dev;
+ struct bpf_dtab *dtab;
+ unsigned int bit;
+ struct rcu_head rcu;
+};
+
+struct bpf_dtab {
+ struct bpf_map map;
+ struct bpf_dtab_netdev **netdev_map;
+ unsigned long __percpu *flush_needed;
+ struct list_head list;
+};
+
+static DEFINE_SPINLOCK(dev_map_lock);
+static LIST_HEAD(dev_map_list);
+
+static u64 dev_map_bitmap_size(const union bpf_attr *attr)
+{
+ return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_dtab *dtab;
+ u64 cost;
+ int err;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ return ERR_PTR(-EINVAL);
+
+ dtab = kzalloc(sizeof(*dtab), GFP_USER);
+ if (!dtab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ dtab->map.map_type = attr->map_type;
+ dtab->map.key_size = attr->key_size;
+ dtab->map.value_size = attr->value_size;
+ dtab->map.max_entries = attr->max_entries;
+ dtab->map.map_flags = attr->map_flags;
+ dtab->map.numa_node = bpf_map_attr_numa_node(attr);
+
+ /* make sure page count doesn't overflow */
+ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
+ cost += dev_map_bitmap_size(attr) * num_possible_cpus();
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_dtab;
+
+ dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* if map size is larger than memlock limit, reject it early */
+ err = bpf_map_precharge_memlock(dtab->map.pages);
+ if (err)
+ goto free_dtab;
+
+ /* A per cpu bitfield with a bit per possible net device */
+ dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr),
+ __alignof__(unsigned long));
+ if (!dtab->flush_needed)
+ goto free_dtab;
+
+ dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
+ sizeof(struct bpf_dtab_netdev *),
+ dtab->map.numa_node);
+ if (!dtab->netdev_map)
+ goto free_dtab;
+
+ spin_lock(&dev_map_lock);
+ list_add_tail_rcu(&dtab->list, &dev_map_list);
+ spin_unlock(&dev_map_lock);
+
+ return &dtab->map;
+free_dtab:
+ free_percpu(dtab->flush_needed);
+ kfree(dtab);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void dev_map_free(struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ int i, cpu;
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete. The rcu critical section only guarantees
+ * no further reads against netdev_map. It does __not__ ensure pending
+ * flush operations (if any) are complete.
+ */
+
+ spin_lock(&dev_map_lock);
+ list_del_rcu(&dtab->list);
+ spin_unlock(&dev_map_lock);
+
+ synchronize_rcu();
+
+ /* To ensure all pending flush operations have completed wait for flush
+ * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * Because the above synchronize_rcu() ensures the map is disconnected
+ * from the program we can assume no new bits will be set.
+ */
+ for_each_online_cpu(cpu) {
+ unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
+
+ while (!bitmap_empty(bitmap, dtab->map.max_entries))
+ cpu_relax();
+ }
+
+ for (i = 0; i < dtab->map.max_entries; i++) {
+ struct bpf_dtab_netdev *dev;
+
+ dev = dtab->netdev_map[i];
+ if (!dev)
+ continue;
+
+ dev_put(dev->dev);
+ kfree(dev);
+ }
+
+ free_percpu(dtab->flush_needed);
+ bpf_map_area_free(dtab->netdev_map);
+ kfree(dtab);
+}
+
+static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= dtab->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == dtab->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+
+ __set_bit(bit, bitmap);
+}
+
+/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
+ * from the driver before returning from its napi->poll() routine. The poll()
+ * routine is called either from busy_poll context or net_rx_action signaled
+ * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
+ * net device can be torn down. On devmap tear down we ensure the ctx bitmap
+ * is zeroed before completing to ensure all flush operations have completed.
+ */
+void __dev_map_flush(struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+ u32 bit;
+
+ for_each_set_bit(bit, bitmap, map->max_entries) {
+ struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
+ struct net_device *netdev;
+
+ /* This is possible if the dev entry is removed by user space
+ * between xdp redirect and flush op.
+ */
+ if (unlikely(!dev))
+ continue;
+
+ __clear_bit(bit, bitmap);
+ netdev = dev->dev;
+ if (likely(netdev->netdev_ops->ndo_xdp_flush))
+ netdev->netdev_ops->ndo_xdp_flush(netdev);
+ }
+}
+
+/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
+ * update happens in parallel here a dev_put wont happen until after reading the
+ * ifindex.
+ */
+struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dev;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ dev = READ_ONCE(dtab->netdev_map[key]);
+ return dev ? dev->dev : NULL;
+}
+
+static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+
+ return dev ? &dev->ifindex : NULL;
+}
+
+static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
+{
+ if (dev->dev->netdev_ops->ndo_xdp_flush) {
+ struct net_device *fl = dev->dev;
+ unsigned long *bitmap;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
+ __clear_bit(dev->bit, bitmap);
+
+ fl->netdev_ops->ndo_xdp_flush(dev->dev);
+ }
+ }
+}
+
+static void __dev_map_entry_free(struct rcu_head *rcu)
+{
+ struct bpf_dtab_netdev *dev;
+
+ dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+ dev_map_flush_old(dev);
+ dev_put(dev->dev);
+ kfree(dev);
+}
+
+static int dev_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *old_dev;
+ int k = *(u32 *)key;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ /* Use call_rcu() here to ensure any rcu critical sections have
+ * completed, but this does not guarantee a flush has happened
+ * yet. Because driver side rcu_read_lock/unlock only protects the
+ * running XDP program. However, for pending flush operations the
+ * dev and ctx are stored in another per cpu map. And additionally,
+ * the driver tear down ensures all soft irqs are complete before
+ * removing the net device in the case of dev_put equals zero.
+ */
+ old_dev = xchg(&dtab->netdev_map[k], NULL);
+ if (old_dev)
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ return 0;
+}
+
+static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_dtab_netdev *dev, *old_dev;
+ u32 i = *(u32 *)key;
+ u32 ifindex = *(u32 *)value;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(i >= dtab->map.max_entries))
+ return -E2BIG;
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+
+ if (!ifindex) {
+ dev = NULL;
+ } else {
+ dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
+ map->numa_node);
+ if (!dev)
+ return -ENOMEM;
+
+ dev->dev = dev_get_by_index(net, ifindex);
+ if (!dev->dev) {
+ kfree(dev);
+ return -EINVAL;
+ }
+
+ dev->bit = i;
+ dev->dtab = dtab;
+ }
+
+ /* Use call_rcu() here to ensure rcu critical sections have completed
+ * Remembering the driver side flush operation will happen before the
+ * net device is removed.
+ */
+ old_dev = xchg(&dtab->netdev_map[i], dev);
+ if (old_dev)
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+
+ return 0;
+}
+
+const struct bpf_map_ops dev_map_ops = {
+ .map_alloc = dev_map_alloc,
+ .map_free = dev_map_free,
+ .map_get_next_key = dev_map_get_next_key,
+ .map_lookup_elem = dev_map_lookup_elem,
+ .map_update_elem = dev_map_update_elem,
+ .map_delete_elem = dev_map_delete_elem,
+};
+
+static int dev_map_notification(struct notifier_block *notifier,
+ ulong event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct bpf_dtab *dtab;
+ int i;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* This rcu_read_lock/unlock pair is needed because
+ * dev_map_list is an RCU list AND to ensure a delete
+ * operation does not free a netdev_map entry while we
+ * are comparing it against the netdev being unregistered.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(dtab, &dev_map_list, list) {
+ for (i = 0; i < dtab->map.max_entries; i++) {
+ struct bpf_dtab_netdev *dev, *odev;
+
+ dev = READ_ONCE(dtab->netdev_map[i]);
+ if (!dev ||
+ dev->dev->ifindex != netdev->ifindex)
+ continue;
+ odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
+ if (dev == odev)
+ call_rcu(&dev->rcu,
+ __dev_map_entry_free);
+ }
+ }
+ rcu_read_unlock();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_map_notifier = {
+ .notifier_call = dev_map_notification,
+};
+
+static int __init dev_map_init(void)
+{
+ register_netdevice_notifier(&dev_map_notifier);
+ return 0;
+}
+
+subsys_initcall(dev_map_init);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 004334ea13ba..431126f31ea3 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,6 +18,9 @@
#include "bpf_lru_list.h"
#include "map_in_map.h"
+#define HTAB_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+
struct bucket {
struct hlist_nulls_head head;
raw_spinlock_t lock;
@@ -138,7 +141,8 @@ static int prealloc_init(struct bpf_htab *htab)
if (!htab_is_percpu(htab) && !htab_is_lru(htab))
num_entries += num_possible_cpus();
- htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
+ htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries,
+ htab->map.numa_node);
if (!htab->elems)
return -ENOMEM;
@@ -233,6 +237,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+ int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_htab *htab;
int err, i;
u64 cost;
@@ -248,7 +253,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
return ERR_PTR(-EPERM);
- if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
+ if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
/* reserved bits should not be used */
return ERR_PTR(-EINVAL);
@@ -258,6 +263,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (lru && !prealloc)
return ERR_PTR(-ENOTSUPP);
+ if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
+ return ERR_PTR(-EINVAL);
+
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -268,6 +276,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->map.value_size = attr->value_size;
htab->map.max_entries = attr->max_entries;
htab->map.map_flags = attr->map_flags;
+ htab->map.numa_node = numa_node;
/* check sanity of attributes.
* value_size == 0 may be allowed in the future to use map as a set
@@ -346,7 +355,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
err = -ENOMEM;
htab->buckets = bpf_map_area_alloc(htab->n_buckets *
- sizeof(struct bucket));
+ sizeof(struct bucket),
+ htab->map.numa_node);
if (!htab->buckets)
goto free_htab;
@@ -504,6 +514,29 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+ const int ref_reg = BPF_REG_1;
+
+ *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
+ *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
+ offsetof(struct htab_elem, lru_node) +
+ offsetof(struct bpf_lru_node, ref));
+ *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
+ *insn++ = BPF_ST_MEM(BPF_B, ret,
+ offsetof(struct htab_elem, lru_node) +
+ offsetof(struct bpf_lru_node, ref),
+ 1);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+ offsetof(struct htab_elem, key) +
+ round_up(map->key_size, 8));
+ return insn - insn_buf;
+}
+
/* It is called from the bpf_lru_list when the LRU needs to delete
* older elements from the htab.
*/
@@ -652,12 +685,27 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
}
}
+static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
+ BITS_PER_LONG == 64;
+}
+
+static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
+{
+ u32 size = htab->map.value_size;
+
+ if (percpu || fd_htab_map_needs_adjust(htab))
+ size = round_up(size, 8);
+ return size;
+}
+
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
struct htab_elem *old_elem)
{
- u32 size = htab->map.value_size;
+ u32 size = htab_size_value(htab, percpu);
bool prealloc = htab_is_prealloc(htab);
struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
@@ -689,16 +737,14 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
atomic_dec(&htab->count);
return ERR_PTR(-E2BIG);
}
- l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+ l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
if (!l_new)
return ERR_PTR(-ENOMEM);
}
memcpy(l_new->key, key, key_size);
if (percpu) {
- /* round up value_size to 8 bytes */
- size = round_up(size, 8);
-
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
@@ -1114,6 +1160,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
+ .map_gen_lookup = htab_lru_map_gen_lookup,
};
/* Called from eBPF program */
@@ -1209,17 +1256,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
{
- struct bpf_map *map;
-
if (attr->value_size != sizeof(u32))
return ERR_PTR(-EINVAL);
-
- /* pointer is stored internally */
- attr->value_size = sizeof(void *);
- map = htab_map_alloc(attr);
- attr->value_size = sizeof(u32);
-
- return map;
+ return htab_map_alloc(attr);
}
static void fd_htab_map_free(struct bpf_map *map)
@@ -1244,6 +1283,26 @@ static void fd_htab_map_free(struct bpf_map *map)
}
/* only called from syscall */
+int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
+{
+ void **ptr;
+ int ret = 0;
+
+ if (!map->ops->map_fd_sys_lookup_elem)
+ return -ENOTSUPP;
+
+ rcu_read_lock();
+ ptr = htab_map_lookup_elem(map, key);
+ if (ptr)
+ *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
+ else
+ ret = -ENOENT;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* only called from syscall */
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags)
{
@@ -1291,6 +1350,22 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
+static u32 htab_of_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+
+ *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+ offsetof(struct htab_elem, key) +
+ round_up(map->key_size, 8));
+ *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
+
+ return insn - insn_buf;
+}
+
static void htab_of_map_free(struct bpf_map *map)
{
bpf_map_meta_free(map->inner_map_meta);
@@ -1305,4 +1380,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_delete_elem = htab_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
+ .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
+ .map_gen_lookup = htab_of_map_gen_lookup,
};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 9bbd33497d3d..e833ed914358 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -377,10 +377,22 @@ static void bpf_evict_inode(struct inode *inode)
bpf_any_put(inode->i_private, type);
}
+/*
+ * Display the mount options in /proc/mounts.
+ */
+static int bpf_show_options(struct seq_file *m, struct dentry *root)
+{
+ umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
+
+ if (mode != S_IRWXUGO)
+ seq_printf(m, ",mode=%o", mode);
+ return 0;
+}
+
static const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
- .show_options = generic_show_options,
+ .show_options = bpf_show_options,
.evict_inode = bpf_evict_inode,
};
@@ -434,8 +446,6 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
struct inode *inode;
int ret;
- save_mount_options(sb, data);
-
ret = bpf_parse_options(data, &opts);
if (ret)
return ret;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b09185f0f17d..1b767844a76f 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -244,7 +244,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
if (value)
size += trie->map.value_size;
- node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
+ node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN,
+ trie->map.numa_node);
if (!node)
return NULL;
@@ -405,6 +406,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key)
#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
+#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
struct lpm_trie *trie;
@@ -416,7 +419,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 ||
- attr->map_flags != BPF_F_NO_PREALLOC ||
+ !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+ attr->map_flags & ~LPM_CREATE_FLAG_MASK ||
attr->key_size < LPM_KEY_SIZE_MIN ||
attr->key_size > LPM_KEY_SIZE_MAX ||
attr->value_size < LPM_VAL_SIZE_MIN ||
@@ -433,6 +437,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
trie->map.value_size = attr->value_size;
trie->map.max_entries = attr->max_entries;
trie->map.map_flags = attr->map_flags;
+ trie->map.numa_node = bpf_map_attr_numa_node(attr);
trie->data_size = attr->key_size -
offsetof(struct bpf_lpm_trie_key, data);
trie->max_prefixlen = trie->data_size * 8;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 59bcdf821ae4..1da574612bea 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -95,3 +95,8 @@ void bpf_map_fd_put_ptr(void *ptr)
*/
bpf_map_put(ptr);
}
+
+u32 bpf_map_fd_sys_lookup_elem(void *ptr)
+{
+ return ((struct bpf_map *)ptr)->id;
+}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 177fadb689dc..6183db9ec08c 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -19,5 +19,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
int ufd);
void bpf_map_fd_put_ptr(void *ptr);
+u32 bpf_map_fd_sys_lookup_elem(void *ptr);
#endif
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
new file mode 100644
index 000000000000..f6ffde9c6a68
--- /dev/null
+++ b/kernel/bpf/sockmap.c
@@ -0,0 +1,873 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* A BPF sock_map is used to store sock objects. This is primarly used
+ * for doing socket redirect with BPF helper routines.
+ *
+ * A sock map may have BPF programs attached to it, currently a program
+ * used to parse packets and a program to provide a verdict and redirect
+ * decision on the packet are supported. Any programs attached to a sock
+ * map are inherited by sock objects when they are added to the map. If
+ * no BPF programs are attached the sock object may only be used for sock
+ * redirect.
+ *
+ * A sock object may be in multiple maps, but can only inherit a single
+ * parse or verdict program. If adding a sock object to a map would result
+ * in having multiple parsing programs the update will return an EBUSY error.
+ *
+ * For reference this program is similar to devmap used in XDP context
+ * reviewing these together may be useful. For an example please review
+ * ./samples/bpf/sockmap/.
+ */
+#include <linux/bpf.h>
+#include <net/sock.h>
+#include <linux/filter.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+#include <net/strparser.h>
+
+struct bpf_stab {
+ struct bpf_map map;
+ struct sock **sock_map;
+ struct bpf_prog *bpf_parse;
+ struct bpf_prog *bpf_verdict;
+};
+
+enum smap_psock_state {
+ SMAP_TX_RUNNING,
+};
+
+struct smap_psock_map_entry {
+ struct list_head list;
+ struct sock **entry;
+};
+
+struct smap_psock {
+ struct rcu_head rcu;
+ /* refcnt is used inside sk_callback_lock */
+ u32 refcnt;
+
+ /* datapath variables */
+ struct sk_buff_head rxqueue;
+ bool strp_enabled;
+
+ /* datapath error path cache across tx work invocations */
+ int save_rem;
+ int save_off;
+ struct sk_buff *save_skb;
+
+ struct strparser strp;
+ struct bpf_prog *bpf_parse;
+ struct bpf_prog *bpf_verdict;
+ struct list_head maps;
+
+ /* Back reference used when sock callback trigger sockmap operations */
+ struct sock *sock;
+ unsigned long state;
+
+ struct work_struct tx_work;
+ struct work_struct gc_work;
+
+ void (*save_data_ready)(struct sock *sk);
+ void (*save_write_space)(struct sock *sk);
+ void (*save_state_change)(struct sock *sk);
+};
+
+static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
+{
+ return rcu_dereference_sk_user_data(sk);
+}
+
+static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
+{
+ struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
+ int rc;
+
+ if (unlikely(!prog))
+ return SK_DROP;
+
+ skb_orphan(skb);
+ skb->sk = psock->sock;
+ bpf_compute_data_end(skb);
+ rc = (*prog->bpf_func)(skb, prog->insnsi);
+ skb->sk = NULL;
+
+ return rc;
+}
+
+static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
+{
+ struct sock *sk;
+ int rc;
+
+ /* Because we use per cpu values to feed input from sock redirect
+ * in BPF program to do_sk_redirect_map() call we need to ensure we
+ * are not preempted. RCU read lock is not sufficient in this case
+ * with CONFIG_PREEMPT_RCU enabled so we must be explicit here.
+ */
+ preempt_disable();
+ rc = smap_verdict_func(psock, skb);
+ switch (rc) {
+ case SK_REDIRECT:
+ sk = do_sk_redirect_map();
+ preempt_enable();
+ if (likely(sk)) {
+ struct smap_psock *peer = smap_psock_sk(sk);
+
+ if (likely(peer &&
+ test_bit(SMAP_TX_RUNNING, &peer->state) &&
+ !sock_flag(sk, SOCK_DEAD) &&
+ sock_writeable(sk))) {
+ skb_set_owner_w(skb, sk);
+ skb_queue_tail(&peer->rxqueue, skb);
+ schedule_work(&peer->tx_work);
+ break;
+ }
+ }
+ /* Fall through and free skb otherwise */
+ case SK_DROP:
+ default:
+ if (rc != SK_REDIRECT)
+ preempt_enable();
+ kfree_skb(skb);
+ }
+}
+
+static void smap_report_sk_error(struct smap_psock *psock, int err)
+{
+ struct sock *sk = psock->sock;
+
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+
+/* Called with lock_sock(sk) held */
+static void smap_state_change(struct sock *sk)
+{
+ struct smap_psock_map_entry *e, *tmp;
+ struct smap_psock *psock;
+ struct socket_wq *wq;
+ struct sock *osk;
+
+ rcu_read_lock();
+
+ /* Allowing transitions into an established syn_recv states allows
+ * for early binding sockets to a smap object before the connection
+ * is established.
+ */
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ case TCP_ESTABLISHED:
+ break;
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ case TCP_LAST_ACK:
+ case TCP_FIN_WAIT1:
+ case TCP_FIN_WAIT2:
+ case TCP_LISTEN:
+ break;
+ case TCP_CLOSE:
+ /* Only release if the map entry is in fact the sock in
+ * question. There is a case where the operator deletes
+ * the sock from the map, but the TCP sock is closed before
+ * the psock is detached. Use cmpxchg to verify correct
+ * sock is removed.
+ */
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ break;
+ write_lock_bh(&sk->sk_callback_lock);
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ osk = cmpxchg(e->entry, sk, NULL);
+ if (osk == sk) {
+ list_del(&e->list);
+ smap_release_sock(psock, sk);
+ }
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ break;
+ default:
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ break;
+ smap_report_sk_error(psock, EPIPE);
+ break;
+ }
+
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_all(&wq->wait);
+ rcu_read_unlock();
+}
+
+static void smap_read_sock_strparser(struct strparser *strp,
+ struct sk_buff *skb)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = container_of(strp, struct smap_psock, strp);
+ smap_do_verdict(psock, skb);
+ rcu_read_unlock();
+}
+
+/* Called with lock held on socket */
+static void smap_data_ready(struct sock *sk)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (likely(psock)) {
+ write_lock_bh(&sk->sk_callback_lock);
+ strp_data_ready(&psock->strp);
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+ rcu_read_unlock();
+}
+
+static void smap_tx_work(struct work_struct *w)
+{
+ struct smap_psock *psock;
+ struct sk_buff *skb;
+ int rem, off, n;
+
+ psock = container_of(w, struct smap_psock, tx_work);
+
+ /* lock sock to avoid losing sk_socket at some point during loop */
+ lock_sock(psock->sock);
+ if (psock->save_skb) {
+ skb = psock->save_skb;
+ rem = psock->save_rem;
+ off = psock->save_off;
+ psock->save_skb = NULL;
+ goto start;
+ }
+
+ while ((skb = skb_dequeue(&psock->rxqueue))) {
+ rem = skb->len;
+ off = 0;
+start:
+ do {
+ if (likely(psock->sock->sk_socket))
+ n = skb_send_sock_locked(psock->sock,
+ skb, off, rem);
+ else
+ n = -EINVAL;
+ if (n <= 0) {
+ if (n == -EAGAIN) {
+ /* Retry when space is available */
+ psock->save_skb = skb;
+ psock->save_rem = rem;
+ psock->save_off = off;
+ goto out;
+ }
+ /* Hard errors break pipe and stop xmit */
+ smap_report_sk_error(psock, n ? -n : EPIPE);
+ clear_bit(SMAP_TX_RUNNING, &psock->state);
+ kfree_skb(skb);
+ goto out;
+ }
+ rem -= n;
+ off += n;
+ } while (rem);
+ kfree_skb(skb);
+ }
+out:
+ release_sock(psock->sock);
+}
+
+static void smap_write_space(struct sock *sk)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
+ schedule_work(&psock->tx_work);
+ rcu_read_unlock();
+}
+
+static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
+{
+ if (!psock->strp_enabled)
+ return;
+ sk->sk_data_ready = psock->save_data_ready;
+ sk->sk_write_space = psock->save_write_space;
+ sk->sk_state_change = psock->save_state_change;
+ psock->save_data_ready = NULL;
+ psock->save_write_space = NULL;
+ psock->save_state_change = NULL;
+ strp_stop(&psock->strp);
+ psock->strp_enabled = false;
+}
+
+static void smap_destroy_psock(struct rcu_head *rcu)
+{
+ struct smap_psock *psock = container_of(rcu,
+ struct smap_psock, rcu);
+
+ /* Now that a grace period has passed there is no longer
+ * any reference to this sock in the sockmap so we can
+ * destroy the psock, strparser, and bpf programs. But,
+ * because we use workqueue sync operations we can not
+ * do it in rcu context
+ */
+ schedule_work(&psock->gc_work);
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
+{
+ psock->refcnt--;
+ if (psock->refcnt)
+ return;
+
+ smap_stop_sock(psock, sock);
+ clear_bit(SMAP_TX_RUNNING, &psock->state);
+ rcu_assign_sk_user_data(sock, NULL);
+ call_rcu_sched(&psock->rcu, smap_destroy_psock);
+}
+
+static int smap_parse_func_strparser(struct strparser *strp,
+ struct sk_buff *skb)
+{
+ struct smap_psock *psock;
+ struct bpf_prog *prog;
+ int rc;
+
+ rcu_read_lock();
+ psock = container_of(strp, struct smap_psock, strp);
+ prog = READ_ONCE(psock->bpf_parse);
+
+ if (unlikely(!prog)) {
+ rcu_read_unlock();
+ return skb->len;
+ }
+
+ /* Attach socket for bpf program to use if needed we can do this
+ * because strparser clones the skb before handing it to a upper
+ * layer, meaning skb_orphan has been called. We NULL sk on the
+ * way out to ensure we don't trigger a BUG_ON in skb/sk operations
+ * later and because we are not charging the memory of this skb to
+ * any socket yet.
+ */
+ skb->sk = psock->sock;
+ bpf_compute_data_end(skb);
+ rc = (*prog->bpf_func)(skb, prog->insnsi);
+ skb->sk = NULL;
+ rcu_read_unlock();
+ return rc;
+}
+
+
+static int smap_read_sock_done(struct strparser *strp, int err)
+{
+ return err;
+}
+
+static int smap_init_sock(struct smap_psock *psock,
+ struct sock *sk)
+{
+ static const struct strp_callbacks cb = {
+ .rcv_msg = smap_read_sock_strparser,
+ .parse_msg = smap_parse_func_strparser,
+ .read_sock_done = smap_read_sock_done,
+ };
+
+ return strp_init(&psock->strp, sk, &cb);
+}
+
+static void smap_init_progs(struct smap_psock *psock,
+ struct bpf_stab *stab,
+ struct bpf_prog *verdict,
+ struct bpf_prog *parse)
+{
+ struct bpf_prog *orig_parse, *orig_verdict;
+
+ orig_parse = xchg(&psock->bpf_parse, parse);
+ orig_verdict = xchg(&psock->bpf_verdict, verdict);
+
+ if (orig_verdict)
+ bpf_prog_put(orig_verdict);
+ if (orig_parse)
+ bpf_prog_put(orig_parse);
+}
+
+static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
+{
+ if (sk->sk_data_ready == smap_data_ready)
+ return;
+ psock->save_data_ready = sk->sk_data_ready;
+ psock->save_write_space = sk->sk_write_space;
+ psock->save_state_change = sk->sk_state_change;
+ sk->sk_data_ready = smap_data_ready;
+ sk->sk_write_space = smap_write_space;
+ sk->sk_state_change = smap_state_change;
+ psock->strp_enabled = true;
+}
+
+static void sock_map_remove_complete(struct bpf_stab *stab)
+{
+ bpf_map_area_free(stab->sock_map);
+ kfree(stab);
+}
+
+static void smap_gc_work(struct work_struct *w)
+{
+ struct smap_psock_map_entry *e, *tmp;
+ struct smap_psock *psock;
+
+ psock = container_of(w, struct smap_psock, gc_work);
+
+ /* no callback lock needed because we already detached sockmap ops */
+ if (psock->strp_enabled)
+ strp_done(&psock->strp);
+
+ cancel_work_sync(&psock->tx_work);
+ __skb_queue_purge(&psock->rxqueue);
+
+ /* At this point all strparser and xmit work must be complete */
+ if (psock->bpf_parse)
+ bpf_prog_put(psock->bpf_parse);
+ if (psock->bpf_verdict)
+ bpf_prog_put(psock->bpf_verdict);
+
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ list_del(&e->list);
+ kfree(e);
+ }
+
+ sock_put(psock->sock);
+ kfree(psock);
+}
+
+static struct smap_psock *smap_init_psock(struct sock *sock,
+ struct bpf_stab *stab)
+{
+ struct smap_psock *psock;
+
+ psock = kzalloc_node(sizeof(struct smap_psock),
+ GFP_ATOMIC | __GFP_NOWARN,
+ stab->map.numa_node);
+ if (!psock)
+ return ERR_PTR(-ENOMEM);
+
+ psock->sock = sock;
+ skb_queue_head_init(&psock->rxqueue);
+ INIT_WORK(&psock->tx_work, smap_tx_work);
+ INIT_WORK(&psock->gc_work, smap_gc_work);
+ INIT_LIST_HEAD(&psock->maps);
+ psock->refcnt = 1;
+
+ rcu_assign_sk_user_data(sock, psock);
+ sock_hold(sock);
+ return psock;
+}
+
+static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_stab *stab;
+ int err = -EINVAL;
+ u64 cost;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ return ERR_PTR(-EINVAL);
+
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ stab = kzalloc(sizeof(*stab), GFP_USER);
+ if (!stab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ stab->map.map_type = attr->map_type;
+ stab->map.key_size = attr->key_size;
+ stab->map.value_size = attr->value_size;
+ stab->map.max_entries = attr->max_entries;
+ stab->map.map_flags = attr->map_flags;
+ stab->map.numa_node = bpf_map_attr_numa_node(attr);
+
+ /* make sure page count doesn't overflow */
+ cost = (u64) stab->map.max_entries * sizeof(struct sock *);
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_stab;
+
+ stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* if map size is larger than memlock limit, reject it early */
+ err = bpf_map_precharge_memlock(stab->map.pages);
+ if (err)
+ goto free_stab;
+
+ err = -ENOMEM;
+ stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
+ sizeof(struct sock *),
+ stab->map.numa_node);
+ if (!stab->sock_map)
+ goto free_stab;
+
+ return &stab->map;
+free_stab:
+ kfree(stab);
+ return ERR_PTR(err);
+}
+
+static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+{
+ struct smap_psock_map_entry *e, *tmp;
+
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ if (e->entry == entry) {
+ list_del(&e->list);
+ break;
+ }
+ }
+}
+
+static void sock_map_free(struct bpf_map *map)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ int i;
+
+ synchronize_rcu();
+
+ /* At this point no update, lookup or delete operations can happen.
+ * However, be aware we can still get a socket state event updates,
+ * and data ready callabacks that reference the psock from sk_user_data
+ * Also psock worker threads are still in-flight. So smap_release_sock
+ * will only free the psock after cancel_sync on the worker threads
+ * and a grace period expire to ensure psock is really safe to remove.
+ */
+ rcu_read_lock();
+ for (i = 0; i < stab->map.max_entries; i++) {
+ struct smap_psock *psock;
+ struct sock *sock;
+
+ sock = xchg(&stab->sock_map[i], NULL);
+ if (!sock)
+ continue;
+
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+ smap_list_remove(psock, &stab->sock_map[i]);
+ smap_release_sock(psock, sock);
+ write_unlock_bh(&sock->sk_callback_lock);
+ }
+ rcu_read_unlock();
+
+ if (stab->bpf_verdict)
+ bpf_prog_put(stab->bpf_verdict);
+ if (stab->bpf_parse)
+ bpf_prog_put(stab->bpf_parse);
+
+ sock_map_remove_complete(stab);
+}
+
+static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ u32 i = key ? *(u32 *)key : U32_MAX;
+ u32 *next = (u32 *)next_key;
+
+ if (i >= stab->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (i == stab->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = i + 1;
+ return 0;
+}
+
+struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ return READ_ONCE(stab->sock_map[key]);
+}
+
+static int sock_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct smap_psock *psock;
+ int k = *(u32 *)key;
+ struct sock *sock;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ sock = xchg(&stab->sock_map[k], NULL);
+ if (!sock)
+ return -EINVAL;
+
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+ if (!psock)
+ goto out;
+
+ if (psock->bpf_parse)
+ smap_stop_sock(psock, sock);
+ smap_list_remove(psock, &stab->sock_map[k]);
+ smap_release_sock(psock, sock);
+out:
+ write_unlock_bh(&sock->sk_callback_lock);
+ return 0;
+}
+
+/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
+ * done inside rcu critical sections. This ensures on updates that the psock
+ * will not be released via smap_release_sock() until concurrent updates/deletes
+ * complete. All operations operate on sock_map using cmpxchg and xchg
+ * operations to ensure we do not get stale references. Any reads into the
+ * map must be done with READ_ONCE() because of this.
+ *
+ * A psock is destroyed via call_rcu and after any worker threads are cancelled
+ * and syncd so we are certain all references from the update/lookup/delete
+ * operations as well as references in the data path are no longer in use.
+ *
+ * Psocks may exist in multiple maps, but only a single set of parse/verdict
+ * programs may be inherited from the maps it belongs to. A reference count
+ * is kept with the total number of references to the psock from all maps. The
+ * psock will not be released until this reaches zero. The psock and sock
+ * user data data use the sk_callback_lock to protect critical data structures
+ * from concurrent access. This allows us to avoid two updates from modifying
+ * the user data in sock and the lock is required anyways for modifying
+ * callbacks, we simply increase its scope slightly.
+ *
+ * Rules to follow,
+ * - psock must always be read inside RCU critical section
+ * - sk_user_data must only be modified inside sk_callback_lock and read
+ * inside RCU critical section.
+ * - psock->maps list must only be read & modified inside sk_callback_lock
+ * - sock_map must use READ_ONCE and (cmp)xchg operations
+ * - BPF verdict/parse programs must use READ_ONCE and xchg operations
+ */
+static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+ struct bpf_map *map,
+ void *key, u64 flags)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct smap_psock_map_entry *e = NULL;
+ struct bpf_prog *verdict, *parse;
+ struct sock *osock, *sock;
+ struct smap_psock *psock;
+ u32 i = *(u32 *)key;
+ int err;
+
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ if (unlikely(i >= stab->map.max_entries))
+ return -E2BIG;
+
+ sock = READ_ONCE(stab->sock_map[i]);
+ if (flags == BPF_EXIST && !sock)
+ return -ENOENT;
+ else if (flags == BPF_NOEXIST && sock)
+ return -EEXIST;
+
+ sock = skops->sk;
+
+ /* 1. If sock map has BPF programs those will be inherited by the
+ * sock being added. If the sock is already attached to BPF programs
+ * this results in an error.
+ */
+ verdict = READ_ONCE(stab->bpf_verdict);
+ parse = READ_ONCE(stab->bpf_parse);
+
+ if (parse && verdict) {
+ /* bpf prog refcnt may be zero if a concurrent attach operation
+ * removes the program after the above READ_ONCE() but before
+ * we increment the refcnt. If this is the case abort with an
+ * error.
+ */
+ verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+ if (IS_ERR(verdict))
+ return PTR_ERR(verdict);
+
+ parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+ if (IS_ERR(parse)) {
+ bpf_prog_put(verdict);
+ return PTR_ERR(parse);
+ }
+ }
+
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+
+ /* 2. Do not allow inheriting programs if psock exists and has
+ * already inherited programs. This would create confusion on
+ * which parser/verdict program is running. If no psock exists
+ * create one. Inside sk_callback_lock to ensure concurrent create
+ * doesn't update user data.
+ */
+ if (psock) {
+ if (READ_ONCE(psock->bpf_parse) && parse) {
+ err = -EBUSY;
+ goto out_progs;
+ }
+ psock->refcnt++;
+ } else {
+ psock = smap_init_psock(sock, stab);
+ if (IS_ERR(psock)) {
+ err = PTR_ERR(psock);
+ goto out_progs;
+ }
+
+ set_bit(SMAP_TX_RUNNING, &psock->state);
+ }
+
+ e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+ if (!e) {
+ err = -ENOMEM;
+ goto out_progs;
+ }
+ e->entry = &stab->sock_map[i];
+
+ /* 3. At this point we have a reference to a valid psock that is
+ * running. Attach any BPF programs needed.
+ */
+ if (parse && verdict && !psock->strp_enabled) {
+ err = smap_init_sock(psock, sock);
+ if (err)
+ goto out_free;
+ smap_init_progs(psock, stab, verdict, parse);
+ smap_start_sock(psock, sock);
+ }
+
+ /* 4. Place psock in sockmap for use and stop any programs on
+ * the old sock assuming its not the same sock we are replacing
+ * it with. Because we can only have a single set of programs if
+ * old_sock has a strp we can stop it.
+ */
+ list_add_tail(&e->list, &psock->maps);
+ write_unlock_bh(&sock->sk_callback_lock);
+
+ osock = xchg(&stab->sock_map[i], sock);
+ if (osock) {
+ struct smap_psock *opsock = smap_psock_sk(osock);
+
+ write_lock_bh(&osock->sk_callback_lock);
+ if (osock != sock && parse)
+ smap_stop_sock(opsock, osock);
+ smap_list_remove(opsock, &stab->sock_map[i]);
+ smap_release_sock(opsock, osock);
+ write_unlock_bh(&osock->sk_callback_lock);
+ }
+ return 0;
+out_free:
+ smap_release_sock(psock, sock);
+out_progs:
+ if (verdict)
+ bpf_prog_put(verdict);
+ if (parse)
+ bpf_prog_put(parse);
+ write_unlock_bh(&sock->sk_callback_lock);
+ kfree(e);
+ return err;
+}
+
+int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct bpf_prog *orig;
+
+ if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
+ return -EINVAL;
+
+ switch (type) {
+ case BPF_SK_SKB_STREAM_PARSER:
+ orig = xchg(&stab->bpf_parse, prog);
+ break;
+ case BPF_SK_SKB_STREAM_VERDICT:
+ orig = xchg(&stab->bpf_verdict, prog);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (orig)
+ bpf_prog_put(orig);
+
+ return 0;
+}
+
+static void *sock_map_lookup(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+static int sock_map_update_elem(struct bpf_map *map,
+ void *key, void *value, u64 flags)
+{
+ struct bpf_sock_ops_kern skops;
+ u32 fd = *(u32 *)value;
+ struct socket *socket;
+ int err;
+
+ socket = sockfd_lookup(fd, &err);
+ if (!socket)
+ return err;
+
+ skops.sk = socket->sk;
+ if (!skops.sk) {
+ fput(socket->file);
+ return -EINVAL;
+ }
+
+ err = sock_map_ctx_update_elem(&skops, map, key, flags);
+ fput(socket->file);
+ return err;
+}
+
+const struct bpf_map_ops sock_map_ops = {
+ .map_alloc = sock_map_alloc,
+ .map_free = sock_map_free,
+ .map_lookup_elem = sock_map_lookup,
+ .map_get_next_key = sock_map_get_next_key,
+ .map_update_elem = sock_map_update_elem,
+ .map_delete_elem = sock_map_delete_elem,
+};
+
+BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
+}
+
+const struct bpf_func_proto bpf_sock_map_update_proto = {
+ .func = bpf_sock_map_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 31147d730abf..135be433e9a0 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -31,7 +31,8 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
int err;
- smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries);
+ smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
+ smap->map.numa_node);
if (!smap->elems)
return -ENOMEM;
@@ -59,7 +60,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- if (attr->map_flags)
+ if (attr->map_flags & ~BPF_F_NUMA_NODE)
return ERR_PTR(-EINVAL);
/* check sanity of attributes */
@@ -75,7 +76,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (cost >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-E2BIG);
- smap = bpf_map_area_alloc(cost);
+ smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
if (!smap)
return ERR_PTR(-ENOMEM);
@@ -91,6 +92,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
smap->map.map_flags = attr->map_flags;
smap->n_buckets = n_buckets;
smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+ smap->map.numa_node = bpf_map_attr_numa_node(attr);
err = bpf_map_precharge_memlock(smap->map.pages);
if (err)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 265a0d854e33..70ad8e220343 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -22,8 +22,20 @@
#include <linux/filter.h>
#include <linux/version.h>
#include <linux/kernel.h>
+#include <linux/idr.h>
+
+#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
+#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
DEFINE_PER_CPU(int, bpf_prog_active);
+static DEFINE_IDR(prog_idr);
+static DEFINE_SPINLOCK(prog_idr_lock);
+static DEFINE_IDR(map_idr);
+static DEFINE_SPINLOCK(map_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly;
@@ -36,6 +48,47 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
#undef BPF_MAP_TYPE
};
+/*
+ * If we're handed a bigger struct than we know of, ensure all the unknown bits
+ * are 0 - i.e. new user-space does not rely on any kernel feature extensions
+ * we don't know about yet.
+ *
+ * There is a ToCToU between this function call and the following
+ * copy_from_user() call. However, this is not a concern since this function is
+ * meant to be a future-proofing of bits.
+ */
+static int check_uarg_tail_zero(void __user *uaddr,
+ size_t expected_size,
+ size_t actual_size)
+{
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+ int err;
+
+ if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
+ return -E2BIG;
+
+ if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size)))
+ return -EFAULT;
+
+ if (actual_size <= expected_size)
+ return 0;
+
+ addr = uaddr + expected_size;
+ end = uaddr + actual_size;
+
+ for (; addr < end; addr++) {
+ err = get_user(val, addr);
+ if (err)
+ return err;
+ if (val)
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
struct bpf_map *map;
@@ -52,7 +105,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map;
}
-void *bpf_map_area_alloc(size_t size)
+void *bpf_map_area_alloc(size_t size, int numa_node)
{
/* We definitely need __GFP_NORETRY, so OOM killer doesn't
* trigger under memory pressure as we really just want to
@@ -62,12 +115,13 @@ void *bpf_map_area_alloc(size_t size)
void *area;
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- area = kmalloc(size, GFP_USER | flags);
+ area = kmalloc_node(size, GFP_USER | flags, numa_node);
if (area != NULL)
return area;
}
- return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL);
+ return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
+ __builtin_return_address(0));
}
void bpf_map_area_free(void *area)
@@ -114,6 +168,37 @@ static void bpf_map_uncharge_memlock(struct bpf_map *map)
free_uid(user);
}
+static int bpf_map_alloc_id(struct bpf_map *map)
+{
+ int id;
+
+ spin_lock_bh(&map_idr_lock);
+ id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
+ if (id > 0)
+ map->id = id;
+ spin_unlock_bh(&map_idr_lock);
+
+ if (WARN_ON_ONCE(!id))
+ return -ENOSPC;
+
+ return id > 0 ? 0 : id;
+}
+
+static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+{
+ if (do_idr_lock)
+ spin_lock_bh(&map_idr_lock);
+ else
+ __acquire(&map_idr_lock);
+
+ idr_remove(&map_idr, map->id);
+
+ if (do_idr_lock)
+ spin_unlock_bh(&map_idr_lock);
+ else
+ __release(&map_idr_lock);
+}
+
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
@@ -135,14 +220,21 @@ static void bpf_map_put_uref(struct bpf_map *map)
/* decrement map refcnt and schedule it for freeing via workqueue
* (unrelying map implementation ops->map_free() might sleep)
*/
-void bpf_map_put(struct bpf_map *map)
+static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
{
if (atomic_dec_and_test(&map->refcnt)) {
+ /* bpf_map_free_id() must be called first */
+ bpf_map_free_id(map, do_idr_lock);
INIT_WORK(&map->work, bpf_map_free_deferred);
schedule_work(&map->work);
}
}
+void bpf_map_put(struct bpf_map *map)
+{
+ __bpf_map_put(map, true);
+}
+
void bpf_map_put_with_uref(struct bpf_map *map)
{
bpf_map_put_uref(map);
@@ -166,10 +258,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
const struct bpf_map *map = filp->private_data;
const struct bpf_array *array;
u32 owner_prog_type = 0;
+ u32 owner_jited = 0;
if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
array = container_of(map, struct bpf_array, map);
owner_prog_type = array->owner_prog_type;
+ owner_jited = array->owner_jited;
}
seq_printf(m,
@@ -186,9 +280,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
map->map_flags,
map->pages * 1ULL << PAGE_SHIFT);
- if (owner_prog_type)
+ if (owner_prog_type) {
seq_printf(m, "owner_prog_type:\t%u\n",
owner_prog_type);
+ seq_printf(m, "owner_jited:\t%u\n",
+ owner_jited);
+ }
}
#endif
@@ -213,10 +310,11 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd
+#define BPF_MAP_CREATE_LAST_FIELD numa_node
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
+ int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_map *map;
int err;
@@ -224,6 +322,11 @@ static int map_create(union bpf_attr *attr)
if (err)
return -EINVAL;
+ if (numa_node != NUMA_NO_NODE &&
+ ((unsigned int)numa_node >= nr_node_ids ||
+ !node_online(numa_node)))
+ return -EINVAL;
+
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map = find_and_alloc_map(attr);
if (IS_ERR(map))
@@ -236,11 +339,22 @@ static int map_create(union bpf_attr *attr)
if (err)
goto free_map_nouncharge;
- err = bpf_map_new_fd(map);
- if (err < 0)
- /* failed to allocate fd */
+ err = bpf_map_alloc_id(map);
+ if (err)
goto free_map;
+ err = bpf_map_new_fd(map);
+ if (err < 0) {
+ /* failed to allocate fd.
+ * bpf_map_put() is needed because the above
+ * bpf_map_alloc_id() has published the map
+ * to the userspace and the userspace may
+ * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
+ */
+ bpf_map_put(map);
+ return err;
+ }
+
trace_bpf_map_create(map, err);
return err;
@@ -295,6 +409,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
return map;
}
+/* map_idr_lock should have been held */
+static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
+ bool uref)
+{
+ int refold;
+
+ refold = __atomic_add_unless(&map->refcnt, 1, 0);
+
+ if (refold >= BPF_MAX_REFCNT) {
+ __bpf_map_put(map, false);
+ return ERR_PTR(-EBUSY);
+ }
+
+ if (!refold)
+ return ERR_PTR(-ENOENT);
+
+ if (uref)
+ atomic_inc(&map->usercnt);
+
+ return map;
+}
+
int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{
return -ENOTSUPP;
@@ -322,19 +458,18 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
goto err_put;
-
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ }
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
value_size = round_up(map->value_size, 8) * num_possible_cpus();
+ else if (IS_FD_MAP(map))
+ value_size = sizeof(u32);
else
value_size = map->value_size;
@@ -350,9 +485,10 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_percpu_array_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
- map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
- err = -ENOTSUPP;
+ } else if (IS_FD_ARRAY(map)) {
+ err = bpf_fd_array_map_lookup_elem(map, key, value);
+ } else if (IS_FD_HASH(map)) {
+ err = bpf_fd_htab_map_lookup_elem(map, key, value);
} else {
rcu_read_lock();
ptr = map->ops->map_lookup_elem(map, key);
@@ -402,14 +538,11 @@ static int map_update_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
goto err_put;
-
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ }
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -488,14 +621,11 @@ static int map_delete_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
goto err_put;
-
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ }
preempt_disable();
__this_cpu_inc(bpf_prog_active);
@@ -507,7 +637,6 @@ static int map_delete_elem(union bpf_attr *attr)
if (!err)
trace_bpf_map_delete_elem(map, ufd, key);
-free_key:
kfree(key);
err_put:
fdput(f);
@@ -536,14 +665,11 @@ static int map_get_next_key(union bpf_attr *attr)
return PTR_ERR(map);
if (ukey) {
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
goto err_put;
-
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ }
} else {
key = NULL;
}
@@ -650,6 +776,42 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
free_uid(user);
}
+static int bpf_prog_alloc_id(struct bpf_prog *prog)
+{
+ int id;
+
+ spin_lock_bh(&prog_idr_lock);
+ id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
+ if (id > 0)
+ prog->aux->id = id;
+ spin_unlock_bh(&prog_idr_lock);
+
+ /* id is in [1, INT_MAX) */
+ if (WARN_ON_ONCE(!id))
+ return -ENOSPC;
+
+ return id > 0 ? 0 : id;
+}
+
+static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
+{
+ /* cBPF to eBPF migrations are currently not in the idr store. */
+ if (!prog->aux->id)
+ return;
+
+ if (do_idr_lock)
+ spin_lock_bh(&prog_idr_lock);
+ else
+ __acquire(&prog_idr_lock);
+
+ idr_remove(&prog_idr, prog->aux->id);
+
+ if (do_idr_lock)
+ spin_unlock_bh(&prog_idr_lock);
+ else
+ __release(&prog_idr_lock);
+}
+
static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
@@ -659,14 +821,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
bpf_prog_free(aux->prog);
}
-void bpf_prog_put(struct bpf_prog *prog)
+static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
trace_bpf_prog_put_rcu(prog);
+ /* bpf_prog_free_id() must be called first */
+ bpf_prog_free_id(prog, do_idr_lock);
bpf_prog_kallsyms_del(prog);
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
}
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+ __bpf_prog_put(prog, true);
+}
EXPORT_SYMBOL_GPL(bpf_prog_put);
static int bpf_prog_release(struct inode *inode, struct file *filp)
@@ -748,6 +917,25 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
}
EXPORT_SYMBOL_GPL(bpf_prog_inc);
+/* prog_idr_lock should have been held */
+struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
+{
+ int refold;
+
+ refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0);
+
+ if (refold >= BPF_MAX_REFCNT) {
+ __bpf_prog_put(prog, false);
+ return ERR_PTR(-EBUSY);
+ }
+
+ if (!refold)
+ return ERR_PTR(-ENOENT);
+
+ return prog;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
+
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
{
struct fd f = fdget(ufd);
@@ -815,7 +1003,9 @@ static int bpf_prog_load(union bpf_attr *attr)
attr->kern_version != LINUX_VERSION_CODE)
return -EINVAL;
- if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+ if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
+ type != BPF_PROG_TYPE_CGROUP_SKB &&
+ !capable(CAP_SYS_ADMIN))
return -EPERM;
/* plain bpf_prog allocation */
@@ -855,11 +1045,22 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_used_maps;
- err = bpf_prog_new_fd(prog);
- if (err < 0)
- /* failed to allocate fd */
+ err = bpf_prog_alloc_id(prog);
+ if (err)
goto free_used_maps;
+ err = bpf_prog_new_fd(prog);
+ if (err < 0) {
+ /* failed to allocate fd.
+ * bpf_prog_put() is needed because the above
+ * bpf_prog_alloc_id() has published the prog
+ * to the userspace and the userspace may
+ * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
+ */
+ bpf_prog_put(prog);
+ return err;
+ }
+
bpf_prog_kallsyms_add(prog);
trace_bpf_prog_load(prog, err);
return err;
@@ -895,6 +1096,36 @@ static int bpf_obj_get(const union bpf_attr *attr)
#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+static int sockmap_get_from_fd(const union bpf_attr *attr)
+{
+ int ufd = attr->target_fd;
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ struct fd f;
+ int err;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, BPF_PROG_TYPE_SK_SKB);
+ if (IS_ERR(prog)) {
+ fdput(f);
+ return PTR_ERR(prog);
+ }
+
+ err = sock_map_attach_prog(map, prog, attr->attach_type);
+ if (err) {
+ fdput(f);
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ fdput(f);
+ return 0;
+}
+
static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
@@ -919,6 +1150,12 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_INET_SOCK_CREATE:
ptype = BPF_PROG_TYPE_CGROUP_SOCK;
break;
+ case BPF_CGROUP_SOCK_OPS:
+ ptype = BPF_PROG_TYPE_SOCK_OPS;
+ break;
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ return sockmap_get_from_fd(attr);
default:
return -EINVAL;
}
@@ -959,6 +1196,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_SOCK_OPS:
cgrp = cgroup_get_from_fd(attr->target_fd);
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
@@ -973,6 +1211,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
return ret;
}
+
#endif /* CONFIG_CGROUP_BPF */
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -997,43 +1236,224 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
return ret;
}
-SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
+
+static int bpf_obj_get_next_id(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ struct idr *idr,
+ spinlock_t *lock)
{
- union bpf_attr attr = {};
- int err;
+ u32 next_id = attr->start_id;
+ int err = 0;
- if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
+ if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ next_id++;
+ spin_lock_bh(lock);
+ if (!idr_get_next(idr, &next_id))
+ err = -ENOENT;
+ spin_unlock_bh(lock);
+
+ if (!err)
+ err = put_user(next_id, &uattr->next_id);
+
+ return err;
+}
+
+#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
+
+static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ u32 id = attr->prog_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ spin_lock_bh(&prog_idr_lock);
+ prog = idr_find(&prog_idr, id);
+ if (prog)
+ prog = bpf_prog_inc_not_zero(prog);
+ else
+ prog = ERR_PTR(-ENOENT);
+ spin_unlock_bh(&prog_idr_lock);
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ fd = bpf_prog_new_fd(prog);
+ if (fd < 0)
+ bpf_prog_put(prog);
+
+ return fd;
+}
+
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+
+static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_map *map;
+ u32 id = attr->map_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!access_ok(VERIFY_READ, uattr, 1))
+ spin_lock_bh(&map_idr_lock);
+ map = idr_find(&map_idr, id);
+ if (map)
+ map = bpf_map_inc_not_zero(map, true);
+ else
+ map = ERR_PTR(-ENOENT);
+ spin_unlock_bh(&map_idr_lock);
+
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ fd = bpf_map_new_fd(map);
+ if (fd < 0)
+ bpf_map_put(map);
+
+ return fd;
+}
+
+static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_prog_info info = {};
+ u32 info_len = attr->info.info_len;
+ char __user *uinsns;
+ u32 ulen;
+ int err;
+
+ err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ if (copy_from_user(&info, uinfo, info_len))
return -EFAULT;
- if (size > PAGE_SIZE) /* silly large */
- return -E2BIG;
+ info.type = prog->type;
+ info.id = prog->aux->id;
- /* If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
- */
- if (size > sizeof(attr)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uattr + sizeof(attr);
- end = (void __user *)uattr + size;
-
- for (; addr < end; addr++) {
- err = get_user(val, addr);
- if (err)
- return err;
- if (val)
- return -E2BIG;
- }
- size = sizeof(attr);
+ memcpy(info.tag, prog->tag, sizeof(prog->tag));
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ info.jited_prog_len = 0;
+ info.xlated_prog_len = 0;
+ goto done;
+ }
+
+ ulen = info.jited_prog_len;
+ info.jited_prog_len = prog->jited_len;
+ if (info.jited_prog_len && ulen) {
+ uinsns = u64_to_user_ptr(info.jited_prog_insns);
+ ulen = min_t(u32, info.jited_prog_len, ulen);
+ if (copy_to_user(uinsns, prog->bpf_func, ulen))
+ return -EFAULT;
+ }
+
+ ulen = info.xlated_prog_len;
+ info.xlated_prog_len = bpf_prog_insn_size(prog);
+ if (info.xlated_prog_len && ulen) {
+ uinsns = u64_to_user_ptr(info.xlated_prog_insns);
+ ulen = min_t(u32, info.xlated_prog_len, ulen);
+ if (copy_to_user(uinsns, prog->insnsi, ulen))
+ return -EFAULT;
}
+done:
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int bpf_map_get_info_by_fd(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_map_info info = {};
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ info.type = map->map_type;
+ info.id = map->id;
+ info.key_size = map->key_size;
+ info.value_size = map->value_size;
+ info.max_entries = map->max_entries;
+ info.map_flags = map->map_flags;
+
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
+
+static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ int ufd = attr->info.bpf_fd;
+ struct fd f;
+ int err;
+
+ if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
+ return -EINVAL;
+
+ f = fdget(ufd);
+ if (!f.file)
+ return -EBADFD;
+
+ if (f.file->f_op == &bpf_prog_fops)
+ err = bpf_prog_get_info_by_fd(f.file->private_data, attr,
+ uattr);
+ else if (f.file->f_op == &bpf_map_fops)
+ err = bpf_map_get_info_by_fd(f.file->private_data, attr,
+ uattr);
+ else
+ err = -EINVAL;
+
+ fdput(f);
+ return err;
+}
+
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+ union bpf_attr attr = {};
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
+ return -EPERM;
+
+ err = check_uarg_tail_zero(uattr, sizeof(attr), size);
+ if (err)
+ return err;
+ size = min_t(u32, size, sizeof(attr));
+
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
@@ -1074,6 +1494,23 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr);
break;
+ case BPF_PROG_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &prog_idr, &prog_idr_lock);
+ break;
+ case BPF_MAP_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &map_idr, &map_idr_lock);
+ break;
+ case BPF_PROG_GET_FD_BY_ID:
+ err = bpf_prog_get_fd_by_id(&attr);
+ break;
+ case BPF_MAP_GET_FD_BY_ID:
+ err = bpf_map_get_fd_by_id(&attr);
+ break;
+ case BPF_OBJ_GET_INFO_BY_FD:
+ err = bpf_obj_get_info_by_fd(&attr, uattr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
new file mode 100644
index 000000000000..1f4bf68c12db
--- /dev/null
+++ b/kernel/bpf/tnum.c
@@ -0,0 +1,180 @@
+/* tnum: tracked (or tristate) numbers
+ *
+ * A tnum tracks knowledge about the bits of a value. Each bit can be either
+ * known (0 or 1), or unknown (x). Arithmetic operations on tnums will
+ * propagate the unknown bits such that the tnum result represents all the
+ * possible results for possible values of the operands.
+ */
+#include <linux/kernel.h>
+#include <linux/tnum.h>
+
+#define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m}
+/* A completely unknown value */
+const struct tnum tnum_unknown = { .value = 0, .mask = -1 };
+
+struct tnum tnum_const(u64 value)
+{
+ return TNUM(value, 0);
+}
+
+struct tnum tnum_range(u64 min, u64 max)
+{
+ u64 chi = min ^ max, delta;
+ u8 bits = fls64(chi);
+
+ /* special case, needed because 1ULL << 64 is undefined */
+ if (bits > 63)
+ return tnum_unknown;
+ /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7.
+ * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return
+ * constant min (since min == max).
+ */
+ delta = (1ULL << bits) - 1;
+ return TNUM(min & ~delta, delta);
+}
+
+struct tnum tnum_lshift(struct tnum a, u8 shift)
+{
+ return TNUM(a.value << shift, a.mask << shift);
+}
+
+struct tnum tnum_rshift(struct tnum a, u8 shift)
+{
+ return TNUM(a.value >> shift, a.mask >> shift);
+}
+
+struct tnum tnum_add(struct tnum a, struct tnum b)
+{
+ u64 sm, sv, sigma, chi, mu;
+
+ sm = a.mask + b.mask;
+ sv = a.value + b.value;
+ sigma = sm + sv;
+ chi = sigma ^ sv;
+ mu = chi | a.mask | b.mask;
+ return TNUM(sv & ~mu, mu);
+}
+
+struct tnum tnum_sub(struct tnum a, struct tnum b)
+{
+ u64 dv, alpha, beta, chi, mu;
+
+ dv = a.value - b.value;
+ alpha = dv + a.mask;
+ beta = dv - b.mask;
+ chi = alpha ^ beta;
+ mu = chi | a.mask | b.mask;
+ return TNUM(dv & ~mu, mu);
+}
+
+struct tnum tnum_and(struct tnum a, struct tnum b)
+{
+ u64 alpha, beta, v;
+
+ alpha = a.value | a.mask;
+ beta = b.value | b.mask;
+ v = a.value & b.value;
+ return TNUM(v, alpha & beta & ~v);
+}
+
+struct tnum tnum_or(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value | b.value;
+ mu = a.mask | b.mask;
+ return TNUM(v, mu & ~v);
+}
+
+struct tnum tnum_xor(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value ^ b.value;
+ mu = a.mask | b.mask;
+ return TNUM(v & ~mu, mu);
+}
+
+/* half-multiply add: acc += (unknown * mask * value).
+ * An intermediate step in the multiply algorithm.
+ */
+static struct tnum hma(struct tnum acc, u64 value, u64 mask)
+{
+ while (mask) {
+ if (mask & 1)
+ acc = tnum_add(acc, TNUM(0, value));
+ mask >>= 1;
+ value <<= 1;
+ }
+ return acc;
+}
+
+struct tnum tnum_mul(struct tnum a, struct tnum b)
+{
+ struct tnum acc;
+ u64 pi;
+
+ pi = a.value * b.value;
+ acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value);
+ return hma(acc, b.mask, a.value);
+}
+
+/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
+ * a 'known 0' - this will return a 'known 1' for that bit.
+ */
+struct tnum tnum_intersect(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value | b.value;
+ mu = a.mask & b.mask;
+ return TNUM(v & ~mu, mu);
+}
+
+struct tnum tnum_cast(struct tnum a, u8 size)
+{
+ a.value &= (1ULL << (size * 8)) - 1;
+ a.mask &= (1ULL << (size * 8)) - 1;
+ return a;
+}
+
+bool tnum_is_aligned(struct tnum a, u64 size)
+{
+ if (!size)
+ return true;
+ return !((a.value | a.mask) & (size - 1));
+}
+
+bool tnum_in(struct tnum a, struct tnum b)
+{
+ if (b.mask & ~a.mask)
+ return false;
+ b.value &= ~a.mask;
+ return a.value == b.value;
+}
+
+int tnum_strn(char *str, size_t size, struct tnum a)
+{
+ return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
+}
+EXPORT_SYMBOL_GPL(tnum_strn);
+
+int tnum_sbin(char *str, size_t size, struct tnum a)
+{
+ size_t n;
+
+ for (n = 64; n; n--) {
+ if (n < size) {
+ if (a.mask & 1)
+ str[n - 1] = 'x';
+ else if (a.value & 1)
+ str[n - 1] = '1';
+ else
+ str[n - 1] = '0';
+ }
+ a.mask >>= 1;
+ a.value >>= 1;
+ }
+ str[min(size - 1, (size_t)64)] = 0;
+ return 64;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a8a725697bed..d690c7dd1f1a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -61,12 +61,12 @@
* (and -20 constant is saved for further stack bounds checking).
* Meaning that this reg is a pointer to stack plus known immediate constant.
*
- * Most of the time the registers have UNKNOWN_VALUE type, which
+ * Most of the time the registers have SCALAR_VALUE type, which
* means the register has some value, but it's not a valid pointer.
- * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ * (like pointer plus pointer becomes SCALAR_VALUE type)
*
* When verifier sees load or store instructions the type of base register
- * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer
* types recognized by check_mem_access() function.
*
* PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
@@ -140,7 +140,7 @@ struct bpf_verifier_stack_elem {
struct bpf_verifier_stack_elem *next;
};
-#define BPF_COMPLEXITY_LIMIT_INSNS 98304
+#define BPF_COMPLEXITY_LIMIT_INSNS 131072
#define BPF_COMPLEXITY_LIMIT_STACK 1024
#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
@@ -180,15 +180,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
[NOT_INIT] = "?",
- [UNKNOWN_VALUE] = "inv",
+ [SCALAR_VALUE] = "inv",
[PTR_TO_CTX] = "ctx",
[CONST_PTR_TO_MAP] = "map_ptr",
[PTR_TO_MAP_VALUE] = "map_value",
[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
- [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj",
- [FRAME_PTR] = "fp",
[PTR_TO_STACK] = "fp",
- [CONST_IMM] = "imm",
[PTR_TO_PACKET] = "pkt",
[PTR_TO_PACKET_END] = "pkt_end",
};
@@ -221,32 +218,52 @@ static void print_verifier_state(struct bpf_verifier_state *state)
if (t == NOT_INIT)
continue;
verbose(" R%d=%s", i, reg_type_str[t]);
- if (t == CONST_IMM || t == PTR_TO_STACK)
- verbose("%lld", reg->imm);
- else if (t == PTR_TO_PACKET)
- verbose("(id=%d,off=%d,r=%d)",
- reg->id, reg->off, reg->range);
- else if (t == UNKNOWN_VALUE && reg->imm)
- verbose("%lld", reg->imm);
- else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
- t == PTR_TO_MAP_VALUE_OR_NULL ||
- t == PTR_TO_MAP_VALUE_ADJ)
- verbose("(ks=%d,vs=%d,id=%u)",
- reg->map_ptr->key_size,
- reg->map_ptr->value_size,
- reg->id);
- if (reg->min_value != BPF_REGISTER_MIN_RANGE)
- verbose(",min_value=%lld",
- (long long)reg->min_value);
- if (reg->max_value != BPF_REGISTER_MAX_RANGE)
- verbose(",max_value=%llu",
- (unsigned long long)reg->max_value);
- if (reg->min_align)
- verbose(",min_align=%u", reg->min_align);
- if (reg->aux_off)
- verbose(",aux_off=%u", reg->aux_off);
- if (reg->aux_off_align)
- verbose(",aux_off_align=%u", reg->aux_off_align);
+ if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
+ tnum_is_const(reg->var_off)) {
+ /* reg->off should be 0 for SCALAR_VALUE */
+ verbose("%lld", reg->var_off.value + reg->off);
+ } else {
+ verbose("(id=%d", reg->id);
+ if (t != SCALAR_VALUE)
+ verbose(",off=%d", reg->off);
+ if (t == PTR_TO_PACKET)
+ verbose(",r=%d", reg->range);
+ else if (t == CONST_PTR_TO_MAP ||
+ t == PTR_TO_MAP_VALUE ||
+ t == PTR_TO_MAP_VALUE_OR_NULL)
+ verbose(",ks=%d,vs=%d",
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size);
+ if (tnum_is_const(reg->var_off)) {
+ /* Typically an immediate SCALAR_VALUE, but
+ * could be a pointer whose offset is too big
+ * for reg->off
+ */
+ verbose(",imm=%llx", reg->var_off.value);
+ } else {
+ if (reg->smin_value != reg->umin_value &&
+ reg->smin_value != S64_MIN)
+ verbose(",smin_value=%lld",
+ (long long)reg->smin_value);
+ if (reg->smax_value != reg->umax_value &&
+ reg->smax_value != S64_MAX)
+ verbose(",smax_value=%lld",
+ (long long)reg->smax_value);
+ if (reg->umin_value != 0)
+ verbose(",umin_value=%llu",
+ (unsigned long long)reg->umin_value);
+ if (reg->umax_value != U64_MAX)
+ verbose(",umax_value=%llu",
+ (unsigned long long)reg->umax_value);
+ if (!tnum_is_unknown(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(",var_off=%s", tn_buf);
+ }
+ }
+ verbose(")");
+ }
}
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] == STACK_SPILL)
@@ -295,11 +312,15 @@ static const char *const bpf_jmp_string[16] = {
[BPF_JA >> 4] = "jmp",
[BPF_JEQ >> 4] = "==",
[BPF_JGT >> 4] = ">",
+ [BPF_JLT >> 4] = "<",
[BPF_JGE >> 4] = ">=",
+ [BPF_JLE >> 4] = "<=",
[BPF_JSET >> 4] = "&",
[BPF_JNE >> 4] = "!=",
[BPF_JSGT >> 4] = "s>",
+ [BPF_JSLT >> 4] = "s<",
[BPF_JSGE >> 4] = "s>=",
+ [BPF_JSLE >> 4] = "s<=",
[BPF_CALL >> 4] = "call",
[BPF_EXIT >> 4] = "exit",
};
@@ -463,55 +484,163 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void __mark_reg_not_init(struct bpf_reg_state *reg);
+
+/* Mark the unknown part of a register (variable offset or scalar value) as
+ * known to have the value @imm.
+ */
+static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
- BUG_ON(regno >= MAX_BPF_REG);
+ reg->id = 0;
+ reg->var_off = tnum_const(imm);
+ reg->smin_value = (s64)imm;
+ reg->smax_value = (s64)imm;
+ reg->umin_value = imm;
+ reg->umax_value = imm;
+}
- memset(&regs[regno], 0, sizeof(regs[regno]));
- regs[regno].type = NOT_INIT;
- regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
- regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+/* Mark the 'variable offset' part of a register as zero. This should be
+ * used only on registers holding a pointer type.
+ */
+static void __mark_reg_known_zero(struct bpf_reg_state *reg)
+{
+ __mark_reg_known(reg, 0);
}
-static void init_reg_state(struct bpf_reg_state *regs)
+static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
{
- int i;
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose("mark_reg_known_zero(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_known_zero(regs + regno);
+}
- for (i = 0; i < MAX_BPF_REG; i++)
- mark_reg_not_init(regs, i);
+/* Attempts to improve min/max values based on var_off information */
+static void __update_reg_bounds(struct bpf_reg_state *reg)
+{
+ /* min signed is max(sign bit) | min(other bits) */
+ reg->smin_value = max_t(s64, reg->smin_value,
+ reg->var_off.value | (reg->var_off.mask & S64_MIN));
+ /* max signed is min(sign bit) | max(other bits) */
+ reg->smax_value = min_t(s64, reg->smax_value,
+ reg->var_off.value | (reg->var_off.mask & S64_MAX));
+ reg->umin_value = max(reg->umin_value, reg->var_off.value);
+ reg->umax_value = min(reg->umax_value,
+ reg->var_off.value | reg->var_off.mask);
+}
- /* frame pointer */
- regs[BPF_REG_FP].type = FRAME_PTR;
+/* Uses signed min/max values to inform unsigned, and vice-versa */
+static void __reg_deduce_bounds(struct bpf_reg_state *reg)
+{
+ /* Learn sign from signed bounds.
+ * If we cannot cross the sign boundary, then signed and unsigned bounds
+ * are the same, so combine. This works even in the negative case, e.g.
+ * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
+ */
+ if (reg->smin_value >= 0 || reg->smax_value < 0) {
+ reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
+ reg->umin_value);
+ reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
+ reg->umax_value);
+ return;
+ }
+ /* Learn sign from unsigned bounds. Signed bounds cross the sign
+ * boundary, so we must be careful.
+ */
+ if ((s64)reg->umax_value >= 0) {
+ /* Positive. We can't learn anything from the smin, but smax
+ * is positive, hence safe.
+ */
+ reg->smin_value = reg->umin_value;
+ reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
+ reg->umax_value);
+ } else if ((s64)reg->umin_value < 0) {
+ /* Negative. We can't learn anything from the smax, but smin
+ * is negative, hence safe.
+ */
+ reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
+ reg->umin_value);
+ reg->smax_value = reg->umax_value;
+ }
+}
- /* 1st arg to a function */
- regs[BPF_REG_1].type = PTR_TO_CTX;
+/* Attempts to improve var_off based on unsigned min/max information */
+static void __reg_bound_offset(struct bpf_reg_state *reg)
+{
+ reg->var_off = tnum_intersect(reg->var_off,
+ tnum_range(reg->umin_value,
+ reg->umax_value));
+}
+
+/* Reset the min/max bounds of a register */
+static void __mark_reg_unbounded(struct bpf_reg_state *reg)
+{
+ reg->smin_value = S64_MIN;
+ reg->smax_value = S64_MAX;
+ reg->umin_value = 0;
+ reg->umax_value = U64_MAX;
}
-static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+/* Mark a register as having a completely unknown (scalar) value. */
+static void __mark_reg_unknown(struct bpf_reg_state *reg)
{
- regs[regno].type = UNKNOWN_VALUE;
- regs[regno].id = 0;
- regs[regno].imm = 0;
+ reg->type = SCALAR_VALUE;
+ reg->id = 0;
+ reg->off = 0;
+ reg->var_off = tnum_unknown;
+ __mark_reg_unbounded(reg);
}
-static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
{
- BUG_ON(regno >= MAX_BPF_REG);
- __mark_reg_unknown_value(regs, regno);
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose("mark_reg_unknown(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_unknown(regs + regno);
}
-static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
+static void __mark_reg_not_init(struct bpf_reg_state *reg)
{
- regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
- regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
- regs[regno].min_align = 0;
+ __mark_reg_unknown(reg);
+ reg->type = NOT_INIT;
}
-static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
- u32 regno)
+static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
{
- mark_reg_unknown_value(regs, regno);
- reset_reg_range_values(regs, regno);
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose("mark_reg_not_init(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_not_init(regs + regno);
+}
+
+static void init_reg_state(struct bpf_reg_state *regs)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ mark_reg_not_init(regs, i);
+ regs[i].live = REG_LIVE_NONE;
+ }
+
+ /* frame pointer */
+ regs[BPF_REG_FP].type = PTR_TO_STACK;
+ mark_reg_known_zero(regs, BPF_REG_FP);
+
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+ mark_reg_known_zero(regs, BPF_REG_1);
}
enum reg_arg_type {
@@ -520,9 +649,26 @@ enum reg_arg_type {
DST_OP_NO_MARK /* same as above, check only, don't mark */
};
-static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
+static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
+{
+ struct bpf_verifier_state *parent = state->parent;
+
+ while (parent) {
+ /* if read wasn't screened by an earlier write ... */
+ if (state->regs[regno].live & REG_LIVE_WRITTEN)
+ break;
+ /* ... then we depend on parent's value */
+ parent->regs[regno].live |= REG_LIVE_READ;
+ state = parent;
+ parent = state->parent;
+ }
+}
+
+static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
enum reg_arg_type t)
{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+
if (regno >= MAX_BPF_REG) {
verbose("R%d is invalid\n", regno);
return -EINVAL;
@@ -534,43 +680,29 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
verbose("R%d !read_ok\n", regno);
return -EACCES;
}
+ mark_reg_read(&env->cur_state, regno);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
verbose("frame pointer is read only\n");
return -EACCES;
}
+ regs[regno].live |= REG_LIVE_WRITTEN;
if (t == DST_OP)
- mark_reg_unknown_value(regs, regno);
+ mark_reg_unknown(regs, regno);
}
return 0;
}
-static int bpf_size_to_bytes(int bpf_size)
-{
- if (bpf_size == BPF_W)
- return 4;
- else if (bpf_size == BPF_H)
- return 2;
- else if (bpf_size == BPF_B)
- return 1;
- else if (bpf_size == BPF_DW)
- return 8;
- else
- return -EINVAL;
-}
-
static bool is_spillable_regtype(enum bpf_reg_type type)
{
switch (type) {
case PTR_TO_MAP_VALUE:
case PTR_TO_MAP_VALUE_OR_NULL:
- case PTR_TO_MAP_VALUE_ADJ:
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
case PTR_TO_PACKET_END:
- case FRAME_PTR:
case CONST_PTR_TO_MAP:
return true;
default:
@@ -584,7 +716,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
static int check_stack_write(struct bpf_verifier_state *state, int off,
int size, int value_regno)
{
- int i;
+ int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
@@ -599,15 +731,14 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
}
/* save register state */
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- state->regs[value_regno];
+ state->spilled_regs[spi] = state->regs[value_regno];
+ state->spilled_regs[spi].live |= REG_LIVE_WRITTEN;
for (i = 0; i < BPF_REG_SIZE; i++)
state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
} else {
/* regular write of data into stack */
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- (struct bpf_reg_state) {};
+ state->spilled_regs[spi] = (struct bpf_reg_state) {};
for (i = 0; i < size; i++)
state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
@@ -615,11 +746,26 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
return 0;
}
+static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot)
+{
+ struct bpf_verifier_state *parent = state->parent;
+
+ while (parent) {
+ /* if read wasn't screened by an earlier write ... */
+ if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN)
+ break;
+ /* ... then we depend on parent's value */
+ parent->spilled_regs[slot].live |= REG_LIVE_READ;
+ state = parent;
+ parent = state->parent;
+ }
+}
+
static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
int value_regno)
{
u8 *slot_type;
- int i;
+ int i, spi;
slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
@@ -635,10 +781,13 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
}
}
- if (value_regno >= 0)
+ spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+
+ if (value_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] =
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
+ state->regs[value_regno] = state->spilled_regs[spi];
+ mark_stack_slot_read(state, spi);
+ }
return 0;
} else {
for (i = 0; i < size; i++) {
@@ -650,14 +799,13 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
}
if (value_regno >= 0)
/* have read misc data from the stack */
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(state->regs, value_regno);
return 0;
}
}
/* check read/write into map element returned by bpf_map_lookup_elem() */
-static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
+static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
int size)
{
struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
@@ -670,49 +818,50 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
return 0;
}
-/* check read/write into an adjusted map element */
-static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno,
+/* check read/write into a map element with possible variable offset */
+static int check_map_access(struct bpf_verifier_env *env, u32 regno,
int off, int size)
{
struct bpf_verifier_state *state = &env->cur_state;
struct bpf_reg_state *reg = &state->regs[regno];
int err;
- /* We adjusted the register to this map value, so we
- * need to change off and size to min_value and max_value
- * respectively to make sure our theoretical access will be
- * safe.
+ /* We may have adjusted the register to this map value, so we
+ * need to try adding each of min_value and max_value to off
+ * to make sure our theoretical access will be safe.
*/
if (log_level)
print_verifier_state(state);
- env->varlen_map_value_access = true;
/* The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
* index'es we need to make sure that whatever we use
* will have a set floor within our range.
*/
- if (reg->min_value < 0) {
+ if (reg->smin_value < 0) {
verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
- err = check_map_access(env, regno, reg->min_value + off, size);
+ err = __check_map_access(env, regno, reg->smin_value + off, size);
if (err) {
- verbose("R%d min value is outside of the array range\n",
- regno);
+ verbose("R%d min value is outside of the array range\n", regno);
return err;
}
- /* If we haven't set a max value then we need to bail
- * since we can't be sure we won't do bad things.
+ /* If we haven't set a max value then we need to bail since we can't be
+ * sure we won't do bad things.
+ * If reg->umax_value + off could overflow, treat that as unbounded too.
*/
- if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+ if (reg->umax_value >= BPF_MAX_VAR_OFF) {
verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
regno);
return -EACCES;
}
- return check_map_access(env, regno, reg->max_value + off, size);
+ err = __check_map_access(env, regno, reg->umax_value + off, size);
+ if (err)
+ verbose("R%d max value is outside of the array range\n", regno);
+ return err;
}
#define MAX_PACKET_OFF 0xffff
@@ -732,6 +881,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_SCHED_ACT:
case BPF_PROG_TYPE_XDP:
case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_SK_SKB:
if (meta)
return meta->pkt_access;
@@ -742,14 +892,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
}
}
-static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size)
+static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
+ int off, int size)
{
struct bpf_reg_state *regs = env->cur_state.regs;
struct bpf_reg_state *reg = &regs[regno];
- off += reg->off;
- if (off < 0 || size <= 0 || off + size > reg->range) {
+ if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
off, size, regno, reg->id, reg->off, reg->range);
return -EACCES;
@@ -757,16 +906,58 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
return 0;
}
-/* check access to 'struct bpf_context' fields */
-static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
+static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *reg = &regs[regno];
+ int err;
+
+ /* We may have added a variable offset to the packet pointer; but any
+ * reg->range we have comes after that. We are only checking the fixed
+ * offset.
+ */
+
+ /* We don't allow negative numbers, because we aren't tracking enough
+ * detail to prove they're safe.
+ */
+ if (reg->smin_value < 0) {
+ verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ regno);
+ return -EACCES;
+ }
+ err = __check_packet_access(env, regno, off, size);
+ if (err) {
+ verbose("R%d offset is outside of the packet\n", regno);
+ return err;
+ }
+ return err;
+}
+
+/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
+static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
enum bpf_access_type t, enum bpf_reg_type *reg_type)
{
+ struct bpf_insn_access_aux info = {
+ .reg_type = *reg_type,
+ };
+
/* for analyzer ctx accesses are already validated and converted */
if (env->analyzer_ops)
return 0;
if (env->prog->aux->ops->is_valid_access &&
- env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
+ env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+ /* A non zero info.ctx_field_size indicates that this field is a
+ * candidate for later verifier transformation to load the whole
+ * field and then apply a mask when accessed with a narrower
+ * access than actual ctx access size. A zero info.ctx_field_size
+ * will only allow for whole field access and rejects any other
+ * type of narrower access.
+ */
+ env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
+ *reg_type = info.reg_type;
+
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
@@ -777,40 +968,30 @@ static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
return -EACCES;
}
-static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+static bool __is_pointer_value(bool allow_ptr_leaks,
+ const struct bpf_reg_state *reg)
{
- if (env->allow_ptr_leaks)
+ if (allow_ptr_leaks)
return false;
- switch (env->cur_state.regs[regno].type) {
- case UNKNOWN_VALUE:
- case CONST_IMM:
- return false;
- default:
- return true;
- }
+ return reg->type != SCALAR_VALUE;
+}
+
+static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+{
+ return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
}
static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
int off, int size, bool strict)
{
+ struct tnum reg_off;
int ip_align;
- int reg_off;
/* Byte size accesses are always allowed. */
if (!strict || size == 1)
return 0;
- reg_off = reg->off;
- if (reg->id) {
- if (reg->aux_off_align % size) {
- verbose("Packet access is only %u byte aligned, %d byte access not allowed\n",
- reg->aux_off_align, size);
- return -EACCES;
- }
- reg_off += reg->aux_off;
- }
-
/* For platforms that do not have a Kconfig enabling
* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
* NET_IP_ALIGN is universally set to '2'. And on platforms
@@ -820,20 +1001,37 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
* unconditional IP align value of '2'.
*/
ip_align = 2;
- if ((ip_align + reg_off + off) % size != 0) {
- verbose("misaligned packet access off %d+%d+%d size %d\n",
- ip_align, reg_off, off, size);
+
+ reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off));
+ if (!tnum_is_aligned(reg_off, size)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+ ip_align, tn_buf, reg->off, off, size);
return -EACCES;
}
return 0;
}
-static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
- int size, bool strict)
+static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+ const char *pointer_desc,
+ int off, int size, bool strict)
{
- if (strict && size != 1) {
- verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
+ struct tnum reg_off;
+
+ /* Byte size accesses are always allowed. */
+ if (!strict || size == 1)
+ return 0;
+
+ reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off));
+ if (!tnum_is_aligned(reg_off, size)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("misaligned %saccess off %s+%d+%d size %d\n",
+ pointer_desc, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -845,21 +1043,25 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
int off, int size)
{
bool strict = env->strict_alignment;
+ const char *pointer_desc = "";
switch (reg->type) {
case PTR_TO_PACKET:
+ /* special case, because of NET_IP_ALIGN */
return check_pkt_ptr_alignment(reg, off, size, strict);
- case PTR_TO_MAP_VALUE_ADJ:
- return check_val_ptr_alignment(reg, size, strict);
+ case PTR_TO_MAP_VALUE:
+ pointer_desc = "value ";
+ break;
+ case PTR_TO_CTX:
+ pointer_desc = "context ";
+ break;
+ case PTR_TO_STACK:
+ pointer_desc = "stack ";
+ break;
default:
- if (off % size != 0) {
- verbose("misaligned access off %d size %d\n",
- off, size);
- return -EACCES;
- }
-
- return 0;
+ break;
}
+ return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
}
/* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -868,7 +1070,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
* if t==write && value_regno==-1, some unknown value is stored into memory
* if t==read && value_regno==-1, don't care what we read from memory
*/
-static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
+static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off,
int bpf_size, enum bpf_access_type t,
int value_regno)
{
@@ -876,56 +1078,87 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
struct bpf_reg_state *reg = &state->regs[regno];
int size, err = 0;
- if (reg->type == PTR_TO_STACK)
- off += reg->imm;
-
size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;
+ /* alignment checks will add in reg->off themselves */
err = check_ptr_alignment(env, reg, off, size);
if (err)
return err;
- if (reg->type == PTR_TO_MAP_VALUE ||
- reg->type == PTR_TO_MAP_VALUE_ADJ) {
+ /* for access checks, reg->off is just part of off */
+ off += reg->off;
+
+ if (reg->type == PTR_TO_MAP_VALUE) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into map\n", value_regno);
return -EACCES;
}
- if (reg->type == PTR_TO_MAP_VALUE_ADJ)
- err = check_map_access_adj(env, regno, off, size);
- else
- err = check_map_access(env, regno, off, size);
+ err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(state->regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
- enum bpf_reg_type reg_type = UNKNOWN_VALUE;
+ enum bpf_reg_type reg_type = SCALAR_VALUE;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
- err = check_ctx_access(env, off, size, t, &reg_type);
+ /* ctx accesses must be at a fixed offset, so that we can
+ * determine what type of data were returned.
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("variable ctx access var_off=%s off=%d size=%d",
+ tn_buf, off, size);
+ return -EACCES;
+ }
+ off += reg->var_off.value;
+ err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
- /* note that reg.[id|off|range] == 0 */
+ /* ctx access returns either a scalar, or a
+ * PTR_TO_PACKET[_END]. In the latter case, we know
+ * the offset is zero.
+ */
+ if (reg_type == SCALAR_VALUE)
+ mark_reg_unknown(state->regs, value_regno);
+ else
+ mark_reg_known_zero(state->regs, value_regno);
+ state->regs[value_regno].id = 0;
+ state->regs[value_regno].off = 0;
+ state->regs[value_regno].range = 0;
state->regs[value_regno].type = reg_type;
- state->regs[value_regno].aux_off = 0;
- state->regs[value_regno].aux_off_align = 0;
}
- } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
+ } else if (reg->type == PTR_TO_STACK) {
+ /* stack accesses must be at a fixed offset, so that we can
+ * determine what type of data were returned.
+ * See check_stack_read().
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("variable stack access var_off=%s off=%d size=%d",
+ tn_buf, off, size);
+ return -EACCES;
+ }
+ off += reg->var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK) {
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
}
+
+ if (env->prog->aux->stack_depth < -off)
+ env->prog->aux->stack_depth = -off;
+
if (t == BPF_WRITE) {
if (!env->allow_ptr_leaks &&
state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
@@ -937,7 +1170,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
} else {
err = check_stack_read(state, off, size, value_regno);
}
- } else if (state->regs[regno].type == PTR_TO_PACKET) {
+ } else if (reg->type == PTR_TO_PACKET) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
verbose("cannot write into packet\n");
return -EACCES;
@@ -949,28 +1182,25 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
}
err = check_packet_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(state->regs, value_regno);
} else {
verbose("R%d invalid mem access '%s'\n",
regno, reg_type_str[reg->type]);
return -EACCES;
}
- if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks &&
- state->regs[value_regno].type == UNKNOWN_VALUE) {
- /* 1 or 2 byte load zero-extends, determine the number of
- * zero upper bits. Not doing it fo 4 byte load, since
- * such values cannot be added to ptr_to_packet anyway.
- */
- state->regs[value_regno].imm = 64 - size * 8;
+ if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
+ state->regs[value_regno].type == SCALAR_VALUE) {
+ /* b/h/w load zero-extends, mark upper bits as known 0 */
+ state->regs[value_regno].var_off = tnum_cast(
+ state->regs[value_regno].var_off, size);
+ __update_reg_bounds(&state->regs[value_regno]);
}
return err;
}
-static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
int err;
if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
@@ -980,12 +1210,12 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -995,19 +1225,27 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check whether atomic_add can read the memory */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
if (err)
return err;
/* check whether atomic_add can write into the same memory */
- return check_mem_access(env, insn->dst_reg, insn->off,
+ return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1);
}
+/* Does this register contain a constant zero? */
+static bool register_is_null(struct bpf_reg_state reg)
+{
+ return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0);
+}
+
/* when register 'regno' is passed into function that will read 'access_size'
* bytes from that pointer, make sure that it's within stack boundary
- * and all elements of stack are initialized
+ * and all elements of stack are initialized.
+ * Unlike most pointer bounds-checking functions, this one doesn't take an
+ * 'off' argument, so it has to add in reg->off itself.
*/
static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
@@ -1018,9 +1256,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int off, i;
if (regs[regno].type != PTR_TO_STACK) {
+ /* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
- regs[regno].type == CONST_IMM &&
- regs[regno].imm == 0)
+ register_is_null(regs[regno]))
return 0;
verbose("R%d type=%s expected=%s\n", regno,
@@ -1029,7 +1267,15 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return -EACCES;
}
- off = regs[regno].imm;
+ /* Only allow fixed-offset stack reads */
+ if (!tnum_is_const(regs[regno].var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
+ verbose("invalid variable stack read R%d var_off=%s\n",
+ regno, tn_buf);
+ }
+ off = regs[regno].off + regs[regno].var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
access_size <= 0) {
verbose("invalid stack type R%d off=%d access_size=%d\n",
@@ -1037,6 +1283,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return -EACCES;
}
+ if (env->prog->aux->stack_depth < -off)
+ env->prog->aux->stack_depth = -off;
+
if (meta && meta->raw_mode) {
meta->access_size = access_size;
meta->regno = regno;
@@ -1057,16 +1306,14 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
- switch (regs[regno].type) {
+ switch (reg->type) {
case PTR_TO_PACKET:
- return check_packet_access(env, regno, 0, access_size);
+ return check_packet_access(env, regno, reg->off, access_size);
case PTR_TO_MAP_VALUE:
- return check_map_access(env, regno, 0, access_size);
- case PTR_TO_MAP_VALUE_ADJ:
- return check_map_access_adj(env, regno, 0, access_size);
- default: /* const_imm|ptr_to_stack or invalid ptr */
+ return check_map_access(env, regno, reg->off, access_size);
+ default: /* scalar_value|ptr_to_stack or invalid ptr */
return check_stack_boundary(env, regno, access_size,
zero_size_allowed, meta);
}
@@ -1083,10 +1330,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_DONTCARE)
return 0;
- if (type == NOT_INIT) {
- verbose("R%d !read_ok\n", regno);
- return -EACCES;
- }
+ err = check_reg_arg(env, regno, SRC_OP);
+ if (err)
+ return err;
if (arg_type == ARG_ANYTHING) {
if (is_pointer_value(env, regno)) {
@@ -1109,11 +1355,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
- expected_type = CONST_IMM;
- /* One exception. Allow UNKNOWN_VALUE registers when the
- * boundaries are known and don't cause unsafe memory accesses
- */
- if (type != UNKNOWN_VALUE && type != expected_type)
+ expected_type = SCALAR_VALUE;
+ if (type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_MAP_PTR) {
expected_type = CONST_PTR_TO_MAP;
@@ -1127,13 +1370,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
arg_type == ARG_PTR_TO_UNINIT_MEM) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
- * passed in as argument, it's a CONST_IMM type. Final test
+ * passed in as argument, it's a SCALAR_VALUE type. Final test
* happens during stack boundary checking.
*/
- if (type == CONST_IMM && reg->imm == 0)
+ if (register_is_null(*reg))
/* final test in check_stack_boundary() */;
else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
- type != PTR_TO_MAP_VALUE_ADJ && type != expected_type)
+ type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
} else {
@@ -1159,7 +1402,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
if (type == PTR_TO_PACKET)
- err = check_packet_access(env, regno, 0,
+ err = check_packet_access(env, regno, reg->off,
meta->map_ptr->key_size);
else
err = check_stack_boundary(env, regno,
@@ -1175,7 +1418,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
if (type == PTR_TO_PACKET)
- err = check_packet_access(env, regno, 0,
+ err = check_packet_access(env, regno, reg->off,
meta->map_ptr->value_size);
else
err = check_stack_boundary(env, regno,
@@ -1195,10 +1438,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
- /* If the register is UNKNOWN_VALUE, the access check happens
- * using its boundaries. Otherwise, just use its imm
+ /* The register is SCALAR_VALUE; the access check
+ * happens using its boundaries.
*/
- if (type == UNKNOWN_VALUE) {
+
+ if (!tnum_is_const(reg->var_off))
/* For unprivileged variable accesses, disable raw
* mode so that the program is required to
* initialize all the memory that the helper could
@@ -1206,35 +1450,28 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
meta = NULL;
- if (reg->min_value < 0) {
- verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
- regno);
- return -EACCES;
- }
-
- if (reg->min_value == 0) {
- err = check_helper_mem_access(env, regno - 1, 0,
- zero_size_allowed,
- meta);
- if (err)
- return err;
- }
+ if (reg->smin_value < 0) {
+ verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+ regno);
+ return -EACCES;
+ }
- if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
- verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
- regno);
- return -EACCES;
- }
- err = check_helper_mem_access(env, regno - 1,
- reg->max_value,
- zero_size_allowed, meta);
+ if (reg->umin_value == 0) {
+ err = check_helper_mem_access(env, regno - 1, 0,
+ zero_size_allowed,
+ meta);
if (err)
return err;
- } else {
- /* register is CONST_IMM */
- err = check_helper_mem_access(env, regno - 1, reg->imm,
- zero_size_allowed, meta);
}
+
+ if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+ verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_helper_mem_access(env, regno - 1,
+ reg->umax_value,
+ zero_size_allowed, meta);
}
return err;
@@ -1269,10 +1506,25 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
func_id != BPF_FUNC_current_task_under_cgroup)
goto error;
break;
+ /* devmap returns a pointer to a live net_device ifindex that we cannot
+ * allow to be modified from bpf side. So do not allow lookup elements
+ * for now.
+ */
+ case BPF_MAP_TYPE_DEVMAP:
+ if (func_id != BPF_FUNC_redirect_map)
+ goto error;
+ break;
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
if (func_id != BPF_FUNC_map_lookup_elem)
goto error;
+ break;
+ case BPF_MAP_TYPE_SOCKMAP:
+ if (func_id != BPF_FUNC_sk_redirect_map &&
+ func_id != BPF_FUNC_sock_map_update &&
+ func_id != BPF_FUNC_map_delete_elem)
+ goto error;
+ break;
default:
break;
}
@@ -1297,6 +1549,18 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
goto error;
break;
+ case BPF_FUNC_redirect_map:
+ if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+ goto error;
+ break;
+ case BPF_FUNC_sk_redirect_map:
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+ goto error;
+ break;
+ case BPF_FUNC_sock_map_update:
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+ goto error;
+ break;
default:
break;
}
@@ -1326,6 +1590,9 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
return count > 1 ? -EINVAL : 0;
}
+/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
+ * so turn them into unknown SCALAR_VALUE.
+ */
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
struct bpf_verifier_state *state = &env->cur_state;
@@ -1335,7 +1602,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET ||
regs[i].type == PTR_TO_PACKET_END)
- mark_reg_unknown_value(regs, i);
+ mark_reg_unknown(regs, i);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
@@ -1344,8 +1611,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
if (reg->type != PTR_TO_PACKET &&
reg->type != PTR_TO_PACKET_END)
continue;
- reg->type = UNKNOWN_VALUE;
- reg->imm = 0;
+ __mark_reg_unknown(reg);
}
}
@@ -1414,25 +1680,30 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
* is inferred from register state.
*/
for (i = 0; i < meta.access_size; i++) {
- err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1);
+ err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1);
if (err)
return err;
}
/* reset caller saved regs */
- for (i = 0; i < CALLER_SAVED_REGS; i++)
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
- /* update return register */
+ /* update return register (already marked as written above) */
if (fn->ret_type == RET_INTEGER) {
- regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ /* sets type to SCALAR_VALUE */
+ mark_reg_unknown(regs, BPF_REG_0);
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
struct bpf_insn_aux_data *insn_aux;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
- regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
+ /* There is no offset yet applied, variable or fixed */
+ mark_reg_known_zero(regs, BPF_REG_0);
+ regs[BPF_REG_0].off = 0;
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
@@ -1463,409 +1734,551 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return 0;
}
-static int check_packet_ptr_add(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+static void coerce_reg_to_32(struct bpf_reg_state *reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
- struct bpf_reg_state tmp_reg;
- s32 imm;
-
- if (BPF_SRC(insn->code) == BPF_K) {
- /* pkt_ptr += imm */
- imm = insn->imm;
-
-add_imm:
- if (imm < 0) {
- verbose("addition of negative constant to packet pointer is not allowed\n");
- return -EACCES;
- }
- if (imm >= MAX_PACKET_OFF ||
- imm + dst_reg->off >= MAX_PACKET_OFF) {
- verbose("constant %d is too large to add to packet pointer\n",
- imm);
- return -EACCES;
- }
- /* a constant was added to pkt_ptr.
- * Remember it while keeping the same 'id'
- */
- dst_reg->off += imm;
- } else {
- bool had_id;
-
- if (src_reg->type == PTR_TO_PACKET) {
- /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
- tmp_reg = *dst_reg; /* save r7 state */
- *dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */
- src_reg = &tmp_reg; /* pretend it's src_reg state */
- /* if the checks below reject it, the copy won't matter,
- * since we're rejecting the whole program. If all ok,
- * then imm22 state will be added to r7
- * and r7 will be pkt(id=0,off=22,r=62) while
- * r6 will stay as pkt(id=0,off=0,r=62)
- */
- }
+ /* clear high 32 bits */
+ reg->var_off = tnum_cast(reg->var_off, 4);
+ /* Update bounds */
+ __update_reg_bounds(reg);
+}
- if (src_reg->type == CONST_IMM) {
- /* pkt_ptr += reg where reg is known constant */
- imm = src_reg->imm;
- goto add_imm;
- }
- /* disallow pkt_ptr += reg
- * if reg is not uknown_value with guaranteed zero upper bits
- * otherwise pkt_ptr may overflow and addition will become
- * subtraction which is not allowed
- */
- if (src_reg->type != UNKNOWN_VALUE) {
- verbose("cannot add '%s' to ptr_to_packet\n",
- reg_type_str[src_reg->type]);
- return -EACCES;
- }
- if (src_reg->imm < 48) {
- verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n",
- src_reg->imm);
- return -EACCES;
- }
+static bool signed_add_overflows(s64 a, s64 b)
+{
+ /* Do the add in u64, where overflow is well-defined */
+ s64 res = (s64)((u64)a + (u64)b);
- had_id = (dst_reg->id != 0);
+ if (b < 0)
+ return res > a;
+ return res < a;
+}
- /* dst_reg stays as pkt_ptr type and since some positive
- * integer value was added to the pointer, increment its 'id'
- */
- dst_reg->id = ++env->id_gen;
-
- /* something was added to pkt_ptr, set range to zero */
- dst_reg->aux_off += dst_reg->off;
- dst_reg->off = 0;
- dst_reg->range = 0;
- if (had_id)
- dst_reg->aux_off_align = min(dst_reg->aux_off_align,
- src_reg->min_align);
- else
- dst_reg->aux_off_align = src_reg->min_align;
- }
- return 0;
+static bool signed_sub_overflows(s64 a, s64 b)
+{
+ /* Do the sub in u64, where overflow is well-defined */
+ s64 res = (s64)((u64)a - (u64)b);
+
+ if (b < 0)
+ return res < a;
+ return res > a;
}
-static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
+/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
+ * Caller should also handle BPF_MOV case separately.
+ * If we return -EACCES, caller may want to try again treating pointer as a
+ * scalar. So we only emit a diagnostic if !env->allow_ptr_leaks.
+ */
+static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ const struct bpf_reg_state *ptr_reg,
+ const struct bpf_reg_state *off_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ bool known = tnum_is_const(off_reg->var_off);
+ s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
+ smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
+ u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
+ umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
u8 opcode = BPF_OP(insn->code);
- s64 imm_log2;
+ u32 dst = insn->dst_reg;
- /* for type == UNKNOWN_VALUE:
- * imm > 0 -> number of zero upper bits
- * imm == 0 -> don't track which is the same as all bits can be non-zero
- */
+ dst_reg = &regs[dst];
- if (BPF_SRC(insn->code) == BPF_X) {
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
-
- if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
- dst_reg->imm && opcode == BPF_ADD) {
- /* dreg += sreg
- * where both have zero upper bits. Adding them
- * can only result making one more bit non-zero
- * in the larger value.
- * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
- * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
- */
- dst_reg->imm = min(dst_reg->imm, src_reg->imm);
- dst_reg->imm--;
- return 0;
- }
- if (src_reg->type == CONST_IMM && src_reg->imm > 0 &&
- dst_reg->imm && opcode == BPF_ADD) {
- /* dreg += sreg
- * where dreg has zero upper bits and sreg is const.
- * Adding them can only result making one more bit
- * non-zero in the larger value.
- */
- imm_log2 = __ilog2_u64((long long)src_reg->imm);
- dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
- dst_reg->imm--;
- return 0;
- }
- /* all other cases non supported yet, just mark dst_reg */
- dst_reg->imm = 0;
- return 0;
+ if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: known but bad sbounds\n");
+ return -EINVAL;
+ }
+ if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: known but bad ubounds\n");
+ return -EINVAL;
+ }
+
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops on pointers produce (meaningless) scalars */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d 32-bit pointer arithmetic prohibited\n",
+ dst);
+ return -EACCES;
}
- /* sign extend 32-bit imm into 64-bit to make sure that
- * negative values occupy bit 63. Note ilog2() would have
- * been incorrect, since sizeof(insn->imm) == 4
+ if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+ dst);
+ return -EACCES;
+ }
+ if (ptr_reg->type == CONST_PTR_TO_MAP) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+ dst);
+ return -EACCES;
+ }
+ if (ptr_reg->type == PTR_TO_PACKET_END) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+ dst);
+ return -EACCES;
+ }
+
+ /* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
+ * The id may be overwritten later if we create a new variable offset.
*/
- imm_log2 = __ilog2_u64((long long)insn->imm);
+ dst_reg->type = ptr_reg->type;
+ dst_reg->id = ptr_reg->id;
- if (dst_reg->imm && opcode == BPF_LSH) {
- /* reg <<= imm
- * if reg was a result of 2 byte load, then its imm == 48
- * which means that upper 48 bits are zero and shifting this reg
- * left by 4 would mean that upper 44 bits are still zero
+ switch (opcode) {
+ case BPF_ADD:
+ /* We can take a fixed offset as long as it doesn't overflow
+ * the s32 'off' field
*/
- dst_reg->imm -= insn->imm;
- } else if (dst_reg->imm && opcode == BPF_MUL) {
- /* reg *= imm
- * if multiplying by 14 subtract 4
- * This is conservative calculation of upper zero bits.
- * It's not trying to special case insn->imm == 1 or 0 cases
+ if (known && (ptr_reg->off + smin_val ==
+ (s64)(s32)(ptr_reg->off + smin_val))) {
+ /* pointer += K. Accumulate it into fixed offset */
+ dst_reg->smin_value = smin_ptr;
+ dst_reg->smax_value = smax_ptr;
+ dst_reg->umin_value = umin_ptr;
+ dst_reg->umax_value = umax_ptr;
+ dst_reg->var_off = ptr_reg->var_off;
+ dst_reg->off = ptr_reg->off + smin_val;
+ dst_reg->range = ptr_reg->range;
+ break;
+ }
+ /* A new variable offset is created. Note that off_reg->off
+ * == 0, since it's a scalar.
+ * dst_reg gets the pointer type and since some positive
+ * integer value was added to the pointer, give it a new 'id'
+ * if it's a PTR_TO_PACKET.
+ * this creates a new 'base' pointer, off_reg (variable) gets
+ * added into the variable offset, and we copy the fixed offset
+ * from ptr_reg.
*/
- dst_reg->imm -= imm_log2 + 1;
- } else if (opcode == BPF_AND) {
- /* reg &= imm */
- dst_reg->imm = 63 - imm_log2;
- } else if (dst_reg->imm && opcode == BPF_ADD) {
- /* reg += imm */
- dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
- dst_reg->imm--;
- } else if (opcode == BPF_RSH) {
- /* reg >>= imm
- * which means that after right shift, upper bits will be zero
- * note that verifier already checked that
- * 0 <= imm < 64 for shift insn
+ if (signed_add_overflows(smin_ptr, smin_val) ||
+ signed_add_overflows(smax_ptr, smax_val)) {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = smin_ptr + smin_val;
+ dst_reg->smax_value = smax_ptr + smax_val;
+ }
+ if (umin_ptr + umin_val < umin_ptr ||
+ umax_ptr + umax_val < umax_ptr) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ dst_reg->umin_value = umin_ptr + umin_val;
+ dst_reg->umax_value = umax_ptr + umax_val;
+ }
+ dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
+ dst_reg->off = ptr_reg->off;
+ if (ptr_reg->type == PTR_TO_PACKET) {
+ dst_reg->id = ++env->id_gen;
+ /* something was added to pkt_ptr, set range to zero */
+ dst_reg->range = 0;
+ }
+ break;
+ case BPF_SUB:
+ if (dst_reg == off_reg) {
+ /* scalar -= pointer. Creates an unknown scalar */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d tried to subtract pointer from scalar\n",
+ dst);
+ return -EACCES;
+ }
+ /* We don't allow subtraction from FP, because (according to
+ * test_verifier.c test "invalid fp arithmetic", JITs might not
+ * be able to deal with it.
*/
- dst_reg->imm += insn->imm;
- if (unlikely(dst_reg->imm > 64))
- /* some dumb code did:
- * r2 = *(u32 *)mem;
- * r2 >>= 32;
- * and all bits are zero now */
- dst_reg->imm = 64;
- } else {
- /* all other alu ops, means that we don't know what will
- * happen to the value, mark it with unknown number of zero bits
+ if (ptr_reg->type == PTR_TO_STACK) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d subtraction from stack pointer prohibited\n",
+ dst);
+ return -EACCES;
+ }
+ if (known && (ptr_reg->off - smin_val ==
+ (s64)(s32)(ptr_reg->off - smin_val))) {
+ /* pointer -= K. Subtract it from fixed offset */
+ dst_reg->smin_value = smin_ptr;
+ dst_reg->smax_value = smax_ptr;
+ dst_reg->umin_value = umin_ptr;
+ dst_reg->umax_value = umax_ptr;
+ dst_reg->var_off = ptr_reg->var_off;
+ dst_reg->id = ptr_reg->id;
+ dst_reg->off = ptr_reg->off - smin_val;
+ dst_reg->range = ptr_reg->range;
+ break;
+ }
+ /* A new variable offset is created. If the subtrahend is known
+ * nonnegative, then any reg->range we had before is still good.
*/
- dst_reg->imm = 0;
- }
-
- if (dst_reg->imm < 0) {
- /* all 64 bits of the register can contain non-zero bits
- * and such value cannot be added to ptr_to_packet, since it
- * may overflow, mark it as unknown to avoid further eval
+ if (signed_sub_overflows(smin_ptr, smax_val) ||
+ signed_sub_overflows(smax_ptr, smin_val)) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = smin_ptr - smax_val;
+ dst_reg->smax_value = smax_ptr - smin_val;
+ }
+ if (umin_ptr < umax_val) {
+ /* Overflow possible, we know nothing */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ /* Cannot overflow (as long as bounds are consistent) */
+ dst_reg->umin_value = umin_ptr - umax_val;
+ dst_reg->umax_value = umax_ptr - umin_val;
+ }
+ dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
+ dst_reg->off = ptr_reg->off;
+ if (ptr_reg->type == PTR_TO_PACKET) {
+ dst_reg->id = ++env->id_gen;
+ /* something was added to pkt_ptr, set range to zero */
+ if (smin_val < 0)
+ dst_reg->range = 0;
+ }
+ break;
+ case BPF_AND:
+ case BPF_OR:
+ case BPF_XOR:
+ /* bitwise ops on pointers are troublesome, prohibit for now.
+ * (However, in principle we could allow some cases, e.g.
+ * ptr &= ~3 which would reduce min_value by 3.)
*/
- dst_reg->imm = 0;
- }
- return 0;
-}
-
-static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
-{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
- u8 opcode = BPF_OP(insn->code);
- u64 dst_imm = dst_reg->imm;
-
- /* dst_reg->type == CONST_IMM here. Simulate execution of insns
- * containing ALU ops. Don't care about overflow or negative
- * values, just add/sub/... them; registers are in u64.
- */
- if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) {
- dst_imm += insn->imm;
- } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm += src_reg->imm;
- } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) {
- dst_imm -= insn->imm;
- } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm -= src_reg->imm;
- } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) {
- dst_imm *= insn->imm;
- } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm *= src_reg->imm;
- } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) {
- dst_imm |= insn->imm;
- } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm |= src_reg->imm;
- } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) {
- dst_imm &= insn->imm;
- } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm &= src_reg->imm;
- } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) {
- dst_imm >>= insn->imm;
- } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm >>= src_reg->imm;
- } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) {
- dst_imm <<= insn->imm;
- } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm <<= src_reg->imm;
- } else {
- mark_reg_unknown_value(regs, insn->dst_reg);
- goto out;
+ if (!env->allow_ptr_leaks)
+ verbose("R%d bitwise operator %s on pointer prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
+ return -EACCES;
+ default:
+ /* other operators (e.g. MUL,LSH) produce non-pointer results */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic with %s operator prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
+ return -EACCES;
}
- dst_reg->imm = dst_imm;
-out:
+ __update_reg_bounds(dst_reg);
+ __reg_deduce_bounds(dst_reg);
+ __reg_bound_offset(dst_reg);
return 0;
}
-static void check_reg_overflow(struct bpf_reg_state *reg)
-{
- if (reg->max_value > BPF_REGISTER_MAX_RANGE)
- reg->max_value = BPF_REGISTER_MAX_RANGE;
- if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
- reg->min_value > BPF_REGISTER_MAX_RANGE)
- reg->min_value = BPF_REGISTER_MIN_RANGE;
-}
-
-static u32 calc_align(u32 imm)
-{
- if (!imm)
- return 1U << 31;
- return imm - ((imm - 1) & imm);
-}
-
-static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state src_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
- s64 min_val = BPF_REGISTER_MIN_RANGE;
- u64 max_val = BPF_REGISTER_MAX_RANGE;
+ struct bpf_reg_state *regs = env->cur_state.regs;
u8 opcode = BPF_OP(insn->code);
- u32 dst_align, src_align;
-
- dst_reg = &regs[insn->dst_reg];
- src_align = 0;
- if (BPF_SRC(insn->code) == BPF_X) {
- check_reg_overflow(&regs[insn->src_reg]);
- min_val = regs[insn->src_reg].min_value;
- max_val = regs[insn->src_reg].max_value;
-
- /* If the source register is a random pointer then the
- * min_value/max_value values represent the range of the known
- * accesses into that value, not the actual min/max value of the
- * register itself. In this case we have to reset the reg range
- * values so we know it is not safe to look at.
- */
- if (regs[insn->src_reg].type != CONST_IMM &&
- regs[insn->src_reg].type != UNKNOWN_VALUE) {
- min_val = BPF_REGISTER_MIN_RANGE;
- max_val = BPF_REGISTER_MAX_RANGE;
- src_align = 0;
- } else {
- src_align = regs[insn->src_reg].min_align;
- }
- } else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
- (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
- min_val = max_val = insn->imm;
- src_align = calc_align(insn->imm);
- }
-
- dst_align = dst_reg->min_align;
-
- /* We don't know anything about what was done to this register, mark it
- * as unknown.
- */
- if (min_val == BPF_REGISTER_MIN_RANGE &&
- max_val == BPF_REGISTER_MAX_RANGE) {
- reset_reg_range_values(regs, insn->dst_reg);
- return;
+ bool src_known, dst_known;
+ s64 smin_val, smax_val;
+ u64 umin_val, umax_val;
+
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops are (32,32)->64 */
+ coerce_reg_to_32(dst_reg);
+ coerce_reg_to_32(&src_reg);
}
-
- /* If one of our values was at the end of our ranges then we can't just
- * do our normal operations to the register, we need to set the values
- * to the min/max since they are undefined.
- */
- if (min_val == BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- if (max_val == BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ smin_val = src_reg.smin_value;
+ smax_val = src_reg.smax_value;
+ umin_val = src_reg.umin_value;
+ umax_val = src_reg.umax_value;
+ src_known = tnum_is_const(src_reg.var_off);
+ dst_known = tnum_is_const(dst_reg->var_off);
switch (opcode) {
case BPF_ADD:
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value += min_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value += max_val;
- dst_reg->min_align = min(src_align, dst_align);
+ if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
+ signed_add_overflows(dst_reg->smax_value, smax_val)) {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value += smin_val;
+ dst_reg->smax_value += smax_val;
+ }
+ if (dst_reg->umin_value + umin_val < umin_val ||
+ dst_reg->umax_value + umax_val < umax_val) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ dst_reg->umin_value += umin_val;
+ dst_reg->umax_value += umax_val;
+ }
+ dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
break;
case BPF_SUB:
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value -= min_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value -= max_val;
- dst_reg->min_align = min(src_align, dst_align);
+ if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
+ signed_sub_overflows(dst_reg->smax_value, smin_val)) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value -= smax_val;
+ dst_reg->smax_value -= smin_val;
+ }
+ if (dst_reg->umin_value < umax_val) {
+ /* Overflow possible, we know nothing */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ /* Cannot overflow (as long as bounds are consistent) */
+ dst_reg->umin_value -= umax_val;
+ dst_reg->umax_value -= umin_val;
+ }
+ dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
break;
case BPF_MUL:
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value *= min_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value *= max_val;
- dst_reg->min_align = max(src_align, dst_align);
+ dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
+ if (smin_val < 0 || dst_reg->smin_value < 0) {
+ /* Ain't nobody got time to multiply that sign */
+ __mark_reg_unbounded(dst_reg);
+ __update_reg_bounds(dst_reg);
+ break;
+ }
+ /* Both values are positive, so we can work with unsigned and
+ * copy the result to signed (unless it exceeds S64_MAX).
+ */
+ if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
+ /* Potential overflow, we know nothing */
+ __mark_reg_unbounded(dst_reg);
+ /* (except what we can learn from the var_off) */
+ __update_reg_bounds(dst_reg);
+ break;
+ }
+ dst_reg->umin_value *= umin_val;
+ dst_reg->umax_value *= umax_val;
+ if (dst_reg->umax_value > S64_MAX) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
break;
case BPF_AND:
- /* Disallow AND'ing of negative numbers, ain't nobody got time
- * for that. Otherwise the minimum is 0 and the max is the max
- * value we could AND against.
+ if (src_known && dst_known) {
+ __mark_reg_known(dst_reg, dst_reg->var_off.value &
+ src_reg.var_off.value);
+ break;
+ }
+ /* We get our minimum from the var_off, since that's inherently
+ * bitwise. Our maximum is the minimum of the operands' maxima.
*/
- if (min_val < 0)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- else
- dst_reg->min_value = 0;
- dst_reg->max_value = max_val;
- dst_reg->min_align = max(src_align, dst_align);
+ dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
+ dst_reg->umin_value = dst_reg->var_off.value;
+ dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
+ if (dst_reg->smin_value < 0 || smin_val < 0) {
+ /* Lose signed bounds when ANDing negative numbers,
+ * ain't nobody got time for that.
+ */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ /* ANDing two positives gives a positive, so safe to
+ * cast result into s64.
+ */
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
- case BPF_LSH:
- /* Gotta have special overflow logic here, if we're shifting
- * more than MAX_RANGE then just assume we have an invalid
- * range.
+ case BPF_OR:
+ if (src_known && dst_known) {
+ __mark_reg_known(dst_reg, dst_reg->var_off.value |
+ src_reg.var_off.value);
+ break;
+ }
+ /* We get our maximum from the var_off, and our minimum is the
+ * maximum of the operands' minima
*/
- if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) {
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- dst_reg->min_align = 1;
+ dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
+ dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
+ dst_reg->umax_value = dst_reg->var_off.value |
+ dst_reg->var_off.mask;
+ if (dst_reg->smin_value < 0 || smin_val < 0) {
+ /* Lose signed bounds when ORing negative numbers,
+ * ain't nobody got time for that.
+ */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
} else {
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value <<= min_val;
- if (!dst_reg->min_align)
- dst_reg->min_align = 1;
- dst_reg->min_align <<= min_val;
- }
- if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
- else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value <<= max_val;
+ /* ORing two positives gives a positive, so safe to
+ * cast result into s64.
+ */
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
- case BPF_RSH:
- /* RSH by a negative number is undefined, and the BPF_RSH is an
- * unsigned shift, so make the appropriate casts.
+ case BPF_LSH:
+ if (umax_val > 63) {
+ /* Shifts greater than 63 are undefined. This includes
+ * shifts by a negative number.
+ */
+ mark_reg_unknown(regs, insn->dst_reg);
+ break;
+ }
+ /* We lose all sign bit information (except what we can pick
+ * up from var_off)
*/
- if (min_val < 0 || dst_reg->min_value < 0) {
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ /* If we might shift our top bit out, then we know nothing */
+ if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
} else {
- dst_reg->min_value =
- (u64)(dst_reg->min_value) >> min_val;
+ dst_reg->umin_value <<= umin_val;
+ dst_reg->umax_value <<= umax_val;
+ }
+ if (src_known)
+ dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
+ else
+ dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
+ break;
+ case BPF_RSH:
+ if (umax_val > 63) {
+ /* Shifts greater than 63 are undefined. This includes
+ * shifts by a negative number.
+ */
+ mark_reg_unknown(regs, insn->dst_reg);
+ break;
}
- if (min_val < 0) {
- dst_reg->min_align = 1;
+ /* BPF_RSH is an unsigned shift, so make the appropriate casts */
+ if (dst_reg->smin_value < 0) {
+ if (umin_val) {
+ /* Sign bit will be cleared */
+ dst_reg->smin_value = 0;
+ } else {
+ /* Lost sign bit information */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ }
} else {
- dst_reg->min_align >>= (u64) min_val;
- if (!dst_reg->min_align)
- dst_reg->min_align = 1;
+ dst_reg->smin_value =
+ (u64)(dst_reg->smin_value) >> umax_val;
}
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value >>= max_val;
+ if (src_known)
+ dst_reg->var_off = tnum_rshift(dst_reg->var_off,
+ umin_val);
+ else
+ dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
+ dst_reg->umin_value >>= umax_val;
+ dst_reg->umax_value >>= umin_val;
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
default:
- reset_reg_range_values(regs, insn->dst_reg);
+ mark_reg_unknown(regs, insn->dst_reg);
break;
}
- check_reg_overflow(dst_reg);
+ __reg_deduce_bounds(dst_reg);
+ __reg_bound_offset(dst_reg);
+ return 0;
+}
+
+/* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max
+ * and var_off.
+ */
+static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+ struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
+ u8 opcode = BPF_OP(insn->code);
+ int rc;
+
+ dst_reg = &regs[insn->dst_reg];
+ src_reg = NULL;
+ if (dst_reg->type != SCALAR_VALUE)
+ ptr_reg = dst_reg;
+ if (BPF_SRC(insn->code) == BPF_X) {
+ src_reg = &regs[insn->src_reg];
+ if (src_reg->type != SCALAR_VALUE) {
+ if (dst_reg->type != SCALAR_VALUE) {
+ /* Combining two pointers by any ALU op yields
+ * an arbitrary scalar.
+ */
+ if (!env->allow_ptr_leaks) {
+ verbose("R%d pointer %s pointer prohibited\n",
+ insn->dst_reg,
+ bpf_alu_string[opcode >> 4]);
+ return -EACCES;
+ }
+ mark_reg_unknown(regs, insn->dst_reg);
+ return 0;
+ } else {
+ /* scalar += pointer
+ * This is legal, but we have to reverse our
+ * src/dest handling in computing the range
+ */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ src_reg, dst_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* scalar += unknown scalar */
+ __mark_reg_unknown(&off_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn,
+ dst_reg, off_reg);
+ }
+ return rc;
+ }
+ } else if (ptr_reg) {
+ /* pointer += scalar */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ dst_reg, src_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* unknown scalar += scalar */
+ __mark_reg_unknown(dst_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn, dst_reg, *src_reg);
+ }
+ return rc;
+ }
+ } else {
+ /* Pretend the src is a reg with a known value, since we only
+ * need to be able to read from this state.
+ */
+ off_reg.type = SCALAR_VALUE;
+ __mark_reg_known(&off_reg, insn->imm);
+ src_reg = &off_reg;
+ if (ptr_reg) { /* pointer += K */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ ptr_reg, src_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* unknown scalar += K */
+ __mark_reg_unknown(dst_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn, dst_reg, off_reg);
+ }
+ return rc;
+ }
+ }
+
+ /* Got here implies adding two SCALAR_VALUEs */
+ if (WARN_ON_ONCE(ptr_reg)) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: unexpected ptr_reg\n");
+ return -EINVAL;
+ }
+ if (WARN_ON(!src_reg)) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: no src_reg\n");
+ return -EINVAL;
+ }
+ return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
}
/* check validity of 32-bit and 64-bit arithmetic operations */
static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ struct bpf_reg_state *regs = env->cur_state.regs;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -1886,7 +2299,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check src operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -1897,7 +2310,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
@@ -1910,7 +2323,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check src operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
} else {
@@ -1921,15 +2334,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
- /* we are setting our register to something new, we need to
- * reset its range values.
- */
- reset_reg_range_values(regs, insn->dst_reg);
-
if (BPF_SRC(insn->code) == BPF_X) {
if (BPF_CLASS(insn->code) == BPF_ALU64) {
/* case: R1 = R2
@@ -1937,22 +2345,24 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
*/
regs[insn->dst_reg] = regs[insn->src_reg];
} else {
+ /* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
verbose("R%d partial copy of pointer\n",
insn->src_reg);
return -EACCES;
}
- mark_reg_unknown_value(regs, insn->dst_reg);
+ mark_reg_unknown(regs, insn->dst_reg);
+ /* high 32 bits are known zero. */
+ regs[insn->dst_reg].var_off = tnum_cast(
+ regs[insn->dst_reg].var_off, 4);
+ __update_reg_bounds(&regs[insn->dst_reg]);
}
} else {
/* case: R = imm
* remember the value we stored into this reg
*/
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = insn->imm;
- regs[insn->dst_reg].max_value = insn->imm;
- regs[insn->dst_reg].min_value = insn->imm;
- regs[insn->dst_reg].min_align = calc_align(insn->imm);
+ regs[insn->dst_reg].type = SCALAR_VALUE;
+ __mark_reg_known(regs + insn->dst_reg, insn->imm);
}
} else if (opcode > BPF_END) {
@@ -1967,7 +2377,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
} else {
@@ -1978,7 +2388,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -1999,72 +2409,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
if (err)
return err;
- dst_reg = &regs[insn->dst_reg];
-
- /* first we want to adjust our ranges. */
- adjust_reg_min_max_vals(env, insn);
-
- /* pattern match 'bpf_add Rx, imm' instruction */
- if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) {
- dst_reg->type = PTR_TO_STACK;
- dst_reg->imm = insn->imm;
- return 0;
- } else if (opcode == BPF_ADD &&
- BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == PTR_TO_STACK &&
- ((BPF_SRC(insn->code) == BPF_X &&
- regs[insn->src_reg].type == CONST_IMM) ||
- BPF_SRC(insn->code) == BPF_K)) {
- if (BPF_SRC(insn->code) == BPF_X)
- dst_reg->imm += regs[insn->src_reg].imm;
- else
- dst_reg->imm += insn->imm;
- return 0;
- } else if (opcode == BPF_ADD &&
- BPF_CLASS(insn->code) == BPF_ALU64 &&
- (dst_reg->type == PTR_TO_PACKET ||
- (BPF_SRC(insn->code) == BPF_X &&
- regs[insn->src_reg].type == PTR_TO_PACKET))) {
- /* ptr_to_packet += K|X */
- return check_packet_ptr_add(env, insn);
- } else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == UNKNOWN_VALUE &&
- env->allow_ptr_leaks) {
- /* unknown += K|X */
- return evaluate_reg_alu(env, insn);
- } else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == CONST_IMM &&
- env->allow_ptr_leaks) {
- /* reg_imm += K|X */
- return evaluate_reg_imm_alu(env, insn);
- } else if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
- insn->dst_reg);
- return -EACCES;
- } else if (BPF_SRC(insn->code) == BPF_X &&
- is_pointer_value(env, insn->src_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
- insn->src_reg);
- return -EACCES;
- }
-
- /* If we did pointer math on a map value then just set it to our
- * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or
- * loads to this register appropriately, otherwise just mark the
- * register as unknown.
- */
- if (env->allow_ptr_leaks &&
- BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
- (dst_reg->type == PTR_TO_MAP_VALUE ||
- dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
- dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
- else
- mark_reg_unknown_value(regs, insn->dst_reg);
+ return adjust_reg_min_max_vals(env, insn);
}
return 0;
@@ -2076,27 +2425,48 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
struct bpf_reg_state *regs = state->regs, *reg;
int i;
- /* LLVM can generate two kind of checks:
+ if (dst_reg->off < 0)
+ /* This doesn't give us any range */
+ return;
+
+ if (dst_reg->umax_value > MAX_PACKET_OFF ||
+ dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF)
+ /* Risk of overflow. For instance, ptr + (1<<63) may be less
+ * than pkt_end, but that's because it's also less than pkt.
+ */
+ return;
+
+ /* LLVM can generate four kind of checks:
*
- * Type 1:
+ * Type 1/2:
*
* r2 = r3;
* r2 += 8;
* if (r2 > pkt_end) goto <handle exception>
* <access okay>
*
+ * r2 = r3;
+ * r2 += 8;
+ * if (r2 < pkt_end) goto <access okay>
+ * <handle exception>
+ *
* Where:
* r2 == dst_reg, pkt_end == src_reg
* r2=pkt(id=n,off=8,r=0)
* r3=pkt(id=n,off=0,r=0)
*
- * Type 2:
+ * Type 3/4:
*
* r2 = r3;
* r2 += 8;
* if (pkt_end >= r2) goto <access okay>
* <handle exception>
*
+ * r2 = r3;
+ * r2 += 8;
+ * if (pkt_end <= r2) goto <handle exception>
+ * <access okay>
+ *
* Where:
* pkt_end == dst_reg, r2 == src_reg
* r2=pkt(id=n,off=8,r=0)
@@ -2106,135 +2476,247 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
* so that range of bytes [r3, r3 + 8) is safe to access.
*/
+ /* If our ids match, then we must have the same max_value. And we
+ * don't care about the other reg's fixed offset, since if it's too big
+ * the range won't allow anything.
+ * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
+ */
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
/* keep the maximum range already checked */
- regs[i].range = max(regs[i].range, dst_reg->off);
+ regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
reg = &state->spilled_regs[i / BPF_REG_SIZE];
if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
- reg->range = max(reg->range, dst_reg->off);
+ reg->range = max_t(u16, reg->range, dst_reg->off);
}
}
/* Adjusts the register min/max values in the case that the dst_reg is the
* variable register that we are working on, and src_reg is a constant or we're
* simply doing a BPF_K check.
+ * In JEQ/JNE cases we also adjust the var_off values.
*/
static void reg_set_min_max(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode)
{
+ /* If the dst_reg is a pointer, we can't learn anything about its
+ * variable offset from the compare (unless src_reg were a pointer into
+ * the same object, but we don't bother with that.
+ * Since false_reg and true_reg have the same type by construction, we
+ * only need to check one of them for pointerness.
+ */
+ if (__is_pointer_value(false, false_reg))
+ return;
+
switch (opcode) {
case BPF_JEQ:
/* If this is false then we know nothing Jon Snow, but if it is
* true then we know for sure.
*/
- true_reg->max_value = true_reg->min_value = val;
+ __mark_reg_known(true_reg, val);
break;
case BPF_JNE:
/* If this is true we know nothing Jon Snow, but if it is false
* we know the value for sure;
*/
- false_reg->max_value = false_reg->min_value = val;
+ __mark_reg_known(false_reg, val);
break;
case BPF_JGT:
- /* Unsigned comparison, the minimum value is 0. */
- false_reg->min_value = 0;
- /* fallthrough */
+ false_reg->umax_value = min(false_reg->umax_value, val);
+ true_reg->umin_value = max(true_reg->umin_value, val + 1);
+ break;
case BPF_JSGT:
- /* If this is false then we know the maximum val is val,
- * otherwise we know the min val is val+1.
- */
- false_reg->max_value = val;
- true_reg->min_value = val + 1;
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
+ break;
+ case BPF_JLT:
+ false_reg->umin_value = max(false_reg->umin_value, val);
+ true_reg->umax_value = min(true_reg->umax_value, val - 1);
+ break;
+ case BPF_JSLT:
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
break;
case BPF_JGE:
- /* Unsigned comparison, the minimum value is 0. */
- false_reg->min_value = 0;
- /* fallthrough */
+ false_reg->umax_value = min(false_reg->umax_value, val - 1);
+ true_reg->umin_value = max(true_reg->umin_value, val);
+ break;
case BPF_JSGE:
- /* If this is false then we know the maximum value is val - 1,
- * otherwise we know the mimimum value is val.
- */
- false_reg->max_value = val - 1;
- true_reg->min_value = val;
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
+ break;
+ case BPF_JLE:
+ false_reg->umin_value = max(false_reg->umin_value, val + 1);
+ true_reg->umax_value = min(true_reg->umax_value, val);
+ break;
+ case BPF_JSLE:
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
break;
default:
break;
}
- check_reg_overflow(false_reg);
- check_reg_overflow(true_reg);
+ __reg_deduce_bounds(false_reg);
+ __reg_deduce_bounds(true_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(false_reg);
+ __reg_bound_offset(true_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(false_reg);
+ __update_reg_bounds(true_reg);
}
-/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
- * is the variable reg.
+/* Same as above, but for the case that dst_reg holds a constant and src_reg is
+ * the variable reg.
*/
static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode)
{
+ if (__is_pointer_value(false, false_reg))
+ return;
+
switch (opcode) {
case BPF_JEQ:
/* If this is false then we know nothing Jon Snow, but if it is
* true then we know for sure.
*/
- true_reg->max_value = true_reg->min_value = val;
+ __mark_reg_known(true_reg, val);
break;
case BPF_JNE:
/* If this is true we know nothing Jon Snow, but if it is false
* we know the value for sure;
*/
- false_reg->max_value = false_reg->min_value = val;
+ __mark_reg_known(false_reg, val);
break;
case BPF_JGT:
- /* Unsigned comparison, the minimum value is 0. */
- true_reg->min_value = 0;
- /* fallthrough */
+ true_reg->umax_value = min(true_reg->umax_value, val - 1);
+ false_reg->umin_value = max(false_reg->umin_value, val);
+ break;
case BPF_JSGT:
- /*
- * If this is false, then the val is <= the register, if it is
- * true the register <= to the val.
- */
- false_reg->min_value = val;
- true_reg->max_value = val - 1;
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
+ break;
+ case BPF_JLT:
+ true_reg->umin_value = max(true_reg->umin_value, val + 1);
+ false_reg->umax_value = min(false_reg->umax_value, val);
+ break;
+ case BPF_JSLT:
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
break;
case BPF_JGE:
- /* Unsigned comparison, the minimum value is 0. */
- true_reg->min_value = 0;
- /* fallthrough */
+ true_reg->umax_value = min(true_reg->umax_value, val);
+ false_reg->umin_value = max(false_reg->umin_value, val + 1);
+ break;
case BPF_JSGE:
- /* If this is false then constant < register, if it is true then
- * the register < constant.
- */
- false_reg->min_value = val + 1;
- true_reg->max_value = val;
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
+ break;
+ case BPF_JLE:
+ true_reg->umin_value = max(true_reg->umin_value, val);
+ false_reg->umax_value = min(false_reg->umax_value, val - 1);
+ break;
+ case BPF_JSLE:
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
break;
default:
break;
}
- check_reg_overflow(false_reg);
- check_reg_overflow(true_reg);
+ __reg_deduce_bounds(false_reg);
+ __reg_deduce_bounds(true_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(false_reg);
+ __reg_bound_offset(true_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(false_reg);
+ __update_reg_bounds(true_reg);
+}
+
+/* Regs are known to be equal, so intersect their min/max/var_off */
+static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
+ struct bpf_reg_state *dst_reg)
+{
+ src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value,
+ dst_reg->umin_value);
+ src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value,
+ dst_reg->umax_value);
+ src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value,
+ dst_reg->smin_value);
+ src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value,
+ dst_reg->smax_value);
+ src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
+ dst_reg->var_off);
+ /* We might have learned new bounds from the var_off. */
+ __update_reg_bounds(src_reg);
+ __update_reg_bounds(dst_reg);
+ /* We might have learned something about the sign bit. */
+ __reg_deduce_bounds(src_reg);
+ __reg_deduce_bounds(dst_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(src_reg);
+ __reg_bound_offset(dst_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(src_reg);
+ __update_reg_bounds(dst_reg);
+}
+
+static void reg_combine_min_max(struct bpf_reg_state *true_src,
+ struct bpf_reg_state *true_dst,
+ struct bpf_reg_state *false_src,
+ struct bpf_reg_state *false_dst,
+ u8 opcode)
+{
+ switch (opcode) {
+ case BPF_JEQ:
+ __reg_combine_min_max(true_src, true_dst);
+ break;
+ case BPF_JNE:
+ __reg_combine_min_max(false_src, false_dst);
+ break;
+ }
}
static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
- enum bpf_reg_type type)
+ bool is_null)
{
struct bpf_reg_state *reg = &regs[regno];
if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
- if (type == UNKNOWN_VALUE) {
- __mark_reg_unknown_value(regs, regno);
+ /* Old offset (both fixed and variable parts) should
+ * have been known-zero, because we don't allow pointer
+ * arithmetic on pointers that might be NULL.
+ */
+ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
+ !tnum_equals_const(reg->var_off, 0) ||
+ reg->off)) {
+ __mark_reg_known_zero(reg);
+ reg->off = 0;
+ }
+ if (is_null) {
+ reg->type = SCALAR_VALUE;
} else if (reg->map_ptr->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
reg->map_ptr = reg->map_ptr->inner_map_meta;
} else {
- reg->type = type;
+ reg->type = PTR_TO_MAP_VALUE;
}
/* We don't need id from this point onwards anymore, thus we
* should better reset it, so that state pruning has chances
@@ -2248,19 +2730,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
* be folded together at some point.
*/
static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
- enum bpf_reg_type type)
+ bool is_null)
{
struct bpf_reg_state *regs = state->regs;
u32 id = regs[regno].id;
int i;
for (i = 0; i < MAX_BPF_REG; i++)
- mark_map_reg(regs, i, id, type);
+ mark_map_reg(regs, i, id, is_null);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
- mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type);
+ mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null);
}
}
@@ -2272,7 +2754,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
u8 opcode = BPF_OP(insn->code);
int err;
- if (opcode > BPF_EXIT) {
+ if (opcode > BPF_JSLE) {
verbose("invalid BPF_JMP opcode %x\n", opcode);
return -EINVAL;
}
@@ -2284,7 +2766,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
@@ -2301,7 +2783,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -2310,7 +2792,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* detect if R == 0 where R was initialized to zero earlier */
if (BPF_SRC(insn->code) == BPF_K &&
(opcode == BPF_JEQ || opcode == BPF_JNE) &&
- dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) {
+ dst_reg->type == SCALAR_VALUE &&
+ tnum_equals_const(dst_reg->var_off, insn->imm)) {
if (opcode == BPF_JEQ) {
/* if (imm == imm) goto pc+off;
* only follow the goto, ignore fall-through
@@ -2332,17 +2815,30 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* detect if we are comparing against a constant value so we can adjust
* our min/max values for our dst register.
+ * this is only legit if both are scalars (or pointers to the same
+ * object, I suppose, but we don't support that right now), because
+ * otherwise the different base pointers mean the offsets aren't
+ * comparable.
*/
if (BPF_SRC(insn->code) == BPF_X) {
- if (regs[insn->src_reg].type == CONST_IMM)
- reg_set_min_max(&other_branch->regs[insn->dst_reg],
- dst_reg, regs[insn->src_reg].imm,
- opcode);
- else if (dst_reg->type == CONST_IMM)
- reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
- &regs[insn->src_reg], dst_reg->imm,
- opcode);
- } else {
+ if (dst_reg->type == SCALAR_VALUE &&
+ regs[insn->src_reg].type == SCALAR_VALUE) {
+ if (tnum_is_const(regs[insn->src_reg].var_off))
+ reg_set_min_max(&other_branch->regs[insn->dst_reg],
+ dst_reg, regs[insn->src_reg].var_off.value,
+ opcode);
+ else if (tnum_is_const(dst_reg->var_off))
+ reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
+ &regs[insn->src_reg],
+ dst_reg->var_off.value, opcode);
+ else if (opcode == BPF_JEQ || opcode == BPF_JNE)
+ /* Comparing for equality, we can combine knowledge */
+ reg_combine_min_max(&other_branch->regs[insn->src_reg],
+ &other_branch->regs[insn->dst_reg],
+ &regs[insn->src_reg],
+ &regs[insn->dst_reg], opcode);
+ }
+ } else if (dst_reg->type == SCALAR_VALUE) {
reg_set_min_max(&other_branch->regs[insn->dst_reg],
dst_reg, insn->imm, opcode);
}
@@ -2354,18 +2850,24 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* Mark all identical map registers in each branch as either
* safe or unknown depending R == 0 or R != 0 conditional.
*/
- mark_map_regs(this_branch, insn->dst_reg,
- opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE);
- mark_map_regs(other_branch, insn->dst_reg,
- opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE);
+ mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
+ mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
find_good_pkt_pointers(this_branch, dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+ dst_reg->type == PTR_TO_PACKET &&
+ regs[insn->src_reg].type == PTR_TO_PACKET_END) {
+ find_good_pkt_pointers(other_branch, dst_reg);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
dst_reg->type == PTR_TO_PACKET_END &&
regs[insn->src_reg].type == PTR_TO_PACKET) {
find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
+ dst_reg->type == PTR_TO_PACKET_END &&
+ regs[insn->src_reg].type == PTR_TO_PACKET) {
+ find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
} else if (is_pointer_value(env, insn->dst_reg)) {
verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
return -EACCES;
@@ -2398,15 +2900,15 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
if (insn->src_reg == 0) {
u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = imm;
+ regs[insn->dst_reg].type = SCALAR_VALUE;
+ __mark_reg_known(&regs[insn->dst_reg], imm);
return 0;
}
@@ -2464,7 +2966,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check whether implicit source operand (register R6) is readable */
- err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+ err = check_reg_arg(env, BPF_REG_6, SRC_OP);
if (err)
return err;
@@ -2475,19 +2977,22 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (mode == BPF_IND) {
/* check explicit source operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
}
/* reset caller saved regs to unreadable */
- for (i = 0; i < CALLER_SAVED_REGS; i++)
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
/* mark destination R0 register as readable, since it contains
- * the value fetched from the packet
+ * the value fetched from the packet.
+ * Already marked as written above.
*/
- regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ mark_reg_unknown(regs, BPF_REG_0);
return 0;
}
@@ -2690,57 +3195,144 @@ err_free:
return ret;
}
-/* the following conditions reduce the number of explored insns
- * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
+/* check %cur's range satisfies %old's */
+static bool range_within(struct bpf_reg_state *old,
+ struct bpf_reg_state *cur)
+{
+ return old->umin_value <= cur->umin_value &&
+ old->umax_value >= cur->umax_value &&
+ old->smin_value <= cur->smin_value &&
+ old->smax_value >= cur->smax_value;
+}
+
+/* Maximum number of register states that can exist at once */
+#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
+struct idpair {
+ u32 old;
+ u32 cur;
+};
+
+/* If in the old state two registers had the same id, then they need to have
+ * the same id in the new state as well. But that id could be different from
+ * the old state, so we need to track the mapping from old to new ids.
+ * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
+ * regs with old id 5 must also have new id 9 for the new state to be safe. But
+ * regs with a different old id could still have new id 9, we don't care about
+ * that.
+ * So we look through our idmap to see if this old id has been seen before. If
+ * so, we require the new id to match; otherwise, we add the id pair to the map.
*/
-static bool compare_ptrs_to_packet(struct bpf_verifier_env *env,
- struct bpf_reg_state *old,
- struct bpf_reg_state *cur)
+static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
{
- if (old->id != cur->id)
- return false;
+ unsigned int i;
+
+ for (i = 0; i < ID_MAP_SIZE; i++) {
+ if (!idmap[i].old) {
+ /* Reached an empty slot; haven't seen this id before */
+ idmap[i].old = old_id;
+ idmap[i].cur = cur_id;
+ return true;
+ }
+ if (idmap[i].old == old_id)
+ return idmap[i].cur == cur_id;
+ }
+ /* We ran out of idmap slots, which should be impossible */
+ WARN_ON_ONCE(1);
+ return false;
+}
- /* old ptr_to_packet is more conservative, since it allows smaller
- * range. Ex:
- * old(off=0,r=10) is equal to cur(off=0,r=20), because
- * old(off=0,r=10) means that with range=10 the verifier proceeded
- * further and found no issues with the program. Now we're in the same
- * spot with cur(off=0,r=20), so we're safe too, since anything further
- * will only be looking at most 10 bytes after this pointer.
- */
- if (old->off == cur->off && old->range < cur->range)
+/* Returns true if (rold safe implies rcur safe) */
+static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+ struct idpair *idmap)
+{
+ if (!(rold->live & REG_LIVE_READ))
+ /* explored state didn't use this */
return true;
- /* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0)
- * since both cannot be used for packet access and safe(old)
- * pointer has smaller off that could be used for further
- * 'if (ptr > data_end)' check
- * Ex:
- * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean
- * that we cannot access the packet.
- * The safe range is:
- * [ptr, ptr + range - off)
- * so whenever off >=range, it means no safe bytes from this pointer.
- * When comparing old->off <= cur->off, it means that older code
- * went with smaller offset and that offset was later
- * used to figure out the safe range after 'if (ptr > data_end)' check
- * Say, 'old' state was explored like:
- * ... R3(off=0, r=0)
- * R4 = R3 + 20
- * ... now R4(off=20,r=0) <-- here
- * if (R4 > data_end)
- * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access.
- * ... the code further went all the way to bpf_exit.
- * Now the 'cur' state at the mark 'here' has R4(off=30,r=0).
- * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier
- * goes further, such cur_R4 will give larger safe packet range after
- * 'if (R4 > data_end)' and all further insn were already good with r=20,
- * so they will be good with r=30 and we can prune the search.
- */
- if (!env->strict_alignment && old->off <= cur->off &&
- old->off >= old->range && cur->off >= cur->range)
+ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0)
return true;
+ if (rold->type == NOT_INIT)
+ /* explored state can't have used this */
+ return true;
+ if (rcur->type == NOT_INIT)
+ return false;
+ switch (rold->type) {
+ case SCALAR_VALUE:
+ if (rcur->type == SCALAR_VALUE) {
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ } else {
+ /* if we knew anything about the old value, we're not
+ * equal, because we can't know anything about the
+ * scalar value of the pointer in the new value.
+ */
+ return rold->umin_value == 0 &&
+ rold->umax_value == U64_MAX &&
+ rold->smin_value == S64_MIN &&
+ rold->smax_value == S64_MAX &&
+ tnum_is_unknown(rold->var_off);
+ }
+ case PTR_TO_MAP_VALUE:
+ /* If the new min/max/var_off satisfy the old ones and
+ * everything else matches, we are OK.
+ * We don't care about the 'id' value, because nothing
+ * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
+ */
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ case PTR_TO_MAP_VALUE_OR_NULL:
+ /* a PTR_TO_MAP_VALUE could be safe to use as a
+ * PTR_TO_MAP_VALUE_OR_NULL into the same map.
+ * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
+ * checked, doing so could have affected others with the same
+ * id, and we can't check for that because we lost the id when
+ * we converted to a PTR_TO_MAP_VALUE.
+ */
+ if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
+ return false;
+ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
+ return false;
+ /* Check our ids match any regs they're supposed to */
+ return check_ids(rold->id, rcur->id, idmap);
+ case PTR_TO_PACKET:
+ if (rcur->type != PTR_TO_PACKET)
+ return false;
+ /* We must have at least as much range as the old ptr
+ * did, so that any accesses which were safe before are
+ * still safe. This is true even if old range < old off,
+ * since someone could have accessed through (ptr - k), or
+ * even done ptr -= k in a register, to get a safe access.
+ */
+ if (rold->range > rcur->range)
+ return false;
+ /* If the offsets don't match, we can't trust our alignment;
+ * nor can we be sure that we won't fall out of range.
+ */
+ if (rold->off != rcur->off)
+ return false;
+ /* id relations must be preserved */
+ if (rold->id && !check_ids(rold->id, rcur->id, idmap))
+ return false;
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ case PTR_TO_CTX:
+ case CONST_PTR_TO_MAP:
+ case PTR_TO_STACK:
+ case PTR_TO_PACKET_END:
+ /* Only valid matches are exact, which memcmp() above
+ * would have accepted
+ */
+ default:
+ /* Don't know what's going on, just say it's not safe */
+ return false;
+ }
+
+ /* Shouldn't get here; if we do, say it's not safe */
+ WARN_ON_ONCE(1);
return false;
}
@@ -2774,44 +3366,18 @@ static bool states_equal(struct bpf_verifier_env *env,
struct bpf_verifier_state *old,
struct bpf_verifier_state *cur)
{
- bool varlen_map_access = env->varlen_map_value_access;
- struct bpf_reg_state *rold, *rcur;
+ struct idpair *idmap;
+ bool ret = false;
int i;
- for (i = 0; i < MAX_BPF_REG; i++) {
- rold = &old->regs[i];
- rcur = &cur->regs[i];
-
- if (memcmp(rold, rcur, sizeof(*rold)) == 0)
- continue;
-
- /* If the ranges were not the same, but everything else was and
- * we didn't do a variable access into a map then we are a-ok.
- */
- if (!varlen_map_access &&
- memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0)
- continue;
-
- /* If we didn't map access then again we don't care about the
- * mismatched range values and it's ok if our old type was
- * UNKNOWN and we didn't go to a NOT_INIT'ed reg.
- */
- if (rold->type == NOT_INIT ||
- (!varlen_map_access && rold->type == UNKNOWN_VALUE &&
- rcur->type != NOT_INIT))
- continue;
-
- /* Don't care about the reg->id in this case. */
- if (rold->type == PTR_TO_MAP_VALUE_OR_NULL &&
- rcur->type == PTR_TO_MAP_VALUE_OR_NULL &&
- rold->map_ptr == rcur->map_ptr)
- continue;
-
- if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
- compare_ptrs_to_packet(env, rold, rcur))
- continue;
-
+ idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL);
+ /* If we failed to allocate the idmap, just say it's not safe */
+ if (!idmap)
return false;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ if (!regsafe(&old->regs[i], &cur->regs[i], idmap))
+ goto out_free;
}
for (i = 0; i < MAX_BPF_STACK; i++) {
@@ -2823,33 +3389,104 @@ static bool states_equal(struct bpf_verifier_env *env,
* this verifier states are not equivalent,
* return false to continue verification of this path
*/
- return false;
+ goto out_free;
if (i % BPF_REG_SIZE)
continue;
- if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
- &cur->spilled_regs[i / BPF_REG_SIZE],
- sizeof(old->spilled_regs[0])))
- /* when explored and current stack slot types are
- * the same, check that stored pointers types
+ if (old->stack_slot_type[i] != STACK_SPILL)
+ continue;
+ if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
+ &cur->spilled_regs[i / BPF_REG_SIZE],
+ idmap))
+ /* when explored and current stack slot are both storing
+ * spilled registers, check that stored pointers types
* are the same as well.
* Ex: explored safe path could have stored
- * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8}
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
* but current path has stored:
- * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16}
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
* such verifier states are not equivalent.
* return false to continue verification of this path
*/
- return false;
+ goto out_free;
else
continue;
}
- return true;
+ ret = true;
+out_free:
+ kfree(idmap);
+ return ret;
+}
+
+/* A write screens off any subsequent reads; but write marks come from the
+ * straight-line code between a state and its parent. When we arrive at a
+ * jump target (in the first iteration of the propagate_liveness() loop),
+ * we didn't arrive by the straight-line code, so read marks in state must
+ * propagate to parent regardless of state's write marks.
+ */
+static bool do_propagate_liveness(const struct bpf_verifier_state *state,
+ struct bpf_verifier_state *parent)
+{
+ bool writes = parent == state->parent; /* Observe write marks */
+ bool touched = false; /* any changes made? */
+ int i;
+
+ if (!parent)
+ return touched;
+ /* Propagate read liveness of registers... */
+ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
+ /* We don't need to worry about FP liveness because it's read-only */
+ for (i = 0; i < BPF_REG_FP; i++) {
+ if (parent->regs[i].live & REG_LIVE_READ)
+ continue;
+ if (writes && (state->regs[i].live & REG_LIVE_WRITTEN))
+ continue;
+ if (state->regs[i].live & REG_LIVE_READ) {
+ parent->regs[i].live |= REG_LIVE_READ;
+ touched = true;
+ }
+ }
+ /* ... and stack slots */
+ for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) {
+ if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ continue;
+ if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ continue;
+ if (parent->spilled_regs[i].live & REG_LIVE_READ)
+ continue;
+ if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN))
+ continue;
+ if (state->spilled_regs[i].live & REG_LIVE_READ) {
+ parent->spilled_regs[i].live |= REG_LIVE_READ;
+ touched = true;
+ }
+ }
+ return touched;
+}
+
+/* "parent" is "a state from which we reach the current state", but initially
+ * it is not the state->parent (i.e. "the state whose straight-line code leads
+ * to the current state"), instead it is the state that happened to arrive at
+ * a (prunable) equivalent of the current state. See comment above
+ * do_propagate_liveness() for consequences of this.
+ * This function is just a more efficient way of calling mark_reg_read() or
+ * mark_stack_slot_read() on each reg in "parent" that is read in "state",
+ * though it requires that parent != state->parent in the call arguments.
+ */
+static void propagate_liveness(const struct bpf_verifier_state *state,
+ struct bpf_verifier_state *parent)
+{
+ while (do_propagate_liveness(state, parent)) {
+ /* Something changed, so we need to feed those changes onward */
+ state = parent;
+ parent = state->parent;
+ }
}
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
+ int i;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -2859,11 +3496,20 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
return 0;
while (sl != STATE_LIST_MARK) {
- if (states_equal(env, &sl->state, &env->cur_state))
+ if (states_equal(env, &sl->state, &env->cur_state)) {
/* reached equivalent register/stack state,
- * prune the search
+ * prune the search.
+ * Registers read by the continuation are read by us.
+ * If we have any write marks in env->cur_state, they
+ * will prevent corresponding reads in the continuation
+ * from reaching our parent (an explored_state). Our
+ * own state will get the read marks recorded, but
+ * they'll be immediately forgotten as we're pruning
+ * this state and will pop a new one.
*/
+ propagate_liveness(&sl->state, &env->cur_state);
return 1;
+ }
sl = sl->next;
}
@@ -2881,6 +3527,19 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
new_sl->next = env->explored_states[insn_idx];
env->explored_states[insn_idx] = new_sl;
+ /* connect new state to parentage chain */
+ env->cur_state.parent = &new_sl->state;
+ /* clear write marks in current state: the writes we did are not writes
+ * our child did, so they don't screen off its reads from us.
+ * (There are no read marks in current state, because reads always mark
+ * their parent and current state never has children yet. Only
+ * explored_states can get read marks.)
+ */
+ for (i = 0; i < BPF_REG_FP; i++)
+ env->cur_state.regs[i].live = REG_LIVE_NONE;
+ for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
+ if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
+ env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
return 0;
}
@@ -2904,8 +3563,8 @@ static int do_check(struct bpf_verifier_env *env)
bool do_print_state = false;
init_reg_state(regs);
+ state->parent = NULL;
insn_idx = 0;
- env->varlen_map_value_access = false;
for (;;) {
struct bpf_insn *insn;
u8 class;
@@ -2974,11 +3633,11 @@ static int do_check(struct bpf_verifier_env *env)
/* check for reserved fields is already done */
/* check src operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
- err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
if (err)
return err;
@@ -2987,18 +3646,12 @@ static int do_check(struct bpf_verifier_env *env)
/* check that memory (src_reg + off) is readable,
* the state of dst_reg will be updated by this func
*/
- err = check_mem_access(env, insn->src_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ,
insn->dst_reg);
if (err)
return err;
- if (BPF_SIZE(insn->code) != BPF_W &&
- BPF_SIZE(insn->code) != BPF_DW) {
- insn_idx++;
- continue;
- }
-
prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
if (*prev_src_type == NOT_INIT) {
@@ -3026,7 +3679,7 @@ static int do_check(struct bpf_verifier_env *env)
enum bpf_reg_type *prev_dst_type, dst_reg_type;
if (BPF_MODE(insn->code) == BPF_XADD) {
- err = check_xadd(env, insn);
+ err = check_xadd(env, insn_idx, insn);
if (err)
return err;
insn_idx++;
@@ -3034,18 +3687,18 @@ static int do_check(struct bpf_verifier_env *env)
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
dst_reg_type = regs[insn->dst_reg].type;
/* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
insn->src_reg);
if (err)
@@ -3069,12 +3722,12 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
/* check src operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
/* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
-1);
if (err)
@@ -3123,7 +3776,7 @@ static int do_check(struct bpf_verifier_env *env)
* of bpf_exit, which means that program wrote
* something into it earlier
*/
- err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
+ err = check_reg_arg(env, BPF_REG_0, SRC_OP);
if (err)
return err;
@@ -3163,7 +3816,6 @@ process_bpf_exit:
verbose("invalid BPF_LD mode\n");
return -EINVAL;
}
- reset_reg_range_values(regs, insn->dst_reg);
} else {
verbose("unknown insn class %d\n", class);
return -EINVAL;
@@ -3172,7 +3824,8 @@ process_bpf_exit:
insn_idx++;
}
- verbose("processed %d insns\n", insn_processed);
+ verbose("processed %d insns, stack depth %d\n",
+ insn_processed, env->prog->aux->stack_depth);
return 0;
}
@@ -3372,11 +4025,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+ int i, cnt, size, ctx_field_size, delta = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16], *insn;
struct bpf_prog *new_prog;
enum bpf_access_type type;
- int i, cnt, delta = 0;
+ bool is_narrower_load;
+ u32 target_size;
if (ops->gen_prologue) {
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
@@ -3416,12 +4071,52 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
continue;
- cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
+ size = BPF_LDST_BYTES(insn);
+
+ /* If the read access is a narrower load of the field,
+ * convert to a 4/8-byte load, to minimum program type specific
+ * convert_ctx_access changes. If conversion is successful,
+ * we will apply proper mask to the result.
+ */
+ is_narrower_load = size < ctx_field_size;
+ if (is_narrower_load) {
+ u32 off = insn->off;
+ u8 size_code;
+
+ if (type == BPF_WRITE) {
+ verbose("bpf verifier narrow ctx access misconfigured\n");
+ return -EINVAL;
+ }
+
+ size_code = BPF_H;
+ if (ctx_field_size == 4)
+ size_code = BPF_W;
+ else if (ctx_field_size == 8)
+ size_code = BPF_DW;
+
+ insn->off = off & ~(ctx_field_size - 1);
+ insn->code = BPF_LDX | BPF_MEM | size_code;
+ }
+
+ target_size = 0;
+ cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
+ &target_size);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
+ (ctx_field_size && !target_size)) {
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
}
+ if (is_narrower_load && size < target_size) {
+ if (ctx_field_size <= 4)
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
+ (1 << size * 8) - 1);
+ else
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
+ (1 << size * 8) - 1);
+ }
+
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -3467,6 +4162,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
* the program array.
*/
prog->cb_access = 1;
+ env->prog->aux->stack_depth = MAX_BPF_STACK;
/* mark bpf_tail_call as different opcode to avoid
* conditional branch in the interpeter for every normal
@@ -3474,11 +4170,15 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
* that doesn't support bpf_tail_call yet
*/
insn->imm = 0;
- insn->code |= BPF_X;
+ insn->code = BPF_JMP | BPF_TAIL_CALL;
continue;
}
- if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) {
+ /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
+ * handlers are currently limited to 64 bit only.
+ */
+ if (ebpf_jit_enabled() && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_map_lookup_elem) {
map_ptr = env->insn_aux_data[i + delta].map_ptr;
if (map_ptr == BPF_MAP_PTR_POISON ||
!map_ptr->ops->map_gen_lookup)
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 387348a40c64..ce693ccb8c58 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
obj-$(CONFIG_CGROUP_RDMA) += rdma.o
obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 00f4d6bf048f..5151ff256c29 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -33,6 +33,9 @@ struct cgroup_taskset {
struct list_head src_csets;
struct list_head dst_csets;
+ /* the number of tasks in the set */
+ int nr_tasks;
+
/* the subsys currently being processed */
int ssid;
@@ -153,6 +156,8 @@ static inline void get_css_set(struct css_set *cset)
bool cgroup_ssid_enabled(int ssid);
bool cgroup_on_dfl(const struct cgroup *cgrp);
+bool cgroup_is_thread_root(struct cgroup *cgrp);
+bool cgroup_is_threaded(struct cgroup *cgrp);
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
struct cgroup *task_cgroup_from_root(struct task_struct *task,
@@ -170,7 +175,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_root *root, unsigned long magic,
struct cgroup_namespace *ns);
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
struct cgroup_mgctx *mgctx);
@@ -180,10 +185,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
-ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup);
-ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
- loff_t off);
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+ __acquires(&cgroup_threadgroup_rwsem);
+void cgroup_procs_write_finish(struct task_struct *task)
+ __releases(&cgroup_threadgroup_rwsem);
void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
@@ -192,6 +197,8 @@ int cgroup_rmdir(struct kernfs_node *kn);
int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
struct kernfs_root *kf_root);
+int cgroup_task_count(const struct cgroup *cgrp);
+
/*
* namespace.c
*/
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 85d75152402d..024085daab1a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -99,8 +99,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
if (cgroup_on_dfl(to))
return -EINVAL;
- if (!cgroup_may_migrate_to(to))
- return -EBUSY;
+ ret = cgroup_migrate_vet_dst(to);
+ if (ret)
+ return ret;
mutex_lock(&cgroup_mutex);
@@ -121,7 +122,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
* ->can_attach() fails.
*/
do {
- css_task_iter_start(&from->self, &it);
+ css_task_iter_start(&from->self, 0, &it);
task = css_task_iter_next(&it);
if (task)
get_task_struct(task);
@@ -334,19 +335,15 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
/**
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup. The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
*/
-static int cgroup_task_count(const struct cgroup *cgrp)
+int cgroup_task_count(const struct cgroup *cgrp)
{
int count = 0;
struct cgrp_cset_link *link;
spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
- count += refcount_read(&link->cset->refcount);
+ count += link->cset->nr_tasks;
spin_unlock_irq(&css_set_lock);
return count;
}
@@ -377,7 +374,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
if (!array)
return -ENOMEM;
/* now, populate the array */
- css_task_iter_start(&cgrp->self, &it);
+ css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
if (unlikely(n == length))
break;
@@ -514,10 +511,58 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
return 0;
}
-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off,
+ bool threadgroup)
{
- return __cgroup_procs_write(of, buf, nbytes, off, false);
+ struct cgroup *cgrp;
+ struct task_struct *task;
+ const struct cred *cred, *tcred;
+ ssize_t ret;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, threadgroup);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Even if we're attaching all tasks in the thread group, we only
+ * need to check permissions on one of them.
+ */
+ cred = current_cred();
+ tcred = get_task_cred(task);
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ ret = -EACCES;
+ put_cred(tcred);
+ if (ret)
+ goto out_finish;
+
+ ret = cgroup_attach_task(cgrp, task, threadgroup);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup1_procs_write(of, buf, nbytes, off, true);
+}
+
+static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup1_procs_write(of, buf, nbytes, off, false);
}
static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
@@ -596,7 +641,7 @@ struct cftype cgroup1_base_files[] = {
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_PROCS,
- .write = cgroup_procs_write,
+ .write = cgroup1_procs_write,
},
{
.name = "cgroup.clone_children",
@@ -615,7 +660,7 @@ struct cftype cgroup1_base_files[] = {
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_TASKS,
- .write = cgroup_tasks_write,
+ .write = cgroup1_tasks_write,
},
{
.name = "notify_on_release",
@@ -705,7 +750,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
}
rcu_read_unlock();
- css_task_iter_start(&cgrp->self, &it);
+ css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
@@ -850,6 +895,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
seq_puts(seq, ",xattr");
+ if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
+ seq_puts(seq, ",cpuset_v2_mode");
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
@@ -904,6 +951,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
opts->cpuset_clone_children = true;
continue;
}
+ if (!strcmp(token, "cpuset_v2_mode")) {
+ opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ continue;
+ }
if (!strcmp(token, "xattr")) {
opts->flags |= CGRP_ROOT_XATTR;
continue;
@@ -1263,150 +1314,3 @@ static int __init cgroup_no_v1(char *str)
return 1;
}
__setup("cgroup_no_v1=", cgroup_no_v1);
-
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
- struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
- if (!css)
- return ERR_PTR(-ENOMEM);
-
- return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
- kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- u64 count;
-
- rcu_read_lock();
- count = refcount_read(&task_css_set(current)->refcount);
- rcu_read_unlock();
- return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
- struct cgrp_cset_link *link;
- struct css_set *cset;
- char *name_buf;
-
- name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
- if (!name_buf)
- return -ENOMEM;
-
- spin_lock_irq(&css_set_lock);
- rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
-
- cgroup_name(c, name_buf, NAME_MAX + 1);
- seq_printf(seq, "Root %d group %s\n",
- c->root->hierarchy_id, name_buf);
- }
- rcu_read_unlock();
- spin_unlock_irq(&css_set_lock);
- kfree(name_buf);
- return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
- struct cgroup_subsys_state *css = seq_css(seq);
- struct cgrp_cset_link *link;
-
- spin_lock_irq(&css_set_lock);
- list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
- struct css_set *cset = link->cset;
- struct task_struct *task;
- int count = 0;
-
- seq_printf(seq, "css_set %pK\n", cset);
-
- list_for_each_entry(task, &cset->tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS)
- goto overflow;
- seq_printf(seq, " task %d\n", task_pid_vnr(task));
- }
-
- list_for_each_entry(task, &cset->mg_tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS)
- goto overflow;
- seq_printf(seq, " task %d\n", task_pid_vnr(task));
- }
- continue;
- overflow:
- seq_puts(seq, " ...\n");
- }
- spin_unlock_irq(&css_set_lock);
- return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- return (!cgroup_is_populated(css->cgroup) &&
- !css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] = {
- {
- .name = "taskcount",
- .read_u64 = debug_taskcount_read,
- },
-
- {
- .name = "current_css_set",
- .read_u64 = current_css_set_read,
- },
-
- {
- .name = "current_css_set_refcount",
- .read_u64 = current_css_set_refcount_read,
- },
-
- {
- .name = "current_css_set_cg_links",
- .seq_show = current_css_set_cg_links_read,
- },
-
- {
- .name = "cgroup_css_links",
- .seq_show = cgroup_css_links_read,
- },
-
- {
- .name = "releasable",
- .read_u64 = releasable_read,
- },
-
- { } /* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
- .css_alloc = debug_css_alloc,
- .css_free = debug_css_free,
- .legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8d4e85eae42c..d6551cd45238 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -162,6 +162,9 @@ static u16 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
static u16 cgrp_dfl_implicit_ss_mask;
+/* some controllers can be threaded on the default hierarchy */
+static u16 cgrp_dfl_threaded_ss_mask;
+
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
@@ -316,13 +319,87 @@ static void cgroup_idr_remove(struct idr *idr, int id)
spin_unlock_bh(&cgroup_idr_lock);
}
-static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+static bool cgroup_has_tasks(struct cgroup *cgrp)
{
- struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+ return cgrp->nr_populated_csets;
+}
- if (parent_css)
- return container_of(parent_css, struct cgroup, self);
- return NULL;
+bool cgroup_is_threaded(struct cgroup *cgrp)
+{
+ return cgrp->dom_cgrp != cgrp;
+}
+
+/* can @cgrp host both domain and threaded children? */
+static bool cgroup_is_mixable(struct cgroup *cgrp)
+{
+ /*
+ * Root isn't under domain level resource control exempting it from
+ * the no-internal-process constraint, so it can serve as a thread
+ * root and a parent of resource domains at the same time.
+ */
+ return !cgroup_parent(cgrp);
+}
+
+/* can @cgrp become a thread root? should always be true for a thread root */
+static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
+{
+ /* mixables don't care */
+ if (cgroup_is_mixable(cgrp))
+ return true;
+
+ /* domain roots can't be nested under threaded */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* can only have either domain or threaded children */
+ if (cgrp->nr_populated_domain_children)
+ return false;
+
+ /* and no domain controllers can be enabled */
+ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+ return false;
+
+ return true;
+}
+
+/* is @cgrp root of a threaded subtree? */
+bool cgroup_is_thread_root(struct cgroup *cgrp)
+{
+ /* thread root should be a domain */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* a domain w/ threaded children is a thread root */
+ if (cgrp->nr_threaded_children)
+ return true;
+
+ /*
+ * A domain which has tasks and explicit threaded controllers
+ * enabled is a thread root.
+ */
+ if (cgroup_has_tasks(cgrp) &&
+ (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
+ return true;
+
+ return false;
+}
+
+/* a domain which isn't connected to the root w/o brekage can't be used */
+static bool cgroup_is_valid_domain(struct cgroup *cgrp)
+{
+ /* the cgroup itself can be a thread root */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* but the ancestors can't be unless mixable */
+ while ((cgrp = cgroup_parent(cgrp))) {
+ if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
+ return false;
+ if (cgroup_is_threaded(cgrp))
+ return false;
+ }
+
+ return true;
}
/* subsystems visibly enabled on a cgroup */
@@ -331,8 +408,14 @@ static u16 cgroup_control(struct cgroup *cgrp)
struct cgroup *parent = cgroup_parent(cgrp);
u16 root_ss_mask = cgrp->root->subsys_mask;
- if (parent)
- return parent->subtree_control;
+ if (parent) {
+ u16 ss_mask = parent->subtree_control;
+
+ /* threaded cgroups can only have threaded controllers */
+ if (cgroup_is_threaded(cgrp))
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
+ return ss_mask;
+ }
if (cgroup_on_dfl(cgrp))
root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
@@ -345,8 +428,14 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
- if (parent)
- return parent->subtree_ss_mask;
+ if (parent) {
+ u16 ss_mask = parent->subtree_ss_mask;
+
+ /* threaded cgroups can only have threaded controllers */
+ if (cgroup_is_threaded(cgrp))
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
+ return ss_mask;
+ }
return cgrp->root->subsys_mask;
}
@@ -436,22 +525,12 @@ out_unlock:
return css;
}
-static void __maybe_unused cgroup_get(struct cgroup *cgrp)
-{
- css_get(&cgrp->self);
-}
-
static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
css_get(&cgrp->self);
}
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
- return css_tryget(&cgrp->self);
-}
-
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -560,9 +639,11 @@ EXPORT_SYMBOL_GPL(of_css);
*/
struct css_set init_css_set = {
.refcount = REFCOUNT_INIT(1),
+ .dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
+ .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
@@ -570,9 +651,19 @@ struct css_set init_css_set = {
static int css_set_count = 1; /* 1 for init_css_set */
+static bool css_set_threaded(struct css_set *cset)
+{
+ return cset->dom_cset != cset;
+}
+
/**
* css_set_populated - does a css_set contain any tasks?
* @cset: target css_set
+ *
+ * css_set_populated() should be the same as !!cset->nr_tasks at steady
+ * state. However, css_set_populated() can be called while a task is being
+ * added to or removed from the linked list before the nr_tasks is
+ * properly updated. Hence, we can't just look at ->nr_tasks here.
*/
static bool css_set_populated(struct css_set *cset)
{
@@ -582,39 +673,48 @@ static bool css_set_populated(struct css_set *cset)
}
/**
- * cgroup_update_populated - updated populated count of a cgroup
+ * cgroup_update_populated - update the populated count of a cgroup
* @cgrp: the target cgroup
* @populated: inc or dec populated count
*
* One of the css_sets associated with @cgrp is either getting its first
- * task or losing the last. Update @cgrp->populated_cnt accordingly. The
- * count is propagated towards root so that a given cgroup's populated_cnt
- * is zero iff the cgroup and all its descendants don't contain any tasks.
+ * task or losing the last. Update @cgrp->nr_populated_* accordingly. The
+ * count is propagated towards root so that a given cgroup's
+ * nr_populated_children is zero iff none of its descendants contain any
+ * tasks.
*
- * @cgrp's interface file "cgroup.populated" is zero if
- * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
- * changes from or to zero, userland is notified that the content of the
- * interface file has changed. This can be used to detect when @cgrp and
- * its descendants become populated or empty.
+ * @cgrp's interface file "cgroup.populated" is zero if both
+ * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
+ * 1 otherwise. When the sum changes from or to zero, userland is notified
+ * that the content of the interface file has changed. This can be used to
+ * detect when @cgrp and its descendants become populated or empty.
*/
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
+ struct cgroup *child = NULL;
+ int adj = populated ? 1 : -1;
+
lockdep_assert_held(&css_set_lock);
do {
- bool trigger;
+ bool was_populated = cgroup_is_populated(cgrp);
- if (populated)
- trigger = !cgrp->populated_cnt++;
- else
- trigger = !--cgrp->populated_cnt;
+ if (!child) {
+ cgrp->nr_populated_csets += adj;
+ } else {
+ if (cgroup_is_threaded(child))
+ cgrp->nr_populated_threaded_children += adj;
+ else
+ cgrp->nr_populated_domain_children += adj;
+ }
- if (!trigger)
+ if (was_populated == cgroup_is_populated(cgrp))
break;
cgroup1_check_for_release(cgrp);
cgroup_file_notify(&cgrp->events_file);
+ child = cgrp;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
}
@@ -625,7 +725,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
* @populated: whether @cset is populated or depopulated
*
* @cset is either getting the first task or losing the last. Update the
- * ->populated_cnt of all associated cgroups accordingly.
+ * populated counters of all associated cgroups accordingly.
*/
static void css_set_update_populated(struct css_set *cset, bool populated)
{
@@ -648,7 +748,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
* css_set, @from_cset can be NULL. If @task is being disassociated
* instead of moved, @to_cset can be NULL.
*
- * This function automatically handles populated_cnt updates and
+ * This function automatically handles populated counter updates and
* css_task_iter adjustments but the caller is responsible for managing
* @from_cset and @to_cset's reference counts.
*/
@@ -732,6 +832,8 @@ void put_css_set_locked(struct css_set *cset)
if (!refcount_dec_and_test(&cset->refcount))
return;
+ WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
+
/* This css_set is dead. unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
list_del(&cset->e_cset_node[ssid]);
@@ -748,6 +850,11 @@ void put_css_set_locked(struct css_set *cset)
kfree(link);
}
+ if (css_set_threaded(cset)) {
+ list_del(&cset->threaded_csets_node);
+ put_css_set_locked(cset->dom_cset);
+ }
+
kfree_rcu(cset, rcu_head);
}
@@ -766,6 +873,7 @@ static bool compare_css_sets(struct css_set *cset,
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
+ struct cgroup *new_dfl_cgrp;
struct list_head *l1, *l2;
/*
@@ -776,6 +884,16 @@ static bool compare_css_sets(struct css_set *cset,
if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
+
+ /* @cset's domain should match the default cgroup's */
+ if (cgroup_on_dfl(new_cgrp))
+ new_dfl_cgrp = new_cgrp;
+ else
+ new_dfl_cgrp = old_cset->dfl_cgrp;
+
+ if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
+ return false;
+
/*
* Compare cgroup pointers in order to distinguish between
* different cgroups in hierarchies. As different cgroups may
@@ -983,9 +1101,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
}
refcount_set(&cset->refcount, 1);
+ cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->task_iters);
+ INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->mg_preload_node);
@@ -1023,6 +1143,28 @@ static struct css_set *find_css_set(struct css_set *old_cset,
spin_unlock_irq(&css_set_lock);
+ /*
+ * If @cset should be threaded, look up the matching dom_cset and
+ * link them up. We first fully initialize @cset then look for the
+ * dom_cset. It's simpler this way and safe as @cset is guaranteed
+ * to stay empty until we return.
+ */
+ if (cgroup_is_threaded(cset->dfl_cgrp)) {
+ struct css_set *dcset;
+
+ dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
+ if (!dcset) {
+ put_css_set(cset);
+ return NULL;
+ }
+
+ spin_lock_irq(&css_set_lock);
+ cset->dom_cset = dcset;
+ list_add_tail(&cset->threaded_csets_node,
+ &dcset->threaded_csets);
+ spin_unlock_irq(&css_set_lock);
+ }
+
return cset;
}
@@ -1150,6 +1292,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
if (cset == &init_css_set) {
res = &root->cgrp;
+ } else if (root == &cgrp_dfl_root) {
+ res = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
@@ -1542,10 +1686,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
return len;
}
+static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
+{
+ char *token;
+
+ *root_flags = 0;
+
+ if (!data)
+ return 0;
+
+ while ((token = strsep(&data, ",")) != NULL) {
+ if (!strcmp(token, "nsdelegate")) {
+ *root_flags |= CGRP_ROOT_NS_DELEGATE;
+ continue;
+ }
+
+ pr_err("cgroup2: unknown option \"%s\"\n", token);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void apply_cgroup_root_flags(unsigned int root_flags)
+{
+ if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
+ if (root_flags & CGRP_ROOT_NS_DELEGATE)
+ cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+ }
+}
+
+static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+ if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
+ seq_puts(seq, ",nsdelegate");
+ return 0;
+}
+
static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
{
- pr_err("remount is not allowed\n");
- return -EINVAL;
+ unsigned int root_flags;
+ int ret;
+
+ ret = parse_cgroup_root_flags(data, &root_flags);
+ if (ret)
+ return ret;
+
+ apply_cgroup_root_flags(root_flags);
+ return 0;
}
/*
@@ -1598,6 +1788,7 @@ static void cgroup_enable_task_cg_lists(void)
css_set_update_populated(cset, true);
list_add_tail(&p->cg_list, &cset->tasks);
get_css_set(cset);
+ cset->nr_tasks++;
}
spin_unlock(&p->sighand->siglock);
} while_each_thread(g, p);
@@ -1618,6 +1809,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
cgrp->self.flags |= CSS_ONLINE;
+ cgrp->dom_cgrp = cgrp;
+ cgrp->max_descendants = INT_MAX;
+ cgrp->max_depth = INT_MAX;
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -1685,7 +1879,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
root->kf_root = kernfs_create_root(kf_sops,
- KERNFS_ROOT_CREATE_DEACTIVATED,
+ KERNFS_ROOT_CREATE_DEACTIVATED |
+ KERNFS_ROOT_SUPPORT_EXPORTOP,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
@@ -1784,6 +1979,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
{
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct dentry *dentry;
+ int ret;
get_cgroup_ns(ns);
@@ -1801,16 +1997,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
cgroup_enable_task_cg_lists();
if (fs_type == &cgroup2_fs_type) {
- if (data) {
- pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+ unsigned int root_flags;
+
+ ret = parse_cgroup_root_flags(data, &root_flags);
+ if (ret) {
put_cgroup_ns(ns);
- return ERR_PTR(-EINVAL);
+ return ERR_PTR(ret);
}
+
cgrp_dfl_visible = true;
cgroup_get_live(&cgrp_dfl_root.cgrp);
dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
CGROUP2_SUPER_MAGIC, ns);
+ if (!IS_ERR(dentry))
+ apply_cgroup_root_flags(root_flags);
} else {
dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
CGROUP_SUPER_MAGIC, ns);
@@ -1948,6 +2149,8 @@ static void cgroup_migrate_add_task(struct task_struct *task,
if (!cset->mg_src_cgrp)
return;
+ mgctx->tset.nr_tasks++;
+
list_move_tail(&task->cg_list, &cset->mg_tasks);
if (list_empty(&cset->mg_node))
list_add_tail(&cset->mg_node,
@@ -2036,21 +2239,19 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
struct css_set *cset, *tmp_cset;
int ssid, failed_ssid, ret;
- /* methods shouldn't be called if no task is actually migrating */
- if (list_empty(&tset->src_csets))
- return 0;
-
/* check that we can legitimately attach to the cgroup */
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ss->can_attach) {
- tset->ssid = ssid;
- ret = ss->can_attach(tset);
- if (ret) {
- failed_ssid = ssid;
- goto out_cancel_attach;
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ss->can_attach) {
+ tset->ssid = ssid;
+ ret = ss->can_attach(tset);
+ if (ret) {
+ failed_ssid = ssid;
+ goto out_cancel_attach;
+ }
}
- }
- } while_each_subsys_mask();
+ } while_each_subsys_mask();
+ }
/*
* Now that we're guaranteed success, proceed to move all tasks to
@@ -2064,8 +2265,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
struct css_set *to_cset = cset->mg_dst_cset;
get_css_set(to_cset);
+ to_cset->nr_tasks++;
css_set_move_task(task, from_cset, to_cset, true);
put_css_set_locked(from_cset);
+ from_cset->nr_tasks--;
}
}
spin_unlock_irq(&css_set_lock);
@@ -2077,25 +2280,29 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
*/
tset->csets = &tset->dst_csets;
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ss->attach) {
- tset->ssid = ssid;
- ss->attach(tset);
- }
- } while_each_subsys_mask();
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ss->attach) {
+ tset->ssid = ssid;
+ ss->attach(tset);
+ }
+ } while_each_subsys_mask();
+ }
ret = 0;
goto out_release_tset;
out_cancel_attach:
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ssid == failed_ssid)
- break;
- if (ss->cancel_attach) {
- tset->ssid = ssid;
- ss->cancel_attach(tset);
- }
- } while_each_subsys_mask();
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ssid == failed_ssid)
+ break;
+ if (ss->cancel_attach) {
+ tset->ssid = ssid;
+ ss->cancel_attach(tset);
+ }
+ } while_each_subsys_mask();
+ }
out_release_tset:
spin_lock_irq(&css_set_lock);
list_splice_init(&tset->dst_csets, &tset->src_csets);
@@ -2108,17 +2315,40 @@ out_release_tset:
}
/**
- * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
* @dst_cgrp: destination cgroup to test
*
- * On the default hierarchy, except for the root, subtree_control must be
- * zero for migration destination cgroups with tasks so that child cgroups
- * don't compete against tasks.
+ * On the default hierarchy, except for the mixable, (possible) thread root
+ * and threaded cgroups, subtree_control must be zero for migration
+ * destination cgroups with tasks so that child cgroups don't compete
+ * against tasks.
*/
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
{
- return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
- !dst_cgrp->subtree_control;
+ /* v1 doesn't have any restriction */
+ if (!cgroup_on_dfl(dst_cgrp))
+ return 0;
+
+ /* verify @dst_cgrp can host resources */
+ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /* mixables don't care */
+ if (cgroup_is_mixable(dst_cgrp))
+ return 0;
+
+ /*
+ * If @dst_cgrp is already or can become a thread root or is
+ * threaded, it doesn't matter.
+ */
+ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
+ return 0;
+
+ /* apply no-internal-process constraint */
+ if (dst_cgrp->subtree_control)
+ return -EBUSY;
+
+ return 0;
}
/**
@@ -2323,8 +2553,9 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
struct task_struct *task;
int ret;
- if (!cgroup_may_migrate_to(dst_cgrp))
- return -EBUSY;
+ ret = cgroup_migrate_vet_dst(dst_cgrp);
+ if (ret)
+ return ret;
/* look up all src csets */
spin_lock_irq(&css_set_lock);
@@ -2351,76 +2582,23 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
return ret;
}
-static int cgroup_procs_write_permission(struct task_struct *task,
- struct cgroup *dst_cgrp,
- struct kernfs_open_file *of)
-{
- int ret = 0;
-
- if (cgroup_on_dfl(dst_cgrp)) {
- struct super_block *sb = of->file->f_path.dentry->d_sb;
- struct cgroup *cgrp;
- struct inode *inode;
-
- spin_lock_irq(&css_set_lock);
- cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
-
- while (!cgroup_is_descendant(dst_cgrp, cgrp))
- cgrp = cgroup_parent(cgrp);
-
- ret = -ENOMEM;
- inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
- if (inode) {
- ret = inode_permission(inode, MAY_WRITE);
- iput(inode);
- }
- } else {
- const struct cred *cred = current_cred();
- const struct cred *tcred = get_task_cred(task);
-
- /*
- * even if we're attaching all tasks in the thread group,
- * we only need to check permissions on one of them.
- */
- if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
- !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid))
- ret = -EACCES;
- put_cred(tcred);
- }
-
- return ret;
-}
-
-/*
- * Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex and threadgroup.
- */
-ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup)
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+ __acquires(&cgroup_threadgroup_rwsem)
{
struct task_struct *tsk;
- struct cgroup_subsys *ss;
- struct cgroup *cgrp;
pid_t pid;
- int ssid, ret;
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
- return -EINVAL;
-
- cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!cgrp)
- return -ENODEV;
+ return ERR_PTR(-EINVAL);
percpu_down_write(&cgroup_threadgroup_rwsem);
+
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
- ret = -ESRCH;
- goto out_unlock_rcu;
+ tsk = ERR_PTR(-ESRCH);
+ goto out_unlock_threadgroup;
}
} else {
tsk = current;
@@ -2436,35 +2614,33 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
* cgroup with no rt_runtime allocated. Just say no.
*/
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
- ret = -EINVAL;
- goto out_unlock_rcu;
+ tsk = ERR_PTR(-EINVAL);
+ goto out_unlock_threadgroup;
}
get_task_struct(tsk);
+ goto out_unlock_rcu;
+
+out_unlock_threadgroup:
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+out_unlock_rcu:
rcu_read_unlock();
+ return tsk;
+}
- ret = cgroup_procs_write_permission(tsk, cgrp, of);
- if (!ret)
- ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+void cgroup_procs_write_finish(struct task_struct *task)
+ __releases(&cgroup_threadgroup_rwsem)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
- put_task_struct(tsk);
- goto out_unlock_threadgroup;
+ /* release reference from cgroup_procs_write_start() */
+ put_task_struct(task);
-out_unlock_rcu:
- rcu_read_unlock();
-out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem);
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
- cgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
-}
-
-ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
- loff_t off)
-{
- return __cgroup_procs_write(of, buf, nbytes, off, true);
}
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -2807,6 +2983,46 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
cgroup_apply_control_disable(cgrp);
}
+static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+{
+ u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+
+ /* if nothing is getting enabled, nothing to worry about */
+ if (!enable)
+ return 0;
+
+ /* can @cgrp host any resources? */
+ if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /* mixables don't care */
+ if (cgroup_is_mixable(cgrp))
+ return 0;
+
+ if (domain_enable) {
+ /* can't enable domain controllers inside a thread subtree */
+ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ return -EOPNOTSUPP;
+ } else {
+ /*
+ * Threaded controllers can handle internal competitions
+ * and are always allowed inside a (prospective) thread
+ * subtree.
+ */
+ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ return 0;
+ }
+
+ /*
+ * Controllers can't be enabled for a cgroup with tasks to avoid
+ * child cgroups competing against tasks.
+ */
+ if (cgroup_has_tasks(cgrp))
+ return -EBUSY;
+
+ return 0;
+}
+
/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
@@ -2882,33 +3098,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
goto out_unlock;
}
- /*
- * Except for the root, subtree_control must be zero for a cgroup
- * with tasks so that child cgroups don't compete against tasks.
- */
- if (enable && cgroup_parent(cgrp)) {
- struct cgrp_cset_link *link;
-
- /*
- * Because namespaces pin csets too, @cgrp->cset_links
- * might not be empty even when @cgrp is empty. Walk and
- * verify each cset.
- */
- spin_lock_irq(&css_set_lock);
-
- ret = 0;
- list_for_each_entry(link, &cgrp->cset_links, cset_link) {
- if (css_set_populated(link->cset)) {
- ret = -EBUSY;
- break;
- }
- }
-
- spin_unlock_irq(&css_set_lock);
-
- if (ret)
- goto out_unlock;
- }
+ ret = cgroup_vet_subtree_control_enable(cgrp, enable);
+ if (ret)
+ goto out_unlock;
/* save and update control masks and prepare csses */
cgroup_save_control(cgrp);
@@ -2917,16 +3109,182 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
cgrp->subtree_control &= ~disable;
ret = cgroup_apply_control(cgrp);
-
cgroup_finalize_control(cgrp, ret);
+ if (ret)
+ goto out_unlock;
kernfs_activate(cgrp->kn);
- ret = 0;
out_unlock:
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
+/**
+ * cgroup_enable_threaded - make @cgrp threaded
+ * @cgrp: the target cgroup
+ *
+ * Called when "threaded" is written to the cgroup.type interface file and
+ * tries to make @cgrp threaded and join the parent's resource domain.
+ * This function is never called on the root cgroup as cgroup.type doesn't
+ * exist on it.
+ */
+static int cgroup_enable_threaded(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup *dom_cgrp = parent->dom_cgrp;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* noop if already threaded */
+ if (cgroup_is_threaded(cgrp))
+ return 0;
+
+ /* we're joining the parent's domain, ensure its validity */
+ if (!cgroup_is_valid_domain(dom_cgrp) ||
+ !cgroup_can_be_thread_root(dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /*
+ * The following shouldn't cause actual migrations and should
+ * always succeed.
+ */
+ cgroup_save_control(cgrp);
+
+ cgrp->dom_cgrp = dom_cgrp;
+ ret = cgroup_apply_control(cgrp);
+ if (!ret)
+ parent->nr_threaded_children++;
+ else
+ cgrp->dom_cgrp = cgrp;
+
+ cgroup_finalize_control(cgrp, ret);
+ return ret;
+}
+
+static int cgroup_type_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ if (cgroup_is_threaded(cgrp))
+ seq_puts(seq, "threaded\n");
+ else if (!cgroup_is_valid_domain(cgrp))
+ seq_puts(seq, "domain invalid\n");
+ else if (cgroup_is_thread_root(cgrp))
+ seq_puts(seq, "domain threaded\n");
+ else
+ seq_puts(seq, "domain\n");
+
+ return 0;
+}
+
+static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ /* only switching to threaded mode is supported */
+ if (strcmp(strstrip(buf), "threaded"))
+ return -EINVAL;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ /* threaded can only be enabled */
+ ret = cgroup_enable_threaded(cgrp);
+
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ int descendants = READ_ONCE(cgrp->max_descendants);
+
+ if (descendants == INT_MAX)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%d\n", descendants);
+
+ return 0;
+}
+
+static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ int descendants;
+ ssize_t ret;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, "max")) {
+ descendants = INT_MAX;
+ } else {
+ ret = kstrtoint(buf, 0, &descendants);
+ if (ret)
+ return ret;
+ }
+
+ if (descendants < 0)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgrp->max_descendants = descendants;
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
+static int cgroup_max_depth_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ int depth = READ_ONCE(cgrp->max_depth);
+
+ if (depth == INT_MAX)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%d\n", depth);
+
+ return 0;
+}
+
+static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ ssize_t ret;
+ int depth;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, "max")) {
+ depth = INT_MAX;
+ } else {
+ ret = kstrtoint(buf, 0, &depth);
+ if (ret)
+ return ret;
+ }
+
+ if (depth < 0)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgrp->max_depth = depth;
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
static int cgroup_events_show(struct seq_file *seq, void *v)
{
seq_printf(seq, "populated %d\n",
@@ -2934,6 +3292,18 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
return 0;
}
+static int cgroup_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "nr_descendants %d\n",
+ cgroup->nr_descendants);
+ seq_printf(seq, "nr_dying_descendants %d\n",
+ cgroup->nr_dying_descendants);
+
+ return 0;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
@@ -2954,11 +3324,23 @@ static void cgroup_file_release(struct kernfs_open_file *of)
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of->kn->priv;
struct cgroup_subsys_state *css;
int ret;
+ /*
+ * If namespaces are delegation boundaries, disallow writes to
+ * files in an non-init namespace root from inside the namespace
+ * except for the files explicitly marked delegatable -
+ * cgroup.procs and cgroup.subtree_control.
+ */
+ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
+ !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
+ ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+ return -EPERM;
+
if (cft->write)
return cft->write(of, buf, nbytes, off);
@@ -3138,7 +3520,6 @@ restart:
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
- LIST_HEAD(pending);
struct cgroup_subsys *ss = cfts[0].ss;
struct cgroup *root = &ss->root->cgrp;
struct cgroup_subsys_state *css;
@@ -3563,6 +3944,58 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
return ret;
}
+static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
+{
+ struct list_head *l;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+
+ lockdep_assert_held(&css_set_lock);
+
+ /* find the next threaded cset */
+ if (it->tcset_pos) {
+ l = it->tcset_pos->next;
+
+ if (l != it->tcset_head) {
+ it->tcset_pos = l;
+ return container_of(l, struct css_set,
+ threaded_csets_node);
+ }
+
+ it->tcset_pos = NULL;
+ }
+
+ /* find the next cset */
+ l = it->cset_pos;
+ l = l->next;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
+ return NULL;
+ }
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
+
+ it->cset_pos = l;
+
+ /* initialize threaded css_set walking */
+ if (it->flags & CSS_TASK_ITER_THREADED) {
+ if (it->cur_dcset)
+ put_css_set_locked(it->cur_dcset);
+ it->cur_dcset = cset;
+ get_css_set(cset);
+
+ it->tcset_head = &cset->threaded_csets;
+ it->tcset_pos = &cset->threaded_csets;
+ }
+
+ return cset;
+}
+
/**
* css_task_iter_advance_css_set - advance a task itererator to the next css_set
* @it: the iterator to advance
@@ -3571,32 +4004,19 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
*/
static void css_task_iter_advance_css_set(struct css_task_iter *it)
{
- struct list_head *l = it->cset_pos;
- struct cgrp_cset_link *link;
struct css_set *cset;
lockdep_assert_held(&css_set_lock);
/* Advance to the next non-empty css_set */
do {
- l = l->next;
- if (l == it->cset_head) {
- it->cset_pos = NULL;
+ cset = css_task_iter_next_css_set(it);
+ if (!cset) {
it->task_pos = NULL;
return;
}
-
- if (it->ss) {
- cset = container_of(l, struct css_set,
- e_cset_node[it->ss->id]);
- } else {
- link = list_entry(l, struct cgrp_cset_link, cset_link);
- cset = link->cset;
- }
} while (!css_set_populated(cset));
- it->cset_pos = l;
-
if (!list_empty(&cset->tasks))
it->task_pos = cset->tasks.next;
else
@@ -3636,6 +4056,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
lockdep_assert_held(&css_set_lock);
WARN_ON_ONCE(!l);
+repeat:
/*
* Advance iterator to find next entry. cset->tasks is consumed
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
@@ -3650,11 +4071,18 @@ static void css_task_iter_advance(struct css_task_iter *it)
css_task_iter_advance_css_set(it);
else
it->task_pos = l;
+
+ /* if PROCS, skip over tasks which aren't group leaders */
+ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
+ !thread_group_leader(list_entry(it->task_pos, struct task_struct,
+ cg_list)))
+ goto repeat;
}
/**
* css_task_iter_start - initiate task iteration
* @css: the css to walk tasks of
+ * @flags: CSS_TASK_ITER_* flags
* @it: the task iterator to use
*
* Initiate iteration through the tasks of @css. The caller can call
@@ -3662,7 +4090,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
*/
-void css_task_iter_start(struct cgroup_subsys_state *css,
+void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it)
{
/* no one should try to iterate before mounting cgroups */
@@ -3673,6 +4101,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
spin_lock_irq(&css_set_lock);
it->ss = css->ss;
+ it->flags = flags;
if (it->ss)
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
@@ -3730,6 +4159,9 @@ void css_task_iter_end(struct css_task_iter *it)
spin_unlock_irq(&css_set_lock);
}
+ if (it->cur_dcset)
+ put_css_set(it->cur_dcset);
+
if (it->cur_task)
put_task_struct(it->cur_task);
}
@@ -3746,16 +4178,12 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
struct css_task_iter *it = of->priv;
- struct task_struct *task;
- do {
- task = css_task_iter_next(it);
- } while (task && !thread_group_leader(task));
-
- return task;
+ return css_task_iter_next(it);
}
-static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
+ unsigned int iter_flags)
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
@@ -3773,25 +4201,171 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
if (!it)
return ERR_PTR(-ENOMEM);
of->priv = it;
- css_task_iter_start(&cgrp->self, it);
+ css_task_iter_start(&cgrp->self, iter_flags, it);
} else if (!(*pos)++) {
css_task_iter_end(it);
- css_task_iter_start(&cgrp->self, it);
+ css_task_iter_start(&cgrp->self, iter_flags, it);
}
return cgroup_procs_next(s, NULL, NULL);
}
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+{
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+
+ /*
+ * All processes of a threaded subtree belong to the domain cgroup
+ * of the subtree. Only threads can be distributed across the
+ * subtree. Reject reads on cgroup.procs in the subtree proper.
+ * They're always empty anyway.
+ */
+ if (cgroup_is_threaded(cgrp))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
+ CSS_TASK_ITER_THREADED);
+}
+
static int cgroup_procs_show(struct seq_file *s, void *v)
{
- seq_printf(s, "%d\n", task_tgid_vnr(v));
+ seq_printf(s, "%d\n", task_pid_vnr(v));
return 0;
}
+static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb)
+{
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup *com_cgrp = src_cgrp;
+ struct inode *inode;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* find the common ancestor */
+ while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+ com_cgrp = cgroup_parent(com_cgrp);
+
+ /* %current should be authorized to migrate to the common ancestor */
+ inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+ if (!inode)
+ return -ENOMEM;
+
+ ret = inode_permission(inode, MAY_WRITE);
+ iput(inode);
+ if (ret)
+ return ret;
+
+ /*
+ * If namespaces are delegation boundaries, %current must be able
+ * to see both source and destination cgroups from its namespace.
+ */
+ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+ (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
+ !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
+ return -ENOENT;
+
+ return 0;
+}
+
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *src_cgrp, *dst_cgrp;
+ struct task_struct *task;
+ ssize_t ret;
+
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!dst_cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, true);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb);
+ if (ret)
+ goto out_finish;
+
+ ret = cgroup_attach_task(dst_cgrp, task, true);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
+{
+ return __cgroup_procs_start(s, pos, 0);
+}
+
+static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *src_cgrp, *dst_cgrp;
+ struct task_struct *task;
+ ssize_t ret;
+
+ buf = strstrip(buf);
+
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!dst_cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, false);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ /* thread migrations follow the cgroup.procs delegation rule */
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb);
+ if (ret)
+ goto out_finish;
+
+ /* and must be contained in the same domain */
+ ret = -EOPNOTSUPP;
+ if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
+ goto out_finish;
+
+ ret = cgroup_attach_task(dst_cgrp, task, false);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
/* cgroup core interface files for the default hierarchy */
static struct cftype cgroup_base_files[] = {
{
+ .name = "cgroup.type",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_type_show,
+ .write = cgroup_type_write,
+ },
+ {
.name = "cgroup.procs",
+ .flags = CFTYPE_NS_DELEGATABLE,
.file_offset = offsetof(struct cgroup, procs_file),
.release = cgroup_procs_release,
.seq_start = cgroup_procs_start,
@@ -3800,11 +4374,20 @@ static struct cftype cgroup_base_files[] = {
.write = cgroup_procs_write,
},
{
+ .name = "cgroup.threads",
+ .release = cgroup_procs_release,
+ .seq_start = cgroup_threads_start,
+ .seq_next = cgroup_procs_next,
+ .seq_show = cgroup_procs_show,
+ .write = cgroup_threads_write,
+ },
+ {
.name = "cgroup.controllers",
.seq_show = cgroup_controllers_show,
},
{
.name = "cgroup.subtree_control",
+ .flags = CFTYPE_NS_DELEGATABLE,
.seq_show = cgroup_subtree_control_show,
.write = cgroup_subtree_control_write,
},
@@ -3814,6 +4397,20 @@ static struct cftype cgroup_base_files[] = {
.file_offset = offsetof(struct cgroup, events_file),
.seq_show = cgroup_events_show,
},
+ {
+ .name = "cgroup.max.descendants",
+ .seq_show = cgroup_max_descendants_show,
+ .write = cgroup_max_descendants_write,
+ },
+ {
+ .name = "cgroup.max.depth",
+ .seq_show = cgroup_max_depth_show,
+ .write = cgroup_max_depth_write,
+ },
+ {
+ .name = "cgroup.stat",
+ .seq_show = cgroup_stat_show,
+ },
{ } /* terminate */
};
@@ -3913,9 +4510,15 @@ static void css_release_work_fn(struct work_struct *work)
if (ss->css_released)
ss->css_released(css);
} else {
+ struct cgroup *tcgrp;
+
/* cgroup release path */
trace_cgroup_release(cgrp);
+ for (tcgrp = cgroup_parent(cgrp); tcgrp;
+ tcgrp = cgroup_parent(tcgrp))
+ tcgrp->nr_dying_descendants--;
+
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
@@ -4002,9 +4605,6 @@ static void offline_css(struct cgroup_subsys_state *css)
if (!(css->flags & CSS_ONLINE))
return;
- if (ss->css_reset)
- ss->css_reset(css);
-
if (ss->css_offline)
ss->css_offline(css);
@@ -4114,9 +4714,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->root = root;
cgrp->level = level;
- for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
+ if (tcgrp != cgrp)
+ tcgrp->nr_descendants++;
+ }
+
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -4157,6 +4761,29 @@ out_free_cgrp:
return ERR_PTR(ret);
}
+static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
+{
+ struct cgroup *cgroup;
+ int ret = false;
+ int level = 1;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
+ if (cgroup->nr_descendants >= cgroup->max_descendants)
+ goto fail;
+
+ if (level > cgroup->max_depth)
+ goto fail;
+
+ level++;
+ }
+
+ ret = true;
+fail:
+ return ret;
+}
+
int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
@@ -4171,6 +4798,11 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
if (!parent)
return -ENODEV;
+ if (!cgroup_check_hierarchy_limits(parent)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
cgrp = cgroup_create(parent);
if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp);
@@ -4322,6 +4954,7 @@ static void kill_css(struct cgroup_subsys_state *css)
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
+ struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
int ssid;
@@ -4366,7 +4999,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
*/
kernfs_remove(cgrp->kn);
- cgroup1_check_for_release(cgroup_parent(cgrp));
+ if (parent && cgroup_is_threaded(cgrp))
+ parent->nr_threaded_children--;
+
+ for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+ tcgrp->nr_descendants--;
+ tcgrp->nr_dying_descendants++;
+ }
+
+ cgroup1_check_for_release(parent);
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -4393,6 +5034,7 @@ int cgroup_rmdir(struct kernfs_node *kn)
}
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+ .show_options = cgroup_show_options,
.remount_fs = cgroup_remount,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
@@ -4560,11 +5202,17 @@ int __init cgroup_init(void)
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+ /* implicit controllers must be threaded too */
+ WARN_ON(ss->implicit_on_dfl && !ss->threaded);
+
if (ss->implicit_on_dfl)
cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
else if (!ss->dfl_cftypes)
cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
+ if (ss->threaded)
+ cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
+
if (ss->dfl_cftypes == ss->legacy_cftypes) {
WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
} else {
@@ -4574,6 +5222,10 @@ int __init cgroup_init(void)
if (ss->bind)
ss->bind(init_css_set.subsys[ssid]);
+
+ mutex_lock(&cgroup_mutex);
+ css_populate_dir(init_css_set.subsys[ssid]);
+ mutex_unlock(&cgroup_mutex);
}
/* init_css_set.subsys[] has been updated, re-hash */
@@ -4605,6 +5257,18 @@ static int __init cgroup_wq_init(void)
}
core_initcall(cgroup_wq_init);
+void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+ char *buf, size_t buflen)
+{
+ struct kernfs_node *kn;
+
+ kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ if (!kn)
+ return;
+ kernfs_path(kn, buf, buflen);
+ kernfs_put(kn);
+}
+
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
@@ -4789,6 +5453,7 @@ void cgroup_post_fork(struct task_struct *child)
cset = task_css_set(current);
if (list_empty(&child->cg_list)) {
get_css_set(cset);
+ cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
}
spin_unlock_irq(&css_set_lock);
@@ -4838,6 +5503,7 @@ void cgroup_exit(struct task_struct *tsk)
if (!list_empty(&tsk->cg_list)) {
spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
+ cset->nr_tasks--;
spin_unlock_irq(&css_set_lock);
} else {
get_css_set(cset);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ae643412948a..67230ecf2ce1 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -56,6 +56,7 @@
#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
+#include <linux/oom.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
@@ -63,6 +64,7 @@
#include <linux/cgroup.h>
#include <linux/wait.h>
+DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/* See "Frequency meter" comments, below. */
@@ -299,6 +301,16 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
/*
+ * Cgroup v2 behavior is used when on default hierarchy or the
+ * cgroup_v2_mode flag is set.
+ */
+static inline bool is_in_v2_mode(void)
+{
+ return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
+}
+
+/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
@@ -488,8 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !is_cpuset_subset(trial, par))
+ if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
goto out;
/*
@@ -576,6 +587,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
+/* Must be called with cpuset_mutex held. */
+static inline int nr_cpusets(void)
+{
+ /* jump label reference count + the top-level cpuset */
+ return static_key_count(&cpusets_enabled_key.key) + 1;
+}
+
/*
* generate_sched_domains()
*
@@ -861,7 +879,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
set_cpus_allowed_ptr(task, cs->effective_cpus);
css_task_iter_end(&it);
@@ -895,8 +913,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- cpumask_empty(new_cpus))
+ if (is_in_v2_mode() && cpumask_empty(new_cpus))
cpumask_copy(new_cpus, parent->effective_cpus);
/* Skip the whole subtree if the cpumask remains the same. */
@@ -913,7 +930,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
cpumask_copy(cp->effective_cpus, new_cpus);
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
update_tasks_cpumask(cp);
@@ -1038,40 +1055,25 @@ static void cpuset_post_attach(void)
* @tsk: the task to change
* @newmems: new nodes that the task will be set
*
- * In order to avoid seeing no nodes if the old and new nodes are disjoint,
- * we structure updates as setting all new allowed nodes, then clearing newly
- * disallowed ones.
+ * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
+ * and rebind an eventual tasks' mempolicy. If the task is allocating in
+ * parallel, it might temporarily see an empty intersection, which results in
+ * a seqlock check and retry before OOM or allocation failure.
*/
static void cpuset_change_task_nodemask(struct task_struct *tsk,
nodemask_t *newmems)
{
- bool need_loop;
-
task_lock(tsk);
- /*
- * Determine if a loop is necessary if another thread is doing
- * read_mems_allowed_begin(). If at least one node remains unchanged and
- * tsk does not have a mempolicy, then an empty nodemask will not be
- * possible when mems_allowed is larger than a word.
- */
- need_loop = task_has_mempolicy(tsk) ||
- !nodes_intersects(*newmems, tsk->mems_allowed);
- if (need_loop) {
- local_irq_disable();
- write_seqcount_begin(&tsk->mems_allowed_seq);
- }
+ local_irq_disable();
+ write_seqcount_begin(&tsk->mems_allowed_seq);
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
+ mpol_rebind_task(tsk, newmems);
tsk->mems_allowed = *newmems;
- if (need_loop) {
- write_seqcount_end(&tsk->mems_allowed_seq);
- local_irq_enable();
- }
+ write_seqcount_end(&tsk->mems_allowed_seq);
+ local_irq_enable();
task_unlock(tsk);
}
@@ -1106,7 +1108,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it))) {
struct mm_struct *mm;
bool migrate;
@@ -1164,8 +1166,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some MEMs.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- nodes_empty(*new_mems))
+ if (is_in_v2_mode() && nodes_empty(*new_mems))
*new_mems = parent->effective_mems;
/* Skip the whole subtree if the nodemask remains the same. */
@@ -1182,7 +1183,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
cp->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ WARN_ON(!is_in_v2_mode() &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
update_tasks_nodemask(cp);
@@ -1299,7 +1300,7 @@ static void update_tasks_flags(struct cpuset *cs)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
cpuset_update_task_spread_flag(cs, task);
css_task_iter_end(&it);
@@ -1474,7 +1475,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ if (!is_in_v2_mode() &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
@@ -1906,6 +1907,7 @@ static struct cftype files[] = {
{
.name = "memory_pressure",
.read_u64 = cpuset_read_u64,
+ .private = FILE_MEMORY_PRESSURE,
},
{
@@ -1992,7 +1994,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
spin_lock_irq(&callback_lock);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
@@ -2069,7 +2071,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if (is_in_v2_mode()) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
top_cpuset.mems_allowed = node_possible_map;
} else {
@@ -2263,7 +2265,7 @@ retry:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
else
@@ -2294,7 +2296,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
- bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+ bool on_dfl = is_in_v2_mode();
mutex_lock(&cpuset_mutex);
@@ -2357,13 +2359,7 @@ void cpuset_update_active_cpus(void)
* We're inside cpu hotplug critical region which usually nests
* inside cgroup synchronization. Bounce actual hotplug processing
* to a work item to avoid reverse locking order.
- *
- * We still need to do partition_sched_domains() synchronously;
- * otherwise, the scheduler will get confused and put tasks to the
- * dead CPU. Fall back to the default single domain.
- * cpuset_hotplug_workfn() will rebuild it as necessary.
*/
- partition_sched_domains(1, NULL, NULL);
schedule_work(&cpuset_hotplug_work);
}
@@ -2512,12 +2508,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* If we're in interrupt, yes, we can always allocate. If @node is set in
* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
- * yes. If current has access to memory reserves due to TIF_MEMDIE, yes.
+ * yes. If current has access to memory reserves as an oom victim, yes.
* Otherwise, no.
*
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset
- * unless the task has been OOM killed as is marked TIF_MEMDIE.
+ * unless the task has been OOM killed.
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
@@ -2540,7 +2536,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* affect that:
* in_interrupt - any node ok (current task context irrelevant)
* GFP_ATOMIC - any node ok
- * TIF_MEMDIE - any node ok
+ * tsk_is_oom_victim - any node ok
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
@@ -2558,7 +2554,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
*/
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ if (unlikely(tsk_is_oom_victim(current)))
return true;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
return false;
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
new file mode 100644
index 000000000000..f661b4cc5efd
--- /dev/null
+++ b/kernel/cgroup/debug.c
@@ -0,0 +1,382 @@
+/*
+ * Debug controller
+ *
+ * WARNING: This controller is for cgroup core debugging only.
+ * Its interfaces are unstable and subject to changes at any time.
+ */
+#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "cgroup-internal.h"
+
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css);
+}
+
+/*
+ * debug_taskcount_read - return the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ */
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return cgroup_task_count(css->cgroup);
+}
+
+static int current_css_set_read(struct seq_file *seq, void *v)
+{
+ struct kernfs_open_file *of = seq->private;
+ struct css_set *cset;
+ struct cgroup_subsys *ss;
+ struct cgroup_subsys_state *css;
+ int i, refcnt;
+
+ if (!cgroup_kn_lock_live(of->kn, false))
+ return -ENODEV;
+
+ spin_lock_irq(&css_set_lock);
+ rcu_read_lock();
+ cset = rcu_dereference(current->cgroups);
+ refcnt = refcount_read(&cset->refcount);
+ seq_printf(seq, "css_set %pK %d", cset, refcnt);
+ if (refcnt > cset->nr_tasks)
+ seq_printf(seq, " +%d", refcnt - cset->nr_tasks);
+ seq_puts(seq, "\n");
+
+ /*
+ * Print the css'es stored in the current css_set.
+ */
+ for_each_subsys(ss, i) {
+ css = cset->subsys[ss->id];
+ if (!css)
+ continue;
+ seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
+ (unsigned long)css, css->id);
+ }
+ rcu_read_unlock();
+ spin_unlock_irq(&css_set_lock);
+ cgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 count;
+
+ rcu_read_lock();
+ count = refcount_read(&task_css_set(current)->refcount);
+ rcu_read_unlock();
+ return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+ char *name_buf;
+
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
+
+ spin_lock_irq(&css_set_lock);
+ rcu_read_lock();
+ cset = rcu_dereference(current->cgroups);
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ cgroup_name(c, name_buf, NAME_MAX + 1);
+ seq_printf(seq, "Root %d group %s\n",
+ c->root->hierarchy_id, name_buf);
+ }
+ rcu_read_unlock();
+ spin_unlock_irq(&css_set_lock);
+ kfree(name_buf);
+ return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(seq);
+ struct cgrp_cset_link *link;
+ int dead_cnt = 0, extra_refs = 0, threaded_csets = 0;
+
+ spin_lock_irq(&css_set_lock);
+
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+ struct css_set *cset = link->cset;
+ struct task_struct *task;
+ int count = 0;
+ int refcnt = refcount_read(&cset->refcount);
+
+ /*
+ * Print out the proc_cset and threaded_cset relationship
+ * and highlight difference between refcount and task_count.
+ */
+ seq_printf(seq, "css_set %pK", cset);
+ if (rcu_dereference_protected(cset->dom_cset, 1) != cset) {
+ threaded_csets++;
+ seq_printf(seq, "=>%pK", cset->dom_cset);
+ }
+ if (!list_empty(&cset->threaded_csets)) {
+ struct css_set *tcset;
+ int idx = 0;
+
+ list_for_each_entry(tcset, &cset->threaded_csets,
+ threaded_csets_node) {
+ seq_puts(seq, idx ? "," : "<=");
+ seq_printf(seq, "%pK", tcset);
+ idx++;
+ }
+ } else {
+ seq_printf(seq, " %d", refcnt);
+ if (refcnt - cset->nr_tasks > 0) {
+ int extra = refcnt - cset->nr_tasks;
+
+ seq_printf(seq, " +%d", extra);
+ /*
+ * Take out the one additional reference in
+ * init_css_set.
+ */
+ if (cset == &init_css_set)
+ extra--;
+ extra_refs += extra;
+ }
+ }
+ seq_puts(seq, "\n");
+
+ list_for_each_entry(task, &cset->tasks, cg_list) {
+ if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+ seq_printf(seq, " task %d\n",
+ task_pid_vnr(task));
+ }
+
+ list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+ if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+ seq_printf(seq, " task %d\n",
+ task_pid_vnr(task));
+ }
+ /* show # of overflowed tasks */
+ if (count > MAX_TASKS_SHOWN_PER_CSS)
+ seq_printf(seq, " ... (%d)\n",
+ count - MAX_TASKS_SHOWN_PER_CSS);
+
+ if (cset->dead) {
+ seq_puts(seq, " [dead]\n");
+ dead_cnt++;
+ }
+
+ WARN_ON(count != cset->nr_tasks);
+ }
+ spin_unlock_irq(&css_set_lock);
+
+ if (!dead_cnt && !extra_refs && !threaded_csets)
+ return 0;
+
+ seq_puts(seq, "\n");
+ if (threaded_csets)
+ seq_printf(seq, "threaded css_sets = %d\n", threaded_csets);
+ if (extra_refs)
+ seq_printf(seq, "extra references = %d\n", extra_refs);
+ if (dead_cnt)
+ seq_printf(seq, "dead css_sets = %d\n", dead_cnt);
+
+ return 0;
+}
+
+static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
+{
+ struct kernfs_open_file *of = seq->private;
+ struct cgroup *cgrp;
+ struct cgroup_subsys *ss;
+ struct cgroup_subsys_state *css;
+ char pbuf[16];
+ int i;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ for_each_subsys(ss, i) {
+ css = rcu_dereference_check(cgrp->subsys[ss->id], true);
+ if (!css)
+ continue;
+
+ pbuf[0] = '\0';
+
+ /* Show the parent CSS if applicable*/
+ if (css->parent)
+ snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
+ css->parent->id);
+ seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
+ (unsigned long)css, css->id,
+ atomic_read(&css->online_cnt), pbuf);
+ }
+
+ cgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
+ u16 mask)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
+ bool first = true;
+
+ seq_printf(seq, "%-17s: ", name);
+ for_each_subsys(ss, ssid) {
+ if (!(mask & (1 << ssid)))
+ continue;
+ if (!first)
+ seq_puts(seq, ", ");
+ seq_puts(seq, ss->name);
+ first = false;
+ }
+ seq_putc(seq, '\n');
+}
+
+static int cgroup_masks_read(struct seq_file *seq, void *v)
+{
+ struct kernfs_open_file *of = seq->private;
+ struct cgroup *cgrp;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
+ cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
+
+ cgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return (!cgroup_is_populated(css->cgroup) &&
+ !css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_legacy_files[] = {
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .seq_show = current_css_set_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_cg_links",
+ .seq_show = current_css_set_cg_links_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "cgroup_css_links",
+ .seq_show = cgroup_css_links_read,
+ },
+
+ {
+ .name = "cgroup_subsys_states",
+ .seq_show = cgroup_subsys_states_read,
+ },
+
+ {
+ .name = "cgroup_masks",
+ .seq_show = cgroup_masks_read,
+ },
+
+ {
+ .name = "releasable",
+ .read_u64 = releasable_read,
+ },
+
+ { } /* terminate */
+};
+
+static struct cftype debug_files[] = {
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .seq_show = current_css_set_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_cg_links",
+ .seq_show = current_css_set_cg_links_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "css_links",
+ .seq_show = cgroup_css_links_read,
+ },
+
+ {
+ .name = "csses",
+ .seq_show = cgroup_subsys_states_read,
+ },
+
+ {
+ .name = "masks",
+ .seq_show = cgroup_masks_read,
+ },
+
+ { } /* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
+ .legacy_cftypes = debug_legacy_files,
+};
+
+/*
+ * On v2, debug is an implicit controller enabled by "cgroup_debug" boot
+ * parameter.
+ */
+static int __init enable_cgroup_debug(char *str)
+{
+ debug_cgrp_subsys.dfl_cftypes = debug_files;
+ debug_cgrp_subsys.implicit_on_dfl = true;
+ debug_cgrp_subsys.threaded = true;
+ return 1;
+}
+__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 1b72d56edce5..08236798d173 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -268,7 +268,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
rcu_read_unlock();
/* are all tasks frozen? */
- css_task_iter_start(css, &it);
+ css_task_iter_start(css, 0, &it);
while ((task = css_task_iter_next(&it))) {
if (freezing(task)) {
@@ -320,7 +320,7 @@ static void freeze_cgroup(struct freezer *freezer)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&freezer->css, &it);
+ css_task_iter_start(&freezer->css, 0, &it);
while ((task = css_task_iter_next(&it)))
freeze_task(task);
css_task_iter_end(&it);
@@ -331,7 +331,7 @@ static void unfreeze_cgroup(struct freezer *freezer)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&freezer->css, &it);
+ css_task_iter_start(&freezer->css, 0, &it);
while ((task = css_task_iter_next(&it)))
__thaw_task(task);
css_task_iter_end(&it);
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 2237201d66d5..9829c67ebc0a 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -345,4 +345,5 @@ struct cgroup_subsys pids_cgrp_subsys = {
.free = pids_free,
.legacy_cftypes = pids_files,
.dfl_cftypes = pids_files,
+ .threaded = true,
};
diff --git a/kernel/compat.c b/kernel/compat.c
index ebd8bdc3fd68..6f0a0e723a06 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -120,6 +120,50 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
+static int __compat_get_timespec64(struct timespec64 *ts64,
+ const struct compat_timespec __user *cts)
+{
+ struct compat_timespec ts;
+ int ret;
+
+ ret = copy_from_user(&ts, cts, sizeof(ts));
+ if (ret)
+ return -EFAULT;
+
+ ts64->tv_sec = ts.tv_sec;
+ ts64->tv_nsec = ts.tv_nsec;
+
+ return 0;
+}
+
+static int __compat_put_timespec64(const struct timespec64 *ts64,
+ struct compat_timespec __user *cts)
+{
+ struct compat_timespec ts = {
+ .tv_sec = ts64->tv_sec,
+ .tv_nsec = ts64->tv_nsec
+ };
+ return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+
+int compat_get_timespec64(struct timespec64 *ts, const void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_get_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec64);
+
+int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_put_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec64);
+
int compat_get_timeval(struct timeval *tv, const void __user *utv)
{
if (COMPAT_USE_64BIT_TIME)
@@ -203,53 +247,6 @@ int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerv
return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0;
}
-static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
-{
- return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
-}
-
-COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
-{
- if (tbuf) {
- struct tms tms;
- struct compat_tms tmp;
-
- do_sys_times(&tms);
- /* Convert our struct tms to the compat version. */
- tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
- tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
- tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
- tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
- if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
- return -EFAULT;
- }
- force_successful_syscall_return();
- return compat_jiffies_to_clock_t(jiffies);
-}
-
-#ifdef __ARCH_WANT_SYS_SIGPENDING
-
-/*
- * Assumption: old_sigset_t and compat_old_sigset_t are both
- * types that can be passed to put_user()/get_user().
- */
-
-COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
-{
- old_sigset_t s;
- long ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_sigpending((old_sigset_t __user *) &s);
- set_fs(old_fs);
- if (ret == 0)
- ret = put_user(s, set);
- return ret;
-}
-
-#endif
-
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
/*
@@ -304,164 +301,33 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
#endif
-COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
- struct compat_rlimit __user *, rlim)
-{
- struct rlimit r;
-
- if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
- __get_user(r.rlim_cur, &rlim->rlim_cur) ||
- __get_user(r.rlim_max, &rlim->rlim_max))
- return -EFAULT;
-
- if (r.rlim_cur == COMPAT_RLIM_INFINITY)
- r.rlim_cur = RLIM_INFINITY;
- if (r.rlim_max == COMPAT_RLIM_INFINITY)
- r.rlim_max = RLIM_INFINITY;
- return do_prlimit(current, resource, &r, NULL);
-}
-
-#ifdef COMPAT_RLIM_OLD_INFINITY
-
-COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
- struct compat_rlimit __user *, rlim)
-{
- struct rlimit r;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
- set_fs(old_fs);
-
- if (!ret) {
- if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY)
- r.rlim_cur = COMPAT_RLIM_INFINITY;
- if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY)
- r.rlim_max = COMPAT_RLIM_INFINITY;
-
- if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
- __put_user(r.rlim_cur, &rlim->rlim_cur) ||
- __put_user(r.rlim_max, &rlim->rlim_max))
- return -EFAULT;
- }
- return ret;
-}
-
-#endif
-
-COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
- struct compat_rlimit __user *, rlim)
-{
- struct rlimit r;
- int ret;
-
- ret = do_prlimit(current, resource, NULL, &r);
- if (!ret) {
- if (r.rlim_cur > COMPAT_RLIM_INFINITY)
- r.rlim_cur = COMPAT_RLIM_INFINITY;
- if (r.rlim_max > COMPAT_RLIM_INFINITY)
- r.rlim_max = COMPAT_RLIM_INFINITY;
-
- if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
- __put_user(r.rlim_cur, &rlim->rlim_cur) ||
- __put_user(r.rlim_max, &rlim->rlim_max))
- return -EFAULT;
- }
- return ret;
-}
-
int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
{
- if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) ||
- __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) ||
- __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) ||
- __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) ||
- __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) ||
- __put_user(r->ru_maxrss, &ru->ru_maxrss) ||
- __put_user(r->ru_ixrss, &ru->ru_ixrss) ||
- __put_user(r->ru_idrss, &ru->ru_idrss) ||
- __put_user(r->ru_isrss, &ru->ru_isrss) ||
- __put_user(r->ru_minflt, &ru->ru_minflt) ||
- __put_user(r->ru_majflt, &ru->ru_majflt) ||
- __put_user(r->ru_nswap, &ru->ru_nswap) ||
- __put_user(r->ru_inblock, &ru->ru_inblock) ||
- __put_user(r->ru_oublock, &ru->ru_oublock) ||
- __put_user(r->ru_msgsnd, &ru->ru_msgsnd) ||
- __put_user(r->ru_msgrcv, &ru->ru_msgrcv) ||
- __put_user(r->ru_nsignals, &ru->ru_nsignals) ||
- __put_user(r->ru_nvcsw, &ru->ru_nvcsw) ||
- __put_user(r->ru_nivcsw, &ru->ru_nivcsw))
+ struct compat_rusage r32;
+ memset(&r32, 0, sizeof(r32));
+ r32.ru_utime.tv_sec = r->ru_utime.tv_sec;
+ r32.ru_utime.tv_usec = r->ru_utime.tv_usec;
+ r32.ru_stime.tv_sec = r->ru_stime.tv_sec;
+ r32.ru_stime.tv_usec = r->ru_stime.tv_usec;
+ r32.ru_maxrss = r->ru_maxrss;
+ r32.ru_ixrss = r->ru_ixrss;
+ r32.ru_idrss = r->ru_idrss;
+ r32.ru_isrss = r->ru_isrss;
+ r32.ru_minflt = r->ru_minflt;
+ r32.ru_majflt = r->ru_majflt;
+ r32.ru_nswap = r->ru_nswap;
+ r32.ru_inblock = r->ru_inblock;
+ r32.ru_oublock = r->ru_oublock;
+ r32.ru_msgsnd = r->ru_msgsnd;
+ r32.ru_msgrcv = r->ru_msgrcv;
+ r32.ru_nsignals = r->ru_nsignals;
+ r32.ru_nvcsw = r->ru_nvcsw;
+ r32.ru_nivcsw = r->ru_nivcsw;
+ if (copy_to_user(ru, &r32, sizeof(r32)))
return -EFAULT;
return 0;
}
-COMPAT_SYSCALL_DEFINE4(wait4,
- compat_pid_t, pid,
- compat_uint_t __user *, stat_addr,
- int, options,
- struct compat_rusage __user *, ru)
-{
- if (!ru) {
- return sys_wait4(pid, stat_addr, options, NULL);
- } else {
- struct rusage r;
- int ret;
- unsigned int status;
- mm_segment_t old_fs = get_fs();
-
- set_fs (KERNEL_DS);
- ret = sys_wait4(pid,
- (stat_addr ?
- (unsigned int __user *) &status : NULL),
- options, (struct rusage __user *) &r);
- set_fs (old_fs);
-
- if (ret > 0) {
- if (put_compat_rusage(&r, ru))
- return -EFAULT;
- if (stat_addr && put_user(status, stat_addr))
- return -EFAULT;
- }
- return ret;
- }
-}
-
-COMPAT_SYSCALL_DEFINE5(waitid,
- int, which, compat_pid_t, pid,
- struct compat_siginfo __user *, uinfo, int, options,
- struct compat_rusage __user *, uru)
-{
- siginfo_t info;
- struct rusage ru;
- long ret;
- mm_segment_t old_fs = get_fs();
-
- memset(&info, 0, sizeof(info));
-
- set_fs(KERNEL_DS);
- ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
- uru ? (struct rusage __user *)&ru : NULL);
- set_fs(old_fs);
-
- if ((ret < 0) || (info.si_signo == 0))
- return ret;
-
- if (uru) {
- /* sys_waitid() overwrites everything in ru */
- if (COMPAT_USE_64BIT_TIME)
- ret = copy_to_user(uru, &ru, sizeof(ru));
- else
- ret = put_compat_rusage(&ru, uru);
- if (ret)
- return -EFAULT;
- }
-
- BUG_ON(info.si_code & __SI_MASK);
- info.si_code |= __SI_CHLD;
- return copy_siginfo_to_user32(uinfo, &info);
-}
-
static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
unsigned len, struct cpumask *new_mask)
{
@@ -542,6 +408,27 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst,
return 0;
}
+int get_compat_itimerspec64(struct itimerspec64 *its,
+ const struct compat_itimerspec __user *uits)
+{
+
+ if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) ||
+ __compat_get_timespec64(&its->it_value, &uits->it_value))
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(get_compat_itimerspec64);
+
+int put_compat_itimerspec64(const struct itimerspec64 *its,
+ struct compat_itimerspec __user *uits)
+{
+ if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) ||
+ __compat_put_timespec64(&its->it_value, &uits->it_value))
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(put_compat_itimerspec64);
+
/*
* We currently only need the following fields from the sigevent
* structure: sigev_value, sigev_signo, sig_notify and (sometimes
@@ -566,84 +453,59 @@ int get_compat_sigevent(struct sigevent *event,
long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
unsigned long bitmap_size)
{
- int i, j;
- unsigned long m;
- compat_ulong_t um;
unsigned long nr_compat_longs;
/* align bitmap up to nearest compat_long_t boundary */
bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+ nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
return -EFAULT;
- nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
-
- for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
- m = 0;
-
- for (j = 0; j < sizeof(m)/sizeof(um); j++) {
- /*
- * We dont want to read past the end of the userspace
- * bitmap. We must however ensure the end of the
- * kernel bitmap is zeroed.
- */
- if (nr_compat_longs) {
- nr_compat_longs--;
- if (__get_user(um, umask))
- return -EFAULT;
- } else {
- um = 0;
- }
-
- umask++;
- m |= (long)um << (j * BITS_PER_COMPAT_LONG);
- }
- *mask++ = m;
+ user_access_begin();
+ while (nr_compat_longs > 1) {
+ compat_ulong_t l1, l2;
+ unsafe_get_user(l1, umask++, Efault);
+ unsafe_get_user(l2, umask++, Efault);
+ *mask++ = ((unsigned long)l2 << BITS_PER_COMPAT_LONG) | l1;
+ nr_compat_longs -= 2;
}
-
+ if (nr_compat_longs)
+ unsafe_get_user(*mask, umask++, Efault);
+ user_access_end();
return 0;
+
+Efault:
+ user_access_end();
+ return -EFAULT;
}
long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
unsigned long bitmap_size)
{
- int i, j;
- unsigned long m;
- compat_ulong_t um;
unsigned long nr_compat_longs;
/* align bitmap up to nearest compat_long_t boundary */
bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+ nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
return -EFAULT;
- nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
-
- for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
- m = *mask++;
-
- for (j = 0; j < sizeof(m)/sizeof(um); j++) {
- um = m;
-
- /*
- * We dont want to write past the end of the userspace
- * bitmap.
- */
- if (nr_compat_longs) {
- nr_compat_longs--;
- if (__put_user(um, umask))
- return -EFAULT;
- }
-
- umask++;
- m >>= 4*sizeof(um);
- m >>= 4*sizeof(um);
- }
+ user_access_begin();
+ while (nr_compat_longs > 1) {
+ unsigned long m = *mask++;
+ unsafe_put_user((compat_ulong_t)m, umask++, Efault);
+ unsafe_put_user(m >> BITS_PER_COMPAT_LONG, umask++, Efault);
+ nr_compat_longs -= 2;
}
-
+ if (nr_compat_longs)
+ unsafe_put_user((compat_ulong_t)*mask, umask++, Efault);
+ user_access_end();
return 0;
+Efault:
+ user_access_end();
+ return -EFAULT;
}
void
@@ -669,38 +531,6 @@ sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
}
}
-COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
- struct compat_siginfo __user *, uinfo,
- struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
-{
- compat_sigset_t s32;
- sigset_t s;
- struct timespec t;
- siginfo_t info;
- long ret;
-
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
-
- if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
- return -EFAULT;
- sigset_from_compat(&s, &s32);
-
- if (uts) {
- if (compat_get_timespec(&t, uts))
- return -EFAULT;
- }
-
- ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
-
- if (ret > 0 && uinfo) {
- if (copy_siginfo_to_user32(uinfo, &info))
- ret = -EFAULT;
- }
-
- return ret;
-}
-
#ifdef CONFIG_NUMA
COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
compat_uptr_t __user *, pages32,
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index d70829033bb7..d3fd428f4b92 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -10,6 +10,7 @@
# CONFIG_USELIB is not set
CONFIG_ANDROID=y
CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder
CONFIG_ANDROID_LOW_MEMORY_KILLER=y
CONFIG_ARMV8_DEPRECATED=y
CONFIG_ASHMEM=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b03a32595cfe..acf5308fad51 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -271,11 +271,26 @@ void cpu_hotplug_enable(void)
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
+static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st);
+
static int bringup_wait_for_ap(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
wait_for_completion(&st->done);
+ if (WARN_ON_ONCE((!cpu_online(cpu))))
+ return -ECANCELED;
+
+ /* Unpark the stopper thread and the hotplug thread of the target cpu */
+ stop_machine_unpark(cpu);
+ kthread_unpark(st->thread);
+
+ /* Should we go further up ? */
+ if (st->target > CPUHP_AP_ONLINE_IDLE) {
+ __cpuhp_kick_ap_work(st);
+ wait_for_completion(&st->done);
+ }
return st->result;
}
@@ -296,9 +311,7 @@ static int bringup_cpu(unsigned int cpu)
irq_unlock_sparse();
if (ret)
return ret;
- ret = bringup_wait_for_ap(cpu);
- BUG_ON(!cpu_online(cpu));
- return ret;
+ return bringup_wait_for_ap(cpu);
}
/*
@@ -637,6 +650,7 @@ static int takedown_cpu(unsigned int cpu)
__cpu_die(cpu);
tick_cleanup_dead_cpu(cpu);
+ rcutree_migrate_callbacks(cpu);
return 0;
}
@@ -767,31 +781,20 @@ void notify_cpu_starting(unsigned int cpu)
}
/*
- * Called from the idle task. We need to set active here, so we can kick off
- * the stopper thread and unpark the smpboot threads. If the target state is
- * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the
- * cpu further.
+ * Called from the idle task. Wake up the controlling task which brings the
+ * stopper and the hotplug thread of the upcoming CPU up and then delegates
+ * the rest of the online bringup to the hotplug thread.
*/
void cpuhp_online_idle(enum cpuhp_state state)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
- unsigned int cpu = smp_processor_id();
/* Happens for the boot cpu */
if (state != CPUHP_AP_ONLINE_IDLE)
return;
st->state = CPUHP_AP_ONLINE_IDLE;
-
- /* Unpark the stopper thread and the hotplug thread of this cpu */
- stop_machine_unpark(cpu);
- kthread_unpark(st->thread);
-
- /* Should we go further up ? */
- if (st->target > CPUHP_AP_ONLINE_IDLE)
- __cpuhp_kick_ap_work(st);
- else
- complete(&st->done);
+ complete(&st->done);
}
/* Requires cpu_add_remove_lock to be held */
@@ -1250,7 +1253,17 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
struct cpuhp_step *sp;
int ret = 0;
- if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) {
+ /*
+ * If name is NULL, then the state gets removed.
+ *
+ * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on
+ * the first allocation from these dynamic ranges, so the removal
+ * would trigger a new allocation and clear the wrong (already
+ * empty) state, leaving the callbacks of the to be cleared state
+ * dangling, which causes wreckage on the next hotplug operation.
+ */
+ if (name && (state == CPUHP_AP_ONLINE_DYN ||
+ state == CPUHP_BP_PREPARE_DYN)) {
ret = cpuhp_reserve_state(state);
if (ret < 0)
return ret;
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 009cc9a17d95..67b02e138a47 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -22,15 +22,21 @@
#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
-static DEFINE_RWLOCK(cpu_pm_notifier_lock);
-static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
{
int ret;
- ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
+ /*
+ * __atomic_notifier_call_chain has a RCU read critical section, which
+ * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
+ * RCU know this.
+ */
+ rcu_irq_enter_irqson();
+ ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
nr_to_call, nr_calls);
+ rcu_irq_exit_irqson();
return notifier_to_errno(ret);
}
@@ -47,14 +53,7 @@ static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
*/
int cpu_pm_register_notifier(struct notifier_block *nb)
{
- unsigned long flags;
- int ret;
-
- write_lock_irqsave(&cpu_pm_notifier_lock, flags);
- ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
- write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
-
- return ret;
+ return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
@@ -69,14 +68,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
*/
int cpu_pm_unregister_notifier(struct notifier_block *nb)
{
- unsigned long flags;
- int ret;
-
- write_lock_irqsave(&cpu_pm_notifier_lock, flags);
- ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
- write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
-
- return ret;
+ return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
@@ -100,7 +92,6 @@ int cpu_pm_enter(void)
int nr_calls;
int ret = 0;
- read_lock(&cpu_pm_notifier_lock);
ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
if (ret)
/*
@@ -108,7 +99,6 @@ int cpu_pm_enter(void)
* PM entry who are notified earlier to prepare for it.
*/
cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
return ret;
}
@@ -128,13 +118,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter);
*/
int cpu_pm_exit(void)
{
- int ret;
-
- read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
-
- return ret;
+ return cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
}
EXPORT_SYMBOL_GPL(cpu_pm_exit);
@@ -159,7 +143,6 @@ int cpu_cluster_pm_enter(void)
int nr_calls;
int ret = 0;
- read_lock(&cpu_pm_notifier_lock);
ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
if (ret)
/*
@@ -167,7 +150,6 @@ int cpu_cluster_pm_enter(void)
* PM entry who are notified earlier to prepare for it.
*/
cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
return ret;
}
@@ -190,13 +172,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
*/
int cpu_cluster_pm_exit(void)
{
- int ret;
-
- read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
-
- return ret;
+ return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
}
EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index fcbd568f1e95..6db80fc0810b 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,10 +14,12 @@
#include <asm/sections.h>
/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+static unsigned char *vmcoreinfo_data;
+static size_t vmcoreinfo_size;
+u32 *vmcoreinfo_note;
+
+/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
+static unsigned char *vmcoreinfo_data_safecopy;
/*
* parsing the "crashkernel" commandline
@@ -324,8 +326,23 @@ static void update_vmcoreinfo_note(void)
final_note(buf);
}
+void crash_update_vmcoreinfo_safecopy(void *ptr)
+{
+ if (ptr)
+ memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
+
+ vmcoreinfo_data_safecopy = ptr;
+}
+
void crash_save_vmcoreinfo(void)
{
+ if (!vmcoreinfo_note)
+ return;
+
+ /* Use the safe copy to generate vmcoreinfo note if have */
+ if (vmcoreinfo_data_safecopy)
+ vmcoreinfo_data = vmcoreinfo_data_safecopy;
+
vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
update_vmcoreinfo_note();
}
@@ -340,7 +357,7 @@ void vmcoreinfo_append_str(const char *fmt, ...)
r = vscnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
- r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+ r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
@@ -356,11 +373,26 @@ void __weak arch_crash_save_vmcoreinfo(void)
phys_addr_t __weak paddr_vmcoreinfo_note(void)
{
- return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
+ return __pa(vmcoreinfo_note);
}
static int __init crash_save_vmcoreinfo_init(void)
{
+ vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
+ if (!vmcoreinfo_data) {
+ pr_warn("Memory allocation for vmcoreinfo_data failed\n");
+ return -ENOMEM;
+ }
+
+ vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!vmcoreinfo_note) {
+ free_page((unsigned long)vmcoreinfo_data);
+ vmcoreinfo_data = NULL;
+ pr_warn("Memory allocation for vmcoreinfo_note failed\n");
+ return -ENOMEM;
+ }
+
VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
VMCOREINFO_PAGESIZE(PAGE_SIZE);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4d2c32f98482..3e691b75b2db 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1249,26 +1249,31 @@ unclone_ctx(struct perf_event_context *ctx)
return parent_ctx;
}
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
+ enum pid_type type)
{
+ u32 nr;
/*
* only top level events have the pid namespace they were created in
*/
if (event->parent)
event = event->parent;
- return task_tgid_nr_ns(p, event->ns);
+ nr = __task_pid_nr_ns(p, type, event->ns);
+ /* avoid -1 if it is idle thread or runs in another ns */
+ if (!nr && !pid_alive(p))
+ nr = -1;
+ return nr;
}
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
- /*
- * only top level events have the pid namespace they were created in
- */
- if (event->parent)
- event = event->parent;
+ return perf_event_pid_type(event, p, __PIDTYPE_TGID);
+}
- return task_pid_nr_ns(p, event->ns);
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+ return perf_event_pid_type(event, p, PIDTYPE_PID);
}
/*
@@ -1452,6 +1457,13 @@ static enum event_type_t get_event_type(struct perf_event *event)
lockdep_assert_held(&ctx->lock);
+ /*
+ * It's 'group type', really, because if our group leader is
+ * pinned, so are we.
+ */
+ if (event->group_leader != event)
+ event = event->group_leader;
+
event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
if (!ctx->task)
event_type |= EVENT_CPU;
@@ -1563,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_TRANSACTION)
size += sizeof(data->txn);
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ size += sizeof(data->phys_addr);
+
event->header_size = size;
}
@@ -2210,6 +2225,33 @@ static int group_can_go_on(struct perf_event *event,
return can_add_hw;
}
+/*
+ * Complement to update_event_times(). This computes the tstamp_* values to
+ * continue 'enabled' state from @now, and effectively discards the time
+ * between the prior tstamp_stopped and now (as we were in the OFF state, or
+ * just switched (context) time base).
+ *
+ * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
+ * cannot have been scheduled in yet. And going into INACTIVE state means
+ * '@event->tstamp_stopped = @now'.
+ *
+ * Thus given the rules of update_event_times():
+ *
+ * total_time_enabled = tstamp_stopped - tstamp_enabled
+ * total_time_running = tstamp_stopped - tstamp_running
+ *
+ * We can insert 'tstamp_stopped == now' and reverse them to compute new
+ * tstamp_* values.
+ */
+static void __perf_event_enable_time(struct perf_event *event, u64 now)
+{
+ WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
+
+ event->tstamp_stopped = now;
+ event->tstamp_enabled = now - event->total_time_enabled;
+ event->tstamp_running = now - event->total_time_running;
+}
+
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
@@ -2217,9 +2259,12 @@ static void add_event_to_ctx(struct perf_event *event,
list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = tstamp;
- event->tstamp_running = tstamp;
- event->tstamp_stopped = tstamp;
+ /*
+ * We can be called with event->state == STATE_OFF when we create with
+ * .disabled = 1. In that case the IOC_ENABLE will call this function.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ __perf_event_enable_time(event, tstamp);
}
static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2464,10 +2509,11 @@ static void __perf_event_mark_enabled(struct perf_event *event)
u64 tstamp = perf_event_time(event);
event->state = PERF_EVENT_STATE_INACTIVE;
- event->tstamp_enabled = tstamp - event->total_time_enabled;
+ __perf_event_enable_time(event, tstamp);
list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ /* XXX should not be > INACTIVE if event isn't */
if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ __perf_event_enable_time(sub, tstamp);
}
}
@@ -3173,6 +3219,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
return;
perf_ctx_lock(cpuctx, ctx);
+ /*
+ * We must check ctx->nr_events while holding ctx->lock, such
+ * that we serialize against perf_install_in_context().
+ */
+ if (!ctx->nr_events)
+ goto unlock;
+
perf_pmu_disable(ctx->pmu);
/*
* We want to keep the following priority order:
@@ -3186,6 +3239,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
+
+unlock:
perf_ctx_unlock(cpuctx, ctx);
}
@@ -3618,10 +3673,7 @@ unlock:
static inline u64 perf_event_count(struct perf_event *event)
{
- if (event->pmu->count)
- return event->pmu->count(event);
-
- return __perf_event_count(event);
+ return local64_read(&event->count) + atomic64_read(&event->child_count);
}
/*
@@ -3632,10 +3684,10 @@ static inline u64 perf_event_count(struct perf_event *event)
* will not be local and we cannot read them atomically
* - must not have a pmu::count method
*/
-u64 perf_event_read_local(struct perf_event *event)
+int perf_event_read_local(struct perf_event *event, u64 *value)
{
unsigned long flags;
- u64 val;
+ int ret = 0;
/*
* Disabling interrupts avoids all counter scheduling (context
@@ -3643,25 +3695,28 @@ u64 perf_event_read_local(struct perf_event *event)
*/
local_irq_save(flags);
- /* If this is a per-task event, it must be for current */
- WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
- event->hw.target != current);
-
- /* If this is a per-CPU event, it must be for this CPU */
- WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
- event->cpu != smp_processor_id());
-
/*
* It must not be an event with inherit set, we cannot read
* all child counters from atomic context.
*/
- WARN_ON_ONCE(event->attr.inherit);
+ if (event->attr.inherit) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
- /*
- * It must not have a pmu::count method, those are not
- * NMI safe.
- */
- WARN_ON_ONCE(event->pmu->count);
+ /* If this is a per-task event, it must be for current */
+ if ((event->attach_state & PERF_ATTACH_TASK) &&
+ event->hw.target != current) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* If this is a per-CPU event, it must be for this CPU */
+ if (!(event->attach_state & PERF_ATTACH_TASK) &&
+ event->cpu != smp_processor_id()) {
+ ret = -EINVAL;
+ goto out;
+ }
/*
* If the event is currently on this CPU, its either a per-task event,
@@ -3671,10 +3726,11 @@ u64 perf_event_read_local(struct perf_event *event)
if (event->oncpu == smp_processor_id())
event->pmu->read(event);
- val = local64_read(&event->count);
+ *value = local64_read(&event->count);
+out:
local_irq_restore(flags);
- return val;
+ return ret;
}
static int perf_event_read(struct perf_event *event, bool group)
@@ -4365,7 +4421,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value);
static int __perf_read_group_add(struct perf_event *leader,
u64 read_format, u64 *values)
{
+ struct perf_event_context *ctx = leader->ctx;
struct perf_event *sub;
+ unsigned long flags;
int n = 1; /* skip @nr */
int ret;
@@ -4395,12 +4453,15 @@ static int __perf_read_group_add(struct perf_event *leader,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
}
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
return 0;
}
@@ -5065,7 +5126,7 @@ static void perf_mmap_open(struct vm_area_struct *vma)
atomic_inc(&event->rb->aux_mmap_count);
if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ event->pmu->event_mapped(event, vma->vm_mm);
}
static void perf_pmu_output_stop(struct perf_event *event);
@@ -5088,7 +5149,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
unsigned long size = perf_data_size(rb);
if (event->pmu->event_unmapped)
- event->pmu->event_unmapped(event);
+ event->pmu->event_unmapped(event, vma->vm_mm);
/*
* rb->aux_mmap_count will always drop before rb->mmap_count and
@@ -5386,7 +5447,7 @@ aux_unlock:
vma->vm_ops = &perf_mmap_vmops;
if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ event->pmu->event_mapped(event, vma->vm_mm);
return ret;
}
@@ -5947,6 +6008,9 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ perf_output_put(handle, data->phys_addr);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -5962,6 +6026,38 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+static u64 perf_virt_to_phys(u64 virt)
+{
+ u64 phys_addr = 0;
+ struct page *p = NULL;
+
+ if (!virt)
+ return 0;
+
+ if (virt >= TASK_SIZE) {
+ /* If it's vmalloc()d memory, leave phys_addr as 0 */
+ if (virt_addr_valid((void *)(uintptr_t)virt) &&
+ !(virt >= VMALLOC_START && virt < VMALLOC_END))
+ phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
+ } else {
+ /*
+ * Walking the pages tables for user address.
+ * Interrupts are disabled, so it prevents any tear down
+ * of the page tables.
+ * Try IRQ-safe __get_user_pages_fast first.
+ * If failed, leave phys_addr as 0.
+ */
+ if ((current->mm != NULL) &&
+ (__get_user_pages_fast(virt, 1, 0, &p) == 1))
+ phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+
+ if (p)
+ put_page(p);
+ }
+
+ return phys_addr;
+}
+
void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
@@ -6080,6 +6176,9 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ data->phys_addr = perf_virt_to_phys(data->addr);
}
static void __always_inline
@@ -7231,6 +7330,11 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_output_end(&handle);
}
+void perf_event_itrace_started(struct perf_event *event)
+{
+ event->attach_state |= PERF_ATTACH_ITRACE;
+}
+
static void perf_log_itrace_start(struct perf_event *event)
{
struct perf_output_handle handle;
@@ -7246,7 +7350,7 @@ static void perf_log_itrace_start(struct perf_event *event)
event = event->parent;
if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
- event->hw.itrace_started)
+ event->attach_state & PERF_ATTACH_ITRACE)
return;
rec.header.type = PERF_RECORD_ITRACE_START;
@@ -7308,21 +7412,6 @@ int perf_event_account_interrupt(struct perf_event *event)
return __perf_event_account_interrupt(event, 1);
}
-static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
-{
- /*
- * Due to interrupt latency (AKA "skid"), we may enter the
- * kernel before taking an overflow, even if the PMU is only
- * counting user events.
- * To avoid leaking information to userspace, we must always
- * reject kernel samples when exclude_kernel is set.
- */
- if (event->attr.exclude_kernel && !user_mode(regs))
- return false;
-
- return true;
-}
-
/*
* Generic event overflow handling, sampling.
*/
@@ -7344,12 +7433,6 @@ static int __perf_event_overflow(struct perf_event *event,
ret = __perf_event_account_interrupt(event, throttle);
/*
- * For security, drop the skid kernel samples if necessary.
- */
- if (!sample_is_allowed(event, regs))
- return ret;
-
- /*
* XXX event_limit might not quite work as expected on inherited
* events
*/
@@ -7871,16 +7954,15 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
}
}
perf_tp_event(call->event.type, count, raw_data, size, regs, head,
- rctx, task);
+ rctx, task, NULL);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
- struct task_struct *task)
+ struct task_struct *task, struct perf_event *event)
{
struct perf_sample_data data;
- struct perf_event *event;
struct perf_raw_record raw = {
.frag = {
@@ -7894,9 +7976,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
perf_trace_buf_update(record, event_type);
- hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ /* Use the given event instead of the hlist */
+ if (event) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
+ } else {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
}
/*
@@ -8046,22 +8134,19 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
- bool is_kprobe, is_tracepoint;
+ bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
- if (event->attr.type == PERF_TYPE_HARDWARE ||
- event->attr.type == PERF_TYPE_SOFTWARE)
- return perf_event_set_bpf_handler(event, prog_fd);
-
if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -EINVAL;
+ return perf_event_set_bpf_handler(event, prog_fd);
if (event->tp_event->prog)
return -EEXIST;
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
- if (!is_kprobe && !is_tracepoint)
+ is_syscall_tp = is_syscall_trace_event(event->tp_event);
+ if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
@@ -8070,13 +8155,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
return PTR_ERR(prog);
if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
- (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
+ (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
+ (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
/* valid fd, but invalid bpf program type */
bpf_prog_put(prog);
return -EINVAL;
}
- if (is_tracepoint) {
+ if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event);
if (prog->aux->max_ctx_offset > off) {
@@ -9580,6 +9666,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (ret)
return -EFAULT;
+ attr->size = size;
+
if (attr->__reserved_1)
return -EINVAL;
@@ -9852,6 +9940,11 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ /* Only privileged users can get physical addresses */
+ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+ perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
if (!attr.sample_max_stack)
attr.sample_max_stack = sysctl_perf_event_max_stack;
@@ -10001,28 +10094,27 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
/*
- * Do not allow to attach to a group in a different
- * task or CPU context:
+ * Make sure we're both events for the same CPU;
+ * grouping events for different CPUs is broken; since
+ * you can never concurrently schedule them anyhow.
*/
- if (move_group) {
- /*
- * Make sure we're both on the same task, or both
- * per-cpu events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ if (group_leader->cpu != event->cpu)
+ goto err_context;
- /*
- * Make sure we're both events for the same CPU;
- * grouping events for different CPUs is broken; since
- * you can never concurrently schedule them anyhow.
- */
- if (group_leader->cpu != event->cpu)
- goto err_context;
- } else {
- if (group_leader->ctx != ctx)
- goto err_context;
- }
+ /*
+ * Make sure we're both on the same task, or both
+ * per-CPU events.
+ */
+ if (group_leader->ctx->task != ctx->task)
+ goto err_context;
+
+ /*
+ * Do not allow to attach to a group in a different task
+ * or CPU context. If we're moving SW events, we'll fix
+ * this up later, so allow that.
+ */
+ if (!move_group && group_leader->ctx != ctx)
+ goto err_context;
/*
* Only a group leader can be exclusive or pinned
@@ -11201,5 +11293,6 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
* controller is not mounted on a legacy hierarchy.
*/
.implicit_on_dfl = true,
+ .threaded = true,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 486fd78eb8d5..843e97047335 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -38,9 +38,9 @@ struct ring_buffer {
struct user_struct *mmap_user;
/* AUX area */
- local_t aux_head;
+ long aux_head;
local_t aux_nest;
- local_t aux_wakeup;
+ long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */
unsigned long aux_pgoff;
int aux_nr_pages;
int aux_overwrite;
@@ -208,7 +208,7 @@ static inline int get_recursion_context(int *recursion)
{
int rctx;
- if (in_nmi())
+ if (unlikely(in_nmi()))
rctx = 3;
else if (in_irq())
rctx = 2;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ee97196bb151..af71a84e12ee 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -367,7 +367,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
goto err_put;
- aux_head = local_read(&rb->aux_head);
+ aux_head = rb->aux_head;
handle->rb = rb;
handle->event = event;
@@ -382,7 +382,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
*/
if (!rb->aux_overwrite) {
aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
- handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
+ handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
if (aux_head - aux_tail < perf_aux_size(rb))
handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
@@ -433,12 +433,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
aux_head = handle->head;
- local_set(&rb->aux_head, aux_head);
+ rb->aux_head = aux_head;
} else {
handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
- aux_head = local_read(&rb->aux_head);
- local_add(size, &rb->aux_head);
+ aux_head = rb->aux_head;
+ rb->aux_head += size;
}
if (size || handle->aux_flags) {
@@ -450,11 +450,10 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
handle->aux_flags);
}
- aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
-
- if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ rb->user_page->aux_head = rb->aux_head;
+ if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
wakeup = true;
- local_add(rb->aux_watermark, &rb->aux_wakeup);
+ rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
}
if (wakeup) {
@@ -478,22 +477,20 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
{
struct ring_buffer *rb = handle->rb;
- unsigned long aux_head;
if (size > handle->size)
return -ENOSPC;
- local_add(size, &rb->aux_head);
+ rb->aux_head += size;
- aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
- if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ rb->user_page->aux_head = rb->aux_head;
+ if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
perf_output_wakeup(handle);
- local_add(rb->aux_watermark, &rb->aux_wakeup);
- handle->wakeup = local_read(&rb->aux_wakeup) +
- rb->aux_watermark;
+ rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
+ handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
}
- handle->head = aux_head;
+ handle->head = rb->aux_head;
handle->size -= size;
return 0;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 0e137f98a50c..267f6ef91d97 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1262,8 +1262,6 @@ void uprobe_end_dup_mmap(void)
void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
- newmm->uprobes_state.xol_area = NULL;
-
if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
set_bit(MMF_HAS_UPROBES, &newmm->flags);
/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
diff --git a/kernel/exit.c b/kernel/exit.c
index c63226283aef..a35d8a17e01f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,7 +51,6 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
#include <linux/fs_struct.h>
-#include <linux/userfaultfd_k.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
#include <trace/events/sched.h>
@@ -62,6 +61,7 @@
#include <linux/kcov.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
+#include <linux/compat.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -764,7 +764,6 @@ void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
- TASKS_RCU(int tasks_rcu_i);
profile_task_exit(tsk);
kcov_task_exit(tsk);
@@ -819,7 +818,8 @@ void __noreturn do_exit(long code)
* Ensure that we must observe the pi_state in exit_mm() ->
* mm_release() -> exit_pi_state_list().
*/
- raw_spin_unlock_wait(&tsk->pi_lock);
+ raw_spin_lock_irq(&tsk->pi_lock);
+ raw_spin_unlock_irq(&tsk->pi_lock);
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -881,9 +881,7 @@ void __noreturn do_exit(long code)
*/
flush_ptrace_hw_breakpoint(tsk);
- TASKS_RCU(preempt_disable());
- TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
- TASKS_RCU(preempt_enable());
+ exit_tasks_rcu_start();
exit_notify(tsk, group_dead);
proc_exit_connector(tsk);
mpol_put_task_policy(tsk);
@@ -918,8 +916,9 @@ void __noreturn do_exit(long code)
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
- TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
+ exit_tasks_rcu_finish();
+ lockdep_free_task(tsk);
do_task_dead();
}
EXPORT_SYMBOL_GPL(do_exit);
@@ -982,14 +981,21 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
return 0;
}
+struct waitid_info {
+ pid_t pid;
+ uid_t uid;
+ int status;
+ int cause;
+};
+
struct wait_opts {
enum pid_type wo_type;
int wo_flags;
struct pid *wo_pid;
- struct siginfo __user *wo_info;
- int __user *wo_stat;
- struct rusage __user *wo_rusage;
+ struct waitid_info *wo_info;
+ int wo_stat;
+ struct rusage *wo_rusage;
wait_queue_entry_t child_wait;
int notask_error;
@@ -1036,34 +1042,6 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
return 1;
}
-static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
- pid_t pid, uid_t uid, int why, int status)
-{
- struct siginfo __user *infop;
- int retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
-
- put_task_struct(p);
- infop = wo->wo_info;
- if (infop) {
- if (!retval)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval)
- retval = put_user(0, &infop->si_errno);
- if (!retval)
- retval = put_user((short)why, &infop->si_code);
- if (!retval)
- retval = put_user(pid, &infop->si_pid);
- if (!retval)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = put_user(status, &infop->si_status);
- }
- if (!retval)
- retval = pid;
- return retval;
-}
-
/*
* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
@@ -1072,30 +1050,23 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
*/
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
- int state, retval, status;
+ int state, status;
pid_t pid = task_pid_vnr(p);
uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
- struct siginfo __user *infop;
+ struct waitid_info *infop;
if (!likely(wo->wo_flags & WEXITED))
return 0;
if (unlikely(wo->wo_flags & WNOWAIT)) {
- int exit_code = p->exit_code;
- int why;
-
+ status = p->exit_code;
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
-
- if ((exit_code & 0x7f) == 0) {
- why = CLD_EXITED;
- status = exit_code >> 8;
- } else {
- why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
- status = exit_code & 0x7f;
- }
- return wait_noreap_copyout(wo, p, pid, uid, why, status);
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+ put_task_struct(p);
+ goto out_info;
}
/*
* Move the task's state to DEAD/TRACE, only one thread can do this.
@@ -1168,38 +1139,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
spin_unlock_irq(&current->sighand->siglock);
}
- retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
? p->signal->group_exit_code : p->exit_code;
- if (!retval && wo->wo_stat)
- retval = put_user(status, wo->wo_stat);
-
- infop = wo->wo_info;
- if (!retval && infop)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval && infop)
- retval = put_user(0, &infop->si_errno);
- if (!retval && infop) {
- int why;
-
- if ((status & 0x7f) == 0) {
- why = CLD_EXITED;
- status >>= 8;
- } else {
- why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
- status &= 0x7f;
- }
- retval = put_user((short)why, &infop->si_code);
- if (!retval)
- retval = put_user(status, &infop->si_status);
- }
- if (!retval && infop)
- retval = put_user(pid, &infop->si_pid);
- if (!retval && infop)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = pid;
+ wo->wo_stat = status;
if (state == EXIT_TRACE) {
write_lock_irq(&tasklist_lock);
@@ -1216,7 +1160,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
if (state == EXIT_DEAD)
release_task(p);
- return retval;
+out_info:
+ infop = wo->wo_info;
+ if (infop) {
+ if ((status & 0x7f) == 0) {
+ infop->cause = CLD_EXITED;
+ infop->status = status >> 8;
+ } else {
+ infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
+ infop->status = status & 0x7f;
+ }
+ infop->pid = pid;
+ infop->uid = uid;
+ }
+
+ return pid;
}
static int *task_stopped_code(struct task_struct *p, bool ptrace)
@@ -1252,8 +1210,8 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
static int wait_task_stopped(struct wait_opts *wo,
int ptrace, struct task_struct *p)
{
- struct siginfo __user *infop;
- int retval, exit_code, *p_code, why;
+ struct waitid_info *infop;
+ int exit_code, *p_code, why;
uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;
@@ -1298,34 +1256,21 @@ unlock_sig:
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);
sched_annotate_sleep();
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+ put_task_struct(p);
- if (unlikely(wo->wo_flags & WNOWAIT))
- return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
-
- retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
- if (!retval && wo->wo_stat)
- retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
+ if (likely(!(wo->wo_flags & WNOWAIT)))
+ wo->wo_stat = (exit_code << 8) | 0x7f;
infop = wo->wo_info;
- if (!retval && infop)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval && infop)
- retval = put_user(0, &infop->si_errno);
- if (!retval && infop)
- retval = put_user((short)why, &infop->si_code);
- if (!retval && infop)
- retval = put_user(exit_code, &infop->si_status);
- if (!retval && infop)
- retval = put_user(pid, &infop->si_pid);
- if (!retval && infop)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = pid;
- put_task_struct(p);
-
- BUG_ON(!retval);
- return retval;
+ if (infop) {
+ infop->cause = why;
+ infop->status = exit_code;
+ infop->pid = pid;
+ infop->uid = uid;
+ }
+ return pid;
}
/*
@@ -1336,7 +1281,7 @@ unlock_sig:
*/
static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
{
- int retval;
+ struct waitid_info *infop;
pid_t pid;
uid_t uid;
@@ -1361,22 +1306,20 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+ put_task_struct(p);
- if (!wo->wo_info) {
- retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
- put_task_struct(p);
- if (!retval && wo->wo_stat)
- retval = put_user(0xffff, wo->wo_stat);
- if (!retval)
- retval = pid;
+ infop = wo->wo_info;
+ if (!infop) {
+ wo->wo_stat = 0xffff;
} else {
- retval = wait_noreap_copyout(wo, p, pid, uid,
- CLD_CONTINUED, SIGCONT);
- BUG_ON(retval == 0);
+ infop->cause = CLD_CONTINUED;
+ infop->pid = pid;
+ infop->uid = uid;
+ infop->status = SIGCONT;
}
-
- return retval;
+ return pid;
}
/*
@@ -1604,8 +1547,8 @@ end:
return retval;
}
-SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
- infop, int, options, struct rusage __user *, ru)
+static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
+ int options, struct rusage *ru)
{
struct wait_opts wo;
struct pid *pid = NULL;
@@ -1643,38 +1586,48 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
wo.wo_pid = pid;
wo.wo_flags = options;
wo.wo_info = infop;
- wo.wo_stat = NULL;
wo.wo_rusage = ru;
ret = do_wait(&wo);
- if (ret > 0) {
- ret = 0;
- } else if (infop) {
- /*
- * For a WNOHANG return, clear out all the fields
- * we would set so the user can easily tell the
- * difference.
- */
- if (!ret)
- ret = put_user(0, &infop->si_signo);
- if (!ret)
- ret = put_user(0, &infop->si_errno);
- if (!ret)
- ret = put_user(0, &infop->si_code);
- if (!ret)
- ret = put_user(0, &infop->si_pid);
- if (!ret)
- ret = put_user(0, &infop->si_uid);
- if (!ret)
- ret = put_user(0, &infop->si_status);
- }
-
put_pid(pid);
return ret;
}
-SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
- int, options, struct rusage __user *, ru)
+SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
+ infop, int, options, struct rusage __user *, ru)
+{
+ struct rusage r;
+ struct waitid_info info = {.status = 0};
+ long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
+ int signo = 0;
+ if (err > 0) {
+ signo = SIGCHLD;
+ err = 0;
+ }
+
+ if (!err) {
+ if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
+ return -EFAULT;
+ }
+ if (!infop)
+ return err;
+
+ user_access_begin();
+ unsafe_put_user(signo, &infop->si_signo, Efault);
+ unsafe_put_user(0, &infop->si_errno, Efault);
+ unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.pid, &infop->si_pid, Efault);
+ unsafe_put_user(info.uid, &infop->si_uid, Efault);
+ unsafe_put_user(info.status, &infop->si_status, Efault);
+ user_access_end();
+ return err;
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
+ struct rusage *ru)
{
struct wait_opts wo;
struct pid *pid = NULL;
@@ -1685,6 +1638,10 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
__WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
+ /* -INT_MIN is not defined */
+ if (upid == INT_MIN)
+ return -ESRCH;
+
if (upid == -1)
type = PIDTYPE_MAX;
else if (upid < 0) {
@@ -1702,14 +1659,29 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
wo.wo_pid = pid;
wo.wo_flags = options | WEXITED;
wo.wo_info = NULL;
- wo.wo_stat = stat_addr;
+ wo.wo_stat = 0;
wo.wo_rusage = ru;
ret = do_wait(&wo);
put_pid(pid);
+ if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
+ ret = -EFAULT;
return ret;
}
+SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
+ int, options, struct rusage __user *, ru)
+{
+ struct rusage r;
+ long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
+
+ if (err > 0) {
+ if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
+ return -EFAULT;
+ }
+ return err;
+}
+
#ifdef __ARCH_WANT_SYS_WAITPID
/*
@@ -1722,3 +1694,61 @@ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
}
#endif
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(wait4,
+ compat_pid_t, pid,
+ compat_uint_t __user *, stat_addr,
+ int, options,
+ struct compat_rusage __user *, ru)
+{
+ struct rusage r;
+ long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
+ if (err > 0) {
+ if (ru && put_compat_rusage(&r, ru))
+ return -EFAULT;
+ }
+ return err;
+}
+
+COMPAT_SYSCALL_DEFINE5(waitid,
+ int, which, compat_pid_t, pid,
+ struct compat_siginfo __user *, infop, int, options,
+ struct compat_rusage __user *, uru)
+{
+ struct rusage ru;
+ struct waitid_info info = {.status = 0};
+ long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
+ int signo = 0;
+ if (err > 0) {
+ signo = SIGCHLD;
+ err = 0;
+ }
+
+ if (!err && uru) {
+ /* kernel_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ err = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ err = put_compat_rusage(&ru, uru);
+ if (err)
+ return -EFAULT;
+ }
+
+ if (!infop)
+ return err;
+
+ user_access_begin();
+ unsafe_put_user(signo, &infop->si_signo, Efault);
+ unsafe_put_user(0, &infop->si_errno, Efault);
+ unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.pid, &infop->si_pid, Efault);
+ unsafe_put_user(info.uid, &infop->si_uid, Efault);
+ unsafe_put_user(info.status, &infop->si_status, Efault);
+ user_access_end();
+ return err;
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+#endif
diff --git a/kernel/extable.c b/kernel/extable.c
index 0fbdd8582f08..38c2412401a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
{
const struct exception_table_entry *e;
- e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
+ e = search_extable(__start___ex_table,
+ __stop___ex_table - __start___ex_table, addr);
if (!e)
e = search_module_extables(addr);
return e;
@@ -69,7 +70,7 @@ static inline int init_kernel_text(unsigned long addr)
return 0;
}
-int core_kernel_text(unsigned long addr)
+int notrace core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr < (unsigned long)_etext)
diff --git a/kernel/fork.c b/kernel/fork.c
index e53770d2bf95..24a4c0be80d5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -88,6 +88,7 @@
#include <linux/sysctl.h>
#include <linux/kcov.h>
#include <linux/livepatch.h>
+#include <linux/thread_info.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -205,21 +206,19 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
void *stack;
int i;
- local_irq_disable();
for (i = 0; i < NR_CACHED_STACKS; i++) {
- struct vm_struct *s = this_cpu_read(cached_stacks[i]);
+ struct vm_struct *s;
+
+ s = this_cpu_xchg(cached_stacks[i], NULL);
if (!s)
continue;
- this_cpu_write(cached_stacks[i], NULL);
tsk->stack_vm_area = s;
- local_irq_enable();
return s->addr;
}
- local_irq_enable();
- stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+ stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
VMALLOC_START, VMALLOC_END,
THREADINFO_GFP,
PAGE_KERNEL,
@@ -245,19 +244,15 @@ static inline void free_thread_stack(struct task_struct *tsk)
{
#ifdef CONFIG_VMAP_STACK
if (task_stack_vm_area(tsk)) {
- unsigned long flags;
int i;
- local_irq_save(flags);
for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_read(cached_stacks[i]))
+ if (this_cpu_cmpxchg(cached_stacks[i],
+ NULL, tsk->stack_vm_area) != NULL)
continue;
- this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
- local_irq_restore(flags);
return;
}
- local_irq_restore(flags);
vfree_atomic(tsk->stack);
return;
@@ -326,8 +321,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
}
/* All stack pages belong to the same memcg. */
- memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
+ mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
} else {
/*
* All stack pages are in the same zone and belong to the
@@ -338,8 +333,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
THREAD_SIZE / 1024 * account);
- memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
+ mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
}
}
@@ -490,6 +485,8 @@ void __init fork_init(void)
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
NULL, free_vm_stack_cache);
#endif
+
+ lockdep_init_task(&init_task);
}
int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -560,7 +557,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
set_task_stack_end_magic(tsk);
#ifdef CONFIG_CC_STACKPROTECTOR
- tsk->stack_canary = get_random_long();
+ tsk->stack_canary = get_random_canary();
#endif
/*
@@ -579,6 +576,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
kcov_task_init(tsk);
+#ifdef CONFIG_FAULT_INJECTION
+ tsk->fail_nth = 0;
+#endif
+
return tsk;
free_stack:
@@ -656,7 +657,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
retval = dup_userfaultfd(tmp, &uf);
if (retval)
goto fail_nomem_anon_vma_fork;
- if (anon_vma_fork(tmp, mpnt))
+ if (tmp->vm_flags & VM_WIPEONFORK) {
+ /* VM_WIPEONFORK gets a clean slate in the child. */
+ tmp->anon_vma = NULL;
+ if (anon_vma_prepare(tmp))
+ goto fail_nomem_anon_vma_fork;
+ } else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
tmp->vm_next = tmp->vm_prev = NULL;
@@ -700,7 +706,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
rb_parent = &tmp->vm_rb;
mm->map_count++;
- retval = copy_page_range(mm, oldmm, mpnt);
+ if (!(tmp->vm_flags & VM_WIPEONFORK))
+ retval = copy_page_range(mm, oldmm, mpnt);
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
@@ -787,6 +794,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
+static void mm_init_uprobes_state(struct mm_struct *mm)
+{
+#ifdef CONFIG_UPROBES
+ mm->uprobes_state.xol_area = NULL;
+#endif
+}
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
@@ -808,11 +822,13 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
- clear_tlb_flush_pending(mm);
+ init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif
+ mm_init_uprobes_state(mm);
if (current->mm) {
mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -912,7 +928,6 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
- set_bit(MMF_OOM_SKIP, &mm->flags);
mmdrop(mm);
}
@@ -928,22 +943,6 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
-#ifdef CONFIG_MMU
-static void mmput_async_fn(struct work_struct *work)
-{
- struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
- __mmput(mm);
-}
-
-void mmput_async(struct mm_struct *mm)
-{
- if (atomic_dec_and_test(&mm->mm_users)) {
- INIT_WORK(&mm->async_put_work, mmput_async_fn);
- schedule_work(&mm->async_put_work);
- }
-}
-#endif
-
/**
* set_mm_exe_file - change a reference to the mm's executable file
*
@@ -1637,9 +1636,9 @@ static __latent_entropy struct task_struct *copy_process(
prev_cputime_init(&p->prev_cputime);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- seqcount_init(&p->vtime_seqcount);
- p->vtime_snap = 0;
- p->vtime_snap_whence = VTIME_INACTIVE;
+ seqcount_init(&p->vtime.seqcount);
+ p->vtime.starttime = 0;
+ p->vtime.state = VTIME_INACTIVE;
#endif
#if defined(SPLIT_RSS_COUNTING)
@@ -1693,6 +1692,7 @@ static __latent_entropy struct task_struct *copy_process(
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
p->lockdep_recursion = 0;
+ lockdep_init_task(p);
#endif
#ifdef CONFIG_DEBUG_MUTEXES
@@ -1951,6 +1951,7 @@ bad_fork_cleanup_audit:
bad_fork_cleanup_perf:
perf_event_free_task(p);
bad_fork_cleanup_policy:
+ lockdep_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
diff --git a/kernel/futex.c b/kernel/futex.c
index c934689043b2..3d38eaf05492 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -212,7 +212,7 @@ struct futex_pi_state {
atomic_t refcount;
union futex_key key;
-};
+} __randomize_layout;
/**
* struct futex_q - The hashed futex queue entry, one per waiting task
@@ -246,7 +246,7 @@ struct futex_q {
struct rt_mutex_waiter *rt_waiter;
union futex_key *requeue_pi_key;
u32 bitset;
-};
+} __randomize_layout;
static const struct futex_q futex_q_init = {
/* list gets initialized in queue_me()*/
@@ -670,13 +670,14 @@ again:
* this reference was taken by ihold under the page lock
* pinning the inode in place so i_lock was unnecessary. The
* only way for this check to fail is if the inode was
- * truncated in parallel so warn for now if this happens.
+ * truncated in parallel which is almost certainly an
+ * application bug. In such a case, just retry.
*
* We are not calling into get_futex_key_refs() in file-backed
* cases, therefore a successful atomic_inc return below will
* guarantee that get_futex_key() will still imply smp_mb(); (B).
*/
- if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
+ if (!atomic_inc_not_zero(&inode->i_count)) {
rcu_read_unlock();
put_page(page);
@@ -875,6 +876,8 @@ static struct task_struct *futex_find_get_task(pid_t pid)
return p;
}
+#ifdef CONFIG_FUTEX_PI
+
/*
* This task is holding PI mutexes at exit time => bad.
* Kernel cleans up PI-state, but userspace is likely hosed.
@@ -932,6 +935,8 @@ void exit_pi_state_list(struct task_struct *curr)
raw_spin_unlock_irq(&curr->pi_lock);
}
+#endif
+
/*
* We need to check the following states:
*
@@ -1546,6 +1551,45 @@ out:
return ret;
}
+static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+{
+ unsigned int op = (encoded_op & 0x70000000) >> 28;
+ unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
+ int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12);
+ int cmparg = sign_extend32(encoded_op & 0x00000fff, 12);
+ int oldval, ret;
+
+ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
+ if (oparg < 0 || oparg > 31)
+ return -EINVAL;
+ oparg = 1 << oparg;
+ }
+
+ if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+ return -EFAULT;
+
+ ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
+ if (ret)
+ return ret;
+
+ switch (cmp) {
+ case FUTEX_OP_CMP_EQ:
+ return oldval == cmparg;
+ case FUTEX_OP_CMP_NE:
+ return oldval != cmparg;
+ case FUTEX_OP_CMP_LT:
+ return oldval < cmparg;
+ case FUTEX_OP_CMP_GE:
+ return oldval >= cmparg;
+ case FUTEX_OP_CMP_LE:
+ return oldval <= cmparg;
+ case FUTEX_OP_CMP_GT:
+ return oldval > cmparg;
+ default:
+ return -ENOSYS;
+ }
+}
+
/*
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
@@ -1799,6 +1843,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);
+ /*
+ * When PI not supported: return -ENOSYS if requeue_pi is true,
+ * consequently the compiler knows requeue_pi is always false past
+ * this point which will optimize away all the conditional code
+ * further down.
+ */
+ if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
+ return -ENOSYS;
+
if (requeue_pi) {
/*
* Requeue PI only works on two distinct uaddrs. This
@@ -2594,6 +2647,9 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
struct futex_q q = futex_q_init;
int res, ret;
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
if (refill_pi_state_cache())
return -ENOMEM;
@@ -2773,6 +2829,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
struct futex_q *top_waiter;
int ret;
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
retry:
if (get_user(uval, uaddr))
return -EFAULT;
@@ -2983,6 +3042,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
struct futex_q q = futex_q_init;
int res, ret;
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
if (uaddr == uaddr2)
return -EINVAL;
diff --git a/kernel/groups.c b/kernel/groups.c
index d09727692a2a..434f6665f187 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -5,6 +5,7 @@
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/security.h>
+#include <linux/sort.h>
#include <linux/syscalls.h>
#include <linux/user_namespace.h>
#include <linux/vmalloc.h>
@@ -76,32 +77,18 @@ static int groups_from_user(struct group_info *group_info,
return 0;
}
-/* a simple Shell sort */
+static int gid_cmp(const void *_a, const void *_b)
+{
+ kgid_t a = *(kgid_t *)_a;
+ kgid_t b = *(kgid_t *)_b;
+
+ return gid_gt(a, b) - gid_lt(a, b);
+}
+
static void groups_sort(struct group_info *group_info)
{
- int base, max, stride;
- int gidsetsize = group_info->ngroups;
-
- for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
- ; /* nothing */
- stride /= 3;
-
- while (stride) {
- max = gidsetsize - stride;
- for (base = 0; base < max; base++) {
- int left = base;
- int right = left + stride;
- kgid_t tmp = group_info->gid[right];
-
- while (left >= 0 && gid_gt(group_info->gid[left], tmp)) {
- group_info->gid[right] = group_info->gid[left];
- right = left;
- left -= stride;
- }
- group_info->gid[right] = tmp;
- }
- stride /= 3;
- }
+ sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
+ gid_cmp, NULL);
}
/* a simple bsearch */
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 27c4e774071c..a117adf7084b 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -63,11 +63,20 @@ config GENERIC_IRQ_CHIP
config IRQ_DOMAIN
bool
+# Support for simulated interrupts
+config IRQ_SIM
+ bool
+ select IRQ_WORK
+
# Support for hierarchical irq domains
config IRQ_DOMAIN_HIERARCHY
bool
select IRQ_DOMAIN
+# Support for hierarchical fasteoi+edge and fasteoi+level handlers
+config IRQ_FASTEOI_HIERARCHY_HANDLERS
+ bool
+
# Generic IRQ IPI support
config GENERIC_IRQ_IPI
bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index e4aef7351f2b..1970cafe8f2a 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_IRQ_TIMINGS) += timings.o
obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
+obj-$(CONFIG_IRQ_SIM) += irq_sim.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index d2747f9c5707..d69bd77252a7 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -110,6 +110,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
struct cpumask *masks;
cpumask_var_t nmsk, *node_to_present_cpumask;
+ /*
+ * If there aren't any vectors left after applying the pre/post
+ * vectors don't bother with assigning affinity.
+ */
+ if (!affv)
+ return NULL;
+
if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
return NULL;
@@ -192,15 +199,19 @@ out:
/**
* irq_calc_affinity_vectors - Calculate the optimal number of vectors
+ * @minvec: The minimum number of vectors available
* @maxvec: The maximum number of vectors available
* @affd: Description of the affinity requirements
*/
-int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd)
+int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
{
int resv = affd->pre_vectors + affd->post_vectors;
int vecs = maxvec - resv;
int ret;
+ if (resv > minvec)
+ return 0;
+
get_online_cpus();
ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv;
put_online_cpus();
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ad43468e89f0..f51b7b6d2451 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -170,21 +170,11 @@ static void irq_state_clr_disabled(struct irq_desc *desc)
irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
}
-static void irq_state_set_disabled(struct irq_desc *desc)
-{
- irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
-}
-
static void irq_state_clr_masked(struct irq_desc *desc)
{
irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
}
-static void irq_state_set_masked(struct irq_desc *desc)
-{
- irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
-}
-
static void irq_state_clr_started(struct irq_desc *desc)
{
irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED);
@@ -234,7 +224,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
return IRQ_STARTUP_MANAGED;
}
#else
-static int
+static __always_inline int
__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
{
return IRQ_STARTUP_NORMAL;
@@ -1010,7 +1000,7 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
- unsigned long flags;
+ unsigned long flags, trigger, tmp;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
@@ -1024,6 +1014,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
irq_settings_clr_and_set(desc, clr, set);
+ trigger = irqd_get_trigger_type(&desc->irq_data);
+
irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
if (irq_settings_has_no_balance_set(desc))
@@ -1035,7 +1027,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
if (irq_settings_is_level(desc))
irqd_set(&desc->irq_data, IRQD_LEVEL);
- irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
+ tmp = irq_settings_get_trigger_mask(desc);
+ if (tmp != IRQ_TYPE_NONE)
+ trigger = tmp;
+
+ irqd_set(&desc->irq_data, trigger);
irq_put_desc_unlock(desc, flags);
}
@@ -1102,6 +1098,112 @@ void irq_cpu_offline(void)
}
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+
+#ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS
+/**
+ * handle_fasteoi_ack_irq - irq handler for edge hierarchy
+ * stacked on transparent controllers
+ *
+ * @desc: the interrupt description structure for this irq
+ *
+ * Like handle_fasteoi_irq(), but for use with hierarchy where
+ * the irq_chip also needs to have its ->irq_ack() function
+ * called.
+ */
+void handle_fasteoi_ack_irq(struct irq_desc *desc)
+{
+ struct irq_chip *chip = desc->irq_data.chip;
+
+ raw_spin_lock(&desc->lock);
+
+ if (!irq_may_run(desc))
+ goto out;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ /*
+ * If its disabled or no action available
+ * then mask it and get out of here:
+ */
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ mask_irq(desc);
+ goto out;
+ }
+
+ kstat_incr_irqs_this_cpu(desc);
+ if (desc->istate & IRQS_ONESHOT)
+ mask_irq(desc);
+
+ /* Start handling the irq */
+ desc->irq_data.chip->irq_ack(&desc->irq_data);
+
+ preflow_handler(desc);
+ handle_irq_event(desc);
+
+ cond_unmask_eoi_irq(desc, chip);
+
+ raw_spin_unlock(&desc->lock);
+ return;
+out:
+ if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ chip->irq_eoi(&desc->irq_data);
+ raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_ack_irq);
+
+/**
+ * handle_fasteoi_mask_irq - irq handler for level hierarchy
+ * stacked on transparent controllers
+ *
+ * @desc: the interrupt description structure for this irq
+ *
+ * Like handle_fasteoi_irq(), but for use with hierarchy where
+ * the irq_chip also needs to have its ->irq_mask_ack() function
+ * called.
+ */
+void handle_fasteoi_mask_irq(struct irq_desc *desc)
+{
+ struct irq_chip *chip = desc->irq_data.chip;
+
+ raw_spin_lock(&desc->lock);
+ mask_ack_irq(desc);
+
+ if (!irq_may_run(desc))
+ goto out;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ /*
+ * If its disabled or no action available
+ * then mask it and get out of here:
+ */
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ mask_irq(desc);
+ goto out;
+ }
+
+ kstat_incr_irqs_this_cpu(desc);
+ if (desc->istate & IRQS_ONESHOT)
+ mask_irq(desc);
+
+ preflow_handler(desc);
+ handle_irq_event(desc);
+
+ cond_unmask_eoi_irq(desc, chip);
+
+ raw_spin_unlock(&desc->lock);
+ return;
+out:
+ if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ chip->irq_eoi(&desc->irq_data);
+ raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq);
+
+#endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */
+
/**
* irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if
* NULL)
@@ -1115,6 +1217,7 @@ void irq_chip_enable_parent(struct irq_data *data)
else
data->chip->irq_unmask(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_enable_parent);
/**
* irq_chip_disable_parent - Disable the parent interrupt (defaults to mask if
@@ -1129,6 +1232,7 @@ void irq_chip_disable_parent(struct irq_data *data)
else
data->chip->irq_mask(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_disable_parent);
/**
* irq_chip_ack_parent - Acknowledge the parent interrupt
@@ -1191,6 +1295,7 @@ int irq_chip_set_affinity_parent(struct irq_data *data,
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(irq_chip_set_affinity_parent);
/**
* irq_chip_set_type_parent - Set IRQ type on the parent interrupt
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index aee8f7ec40af..638eb9c83d9f 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -95,8 +95,13 @@ static bool migrate_one_irq(struct irq_desc *desc)
affinity = cpu_online_mask;
brokeaff = true;
}
-
- err = irq_do_set_affinity(d, affinity, true);
+ /*
+ * Do not set the force argument of irq_do_set_affinity() as this
+ * disables the masking of offline CPUs from the supplied affinity
+ * mask and therefore might keep/reassign the irq to the outgoing
+ * CPU.
+ */
+ err = irq_do_set_affinity(d, affinity, false);
if (err) {
pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
d->irq, err);
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 4d384edc0c64..c3fdb36dec30 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -5,6 +5,7 @@
*/
#include <linux/irqdomain.h>
#include <linux/irq.h>
+#include <linux/uaccess.h>
#include "internals.h"
@@ -171,8 +172,55 @@ static int irq_debug_open(struct inode *inode, struct file *file)
return single_open(file, irq_debug_show, inode->i_private);
}
+static ssize_t irq_debug_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct irq_desc *desc = file_inode(file)->i_private;
+ char buf[8] = { 0, };
+ size_t size;
+
+ size = min(sizeof(buf) - 1, count);
+ if (copy_from_user(buf, user_buf, size))
+ return -EFAULT;
+
+ if (!strncmp(buf, "trigger", size)) {
+ unsigned long flags;
+ int err;
+
+ /* Try the HW interface first */
+ err = irq_set_irqchip_state(irq_desc_get_irq(desc),
+ IRQCHIP_STATE_PENDING, true);
+ if (!err)
+ return count;
+
+ /*
+ * Otherwise, try to inject via the resend interface,
+ * which may or may not succeed.
+ */
+ chip_bus_lock(desc);
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ if (irq_settings_is_level(desc)) {
+ /* Can't do level, sorry */
+ err = -EINVAL;
+ } else {
+ desc->istate |= IRQS_PENDING;
+ check_irq_resend(desc);
+ err = 0;
+ }
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(desc);
+
+ return err ? err : count;
+ }
+
+ return count;
+}
+
static const struct file_operations dfs_irq_ops = {
.open = irq_debug_open,
+ .write = irq_debug_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
@@ -186,7 +234,7 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
return;
sprintf(name, "%d", irq);
- desc->debugfs_file = debugfs_create_file(name, 0444, irq_dir, desc,
+ desc->debugfs_file = debugfs_create_file(name, 0644, irq_dir, desc,
&dfs_irq_ops);
}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 9da14d125df4..a4aa39009f0d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -151,7 +151,7 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
#define for_each_action_of_desc(desc, act) \
- for (act = desc->act; act; act = act->next)
+ for (act = desc->action; act; act = act->next)
struct irq_desc *
__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
@@ -227,6 +227,16 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
return __irqd_to_state(d) & mask;
}
+static inline void irq_state_set_disabled(struct irq_desc *desc)
+{
+ irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
+}
+
+static inline void irq_state_set_masked(struct irq_desc *desc)
+{
+ irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
+}
+
#undef __irqd_to_state
static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
@@ -437,7 +447,9 @@ static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
# ifdef CONFIG_IRQ_DOMAIN
void irq_domain_debugfs_init(struct dentry *root);
# else
-static inline void irq_domain_debugfs_init(struct dentry *root);
+static inline void irq_domain_debugfs_init(struct dentry *root)
+{
+}
# endif
#else /* CONFIG_GENERIC_IRQ_DEBUGFS */
static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 1a9abc1c8ea0..259a22aa9934 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
struct irq_data *data = irq_get_irq_data(irq);
struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
- if (!data || !ipimask || cpu > nr_cpu_ids)
+ if (!data || !ipimask || cpu >= nr_cpu_ids)
return INVALID_HWIRQ;
if (!cpumask_test_cpu(cpu, ipimask))
@@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
if (!chip->ipi_send_single && !chip->ipi_send_mask)
return -EINVAL;
- if (cpu > nr_cpu_ids)
+ if (cpu >= nr_cpu_ids)
return -EINVAL;
if (dest) {
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
new file mode 100644
index 000000000000..24caabf1a0f7
--- /dev/null
+++ b/kernel/irq/irq_sim.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/irq_sim.h>
+#include <linux/irq.h>
+
+struct irq_sim_devres {
+ struct irq_sim *sim;
+};
+
+static void irq_sim_irqmask(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+
+ irq_ctx->enabled = false;
+}
+
+static void irq_sim_irqunmask(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+
+ irq_ctx->enabled = true;
+}
+
+static struct irq_chip irq_sim_irqchip = {
+ .name = "irq_sim",
+ .irq_mask = irq_sim_irqmask,
+ .irq_unmask = irq_sim_irqunmask,
+};
+
+static void irq_sim_handle_irq(struct irq_work *work)
+{
+ struct irq_sim_work_ctx *work_ctx;
+
+ work_ctx = container_of(work, struct irq_sim_work_ctx, work);
+ handle_simple_irq(irq_to_desc(work_ctx->irq));
+}
+
+/**
+ * irq_sim_init - Initialize the interrupt simulator: allocate a range of
+ * dummy interrupts.
+ *
+ * @sim: The interrupt simulator object to initialize.
+ * @num_irqs: Number of interrupts to allocate
+ *
+ * Returns 0 on success and a negative error number on failure.
+ */
+int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs)
+{
+ int i;
+
+ sim->irqs = kmalloc_array(num_irqs, sizeof(*sim->irqs), GFP_KERNEL);
+ if (!sim->irqs)
+ return -ENOMEM;
+
+ sim->irq_base = irq_alloc_descs(-1, 0, num_irqs, 0);
+ if (sim->irq_base < 0) {
+ kfree(sim->irqs);
+ return sim->irq_base;
+ }
+
+ for (i = 0; i < num_irqs; i++) {
+ sim->irqs[i].irqnum = sim->irq_base + i;
+ sim->irqs[i].enabled = false;
+ irq_set_chip(sim->irq_base + i, &irq_sim_irqchip);
+ irq_set_chip_data(sim->irq_base + i, &sim->irqs[i]);
+ irq_set_handler(sim->irq_base + i, &handle_simple_irq);
+ irq_modify_status(sim->irq_base + i,
+ IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE);
+ }
+
+ init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq);
+ sim->irq_count = num_irqs;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_sim_init);
+
+/**
+ * irq_sim_fini - Deinitialize the interrupt simulator: free the interrupt
+ * descriptors and allocated memory.
+ *
+ * @sim: The interrupt simulator to tear down.
+ */
+void irq_sim_fini(struct irq_sim *sim)
+{
+ irq_work_sync(&sim->work_ctx.work);
+ irq_free_descs(sim->irq_base, sim->irq_count);
+ kfree(sim->irqs);
+}
+EXPORT_SYMBOL_GPL(irq_sim_fini);
+
+static void devm_irq_sim_release(struct device *dev, void *res)
+{
+ struct irq_sim_devres *this = res;
+
+ irq_sim_fini(this->sim);
+}
+
+/**
+ * irq_sim_init - Initialize the interrupt simulator for a managed device.
+ *
+ * @dev: Device to initialize the simulator object for.
+ * @sim: The interrupt simulator object to initialize.
+ * @num_irqs: Number of interrupts to allocate
+ *
+ * Returns 0 on success and a negative error number on failure.
+ */
+int devm_irq_sim_init(struct device *dev, struct irq_sim *sim,
+ unsigned int num_irqs)
+{
+ struct irq_sim_devres *dr;
+ int rv;
+
+ dr = devres_alloc(devm_irq_sim_release, sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return -ENOMEM;
+
+ rv = irq_sim_init(sim, num_irqs);
+ if (rv) {
+ devres_free(dr);
+ return rv;
+ }
+
+ dr->sim = sim;
+ devres_add(dev, dr);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devm_irq_sim_init);
+
+/**
+ * irq_sim_fire - Enqueue an interrupt.
+ *
+ * @sim: The interrupt simulator object.
+ * @offset: Offset of the simulated interrupt which should be fired.
+ */
+void irq_sim_fire(struct irq_sim *sim, unsigned int offset)
+{
+ if (sim->irqs[offset].enabled) {
+ sim->work_ctx.irq = irq_sim_irqnum(sim, offset);
+ irq_work_queue(&sim->work_ctx.work);
+ }
+}
+EXPORT_SYMBOL_GPL(irq_sim_fire);
+
+/**
+ * irq_sim_irqnum - Get the allocated number of a dummy interrupt.
+ *
+ * @sim: The interrupt simulator object.
+ * @offset: Offset of the simulated interrupt for which to retrieve
+ * the number.
+ */
+int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset)
+{
+ return sim->irqs[offset].irqnum;
+}
+EXPORT_SYMBOL_GPL(irq_sim_irqnum);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 8bbd06405e60..73be2b3909bd 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -373,6 +373,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
raw_spin_lock_init(&desc->lock);
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+ mutex_init(&desc->request_mutex);
init_rcu_head(&desc->rcu);
desc_set_defaults(irq, desc, node, affinity, owner);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 14fe862aa2e3..e84b7056bb08 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,5 +1,6 @@
#define pr_fmt(fmt) "irq: " fmt
+#include <linux/acpi.h>
#include <linux/debugfs.h>
#include <linux/hardirq.h>
#include <linux/interrupt.h>
@@ -40,6 +41,9 @@ static inline void debugfs_add_domain_dir(struct irq_domain *d) { }
static inline void debugfs_remove_domain_dir(struct irq_domain *d) { }
#endif
+const struct fwnode_operations irqchip_fwnode_ops;
+EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
+
/**
* irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for
* identifying an irq domain
@@ -85,7 +89,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
fwid->type = type;
fwid->name = n;
fwid->data = data;
- fwid->fwnode.type = FWNODE_IRQCHIP;
+ fwid->fwnode.ops = &irqchip_fwnode_ops;
return &fwid->fwnode;
}
EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode);
@@ -155,6 +159,21 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
domain->name = fwid->name;
break;
}
+#ifdef CONFIG_ACPI
+ } else if (is_acpi_device_node(fwnode)) {
+ struct acpi_buffer buf = {
+ .length = ACPI_ALLOCATE_BUFFER,
+ };
+ acpi_handle handle;
+
+ handle = acpi_device_handle(to_acpi_device_node(fwnode));
+ if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) {
+ domain->name = buf.pointer;
+ domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+ }
+
+ domain->fwnode = fwnode;
+#endif
} else if (of_node) {
char *name;
@@ -177,10 +196,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
}
if (!domain->name) {
- if (fwnode) {
- pr_err("Invalid fwnode type (%d) for irqdomain\n",
- fwnode->type);
- }
+ if (fwnode)
+ pr_err("Invalid fwnode type for irqdomain\n");
domain->name = kasprintf(GFP_KERNEL, "unknown-%d",
atomic_inc_return(&unknown_domains));
if (!domain->name) {
@@ -439,6 +456,31 @@ void irq_set_default_host(struct irq_domain *domain)
}
EXPORT_SYMBOL_GPL(irq_set_default_host);
+static void irq_domain_clear_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq)
+{
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = 0;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_delete(&domain->revmap_tree, hwirq);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+}
+
+static void irq_domain_set_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ struct irq_data *irq_data)
+{
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = irq_data->irq;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+}
+
void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
{
struct irq_data *irq_data = irq_get_irq_data(irq);
@@ -467,13 +509,7 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
domain->mapcount--;
/* Clear reverse map for this hwirq */
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = 0;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&revmap_trees_mutex);
- }
+ irq_domain_clear_mapping(domain, hwirq);
}
int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
@@ -517,13 +553,7 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
}
domain->mapcount++;
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = virq;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
- mutex_unlock(&revmap_trees_mutex);
- }
+ irq_domain_set_mapping(domain, hwirq, irq_data);
mutex_unlock(&irq_domain_mutex);
irq_clear_status_flags(virq, IRQ_NOREQUEST);
@@ -1122,16 +1152,9 @@ static void irq_domain_insert_irq(int virq)
for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
struct irq_domain *domain = data->domain;
- irq_hw_number_t hwirq = data->hwirq;
domain->mapcount++;
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = virq;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_insert(&domain->revmap_tree, hwirq, data);
- mutex_unlock(&revmap_trees_mutex);
- }
+ irq_domain_set_mapping(domain, data->hwirq, data);
/* If not already assigned, give the domain the chip's name */
if (!domain->name && data->chip)
@@ -1155,13 +1178,7 @@ static void irq_domain_remove_irq(int virq)
irq_hw_number_t hwirq = data->hwirq;
domain->mapcount--;
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = 0;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&revmap_trees_mutex);
- }
+ irq_domain_clear_mapping(domain, hwirq);
}
}
@@ -1346,7 +1363,8 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain,
unsigned int irq_base,
unsigned int nr_irqs)
{
- domain->ops->free(domain, irq_base, nr_irqs);
+ if (domain->ops->free)
+ domain->ops->free(domain, irq_base, nr_irqs);
}
int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
@@ -1432,6 +1450,175 @@ out_free_desc:
return ret;
}
+/* The irq_data was moved, fix the revmap to refer to the new location */
+static void irq_domain_fix_revmap(struct irq_data *d)
+{
+ void **slot;
+
+ if (d->hwirq < d->domain->revmap_size)
+ return; /* Not using radix tree. */
+
+ /* Fix up the revmap. */
+ mutex_lock(&revmap_trees_mutex);
+ slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
+ if (slot)
+ radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
+ mutex_unlock(&revmap_trees_mutex);
+}
+
+/**
+ * irq_domain_push_irq() - Push a domain in to the top of a hierarchy.
+ * @domain: Domain to push.
+ * @virq: Irq to push the domain in to.
+ * @arg: Passed to the irq_domain_ops alloc() function.
+ *
+ * For an already existing irqdomain hierarchy, as might be obtained
+ * via a call to pci_enable_msix(), add an additional domain to the
+ * head of the processing chain. Must be called before request_irq()
+ * has been called.
+ */
+int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
+{
+ struct irq_data *child_irq_data;
+ struct irq_data *root_irq_data = irq_get_irq_data(virq);
+ struct irq_desc *desc;
+ int rv = 0;
+
+ /*
+ * Check that no action has been set, which indicates the virq
+ * is in a state where this function doesn't have to deal with
+ * races between interrupt handling and maintaining the
+ * hierarchy. This will catch gross misuse. Attempting to
+ * make the check race free would require holding locks across
+ * calls to struct irq_domain_ops->alloc(), which could lead
+ * to deadlock, so we just do a simple check before starting.
+ */
+ desc = irq_to_desc(virq);
+ if (!desc)
+ return -EINVAL;
+ if (WARN_ON(desc->action))
+ return -EBUSY;
+
+ if (domain == NULL)
+ return -EINVAL;
+
+ if (WARN_ON(!irq_domain_is_hierarchy(domain)))
+ return -EINVAL;
+
+ if (!root_irq_data)
+ return -EINVAL;
+
+ if (domain->parent != root_irq_data->domain)
+ return -EINVAL;
+
+ child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL,
+ irq_data_get_node(root_irq_data));
+ if (!child_irq_data)
+ return -ENOMEM;
+
+ mutex_lock(&irq_domain_mutex);
+
+ /* Copy the original irq_data. */
+ *child_irq_data = *root_irq_data;
+
+ /*
+ * Overwrite the root_irq_data, which is embedded in struct
+ * irq_desc, with values for this domain.
+ */
+ root_irq_data->parent_data = child_irq_data;
+ root_irq_data->domain = domain;
+ root_irq_data->mask = 0;
+ root_irq_data->hwirq = 0;
+ root_irq_data->chip = NULL;
+ root_irq_data->chip_data = NULL;
+
+ /* May (probably does) set hwirq, chip, etc. */
+ rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
+ if (rv) {
+ /* Restore the original irq_data. */
+ *root_irq_data = *child_irq_data;
+ goto error;
+ }
+
+ irq_domain_fix_revmap(child_irq_data);
+ irq_domain_set_mapping(domain, root_irq_data->hwirq, root_irq_data);
+
+error:
+ mutex_unlock(&irq_domain_mutex);
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(irq_domain_push_irq);
+
+/**
+ * irq_domain_pop_irq() - Remove a domain from the top of a hierarchy.
+ * @domain: Domain to remove.
+ * @virq: Irq to remove the domain from.
+ *
+ * Undo the effects of a call to irq_domain_push_irq(). Must be
+ * called either before request_irq() or after free_irq().
+ */
+int irq_domain_pop_irq(struct irq_domain *domain, int virq)
+{
+ struct irq_data *root_irq_data = irq_get_irq_data(virq);
+ struct irq_data *child_irq_data;
+ struct irq_data *tmp_irq_data;
+ struct irq_desc *desc;
+
+ /*
+ * Check that no action is set, which indicates the virq is in
+ * a state where this function doesn't have to deal with races
+ * between interrupt handling and maintaining the hierarchy.
+ * This will catch gross misuse. Attempting to make the check
+ * race free would require holding locks across calls to
+ * struct irq_domain_ops->free(), which could lead to
+ * deadlock, so we just do a simple check before starting.
+ */
+ desc = irq_to_desc(virq);
+ if (!desc)
+ return -EINVAL;
+ if (WARN_ON(desc->action))
+ return -EBUSY;
+
+ if (domain == NULL)
+ return -EINVAL;
+
+ if (!root_irq_data)
+ return -EINVAL;
+
+ tmp_irq_data = irq_domain_get_irq_data(domain, virq);
+
+ /* We can only "pop" if this domain is at the top of the list */
+ if (WARN_ON(root_irq_data != tmp_irq_data))
+ return -EINVAL;
+
+ if (WARN_ON(root_irq_data->domain != domain))
+ return -EINVAL;
+
+ child_irq_data = root_irq_data->parent_data;
+ if (WARN_ON(!child_irq_data))
+ return -EINVAL;
+
+ mutex_lock(&irq_domain_mutex);
+
+ root_irq_data->parent_data = NULL;
+
+ irq_domain_clear_mapping(domain, root_irq_data->hwirq);
+ irq_domain_free_irqs_hierarchy(domain, virq, 1);
+
+ /* Restore the original irq_data. */
+ *root_irq_data = *child_irq_data;
+
+ irq_domain_fix_revmap(root_irq_data);
+
+ mutex_unlock(&irq_domain_mutex);
+
+ kfree(child_irq_data);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_pop_irq);
+
/**
* irq_domain_free_irqs - Free IRQ number and associated data structures
* @virq: base IRQ number
@@ -1667,8 +1854,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d)
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
- if (d->debugfs_file)
- debugfs_remove(d->debugfs_file);
+ debugfs_remove(d->debugfs_file);
}
void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5c11c1730ba5..573dc52b0806 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -400,8 +400,18 @@ int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
return -EINVAL;
data = irq_desc_get_irq_data(desc);
- chip = irq_data_get_irq_chip(data);
- if (chip && chip->irq_set_vcpu_affinity)
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip && chip->irq_set_vcpu_affinity)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
irq_put_desc_unlock(desc, flags);
@@ -1090,6 +1100,16 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
+ *
+ * Locking rules:
+ *
+ * desc->request_mutex Provides serialization against a concurrent free_irq()
+ * chip_bus_lock Provides serialization for slow bus operations
+ * desc->lock Provides serialization against hard interrupts
+ *
+ * chip_bus_lock and desc->lock are sufficient for all other management and
+ * interrupt related functions. desc->request_mutex solely serializes
+ * request/free_irq().
*/
static int
__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
@@ -1168,7 +1188,34 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->flags &= ~IRQF_ONESHOT;
/*
+ * Protects against a concurrent __free_irq() call which might wait
+ * for synchronize_irq() to complete without holding the optional
+ * chip bus lock and desc->lock.
+ */
+ mutex_lock(&desc->request_mutex);
+
+ /*
+ * Acquire bus lock as the irq_request_resources() callback below
+ * might rely on the serialization or the magic power management
+ * functions which are abusing the irq_bus_lock() callback,
+ */
+ chip_bus_lock(desc);
+
+ /* First installed action requests resources. */
+ if (!desc->action) {
+ ret = irq_request_resources(desc);
+ if (ret) {
+ pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
+ new->name, irq, desc->irq_data.chip->name);
+ goto out_bus_unlock;
+ }
+ }
+
+ /*
* The following block of code has to be executed atomically
+ * protected against a concurrent interrupt and any of the other
+ * management calls which are not serialized via
+ * desc->request_mutex or the optional bus lock.
*/
raw_spin_lock_irqsave(&desc->lock, flags);
old_ptr = &desc->action;
@@ -1267,13 +1314,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
if (!shared) {
- ret = irq_request_resources(desc);
- if (ret) {
- pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
- new->name, irq, desc->irq_data.chip->name);
- goto out_unlock;
- }
-
init_waitqueue_head(&desc->wait_for_threads);
/* Setup the type (level, edge polarity) if configured: */
@@ -1281,10 +1321,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
ret = __irq_set_trigger(desc,
new->flags & IRQF_TRIGGER_MASK);
- if (ret) {
- irq_release_resources(desc);
+ if (ret)
goto out_unlock;
- }
}
desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
@@ -1347,6 +1385,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(desc);
+ mutex_unlock(&desc->request_mutex);
irq_setup_timings(desc, new);
@@ -1378,6 +1418,12 @@ mismatch:
out_unlock:
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (!desc->action)
+ irq_release_resources(desc);
+out_bus_unlock:
+ chip_bus_sync_unlock(desc);
+ mutex_unlock(&desc->request_mutex);
+
out_thread:
if (new->thread) {
struct task_struct *t = new->thread;
@@ -1417,9 +1463,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
if (retval < 0)
return retval;
- chip_bus_lock(desc);
retval = __setup_irq(irq, desc, act);
- chip_bus_sync_unlock(desc);
if (retval)
irq_chip_pm_put(&desc->irq_data);
@@ -1443,6 +1487,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
if (!desc)
return NULL;
+ mutex_lock(&desc->request_mutex);
chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
@@ -1458,6 +1503,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
WARN(1, "Trying to free already-free IRQ %d\n", irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
chip_bus_sync_unlock(desc);
+ mutex_unlock(&desc->request_mutex);
return NULL;
}
@@ -1475,8 +1521,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
if (!desc->action) {
irq_settings_clr_disable_unlazy(desc);
irq_shutdown(desc);
- irq_release_resources(desc);
- irq_remove_timings(desc);
}
#ifdef CONFIG_SMP
@@ -1486,6 +1530,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
#endif
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ /*
+ * Drop bus_lock here so the changes which were done in the chip
+ * callbacks above are synced out to the irq chips which hang
+ * behind a slow bus (I2C, SPI) before calling synchronize_irq().
+ *
+ * Aside of that the bus_lock can also be taken from the threaded
+ * handler in irq_finalize_oneshot() which results in a deadlock
+ * because synchronize_irq() would wait forever for the thread to
+ * complete, which is blocked on the bus lock.
+ *
+ * The still held desc->request_mutex() protects against a
+ * concurrent request_irq() of this irq so the release of resources
+ * and timing data is properly serialized.
+ */
chip_bus_sync_unlock(desc);
unregister_handler_proc(irq, action);
@@ -1518,6 +1576,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
}
}
+ /* Last action releases resources */
+ if (!desc->action) {
+ /*
+ * Reaquire bus lock as irq_release_resources() might
+ * require it to deallocate resources over the slow bus.
+ */
+ chip_bus_lock(desc);
+ irq_release_resources(desc);
+ chip_bus_sync_unlock(desc);
+ irq_remove_timings(desc);
+ }
+
+ mutex_unlock(&desc->request_mutex);
+
irq_chip_pm_put(&desc->irq_data);
module_put(desc->owner);
kfree(action->secondary);
@@ -1674,9 +1746,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
- chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
- chip_bus_sync_unlock(desc);
if (retval) {
irq_chip_pm_put(&desc->irq_data);
@@ -1924,9 +1994,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
if (retval < 0)
return retval;
- chip_bus_lock(desc);
retval = __setup_irq(irq, desc, act);
- chip_bus_sync_unlock(desc);
if (retval)
irq_chip_pm_put(&desc->irq_data);
@@ -1935,9 +2003,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
}
/**
- * request_percpu_irq - allocate a percpu interrupt line
+ * __request_percpu_irq - allocate a percpu interrupt line
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs.
+ * @flags: Interrupt type flags (IRQF_TIMER only)
* @devname: An ascii name for the claiming device
* @dev_id: A percpu cookie passed back to the handler function
*
@@ -1950,8 +2019,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
* the handler gets called with the interrupted CPU's instance of
* that variable.
*/
-int request_percpu_irq(unsigned int irq, irq_handler_t handler,
- const char *devname, void __percpu *dev_id)
+int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
+ unsigned long flags, const char *devname,
+ void __percpu *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
@@ -1965,12 +2035,15 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
!irq_settings_is_per_cpu_devid(desc))
return -EINVAL;
+ if (flags && flags != IRQF_TIMER)
+ return -EINVAL;
+
action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
if (!action)
return -ENOMEM;
action->handler = handler;
- action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
+ action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND;
action->name = devname;
action->percpu_dev_id = dev_id;
@@ -1980,9 +2053,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
- chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
- chip_bus_sync_unlock(desc);
if (retval) {
irq_chip_pm_put(&desc->irq_data);
@@ -1991,7 +2062,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
-EXPORT_SYMBOL_GPL(request_percpu_irq);
+EXPORT_SYMBOL_GPL(__request_percpu_irq);
/**
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cea1de0161f1..6bd9b58429cc 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -149,6 +149,8 @@ static void resume_irq(struct irq_desc *desc)
/* Pretend that it got disabled ! */
desc->depth++;
+ irq_state_set_disabled(desc);
+ irq_state_set_masked(desc);
resume:
desc->istate &= ~IRQS_SUSPENDED;
__enable_irq(desc);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7f9642a1e267..6376b4a598d3 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -61,12 +61,12 @@ static int show_irq_affinity(int type, struct seq_file *m)
case EFFECTIVE:
case EFFECTIVE_LIST:
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
- mask = desc->irq_common_data.effective_affinity;
+ mask = irq_data_get_effective_affinity_mask(&desc->irq_data);
break;
-#else
- return -EINVAL;
#endif
- };
+ default:
+ return -EINVAL;
+ }
switch (type) {
case AFFINITY_LIST:
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index d11c506a6ac3..0bf2e8f5244a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,29 +79,7 @@ int static_key_count(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_count);
-void static_key_enable(struct static_key *key)
-{
- int count = static_key_count(key);
-
- WARN_ON_ONCE(count < 0 || count > 1);
-
- if (!count)
- static_key_slow_inc(key);
-}
-EXPORT_SYMBOL_GPL(static_key_enable);
-
-void static_key_disable(struct static_key *key)
-{
- int count = static_key_count(key);
-
- WARN_ON_ONCE(count < 0 || count > 1);
-
- if (count)
- static_key_slow_dec(key);
-}
-EXPORT_SYMBOL_GPL(static_key_disable);
-
-void static_key_slow_inc(struct static_key *key)
+static void static_key_slow_inc_cpuslocked(struct static_key *key)
{
int v, v1;
@@ -125,24 +103,87 @@ void static_key_slow_inc(struct static_key *key)
return;
}
- cpus_read_lock();
jump_label_lock();
if (atomic_read(&key->enabled) == 0) {
atomic_set(&key->enabled, -1);
jump_label_update(key);
- atomic_set(&key->enabled, 1);
+ /*
+ * Ensure that if the above cmpxchg loop observes our positive
+ * value, it must also observe all the text changes.
+ */
+ atomic_set_release(&key->enabled, 1);
} else {
atomic_inc(&key->enabled);
}
jump_label_unlock();
+}
+
+void static_key_slow_inc(struct static_key *key)
+{
+ cpus_read_lock();
+ static_key_slow_inc_cpuslocked(key);
cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
-static void __static_key_slow_dec(struct static_key *key,
- unsigned long rate_limit, struct delayed_work *work)
+void static_key_enable_cpuslocked(struct static_key *key)
+{
+ STATIC_KEY_CHECK_USE();
+
+ if (atomic_read(&key->enabled) > 0) {
+ WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
+ return;
+ }
+
+ jump_label_lock();
+ if (atomic_read(&key->enabled) == 0) {
+ atomic_set(&key->enabled, -1);
+ jump_label_update(key);
+ /*
+ * See static_key_slow_inc().
+ */
+ atomic_set_release(&key->enabled, 1);
+ }
+ jump_label_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked);
+
+void static_key_enable(struct static_key *key)
+{
+ cpus_read_lock();
+ static_key_enable_cpuslocked(key);
+ cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_enable);
+
+void static_key_disable_cpuslocked(struct static_key *key)
+{
+ STATIC_KEY_CHECK_USE();
+
+ if (atomic_read(&key->enabled) != 1) {
+ WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
+ return;
+ }
+
+ jump_label_lock();
+ if (atomic_cmpxchg(&key->enabled, 1, 0))
+ jump_label_update(key);
+ jump_label_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked);
+
+void static_key_disable(struct static_key *key)
{
cpus_read_lock();
+ static_key_disable_cpuslocked(key);
+ cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_disable);
+
+static void static_key_slow_dec_cpuslocked(struct static_key *key,
+ unsigned long rate_limit,
+ struct delayed_work *work)
+{
/*
* The negative count check is valid even when a negative
* key->enabled is in use by static_key_slow_inc(); a
@@ -153,7 +194,6 @@ static void __static_key_slow_dec(struct static_key *key,
if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
WARN(atomic_read(&key->enabled) < 0,
"jump label: negative count!\n");
- cpus_read_unlock();
return;
}
@@ -164,6 +204,14 @@ static void __static_key_slow_dec(struct static_key *key,
jump_label_update(key);
}
jump_label_unlock();
+}
+
+static void __static_key_slow_dec(struct static_key *key,
+ unsigned long rate_limit,
+ struct delayed_work *work)
+{
+ cpus_read_lock();
+ static_key_slow_dec_cpuslocked(key, rate_limit, work);
cpus_read_unlock();
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6a3b249a2ae1..127e7cfafa55 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -28,12 +28,6 @@
#include <asm/sections.h>
-#ifdef CONFIG_KALLSYMS_ALL
-#define all_var 1
-#else
-#define all_var 0
-#endif
-
/*
* These will be re-linked against their real values
* during the second link stage.
@@ -82,7 +76,7 @@ static inline int is_kernel(unsigned long addr)
static int is_ksym_addr(unsigned long addr)
{
- if (all_var)
+ if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
return is_kernel(addr);
return is_kernel_text(addr) || is_kernel_inittext(addr);
@@ -280,7 +274,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
if (!symbol_end) {
if (is_kernel_inittext(addr))
symbol_end = (unsigned long)_einittext;
- else if (all_var)
+ else if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
symbol_end = (unsigned long)_end;
else
symbol_end = (unsigned long)_etext;
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 3a47fa998fe0..ea34ed8bb952 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -11,6 +11,10 @@
#include <linux/bug.h>
#include <linux/err.h>
#include <linux/kcmp.h>
+#include <linux/capability.h>
+#include <linux/list.h>
+#include <linux/eventpoll.h>
+#include <linux/file.h>
#include <asm/unistd.h>
@@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2)
return err;
}
+#ifdef CONFIG_EPOLL
+static int kcmp_epoll_target(struct task_struct *task1,
+ struct task_struct *task2,
+ unsigned long idx1,
+ struct kcmp_epoll_slot __user *uslot)
+{
+ struct file *filp, *filp_epoll, *filp_tgt;
+ struct kcmp_epoll_slot slot;
+ struct files_struct *files;
+
+ if (copy_from_user(&slot, uslot, sizeof(slot)))
+ return -EFAULT;
+
+ filp = get_file_raw_ptr(task1, idx1);
+ if (!filp)
+ return -EBADF;
+
+ files = get_files_struct(task2);
+ if (!files)
+ return -EBADF;
+
+ spin_lock(&files->file_lock);
+ filp_epoll = fcheck_files(files, slot.efd);
+ if (filp_epoll)
+ get_file(filp_epoll);
+ else
+ filp_tgt = ERR_PTR(-EBADF);
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+
+ if (filp_epoll) {
+ filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
+ fput(filp_epoll);
+ } else
+
+ if (IS_ERR(filp_tgt))
+ return PTR_ERR(filp_tgt);
+
+ return kcmp_ptr(filp, filp_tgt, KCMP_FILE);
+}
+#else
+static int kcmp_epoll_target(struct task_struct *task1,
+ struct task_struct *task2,
+ unsigned long idx1,
+ struct kcmp_epoll_slot __user *uslot)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
unsigned long, idx1, unsigned long, idx2)
{
@@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
ret = -EOPNOTSUPP;
#endif
break;
+ case KCMP_EPOLL_TFD:
+ ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 980936a90ee6..e62ec4dc6620 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -144,6 +144,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
if (ret)
goto out;
+ /*
+ * Some architecture(like S390) may touch the crash memory before
+ * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
+ */
+ ret = kimage_crash_copy_vmcoreinfo(image);
+ if (ret)
+ goto out;
+
for (i = 0; i < nr_segments; i++) {
ret = kimage_load_segment(image, &image->segment[i]);
if (ret)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 154ffb489b93..20fef1a38602 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -301,7 +301,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *pages;
- pages = alloc_pages(gfp_mask, order);
+ pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
if (pages) {
unsigned int count, i;
@@ -310,6 +310,13 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
count = 1 << order;
for (i = 0; i < count; i++)
SetPageReserved(pages + i);
+
+ arch_kexec_post_alloc_pages(page_address(pages), count,
+ gfp_mask);
+
+ if (gfp_mask & __GFP_ZERO)
+ for (i = 0; i < count; i++)
+ clear_highpage(pages + i);
}
return pages;
@@ -321,6 +328,9 @@ static void kimage_free_pages(struct page *page)
order = page_private(page);
count = 1 << order;
+
+ arch_kexec_pre_free_pages(page_address(page), count);
+
for (i = 0; i < count; i++)
ClearPageReserved(page + i);
__free_pages(page, order);
@@ -482,6 +492,40 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
return pages;
}
+int kimage_crash_copy_vmcoreinfo(struct kimage *image)
+{
+ struct page *vmcoreinfo_page;
+ void *safecopy;
+
+ if (image->type != KEXEC_TYPE_CRASH)
+ return 0;
+
+ /*
+ * For kdump, allocate one vmcoreinfo safe copy from the
+ * crash memory. as we have arch_kexec_protect_crashkres()
+ * after kexec syscall, we naturally protect it from write
+ * (even read) access under kernel direct mapping. But on
+ * the other hand, we still need to operate it when crash
+ * happens to generate vmcoreinfo note, hereby we rely on
+ * vmap for this purpose.
+ */
+ vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
+ if (!vmcoreinfo_page) {
+ pr_warn("Could not allocate vmcoreinfo buffer\n");
+ return -ENOMEM;
+ }
+ safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
+ if (!safecopy) {
+ pr_warn("Could not vmap vmcoreinfo buffer\n");
+ return -ENOMEM;
+ }
+
+ image->vmcoreinfo_data_copy = safecopy;
+ crash_update_vmcoreinfo_safecopy(safecopy);
+
+ return 0;
+}
+
static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
{
if (*image->entry != 0)
@@ -569,6 +613,11 @@ void kimage_free(struct kimage *image)
if (!image)
return;
+ if (image->vmcoreinfo_data_copy) {
+ crash_update_vmcoreinfo_safecopy(NULL);
+ vunmap(image->vmcoreinfo_data_copy);
+ }
+
kimage_free_extra_pages(image);
for_each_kimage_entry(image, ptr, entry) {
if (entry & IND_INDIRECTION) {
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b118735fea9d..9f48f4412297 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -26,13 +26,6 @@
#include <linux/vmalloc.h>
#include "kexec_internal.h"
-/*
- * Declare these symbols weak so that if architecture provides a purgatory,
- * these will be overridden.
- */
-char __weak kexec_purgatory[0];
-size_t __weak kexec_purgatory_size = 0;
-
static int kexec_calculate_store_digests(struct kimage *image);
/* Architectures can provide this probe function */
@@ -162,16 +155,10 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
}
if (cmdline_len) {
- image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
- if (!image->cmdline_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
- cmdline_len);
- if (ret) {
- ret = -EFAULT;
+ image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
+ if (IS_ERR(image->cmdline_buf)) {
+ ret = PTR_ERR(image->cmdline_buf);
+ image->cmdline_buf = NULL;
goto out;
}
@@ -304,6 +291,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if (ret)
goto out;
+ /*
+ * Some architecture(like S390) may touch the crash memory before
+ * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
+ */
+ ret = kimage_crash_copy_vmcoreinfo(image);
+ if (ret)
+ goto out;
+
ret = kexec_calculate_store_digests(image);
if (ret)
goto out;
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 799a8a452187..50dfcb039a41 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -17,6 +17,8 @@ extern struct mutex kexec_mutex;
#ifdef CONFIG_KEXEC_FILE
#include <linux/purgatory.h>
void kimage_file_post_load_cleanup(struct kimage *image);
+extern char kexec_purgatory[];
+extern size_t kexec_purgatory_size;
#else /* CONFIG_KEXEC_FILE */
static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
#endif /* CONFIG_KEXEC_FILE */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 563f97e2be36..2f37acde640b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,8 +45,6 @@
#include <trace/events/module.h>
-extern int max_threads;
-
#define CAP_BSET (void *)1
#define CAP_PI (void *)2
@@ -56,6 +54,33 @@ static DEFINE_SPINLOCK(umh_sysctl_lock);
static DECLARE_RWSEM(umhelper_sem);
#ifdef CONFIG_MODULES
+/*
+ * Assuming:
+ *
+ * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
+ * (u64) THREAD_SIZE * 8UL);
+ *
+ * If you need less than 50 threads would mean we're dealing with systems
+ * smaller than 3200 pages. This assuems you are capable of having ~13M memory,
+ * and this would only be an be an upper limit, after which the OOM killer
+ * would take effect. Systems like these are very unlikely if modules are
+ * enabled.
+ */
+#define MAX_KMOD_CONCURRENT 50
+static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
+static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
+
+/*
+ * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
+ * running at the same time without returning. When this happens we
+ * believe you've somehow ended up with a recursive module dependency
+ * creating a loop.
+ *
+ * We have no option but to fail.
+ *
+ * Userspace should proactively try to detect and prevent these.
+ */
+#define MAX_KMOD_ALL_BUSY_TIMEOUT 5
/*
modprobe_path is set via /proc/sys.
@@ -127,11 +152,7 @@ int __request_module(bool wait, const char *fmt, ...)
{
va_list args;
char module_name[MODULE_NAME_LEN];
- unsigned int max_modprobes;
int ret;
- static atomic_t kmod_concurrent = ATOMIC_INIT(0);
-#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
- static int kmod_loop_msg;
/*
* We don't allow synchronous module loading from async. Module
@@ -154,40 +175,34 @@ int __request_module(bool wait, const char *fmt, ...)
if (ret)
return ret;
- /* If modprobe needs a service that is in a module, we get a recursive
- * loop. Limit the number of running kmod threads to max_threads/2 or
- * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
- * would be to run the parents of this process, counting how many times
- * kmod was invoked. That would mean accessing the internals of the
- * process tables to get the command line, proc_pid_cmdline is static
- * and it is not worth changing the proc code just to handle this case.
- * KAO.
- *
- * "trace the ppid" is simple, but will fail if someone's
- * parent exits. I think this is as good as it gets. --RR
- */
- max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
- atomic_inc(&kmod_concurrent);
- if (atomic_read(&kmod_concurrent) > max_modprobes) {
- /* We may be blaming an innocent here, but unlikely */
- if (kmod_loop_msg < 5) {
- printk(KERN_ERR
- "request_module: runaway loop modprobe %s\n",
- module_name);
- kmod_loop_msg++;
+ if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) {
+ pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
+ atomic_read(&kmod_concurrent_max),
+ MAX_KMOD_CONCURRENT, module_name);
+ ret = wait_event_killable_timeout(kmod_wq,
+ atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
+ MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
+ if (!ret) {
+ pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
+ module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
+ return -ETIME;
+ } else if (ret == -ERESTARTSYS) {
+ pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
+ return ret;
}
- atomic_dec(&kmod_concurrent);
- return -ENOMEM;
}
trace_module_request(module_name, wait, _RET_IP_);
ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
- atomic_dec(&kmod_concurrent);
+ atomic_inc(&kmod_concurrent_max);
+ wake_up(&kmod_wq);
+
return ret;
}
EXPORT_SYMBOL(__request_module);
+
#endif /* CONFIG_MODULES */
static void call_usermodehelper_freeinfo(struct subprocess_info *info)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6756d750b31b..a1606a4224e1 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1771,24 +1771,13 @@ unsigned long __weak arch_deref_entry_point(void *entry)
int register_jprobes(struct jprobe **jps, int num)
{
- struct jprobe *jp;
int ret = 0, i;
if (num <= 0)
return -EINVAL;
+
for (i = 0; i < num; i++) {
- unsigned long addr, offset;
- jp = jps[i];
- addr = arch_deref_entry_point(jp->entry);
-
- /* Verify probepoint is a function entry point */
- if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
- offset == 0) {
- jp->kp.pre_handler = setjmp_pre_handler;
- jp->kp.break_handler = longjmp_break_handler;
- ret = register_kprobe(&jp->kp);
- } else
- ret = -EINVAL;
+ ret = register_jprobe(jps[i]);
if (ret < 0) {
if (i > 0)
@@ -1796,13 +1785,30 @@ int register_jprobes(struct jprobe **jps, int num)
break;
}
}
+
return ret;
}
EXPORT_SYMBOL_GPL(register_jprobes);
int register_jprobe(struct jprobe *jp)
{
- return register_jprobes(&jp, 1);
+ unsigned long addr, offset;
+ struct kprobe *kp = &jp->kp;
+
+ /*
+ * Verify probepoint as well as the jprobe handler are
+ * valid function entry points.
+ */
+ addr = arch_deref_entry_point(jp->entry);
+
+ if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0 &&
+ kprobe_on_func_entry(kp->addr, kp->symbol_name, kp->offset)) {
+ kp->pre_handler = setjmp_pre_handler;
+ kp->break_handler = longjmp_break_handler;
+ return register_kprobe(kp);
+ }
+
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(register_jprobe);
@@ -1888,12 +1894,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
-bool __weak arch_function_offset_within_entry(unsigned long offset)
+bool __weak arch_kprobe_on_func_entry(unsigned long offset)
{
return !offset;
}
-bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
@@ -1901,7 +1907,7 @@ bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsign
return false;
if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
- !arch_function_offset_within_entry(offset))
+ !arch_kprobe_on_func_entry(offset))
return false;
return true;
@@ -1914,7 +1920,7 @@ int register_kretprobe(struct kretprobe *rp)
int i;
void *addr;
- if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
+ if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
return -EINVAL;
if (kretprobe_blacklist_size) {
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 23cd70651238..46ba853656f6 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
{
phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
return sprintf(buf, "%pa %x\n", &vmcore_base,
- (unsigned int)sizeof(vmcoreinfo_note));
+ (unsigned int)VMCOREINFO_NOTE_SIZE);
}
KERNEL_ATTR_RO(vmcoreinfo);
@@ -234,7 +234,7 @@ static struct attribute * kernel_attrs[] = {
NULL
};
-static struct attribute_group kernel_attr_group = {
+static const struct attribute_group kernel_attr_group = {
.attrs = kernel_attrs,
};
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 26db528c1d88..1c19edf82427 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -637,6 +637,7 @@ repeat:
schedule();
try_to_freeze();
+ cond_resched();
goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7d2499bec5fe..44c8d0d17170 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -58,6 +58,10 @@
#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+#include <linux/slab.h>
+#endif
+
#ifdef CONFIG_PROVE_LOCKING
int prove_locking = 1;
module_param(prove_locking, int, 0644);
@@ -344,14 +348,12 @@ EXPORT_SYMBOL(lockdep_on);
#if VERBOSE
# define HARDIRQ_VERBOSE 1
# define SOFTIRQ_VERBOSE 1
-# define RECLAIM_VERBOSE 1
#else
# define HARDIRQ_VERBOSE 0
# define SOFTIRQ_VERBOSE 0
-# define RECLAIM_VERBOSE 0
#endif
-#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
/*
* Quick filtering for interesting events:
*/
@@ -726,6 +728,18 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
}
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+static void cross_init(struct lockdep_map *lock, int cross);
+static int cross_lock(struct lockdep_map *lock);
+static int lock_acquire_crosslock(struct held_lock *hlock);
+static int lock_release_crosslock(struct lockdep_map *lock);
+#else
+static inline void cross_init(struct lockdep_map *lock, int cross) {}
+static inline int cross_lock(struct lockdep_map *lock) { return 0; }
+static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
+static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
+#endif
+
/*
* Register a lock's class in the hash-table, if the class is not present
* yet. Otherwise we look it up. We cache the result in the lock object
@@ -1125,22 +1139,41 @@ print_circular_lock_scenario(struct held_lock *src,
printk(KERN_CONT "\n\n");
}
- printk(" Possible unsafe locking scenario:\n\n");
- printk(" CPU0 CPU1\n");
- printk(" ---- ----\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(parent);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(source);
- printk(KERN_CONT ");\n");
- printk("\n *** DEADLOCK ***\n\n");
+ if (cross_lock(tgt->instance)) {
+ printk(" Possible unsafe locking scenario by crosslock:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(parent);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(source);
+ printk(KERN_CONT ");\n");
+ printk(" unlock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+ } else {
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(parent);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(source);
+ printk(KERN_CONT ");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+ }
}
/*
@@ -1165,7 +1198,12 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
pr_warn("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(check_src);
- pr_warn("\nbut task is already holding lock:\n");
+
+ if (cross_lock(check_tgt->instance))
+ pr_warn("\nbut now in release context of a crosslock acquired at the following:\n");
+ else
+ pr_warn("\nbut task is already holding lock:\n");
+
print_lock(check_tgt);
pr_warn("\nwhich lock already depends on the new lock.\n\n");
pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
@@ -1183,7 +1221,8 @@ static inline int class_equal(struct lock_list *entry, void *data)
static noinline int print_circular_bug(struct lock_list *this,
struct lock_list *target,
struct held_lock *check_src,
- struct held_lock *check_tgt)
+ struct held_lock *check_tgt,
+ struct stack_trace *trace)
{
struct task_struct *curr = current;
struct lock_list *parent;
@@ -1193,7 +1232,9 @@ static noinline int print_circular_bug(struct lock_list *this,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- if (!save_trace(&this->trace))
+ if (cross_lock(check_tgt->instance))
+ this->trace = *trace;
+ else if (!save_trace(&this->trace))
return 0;
depth = get_lock_depth(target);
@@ -1309,6 +1350,19 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
return result;
}
+static noinline int
+check_redundant(struct lock_list *root, struct lock_class *target,
+ struct lock_list **target_entry)
+{
+ int result;
+
+ debug_atomic_inc(nr_redundant_checks);
+
+ result = __bfs_forwards(root, target, class_equal, target_entry);
+
+ return result;
+}
+
#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
/*
* Forwards and backwards subgraph searching, for the purposes of
@@ -1784,6 +1838,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
if (nest)
return 2;
+ if (cross_lock(prev->instance))
+ continue;
+
return print_deadlock_bug(curr, prev, next);
}
return 1;
@@ -1813,20 +1870,13 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, int *stack_saved)
+ struct held_lock *next, int distance, struct stack_trace *trace,
+ int (*save)(struct stack_trace *trace))
{
struct lock_list *entry;
int ret;
struct lock_list this;
struct lock_list *uninitialized_var(target_entry);
- /*
- * Static variable, serialized by the graph_lock().
- *
- * We use this static variable to save the stack trace in case
- * we call into this function multiple times due to encountering
- * trylocks in the held lock stack.
- */
- static struct stack_trace trace;
/*
* Prove that the new <prev> -> <next> dependency would not
@@ -1841,7 +1891,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
this.parent = NULL;
ret = check_noncircular(&this, hlock_class(prev), &target_entry);
if (unlikely(!ret))
- return print_circular_bug(&this, target_entry, next, prev);
+ return print_circular_bug(&this, target_entry, next, prev, trace);
else if (unlikely(ret < 0))
return print_bfs_bug(ret);
@@ -1870,15 +1920,26 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
if (entry->class == hlock_class(next)) {
if (distance == 1)
entry->distance = 1;
- return 2;
+ return 1;
}
}
- if (!*stack_saved) {
- if (!save_trace(&trace))
- return 0;
- *stack_saved = 1;
+ /*
+ * Is the <prev> -> <next> link redundant?
+ */
+ this.class = hlock_class(prev);
+ this.parent = NULL;
+ ret = check_redundant(&this, hlock_class(next), &target_entry);
+ if (!ret) {
+ debug_atomic_inc(nr_redundant);
+ return 2;
}
+ if (ret < 0)
+ return print_bfs_bug(ret);
+
+
+ if (save && !save(trace))
+ return 0;
/*
* Ok, all validations passed, add the new lock
@@ -1886,14 +1947,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
*/
ret = add_lock_to_list(hlock_class(next),
&hlock_class(prev)->locks_after,
- next->acquire_ip, distance, &trace);
+ next->acquire_ip, distance, trace);
if (!ret)
return 0;
ret = add_lock_to_list(hlock_class(prev),
&hlock_class(next)->locks_before,
- next->acquire_ip, distance, &trace);
+ next->acquire_ip, distance, trace);
if (!ret)
return 0;
@@ -1901,8 +1962,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* Debugging printouts:
*/
if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
- /* We drop graph lock, so another thread can overwrite trace. */
- *stack_saved = 0;
graph_unlock();
printk("\n new dependency: ");
print_lock_name(hlock_class(prev));
@@ -1910,9 +1969,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
print_lock_name(hlock_class(next));
printk(KERN_CONT "\n");
dump_stack();
- return graph_lock();
+ if (!graph_lock())
+ return 0;
}
- return 1;
+ return 2;
}
/*
@@ -1925,8 +1985,9 @@ static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
int depth = curr->lockdep_depth;
- int stack_saved = 0;
struct held_lock *hlock;
+ struct stack_trace trace;
+ int (*save)(struct stack_trace *trace) = save_trace;
/*
* Debugging checks.
@@ -1947,21 +2008,36 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
int distance = curr->lockdep_depth - depth + 1;
hlock = curr->held_locks + depth - 1;
/*
- * Only non-recursive-read entries get new dependencies
- * added:
+ * Only non-crosslock entries get new dependencies added.
+ * Crosslock entries will be added by commit later:
*/
- if (hlock->read != 2 && hlock->check) {
- if (!check_prev_add(curr, hlock, next,
- distance, &stack_saved))
- return 0;
+ if (!cross_lock(hlock->instance)) {
/*
- * Stop after the first non-trylock entry,
- * as non-trylock entries have added their
- * own direct dependencies already, so this
- * lock is connected to them indirectly:
+ * Only non-recursive-read entries get new dependencies
+ * added:
*/
- if (!hlock->trylock)
- break;
+ if (hlock->read != 2 && hlock->check) {
+ int ret = check_prev_add(curr, hlock, next,
+ distance, &trace, save);
+ if (!ret)
+ return 0;
+
+ /*
+ * Stop saving stack_trace if save_trace() was
+ * called at least once:
+ */
+ if (save && ret == 2)
+ save = NULL;
+
+ /*
+ * Stop after the first non-trylock entry,
+ * as non-trylock entries have added their
+ * own direct dependencies already, so this
+ * lock is connected to them indirectly:
+ */
+ if (!hlock->trylock)
+ break;
+ }
}
depth--;
/*
@@ -2126,19 +2202,26 @@ static int check_no_collision(struct task_struct *curr,
}
/*
- * Look up a dependency chain. If the key is not present yet then
- * add it and return 1 - in this case the new dependency chain is
- * validated. If the key is already hashed, return 0.
- * (On return with 1 graph_lock is held.)
+ * This is for building a chain between just two different classes,
+ * instead of adding a new hlock upon current, which is done by
+ * add_chain_cache().
+ *
+ * This can be called in any context with two classes, while
+ * add_chain_cache() must be done within the lock owener's context
+ * since it uses hlock which might be racy in another context.
*/
-static inline int lookup_chain_cache(struct task_struct *curr,
- struct held_lock *hlock,
- u64 chain_key)
+static inline int add_chain_cache_classes(unsigned int prev,
+ unsigned int next,
+ unsigned int irq_context,
+ u64 chain_key)
{
- struct lock_class *class = hlock_class(hlock);
struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
- int i, j;
+
+ /*
+ * Allocate a new chain entry from the static array, and add
+ * it to the hash:
+ */
/*
* We might need to take the graph lock, ensure we've got IRQs
@@ -2147,43 +2230,76 @@ static inline int lookup_chain_cache(struct task_struct *curr,
*/
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
+
+ if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
+ dump_stack();
+ return 0;
+ }
+
+ chain = lock_chains + nr_lock_chains++;
+ chain->chain_key = chain_key;
+ chain->irq_context = irq_context;
+ chain->depth = 2;
+ if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ chain->base = nr_chain_hlocks;
+ nr_chain_hlocks += chain->depth;
+ chain_hlocks[chain->base] = prev - 1;
+ chain_hlocks[chain->base + 1] = next -1;
+ }
+#ifdef CONFIG_DEBUG_LOCKDEP
/*
- * We can walk it lock-free, because entries only get added
- * to the hash:
+ * Important for check_no_collision().
*/
- hlist_for_each_entry_rcu(chain, hash_head, entry) {
- if (chain->chain_key == chain_key) {
-cache_hit:
- debug_atomic_inc(chain_lookup_hits);
- if (!check_no_collision(curr, hlock, chain))
- return 0;
-
- if (very_verbose(class))
- printk("\nhash chain already cached, key: "
- "%016Lx tail class: [%p] %s\n",
- (unsigned long long)chain_key,
- class->key, class->name);
+ else {
+ if (!debug_locks_off_graph_unlock())
return 0;
- }
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
+ dump_stack();
+ return 0;
}
- if (very_verbose(class))
- printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
- (unsigned long long)chain_key, class->key, class->name);
+#endif
+
+ hlist_add_head_rcu(&chain->entry, hash_head);
+ debug_atomic_inc(chain_lookup_misses);
+ inc_chains();
+
+ return 1;
+}
+
+/*
+ * Adds a dependency chain into chain hashtable. And must be called with
+ * graph_lock held.
+ *
+ * Return 0 if fail, and graph_lock is released.
+ * Return 1 if succeed, with graph_lock held.
+ */
+static inline int add_chain_cache(struct task_struct *curr,
+ struct held_lock *hlock,
+ u64 chain_key)
+{
+ struct lock_class *class = hlock_class(hlock);
+ struct hlist_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+ int i, j;
+
/*
* Allocate a new chain entry from the static array, and add
* it to the hash:
*/
- if (!graph_lock())
- return 0;
+
/*
- * We have to walk the chain again locked - to avoid duplicates:
+ * We might need to take the graph lock, ensure we've got IRQs
+ * disabled to make this an IRQ-safe lock.. for recursion reasons
+ * lockdep won't complain about its own locking errors.
*/
- hlist_for_each_entry(chain, hash_head, entry) {
- if (chain->chain_key == chain_key) {
- graph_unlock();
- goto cache_hit;
- }
- }
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
if (!debug_locks_off_graph_unlock())
return 0;
@@ -2235,6 +2351,78 @@ cache_hit:
return 1;
}
+/*
+ * Look up a dependency chain.
+ */
+static inline struct lock_chain *lookup_chain_cache(u64 chain_key)
+{
+ struct hlist_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+
+ /*
+ * We can walk it lock-free, because entries only get added
+ * to the hash:
+ */
+ hlist_for_each_entry_rcu(chain, hash_head, entry) {
+ if (chain->chain_key == chain_key) {
+ debug_atomic_inc(chain_lookup_hits);
+ return chain;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * If the key is not present yet in dependency chain cache then
+ * add it and return 1 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
+ */
+static inline int lookup_chain_cache_add(struct task_struct *curr,
+ struct held_lock *hlock,
+ u64 chain_key)
+{
+ struct lock_class *class = hlock_class(hlock);
+ struct lock_chain *chain = lookup_chain_cache(chain_key);
+
+ if (chain) {
+cache_hit:
+ if (!check_no_collision(curr, hlock, chain))
+ return 0;
+
+ if (very_verbose(class)) {
+ printk("\nhash chain already cached, key: "
+ "%016Lx tail class: [%p] %s\n",
+ (unsigned long long)chain_key,
+ class->key, class->name);
+ }
+
+ return 0;
+ }
+
+ if (very_verbose(class)) {
+ printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
+ (unsigned long long)chain_key, class->key, class->name);
+ }
+
+ if (!graph_lock())
+ return 0;
+
+ /*
+ * We have to walk the chain again locked - to avoid duplicates:
+ */
+ chain = lookup_chain_cache(chain_key);
+ if (chain) {
+ graph_unlock();
+ goto cache_hit;
+ }
+
+ if (!add_chain_cache(curr, hlock, chain_key))
+ return 0;
+
+ return 1;
+}
+
static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
struct held_lock *hlock, int chain_head, u64 chain_key)
{
@@ -2245,11 +2433,11 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
*
* We look up the chain_key and do the O(N^2) check and update of
* the dependencies only if this is a new dependency chain.
- * (If lookup_chain_cache() returns with 1 it acquires
+ * (If lookup_chain_cache_add() return with 1 it acquires
* graph_lock for us)
*/
if (!hlock->trylock && hlock->check &&
- lookup_chain_cache(curr, hlock, chain_key)) {
+ lookup_chain_cache_add(curr, hlock, chain_key)) {
/*
* Check whether last held lock:
*
@@ -2277,14 +2465,17 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
* Add dependency only if this lock is not the head
* of the chain, and if it's not a secondary read-lock:
*/
- if (!chain_head && ret != 2)
+ if (!chain_head && ret != 2) {
if (!check_prevs_add(curr, hlock))
return 0;
+ }
+
graph_unlock();
- } else
- /* after lookup_chain_cache(): */
+ } else {
+ /* after lookup_chain_cache_add(): */
if (unlikely(!debug_locks))
return 0;
+ }
return 1;
}
@@ -2567,14 +2758,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)
return 0;
}
-static int RECLAIM_FS_verbose(struct lock_class *class)
-{
-#if RECLAIM_VERBOSE
- return class_filter(class);
-#endif
- return 0;
-}
-
#define STRICT_READ_CHECKS 1
static int (*state_verbose_f[])(struct lock_class *class) = {
@@ -2870,57 +3053,6 @@ void trace_softirqs_off(unsigned long ip)
debug_atomic_inc(redundant_softirqs_off);
}
-static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
-{
- struct task_struct *curr = current;
-
- if (unlikely(!debug_locks))
- return;
-
- gfp_mask = current_gfp_context(gfp_mask);
-
- /* no reclaim without waiting on it */
- if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
- return;
-
- /* this guy won't enter reclaim */
- if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
- return;
-
- /* We're only interested __GFP_FS allocations for now */
- if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
- return;
-
- /*
- * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
- */
- if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
- return;
-
- /* Disable lockdep if explicitly requested */
- if (gfp_mask & __GFP_NOLOCKDEP)
- return;
-
- mark_held_locks(curr, RECLAIM_FS);
-}
-
-static void check_flags(unsigned long flags);
-
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
- unsigned long flags;
-
- if (unlikely(current->lockdep_recursion))
- return;
-
- raw_local_irq_save(flags);
- check_flags(flags);
- current->lockdep_recursion = 1;
- __lockdep_trace_alloc(gfp_mask, flags);
- current->lockdep_recursion = 0;
- raw_local_irq_restore(flags);
-}
-
static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
{
/*
@@ -2966,22 +3098,6 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
}
}
- /*
- * We reuse the irq context infrastructure more broadly as a general
- * context checking code. This tests GFP_FS recursion (a lock taken
- * during reclaim for a GFP_FS allocation is held over a GFP_FS
- * allocation).
- */
- if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
- if (hlock->read) {
- if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
- return 0;
- } else {
- if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
- return 0;
- }
- }
-
return 1;
}
@@ -3040,10 +3156,6 @@ static inline int separate_irq_context(struct task_struct *curr,
return 0;
}
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
-}
-
#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
/*
@@ -3116,7 +3228,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
/*
* Initialize a lock instance's lock-class mapping info:
*/
-void lockdep_init_map(struct lockdep_map *lock, const char *name,
+static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass)
{
int i;
@@ -3174,8 +3286,25 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
raw_local_irq_restore(flags);
}
}
+
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass)
+{
+ cross_init(lock, 0);
+ __lockdep_init_map(lock, name, key, subclass);
+}
EXPORT_SYMBOL_GPL(lockdep_init_map);
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass)
+{
+ cross_init(lock, 1);
+ __lockdep_init_map(lock, name, key, subclass);
+}
+EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
+#endif
+
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
@@ -3231,6 +3360,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int chain_head = 0;
int class_idx;
u64 chain_key;
+ int ret;
if (unlikely(!debug_locks))
return 0;
@@ -3279,7 +3409,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
class_idx = class - lock_classes + 1;
- if (depth) {
+ /* TODO: nest_lock is not implemented for crosslock yet. */
+ if (depth && !cross_lock(lock)) {
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
if (hlock->references) {
@@ -3367,6 +3498,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
return 0;
+ ret = lock_acquire_crosslock(hlock);
+ /*
+ * 2 means normal acquire operations are needed. Otherwise, it's
+ * ok just to return with '0:fail, 1:success'.
+ */
+ if (ret != 2)
+ return ret;
+
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -3604,11 +3743,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
struct task_struct *curr = current;
struct held_lock *hlock;
unsigned int depth;
- int i;
+ int ret, i;
if (unlikely(!debug_locks))
return 0;
+ ret = lock_release_crosslock(lock);
+ /*
+ * 2 means normal release operations are needed. Otherwise, it's
+ * ok just to return with '0:fail, 1:success'.
+ */
+ if (ret != 2)
+ return ret;
+
depth = curr->lockdep_depth;
/*
* So we're all set to release this lock.. wait what lock? We don't
@@ -3952,18 +4099,6 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
}
EXPORT_SYMBOL_GPL(lock_unpin_lock);
-void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
-{
- current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
-}
-EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
-
-void lockdep_clear_current_reclaim_state(void)
-{
- current->lockdep_reclaim_gfp = 0;
-}
-EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state);
-
#ifdef CONFIG_LOCK_STAT
static int
print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
@@ -4484,6 +4619,12 @@ asmlinkage __visible void lockdep_sys_exit(void)
curr->comm, curr->pid);
lockdep_print_held_locks(curr);
}
+
+ /*
+ * The lock history for each syscall should be independent. So wipe the
+ * slate clean on return to userspace.
+ */
+ lockdep_invariant_state(false);
}
void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
@@ -4532,3 +4673,488 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
dump_stack();
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
+
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+
+/*
+ * Crossrelease works by recording a lock history for each thread and
+ * connecting those historic locks that were taken after the
+ * wait_for_completion() in the complete() context.
+ *
+ * Task-A Task-B
+ *
+ * mutex_lock(&A);
+ * mutex_unlock(&A);
+ *
+ * wait_for_completion(&C);
+ * lock_acquire_crosslock();
+ * atomic_inc_return(&cross_gen_id);
+ * |
+ * | mutex_lock(&B);
+ * | mutex_unlock(&B);
+ * |
+ * | complete(&C);
+ * `-- lock_commit_crosslock();
+ *
+ * Which will then add a dependency between B and C.
+ */
+
+#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR])
+
+/*
+ * Whenever a crosslock is held, cross_gen_id will be increased.
+ */
+static atomic_t cross_gen_id; /* Can be wrapped */
+
+/*
+ * Make an entry of the ring buffer invalid.
+ */
+static inline void invalidate_xhlock(struct hist_lock *xhlock)
+{
+ /*
+ * Normally, xhlock->hlock.instance must be !NULL.
+ */
+ xhlock->hlock.instance = NULL;
+}
+
+/*
+ * Lock history stacks; we have 2 nested lock history stacks:
+ *
+ * HARD(IRQ)
+ * SOFT(IRQ)
+ *
+ * The thing is that once we complete a HARD/SOFT IRQ the future task locks
+ * should not depend on any of the locks observed while running the IRQ. So
+ * what we do is rewind the history buffer and erase all our knowledge of that
+ * temporal event.
+ */
+
+void crossrelease_hist_start(enum xhlock_context_t c)
+{
+ struct task_struct *cur = current;
+
+ if (!cur->xhlocks)
+ return;
+
+ cur->xhlock_idx_hist[c] = cur->xhlock_idx;
+ cur->hist_id_save[c] = cur->hist_id;
+}
+
+void crossrelease_hist_end(enum xhlock_context_t c)
+{
+ struct task_struct *cur = current;
+
+ if (cur->xhlocks) {
+ unsigned int idx = cur->xhlock_idx_hist[c];
+ struct hist_lock *h = &xhlock(idx);
+
+ cur->xhlock_idx = idx;
+
+ /* Check if the ring was overwritten. */
+ if (h->hist_id != cur->hist_id_save[c])
+ invalidate_xhlock(h);
+ }
+}
+
+/*
+ * lockdep_invariant_state() is used to annotate independence inside a task, to
+ * make one task look like multiple independent 'tasks'.
+ *
+ * Take for instance workqueues; each work is independent of the last. The
+ * completion of a future work does not depend on the completion of a past work
+ * (in general). Therefore we must not carry that (lock) dependency across
+ * works.
+ *
+ * This is true for many things; pretty much all kthreads fall into this
+ * pattern, where they have an invariant state and future completions do not
+ * depend on past completions. Its just that since they all have the 'same'
+ * form -- the kthread does the same over and over -- it doesn't typically
+ * matter.
+ *
+ * The same is true for system-calls, once a system call is completed (we've
+ * returned to userspace) the next system call does not depend on the lock
+ * history of the previous system call.
+ *
+ * They key property for independence, this invariant state, is that it must be
+ * a point where we hold no locks and have no history. Because if we were to
+ * hold locks, the restore at _end() would not necessarily recover it's history
+ * entry. Similarly, independence per-definition means it does not depend on
+ * prior state.
+ */
+void lockdep_invariant_state(bool force)
+{
+ /*
+ * We call this at an invariant point, no current state, no history.
+ * Verify the former, enforce the latter.
+ */
+ WARN_ON_ONCE(!force && current->lockdep_depth);
+ invalidate_xhlock(&xhlock(current->xhlock_idx));
+}
+
+static int cross_lock(struct lockdep_map *lock)
+{
+ return lock ? lock->cross : 0;
+}
+
+/*
+ * This is needed to decide the relationship between wrapable variables.
+ */
+static inline int before(unsigned int a, unsigned int b)
+{
+ return (int)(a - b) < 0;
+}
+
+static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
+{
+ return hlock_class(&xhlock->hlock);
+}
+
+static inline struct lock_class *xlock_class(struct cross_lock *xlock)
+{
+ return hlock_class(&xlock->hlock);
+}
+
+/*
+ * Should we check a dependency with previous one?
+ */
+static inline int depend_before(struct held_lock *hlock)
+{
+ return hlock->read != 2 && hlock->check && !hlock->trylock;
+}
+
+/*
+ * Should we check a dependency with next one?
+ */
+static inline int depend_after(struct held_lock *hlock)
+{
+ return hlock->read != 2 && hlock->check;
+}
+
+/*
+ * Check if the xhlock is valid, which would be false if,
+ *
+ * 1. Has not used after initializaion yet.
+ * 2. Got invalidated.
+ *
+ * Remind hist_lock is implemented as a ring buffer.
+ */
+static inline int xhlock_valid(struct hist_lock *xhlock)
+{
+ /*
+ * xhlock->hlock.instance must be !NULL.
+ */
+ return !!xhlock->hlock.instance;
+}
+
+/*
+ * Record a hist_lock entry.
+ *
+ * Irq disable is only required.
+ */
+static void add_xhlock(struct held_lock *hlock)
+{
+ unsigned int idx = ++current->xhlock_idx;
+ struct hist_lock *xhlock = &xhlock(idx);
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ /*
+ * This can be done locklessly because they are all task-local
+ * state, we must however ensure IRQs are disabled.
+ */
+ WARN_ON_ONCE(!irqs_disabled());
+#endif
+
+ /* Initialize hist_lock's members */
+ xhlock->hlock = *hlock;
+ xhlock->hist_id = ++current->hist_id;
+
+ xhlock->trace.nr_entries = 0;
+ xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
+ xhlock->trace.entries = xhlock->trace_entries;
+ xhlock->trace.skip = 3;
+ save_stack_trace(&xhlock->trace);
+}
+
+static inline int same_context_xhlock(struct hist_lock *xhlock)
+{
+ return xhlock->hlock.irq_context == task_irq_context(current);
+}
+
+/*
+ * This should be lockless as far as possible because this would be
+ * called very frequently.
+ */
+static void check_add_xhlock(struct held_lock *hlock)
+{
+ /*
+ * Record a hist_lock, only in case that acquisitions ahead
+ * could depend on the held_lock. For example, if the held_lock
+ * is trylock then acquisitions ahead never depends on that.
+ * In that case, we don't need to record it. Just return.
+ */
+ if (!current->xhlocks || !depend_before(hlock))
+ return;
+
+ add_xhlock(hlock);
+}
+
+/*
+ * For crosslock.
+ */
+static int add_xlock(struct held_lock *hlock)
+{
+ struct cross_lock *xlock;
+ unsigned int gen_id;
+
+ if (!graph_lock())
+ return 0;
+
+ xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
+
+ /*
+ * When acquisitions for a crosslock are overlapped, we use
+ * nr_acquire to perform commit for them, based on cross_gen_id
+ * of the first acquisition, which allows to add additional
+ * dependencies.
+ *
+ * Moreover, when no acquisition of a crosslock is in progress,
+ * we should not perform commit because the lock might not exist
+ * any more, which might cause incorrect memory access. So we
+ * have to track the number of acquisitions of a crosslock.
+ *
+ * depend_after() is necessary to initialize only the first
+ * valid xlock so that the xlock can be used on its commit.
+ */
+ if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
+ goto unlock;
+
+ gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
+ xlock->hlock = *hlock;
+ xlock->hlock.gen_id = gen_id;
+unlock:
+ graph_unlock();
+ return 1;
+}
+
+/*
+ * Called for both normal and crosslock acquires. Normal locks will be
+ * pushed on the hist_lock queue. Cross locks will record state and
+ * stop regular lock_acquire() to avoid being placed on the held_lock
+ * stack.
+ *
+ * Return: 0 - failure;
+ * 1 - crosslock, done;
+ * 2 - normal lock, continue to held_lock[] ops.
+ */
+static int lock_acquire_crosslock(struct held_lock *hlock)
+{
+ /*
+ * CONTEXT 1 CONTEXT 2
+ * --------- ---------
+ * lock A (cross)
+ * X = atomic_inc_return(&cross_gen_id)
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Y = atomic_read_acquire(&cross_gen_id)
+ * lock B
+ *
+ * atomic_read_acquire() is for ordering between A and B,
+ * IOW, A happens before B, when CONTEXT 2 see Y >= X.
+ *
+ * Pairs with atomic_inc_return() in add_xlock().
+ */
+ hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
+
+ if (cross_lock(hlock->instance))
+ return add_xlock(hlock);
+
+ check_add_xhlock(hlock);
+ return 2;
+}
+
+static int copy_trace(struct stack_trace *trace)
+{
+ unsigned long *buf = stack_trace + nr_stack_trace_entries;
+ unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+ unsigned int nr = min(max_nr, trace->nr_entries);
+
+ trace->nr_entries = nr;
+ memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
+ trace->entries = buf;
+ nr_stack_trace_entries += nr;
+
+ if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
+ dump_stack();
+
+ return 0;
+ }
+
+ return 1;
+}
+
+static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
+{
+ unsigned int xid, pid;
+ u64 chain_key;
+
+ xid = xlock_class(xlock) - lock_classes;
+ chain_key = iterate_chain_key((u64)0, xid);
+ pid = xhlock_class(xhlock) - lock_classes;
+ chain_key = iterate_chain_key(chain_key, pid);
+
+ if (lookup_chain_cache(chain_key))
+ return 1;
+
+ if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
+ chain_key))
+ return 0;
+
+ if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
+ &xhlock->trace, copy_trace))
+ return 0;
+
+ return 1;
+}
+
+static void commit_xhlocks(struct cross_lock *xlock)
+{
+ unsigned int cur = current->xhlock_idx;
+ unsigned int prev_hist_id = xhlock(cur).hist_id;
+ unsigned int i;
+
+ if (!graph_lock())
+ return;
+
+ if (xlock->nr_acquire) {
+ for (i = 0; i < MAX_XHLOCKS_NR; i++) {
+ struct hist_lock *xhlock = &xhlock(cur - i);
+
+ if (!xhlock_valid(xhlock))
+ break;
+
+ if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
+ break;
+
+ if (!same_context_xhlock(xhlock))
+ break;
+
+ /*
+ * Filter out the cases where the ring buffer was
+ * overwritten and the current entry has a bigger
+ * hist_id than the previous one, which is impossible
+ * otherwise:
+ */
+ if (unlikely(before(prev_hist_id, xhlock->hist_id)))
+ break;
+
+ prev_hist_id = xhlock->hist_id;
+
+ /*
+ * commit_xhlock() returns 0 with graph_lock already
+ * released if fail.
+ */
+ if (!commit_xhlock(xlock, xhlock))
+ return;
+ }
+ }
+
+ graph_unlock();
+}
+
+void lock_commit_crosslock(struct lockdep_map *lock)
+{
+ struct cross_lock *xlock;
+ unsigned long flags;
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ if (!current->xhlocks)
+ return;
+
+ /*
+ * Do commit hist_locks with the cross_lock, only in case that
+ * the cross_lock could depend on acquisitions after that.
+ *
+ * For example, if the cross_lock does not have the 'check' flag
+ * then we don't need to check dependencies and commit for that.
+ * Just skip it. In that case, of course, the cross_lock does
+ * not depend on acquisitions ahead, either.
+ *
+ * WARNING: Don't do that in add_xlock() in advance. When an
+ * acquisition context is different from the commit context,
+ * invalid(skipped) cross_lock might be accessed.
+ */
+ if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ xlock = &((struct lockdep_map_cross *)lock)->xlock;
+ commit_xhlocks(xlock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_commit_crosslock);
+
+/*
+ * Return: 0 - failure;
+ * 1 - crosslock, done;
+ * 2 - normal lock, continue to held_lock[] ops.
+ */
+static int lock_release_crosslock(struct lockdep_map *lock)
+{
+ if (cross_lock(lock)) {
+ if (!graph_lock())
+ return 0;
+ ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
+ graph_unlock();
+ return 1;
+ }
+ return 2;
+}
+
+static void cross_init(struct lockdep_map *lock, int cross)
+{
+ if (cross)
+ ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
+
+ lock->cross = cross;
+
+ /*
+ * Crossrelease assumes that the ring buffer size of xhlocks
+ * is aligned with power of 2. So force it on build.
+ */
+ BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
+}
+
+void lockdep_init_task(struct task_struct *task)
+{
+ int i;
+
+ task->xhlock_idx = UINT_MAX;
+ task->hist_id = 0;
+
+ for (i = 0; i < XHLOCK_CTX_NR; i++) {
+ task->xhlock_idx_hist[i] = UINT_MAX;
+ task->hist_id_save[i] = 0;
+ }
+
+ task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
+ GFP_KERNEL);
+}
+
+void lockdep_free_task(struct task_struct *task)
+{
+ if (task->xhlocks) {
+ void *tmp = task->xhlocks;
+ /* Diable crossrelease for current */
+ task->xhlocks = NULL;
+ kfree(tmp);
+ }
+}
+#endif
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index c08fbd2f5ba9..1da4669d57a7 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -143,6 +143,8 @@ struct lockdep_stats {
int redundant_softirqs_on;
int redundant_softirqs_off;
int nr_unused_locks;
+ int nr_redundant_checks;
+ int nr_redundant;
int nr_cyclic_checks;
int nr_cyclic_check_recursions;
int nr_find_usage_forwards_checks;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 6d1fcc786081..68d9e267ccd4 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -201,6 +201,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
debug_atomic_read(chain_lookup_hits));
seq_printf(m, " cyclic checks: %11llu\n",
debug_atomic_read(nr_cyclic_checks));
+ seq_printf(m, " redundant checks: %11llu\n",
+ debug_atomic_read(nr_redundant_checks));
+ seq_printf(m, " redundant links: %11llu\n",
+ debug_atomic_read(nr_redundant));
seq_printf(m, " find-mask forwards checks: %11llu\n",
debug_atomic_read(nr_find_usage_forwards_checks));
seq_printf(m, " find-mask backwards checks: %11llu\n",
diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h
index 995b0cc2b84c..35ca09f2ed0b 100644
--- a/kernel/locking/lockdep_states.h
+++ b/kernel/locking/lockdep_states.h
@@ -6,4 +6,3 @@
*/
LOCKDEP_STATE(HARDIRQ)
LOCKDEP_STATE(SOFTIRQ)
-LOCKDEP_STATE(RECLAIM_FS)
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index a3167941093b..a74ee6abd039 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -109,6 +109,19 @@ bool osq_lock(struct optimistic_spin_queue *lock)
prev = decode_cpu(old);
node->prev = prev;
+
+ /*
+ * osq_lock() unqueue
+ *
+ * node->prev = prev osq_wait_next()
+ * WMB MB
+ * prev->next = node next->prev = prev // unqueue-C
+ *
+ * Here 'node->prev' and 'next->prev' are the same variable and we need
+ * to ensure these stores happen in-order to avoid corrupting the list.
+ */
+ smp_wmb();
+
WRITE_ONCE(prev->next, node);
/*
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index cc3ed0ccdfa2..2655f26ec882 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -20,6 +20,7 @@
#include <linux/cpumask.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
+#include <linux/spinlock.h>
#include <asm/qrwlock.h>
/*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b2caec7315af..294294c71ba4 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -28,6 +28,7 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/mutex.h>
+#include <linux/prefetch.h>
#include <asm/byteorder.h>
#include <asm/qspinlock.h>
@@ -267,123 +268,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
#endif
-/*
- * Various notes on spin_is_locked() and spin_unlock_wait(), which are
- * 'interesting' functions:
- *
- * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
- * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
- * PPC). Also qspinlock has a similar issue per construction, the setting of
- * the locked byte can be unordered acquiring the lock proper.
- *
- * This gets to be 'interesting' in the following cases, where the /should/s
- * end up false because of this issue.
- *
- *
- * CASE 1:
- *
- * So the spin_is_locked() correctness issue comes from something like:
- *
- * CPU0 CPU1
- *
- * global_lock(); local_lock(i)
- * spin_lock(&G) spin_lock(&L[i])
- * for (i) if (!spin_is_locked(&G)) {
- * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep();
- * return;
- * }
- * // deal with fail
- *
- * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
- * that there is exclusion between the two critical sections.
- *
- * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
- * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
- * /should/ be constrained by the ACQUIRE from spin_lock(&G).
- *
- * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
- *
- *
- * CASE 2:
- *
- * For spin_unlock_wait() there is a second correctness issue, namely:
- *
- * CPU0 CPU1
- *
- * flag = set;
- * smp_mb(); spin_lock(&l)
- * spin_unlock_wait(&l); if (!flag)
- * // add to lockless list
- * spin_unlock(&l);
- * // iterate lockless list
- *
- * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
- * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
- * semantics etc..)
- *
- * Where flag /should/ be ordered against the locked store of l.
- */
-
-/*
- * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
- * issuing an _unordered_ store to set _Q_LOCKED_VAL.
- *
- * This means that the store can be delayed, but no later than the
- * store-release from the unlock. This means that simply observing
- * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
- *
- * There are two paths that can issue the unordered store:
- *
- * (1) clear_pending_set_locked(): *,1,0 -> *,0,1
- *
- * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0
- * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1
- *
- * However, in both cases we have other !0 state we've set before to queue
- * ourseves:
- *
- * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
- * load is constrained by that ACQUIRE to not pass before that, and thus must
- * observe the store.
- *
- * For (2) we have a more intersting scenario. We enqueue ourselves using
- * xchg_tail(), which ends up being a RELEASE. This in itself is not
- * sufficient, however that is followed by an smp_cond_acquire() on the same
- * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
- * guarantees we must observe that store.
- *
- * Therefore both cases have other !0 state that is observable before the
- * unordered locked byte store comes through. This means we can use that to
- * wait for the lock store, and then wait for an unlock.
- */
-#ifndef queued_spin_unlock_wait
-void queued_spin_unlock_wait(struct qspinlock *lock)
-{
- u32 val;
-
- for (;;) {
- val = atomic_read(&lock->val);
-
- if (!val) /* not locked, we're done */
- goto done;
-
- if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
- break;
-
- /* not locked, but pending, wait until we observe the lock */
- cpu_relax();
- }
-
- /* any unlock is good */
- while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
- cpu_relax();
-
-done:
- smp_acquire__after_ctrl_dep();
-}
-EXPORT_SYMBOL(queued_spin_unlock_wait);
-#endif
-
#endif /* _GEN_PV_LOCK_SLOWPATH */
/**
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e6b2f7ad3e51..43555681c40b 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -72,7 +72,7 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
struct __qspinlock *l = (void *)lock;
if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
- (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
qstat_inc(qstat_pv_lock_stealing, true);
return true;
}
@@ -101,16 +101,16 @@ static __always_inline void clear_pending(struct qspinlock *lock)
/*
* The pending bit check in pv_queued_spin_steal_lock() isn't a memory
- * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
- * just to be sure that it will get it.
+ * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
+ * lock just to be sure that it will get it.
*/
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
return !READ_ONCE(l->locked) &&
- (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
- == _Q_PENDING_VAL);
+ (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
+ _Q_LOCKED_VAL) == _Q_PENDING_VAL);
}
#else /* _Q_PENDING_BITS == 8 */
static __always_inline void set_pending(struct qspinlock *lock)
@@ -138,7 +138,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
*/
old = val;
new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
- val = atomic_cmpxchg(&lock->val, old, new);
+ val = atomic_cmpxchg_acquire(&lock->val, old, new);
if (val == old)
return 1;
@@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void)
*/
pv_lock_hash = alloc_large_system_hash("PV qspinlock",
sizeof(struct pv_hash_entry),
- pv_hash_size, 0, HASH_EARLY,
+ pv_hash_size, 0,
+ HASH_EARLY | HASH_ZERO,
&pv_lock_hash_bits, NULL,
pv_hash_size, pv_hash_size);
}
@@ -361,8 +362,18 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
* observe its next->locked value and advance itself.
*
* Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+ *
+ * The write to next->locked in arch_mcs_spin_unlock_contended()
+ * must be ordered before the read of pn->state in the cmpxchg()
+ * below for the code to work correctly. To guarantee full ordering
+ * irrespective of the success or failure of the cmpxchg(),
+ * a relaxed version with explicit barrier is used. The control
+ * dependency will order the reading of pn->state before any
+ * subsequent writes.
*/
- if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+ smp_mb__before_atomic();
+ if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed)
+ != vcpu_halted)
return;
/*
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 78069895032a..649dc9d3951a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -963,7 +963,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
- rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
waiter->prio = task->prio;
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 72ad45a9a794..8d039b928d61 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -40,6 +40,9 @@ struct rt_mutex_waiter {
/*
* Various helpers to access the waiters-tree:
*/
+
+#ifdef CONFIG_RT_MUTEXES
+
static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
{
return !RB_EMPTY_ROOT(&lock->waiters);
@@ -69,6 +72,32 @@ task_top_pi_waiter(struct task_struct *p)
pi_tree_entry);
}
+#else
+
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return false;
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+ return NULL;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+ return false;
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+ return NULL;
+}
+
+#endif
+
/*
* lock->owner state tracking:
*/
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index c65f7989f850..0848634c5512 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -126,7 +126,7 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem)
/*
* get a read lock on the semaphore
*/
-void __sched __down_read(struct rw_semaphore *sem)
+int __sched __down_read_common(struct rw_semaphore *sem, int state)
{
struct rwsem_waiter waiter;
unsigned long flags;
@@ -140,8 +140,6 @@ void __sched __down_read(struct rw_semaphore *sem)
goto out;
}
- set_current_state(TASK_UNINTERRUPTIBLE);
-
/* set up my own style of waitqueue */
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_READ;
@@ -149,20 +147,41 @@ void __sched __down_read(struct rw_semaphore *sem)
list_add_tail(&waiter.list, &sem->wait_list);
- /* we don't need to touch the semaphore struct anymore */
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
/* wait to be given the lock */
for (;;) {
if (!waiter.task)
break;
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+ set_current_state(state);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
schedule();
- set_current_state(TASK_UNINTERRUPTIBLE);
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
}
- __set_current_state(TASK_RUNNING);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
out:
- ;
+ return 0;
+
+out_nolock:
+ /*
+ * We didn't take the lock, so that there is a writer, which
+ * is owner or the first waiter of the sem. If it's a waiter,
+ * it will be woken by current owner. Not need to wake anybody.
+ */
+ list_del(&waiter.list);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ return -EINTR;
+}
+
+void __sched __down_read(struct rw_semaphore *sem)
+{
+ __down_read_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+int __sched __down_read_killable(struct rw_semaphore *sem)
+{
+ return __down_read_common(sem, TASK_KILLABLE);
}
/*
@@ -231,8 +250,8 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
out_nolock:
list_del(&waiter.list);
- if (!list_empty(&sem->wait_list))
- __rwsem_do_wake(sem, 1);
+ if (!list_empty(&sem->wait_list) && sem->count >= 0)
+ __rwsem_do_wake(sem, 0);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
return -EINTR;
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 34e727f18e49..02f660666ab8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -221,8 +221,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
/*
* Wait for the read lock to be granted
*/
-__visible
-struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
+static inline struct rw_semaphore __sched *
+__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
{
long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
struct rwsem_waiter waiter;
@@ -255,17 +255,44 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
/* wait to be given the lock */
while (true) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(state);
if (!waiter.task)
break;
+ if (signal_pending_state(state, current)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (waiter.task)
+ goto out_nolock;
+ raw_spin_unlock_irq(&sem->wait_lock);
+ break;
+ }
schedule();
}
__set_current_state(TASK_RUNNING);
return sem;
+out_nolock:
+ list_del(&waiter.list);
+ if (list_empty(&sem->wait_list))
+ atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ __set_current_state(TASK_RUNNING);
+ return ERR_PTR(-EINTR);
+}
+
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed(struct rw_semaphore *sem)
+{
+ return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(rwsem_down_read_failed);
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed_killable(struct rw_semaphore *sem)
+{
+ return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed_killable);
+
/*
* This function must be called with the sem->wait_lock held to prevent
* race conditions between checking the rwsem wait list and setting the
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
deleted file mode 100644
index 9f9284f37f8d..000000000000
--- a/kernel/membarrier.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
- *
- * membarrier system call
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- */
-
-#include <linux/syscalls.h>
-#include <linux/membarrier.h>
-#include <linux/tick.h>
-
-/*
- * Bitmask made from a "or" of all commands within enum membarrier_cmd,
- * except MEMBARRIER_CMD_QUERY.
- */
-#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED)
-
-/**
- * sys_membarrier - issue memory barriers on a set of threads
- * @cmd: Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
- *
- * If this system call is not implemented, -ENOSYS is returned. If the
- * command specified does not exist, or if the command argument is invalid,
- * this system call returns -EINVAL. For a given command, with flags argument
- * set to 0, this system call is guaranteed to always return the same value
- * until reboot.
- *
- * All memory accesses performed in program order from each targeted thread
- * is guaranteed to be ordered with respect to sys_membarrier(). If we use
- * the semantic "barrier()" to represent a compiler barrier forcing memory
- * accesses to be performed in program order across the barrier, and
- * smp_mb() to represent explicit memory barriers forcing full memory
- * ordering across the barrier, we have the following ordering table for
- * each pair of barrier(), sys_membarrier() and smp_mb():
- *
- * The pair ordering is detailed as (O: ordered, X: not ordered):
- *
- * barrier() smp_mb() sys_membarrier()
- * barrier() X X O
- * smp_mb() X O O
- * sys_membarrier() O O O
- */
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
-{
- /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
- if (tick_nohz_full_enabled())
- return -ENOSYS;
- if (unlikely(flags))
- return -EINVAL;
- switch (cmd) {
- case MEMBARRIER_CMD_QUERY:
- return MEMBARRIER_CMD_BITMASK;
- case MEMBARRIER_CMD_SHARED:
- if (num_online_cpus() > 1)
- synchronize_sched();
- return 0;
- default:
- return -EINVAL;
- }
-}
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 23a6483c3666..066e73c2fcc9 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -34,13 +34,24 @@ static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
}
#endif
-static void *try_ram_remap(resource_size_t offset, size_t size)
+#ifndef arch_memremap_can_ram_remap
+static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
+ unsigned long flags)
+{
+ return true;
+}
+#endif
+
+static void *try_ram_remap(resource_size_t offset, size_t size,
+ unsigned long flags)
{
unsigned long pfn = PHYS_PFN(offset);
/* In the simple case just return the existing linear address */
- if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
+ if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
+ arch_memremap_can_ram_remap(offset, size, flags))
return __va(offset);
+
return NULL; /* fallback to arch_memremap_wb */
}
@@ -48,7 +59,8 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
* memremap() - remap an iomem_resource as cacheable memory
* @offset: iomem resource start address
* @size: size of remap
- * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC
+ * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
+ * MEMREMAP_ENC, MEMREMAP_DEC
*
* memremap() is "ioremap" for cases where it is known that the resource
* being mapped does not have i/o side effects and the __iomem
@@ -95,7 +107,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
* the requested range is potentially in System RAM.
*/
if (is_ram == REGION_INTERSECTS)
- addr = try_ram_remap(offset, size);
+ addr = try_ram_remap(offset, size, flags);
if (!addr)
addr = arch_memremap_wb(offset, size);
}
@@ -182,18 +194,41 @@ struct page_map {
struct vmem_altmap altmap;
};
-static void pgmap_radix_release(struct resource *res)
+static unsigned long order_at(struct resource *res, unsigned long pgoff)
{
- resource_size_t key, align_start, align_size, align_end;
+ unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
+ unsigned long nr_pages, mask;
- align_start = res->start & ~(SECTION_SIZE - 1);
- align_size = ALIGN(resource_size(res), SECTION_SIZE);
- align_end = align_start + align_size - 1;
+ nr_pages = PHYS_PFN(resource_size(res));
+ if (nr_pages == pgoff)
+ return ULONG_MAX;
+
+ /*
+ * What is the largest aligned power-of-2 range available from
+ * this resource pgoff to the end of the resource range,
+ * considering the alignment of the current pgoff?
+ */
+ mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
+ if (!mask)
+ return ULONG_MAX;
+
+ return find_first_bit(&mask, BITS_PER_LONG);
+}
+
+#define foreach_order_pgoff(res, order, pgoff) \
+ for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
+ pgoff += 1UL << order, order = order_at((res), pgoff))
+
+static void pgmap_radix_release(struct resource *res)
+{
+ unsigned long pgoff, order;
mutex_lock(&pgmap_lock);
- for (key = res->start; key <= res->end; key += SECTION_SIZE)
- radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
+ foreach_order_pgoff(res, order, pgoff)
+ radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
mutex_unlock(&pgmap_lock);
+
+ synchronize_rcu();
}
static unsigned long pfn_first(struct page_map *page_map)
@@ -256,7 +291,7 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
WARN_ON_ONCE(!rcu_read_lock_held());
- page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
+ page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
return page_map ? &page_map->pgmap : NULL;
}
@@ -281,12 +316,12 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
void *devm_memremap_pages(struct device *dev, struct resource *res,
struct percpu_ref *ref, struct vmem_altmap *altmap)
{
- resource_size_t key, align_start, align_size, align_end;
+ resource_size_t align_start, align_size, align_end;
+ unsigned long pfn, pgoff, order;
pgprot_t pgprot = PAGE_KERNEL;
struct dev_pagemap *pgmap;
struct page_map *page_map;
int error, nid, is_ram;
- unsigned long pfn;
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -325,11 +360,12 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
mutex_lock(&pgmap_lock);
error = 0;
align_end = align_start + align_size - 1;
- for (key = align_start; key <= align_end; key += SECTION_SIZE) {
+
+ foreach_order_pgoff(res, order, pgoff) {
struct dev_pagemap *dup;
rcu_read_lock();
- dup = find_dev_pagemap(key);
+ dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff));
rcu_read_unlock();
if (dup) {
dev_err(dev, "%s: %pr collides with mapping for %s\n",
@@ -337,8 +373,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
error = -EBUSY;
break;
}
- error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
- page_map);
+ error = __radix_tree_insert(&pgmap_radix,
+ PHYS_PFN(res->start) + pgoff, order, page_map);
if (error) {
dev_err(dev, "%s: failed: %d\n", __func__, error);
break;
@@ -358,7 +394,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
goto err_pfn_remap;
mem_hotplug_begin();
- error = arch_add_memory(nid, align_start, align_size, true);
+ error = arch_add_memory(nid, align_start, align_size, false);
+ if (!error)
+ move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT);
mem_hotplug_done();
if (error)
goto err_add_memory;
diff --git a/kernel/module.c b/kernel/module.c
index d7eb41d772c4..40f983cbea81 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -49,9 +49,7 @@
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#ifdef CONFIG_STRICT_MODULE_RWX
-#include <asm/set_memory.h>
-#endif
+#include <linux/set_memory.h>
#include <asm/mmu_context.h>
#include <linux/license.h>
#include <asm/sections.h>
@@ -302,6 +300,7 @@ int unregister_module_notifier(struct notifier_block *nb)
EXPORT_SYMBOL(unregister_module_notifier);
struct load_info {
+ const char *name;
Elf_Ehdr *hdr;
unsigned long len;
Elf_Shdr *sechdrs;
@@ -602,7 +601,7 @@ static struct module *find_module_all(const char *name, size_t len,
module_assert_mutex_or_preempt();
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
continue;
if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
@@ -1275,12 +1274,13 @@ static u32 resolve_rel_crc(const s32 *crc)
return *(u32 *)((void *)crc + *crc);
}
-static int check_version(Elf_Shdr *sechdrs,
- unsigned int versindex,
+static int check_version(const struct load_info *info,
const char *symname,
struct module *mod,
const s32 *crc)
{
+ Elf_Shdr *sechdrs = info->sechdrs;
+ unsigned int versindex = info->index.vers;
unsigned int i, num_versions;
struct modversion_info *versions;
@@ -1314,17 +1314,16 @@ static int check_version(Elf_Shdr *sechdrs,
}
/* Broken toolchain. Warn once, then let it go.. */
- pr_warn_once("%s: no symbol version for %s\n", mod->name, symname);
+ pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
return 1;
bad_version:
pr_warn("%s: disagrees about version of symbol %s\n",
- mod->name, symname);
+ info->name, symname);
return 0;
}
-static inline int check_modstruct_version(Elf_Shdr *sechdrs,
- unsigned int versindex,
+static inline int check_modstruct_version(const struct load_info *info,
struct module *mod)
{
const s32 *crc;
@@ -1340,8 +1339,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
BUG();
}
preempt_enable();
- return check_version(sechdrs, versindex,
- VMLINUX_SYMBOL_STR(module_layout), mod, crc);
+ return check_version(info, VMLINUX_SYMBOL_STR(module_layout),
+ mod, crc);
}
/* First part is kernel version, which we ignore if module has crcs. */
@@ -1355,8 +1354,7 @@ static inline int same_magic(const char *amagic, const char *bmagic,
return strcmp(amagic, bmagic) == 0;
}
#else
-static inline int check_version(Elf_Shdr *sechdrs,
- unsigned int versindex,
+static inline int check_version(const struct load_info *info,
const char *symname,
struct module *mod,
const s32 *crc)
@@ -1364,8 +1362,7 @@ static inline int check_version(Elf_Shdr *sechdrs,
return 1;
}
-static inline int check_modstruct_version(Elf_Shdr *sechdrs,
- unsigned int versindex,
+static inline int check_modstruct_version(const struct load_info *info,
struct module *mod)
{
return 1;
@@ -1401,7 +1398,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
if (!sym)
goto unlock;
- if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) {
+ if (!check_version(info, name, mod, crc)) {
sym = ERR_PTR(-EINVAL);
goto getname;
}
@@ -1664,31 +1661,36 @@ static inline void remove_notes_attrs(struct module *mod)
}
#endif /* CONFIG_KALLSYMS */
-static void add_usage_links(struct module *mod)
+static void del_usage_links(struct module *mod)
{
#ifdef CONFIG_MODULE_UNLOAD
struct module_use *use;
- int nowarn;
mutex_lock(&module_mutex);
- list_for_each_entry(use, &mod->target_list, target_list) {
- nowarn = sysfs_create_link(use->target->holders_dir,
- &mod->mkobj.kobj, mod->name);
- }
+ list_for_each_entry(use, &mod->target_list, target_list)
+ sysfs_remove_link(use->target->holders_dir, mod->name);
mutex_unlock(&module_mutex);
#endif
}
-static void del_usage_links(struct module *mod)
+static int add_usage_links(struct module *mod)
{
+ int ret = 0;
#ifdef CONFIG_MODULE_UNLOAD
struct module_use *use;
mutex_lock(&module_mutex);
- list_for_each_entry(use, &mod->target_list, target_list)
- sysfs_remove_link(use->target->holders_dir, mod->name);
+ list_for_each_entry(use, &mod->target_list, target_list) {
+ ret = sysfs_create_link(use->target->holders_dir,
+ &mod->mkobj.kobj, mod->name);
+ if (ret)
+ break;
+ }
mutex_unlock(&module_mutex);
+ if (ret)
+ del_usage_links(mod);
#endif
+ return ret;
}
static int module_add_modinfo_attrs(struct module *mod)
@@ -1799,13 +1801,18 @@ static int mod_sysfs_setup(struct module *mod,
if (err)
goto out_unreg_param;
- add_usage_links(mod);
+ err = add_usage_links(mod);
+ if (err)
+ goto out_unreg_modinfo_attrs;
+
add_sect_attrs(mod, info);
add_notes_attrs(mod, info);
kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
return 0;
+out_unreg_modinfo_attrs:
+ module_remove_modinfo_attrs(mod);
out_unreg_param:
module_param_sysfs_remove(mod);
out_unreg_holders:
@@ -2912,9 +2919,15 @@ static int rewrite_section_headers(struct load_info *info, int flags)
info->index.vers = 0; /* Pretend no __versions section! */
else
info->index.vers = find_sec(info, "__versions");
+ info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
info->index.info = find_sec(info, ".modinfo");
+ if (!info->index.info)
+ info->name = "(missing .modinfo section)";
+ else
+ info->name = get_modinfo(info, "name");
info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
- info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
return 0;
}
@@ -2954,21 +2967,29 @@ static struct module *setup_load_info(struct load_info *info, int flags)
info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
if (!info->index.mod) {
- pr_warn("No module found in object\n");
+ pr_warn("%s: No module found in object\n",
+ info->name ?: "(missing .modinfo name field)");
return ERR_PTR(-ENOEXEC);
}
/* This is temporary: point mod into copy of data. */
mod = (void *)info->sechdrs[info->index.mod].sh_addr;
+ /*
+ * If we didn't load the .modinfo 'name' field, fall back to
+ * on-disk struct mod 'name' field.
+ */
+ if (!info->name)
+ info->name = mod->name;
+
if (info->index.sym == 0) {
- pr_warn("%s: module has no symbols (stripped?)\n", mod->name);
+ pr_warn("%s: module has no symbols (stripped?)\n", info->name);
return ERR_PTR(-ENOEXEC);
}
info->index.pcpu = find_pcpusec(info);
/* Check module struct version now, before we try to use module. */
- if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
+ if (!check_modstruct_version(info, mod))
return ERR_PTR(-ENOEXEC);
return mod;
@@ -2989,7 +3010,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
return err;
} else if (!same_magic(modmagic, vermagic, info->index.vers)) {
pr_err("%s: version magic '%s' should be '%s'\n",
- mod->name, modmagic, vermagic);
+ info->name, modmagic, vermagic);
return -ENOEXEC;
}
@@ -3074,9 +3095,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
mod->trace_events = section_objs(info, "_ftrace_events",
sizeof(*mod->trace_events),
&mod->num_trace_events);
- mod->trace_enums = section_objs(info, "_ftrace_enum_map",
- sizeof(*mod->trace_enums),
- &mod->num_trace_enums);
+ mod->trace_evals = section_objs(info, "_ftrace_eval_map",
+ sizeof(*mod->trace_evals),
+ &mod->num_trace_evals);
#endif
#ifdef CONFIG_TRACING
mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
@@ -3239,7 +3260,7 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
/* module_blacklist is a comma-separated list of module names */
static char *module_blacklist;
-static bool blacklisted(char *module_name)
+static bool blacklisted(const char *module_name)
{
const char *p;
size_t len;
@@ -3269,7 +3290,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
if (IS_ERR(mod))
return mod;
- if (blacklisted(mod->name))
+ if (blacklisted(info->name))
return ERR_PTR(-EPERM);
err = check_modinfo(mod, info, flags);
@@ -4198,7 +4219,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
goto out;
e = search_extable(mod->extable,
- mod->extable + mod->num_exentries - 1,
+ mod->num_exentries,
addr);
out:
preempt_enable();
diff --git a/kernel/panic.c b/kernel/panic.c
index a58932b41700..bdd18afa19a4 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -26,6 +26,7 @@
#include <linux/nmi.h>
#include <linux/console.h>
#include <linux/bug.h>
+#include <linux/ratelimit.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -601,6 +602,17 @@ EXPORT_SYMBOL(__stack_chk_fail);
#endif
+#ifdef CONFIG_ARCH_HAS_REFCOUNT
+void refcount_error_report(struct pt_regs *regs, const char *err)
+{
+ WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n",
+ err, (void *)instruction_pointer(regs),
+ current->comm, task_pid_nr(current),
+ from_kuid_munged(&init_user_ns, current_uid()),
+ from_kuid_munged(&init_user_ns, current_euid()));
+}
+#endif
+
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
diff --git a/kernel/pid.c b/kernel/pid.c
index fd1cde1e4576..020dedbdf066 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (!ns)
ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
- if (type != PIDTYPE_PID)
+ if (type != PIDTYPE_PID) {
+ if (type == __PIDTYPE_TGID)
+ type = PIDTYPE_PID;
task = task->group_leader;
+ }
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
}
rcu_read_unlock();
@@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
}
EXPORT_SYMBOL(__task_pid_nr_ns);
-pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return pid_nr_ns(task_tgid(tsk), ns);
-}
-EXPORT_SYMBOL(task_tgid_nr_ns);
-
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));
@@ -575,16 +572,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
- unsigned int i, pidhash_size;
-
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
- HASH_EARLY | HASH_SMALL,
+ HASH_EARLY | HASH_SMALL | HASH_ZERO,
&pidhash_shift, NULL,
0, 4096);
- pidhash_size = 1U << pidhash_shift;
-
- for (i = 0; i < pidhash_size; i++)
- INIT_HLIST_HEAD(&pid_hash[i]);
}
void __init pidmap_init(void)
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e1914c7b85b1..a5c36e9c56a6 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -651,7 +651,7 @@ static int load_image_and_restore(void)
int error;
unsigned int flags;
- pr_debug("Loading hibernation image.\n");
+ pm_pr_dbg("Loading hibernation image.\n");
lock_device_hotplug();
error = create_basic_memory_bitmaps();
@@ -681,7 +681,7 @@ int hibernate(void)
bool snapshot_test = false;
if (!hibernation_available()) {
- pr_debug("Hibernation not available.\n");
+ pm_pr_dbg("Hibernation not available.\n");
return -EPERM;
}
@@ -692,6 +692,7 @@ int hibernate(void)
goto Unlock;
}
+ pr_info("hibernation entry\n");
pm_prepare_console();
error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
if (error) {
@@ -727,7 +728,7 @@ int hibernate(void)
else
flags |= SF_CRC32_MODE;
- pr_debug("Writing image.\n");
+ pm_pr_dbg("Writing image.\n");
error = swsusp_write(flags);
swsusp_free();
if (!error) {
@@ -739,7 +740,7 @@ int hibernate(void)
in_suspend = 0;
pm_restore_gfp_mask();
} else {
- pr_debug("Image restored successfully.\n");
+ pm_pr_dbg("Image restored successfully.\n");
}
Free_bitmaps:
@@ -747,7 +748,7 @@ int hibernate(void)
Thaw:
unlock_device_hotplug();
if (snapshot_test) {
- pr_debug("Checking hibernation image\n");
+ pm_pr_dbg("Checking hibernation image\n");
error = swsusp_check();
if (!error)
error = load_image_and_restore();
@@ -762,6 +763,8 @@ int hibernate(void)
atomic_inc(&snapshot_device_available);
Unlock:
unlock_system_sleep();
+ pr_info("hibernation exit\n");
+
return error;
}
@@ -811,7 +814,7 @@ static int software_resume(void)
goto Unlock;
}
- pr_debug("Checking hibernation image partition %s\n", resume_file);
+ pm_pr_dbg("Checking hibernation image partition %s\n", resume_file);
if (resume_delay) {
pr_info("Waiting %dsec before reading resume device ...\n",
@@ -853,10 +856,10 @@ static int software_resume(void)
}
Check_image:
- pr_debug("Hibernation image partition %d:%d present\n",
+ pm_pr_dbg("Hibernation image partition %d:%d present\n",
MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
- pr_debug("Looking for hibernation image.\n");
+ pm_pr_dbg("Looking for hibernation image.\n");
error = swsusp_check();
if (error)
goto Unlock;
@@ -868,6 +871,7 @@ static int software_resume(void)
goto Unlock;
}
+ pr_info("resume from hibernation\n");
pm_prepare_console();
error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
if (error) {
@@ -875,7 +879,7 @@ static int software_resume(void)
goto Close_Finish;
}
- pr_debug("Preparing processes for restore.\n");
+ pm_pr_dbg("Preparing processes for restore.\n");
error = freeze_processes();
if (error)
goto Close_Finish;
@@ -884,11 +888,12 @@ static int software_resume(void)
Finish:
__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
pm_restore_console();
+ pr_info("resume from hibernation failed (%d)\n", error);
atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
Unlock:
mutex_unlock(&pm_mutex);
- pr_debug("Hibernation image not present or could not be loaded.\n");
+ pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
swsusp_close(FMODE_READ);
@@ -1012,8 +1017,8 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
error = -EINVAL;
if (!error)
- pr_debug("Hibernation mode set to '%s'\n",
- hibernation_modes[mode]);
+ pm_pr_dbg("Hibernation mode set to '%s'\n",
+ hibernation_modes[mode]);
unlock_system_sleep();
return error ? error : n;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d401c21136d1..3a2ca9066583 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -150,7 +150,7 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr
power_attr(mem_sleep);
#endif /* CONFIG_SUSPEND */
-#ifdef CONFIG_PM_DEBUG
+#ifdef CONFIG_PM_SLEEP_DEBUG
int pm_test_level = TEST_NONE;
static const char * const pm_tests[__TEST_AFTER_LAST] = {
@@ -211,7 +211,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
}
power_attr(pm_test);
-#endif /* CONFIG_PM_DEBUG */
+#endif /* CONFIG_PM_SLEEP_DEBUG */
#ifdef CONFIG_DEBUG_FS
static char *suspend_step_name(enum suspend_stat_step step)
@@ -361,6 +361,61 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
power_attr_ro(pm_wakeup_irq);
+bool pm_debug_messages_on __read_mostly;
+
+static ssize_t pm_debug_messages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", pm_debug_messages_on);
+}
+
+static ssize_t pm_debug_messages_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ pm_debug_messages_on = !!val;
+ return n;
+}
+
+power_attr(pm_debug_messages);
+
+/**
+ * __pm_pr_dbg - Print a suspend debug message to the kernel log.
+ * @defer: Whether or not to use printk_deferred() to print the message.
+ * @fmt: Message format.
+ *
+ * The message will be emitted if enabled through the pm_debug_messages
+ * sysfs attribute.
+ */
+void __pm_pr_dbg(bool defer, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (!pm_debug_messages_on)
+ return;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (defer)
+ printk_deferred(KERN_DEBUG "PM: %pV", &vaf);
+ else
+ printk(KERN_DEBUG "PM: %pV", &vaf);
+
+ va_end(args);
+}
+
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
@@ -691,12 +746,11 @@ static struct attribute * g[] = {
&wake_lock_attr.attr,
&wake_unlock_attr.attr,
#endif
-#ifdef CONFIG_PM_DEBUG
- &pm_test_attr.attr,
-#endif
#ifdef CONFIG_PM_SLEEP_DEBUG
+ &pm_test_attr.attr,
&pm_print_times_attr.attr,
&pm_wakeup_irq_attr.attr,
+ &pm_debug_messages_attr.attr,
#endif
#endif
#ifdef CONFIG_FREEZER
@@ -705,7 +759,7 @@ static struct attribute * g[] = {
NULL,
};
-static struct attribute_group attr_group = {
+static const struct attribute_group attr_group = {
.attrs = g,
};
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7fdc40d31b7d..1d2d761e3c25 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -192,7 +192,6 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
extern const char * const pm_labels[];
extern const char *pm_states[];
extern const char *mem_sleep_states[];
-extern suspend_state_t mem_sleep_current;
extern int suspend_devices_and_enter(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
@@ -245,7 +244,11 @@ enum {
#define TEST_FIRST TEST_NONE
#define TEST_MAX (__TEST_AFTER_LAST - 1)
+#ifdef CONFIG_PM_SLEEP_DEBUG
extern int pm_test_level;
+#else
+#define pm_test_level (TEST_NONE)
+#endif
#ifdef CONFIG_SUSPEND_FREEZER
static inline int suspend_freeze_processes(void)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b7708e319941..0972a8e09d08 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -30,15 +30,13 @@
#include <linux/slab.h>
#include <linux/compiler.h>
#include <linux/ktime.h>
+#include <linux/set_memory.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-#include <asm/set_memory.h>
-#endif
#include "power.h"
@@ -1652,7 +1650,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
{
unsigned long size;
- size = global_page_state(NR_SLAB_RECLAIMABLE)
+ size = global_node_page_state(NR_SLAB_RECLAIMABLE)
+ global_node_page_state(NR_ACTIVE_ANON)
+ global_node_page_state(NR_INACTIVE_ANON)
+ global_node_page_state(NR_ACTIVE_FILE)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 3ecf275d7e44..3e2b4f519009 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -8,6 +8,8 @@
* This file is released under the GPLv2.
*/
+#define pr_fmt(fmt) "PM: " fmt
+
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
@@ -33,53 +35,55 @@
#include "power.h"
const char * const pm_labels[] = {
- [PM_SUSPEND_FREEZE] = "freeze",
+ [PM_SUSPEND_TO_IDLE] = "freeze",
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
};
const char *pm_states[PM_SUSPEND_MAX];
static const char * const mem_sleep_labels[] = {
- [PM_SUSPEND_FREEZE] = "s2idle",
+ [PM_SUSPEND_TO_IDLE] = "s2idle",
[PM_SUSPEND_STANDBY] = "shallow",
[PM_SUSPEND_MEM] = "deep",
};
const char *mem_sleep_states[PM_SUSPEND_MAX];
-suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE;
-static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM;
+suspend_state_t mem_sleep_current = PM_SUSPEND_TO_IDLE;
+suspend_state_t mem_sleep_default = PM_SUSPEND_MAX;
+suspend_state_t pm_suspend_target_state;
+EXPORT_SYMBOL_GPL(pm_suspend_target_state);
unsigned int pm_suspend_global_flags;
EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
static const struct platform_suspend_ops *suspend_ops;
-static const struct platform_freeze_ops *freeze_ops;
-static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+static const struct platform_s2idle_ops *s2idle_ops;
+static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head);
-enum freeze_state __read_mostly suspend_freeze_state;
-static DEFINE_SPINLOCK(suspend_freeze_lock);
+enum s2idle_states __read_mostly s2idle_state;
+static DEFINE_SPINLOCK(s2idle_lock);
-void freeze_set_ops(const struct platform_freeze_ops *ops)
+void s2idle_set_ops(const struct platform_s2idle_ops *ops)
{
lock_system_sleep();
- freeze_ops = ops;
+ s2idle_ops = ops;
unlock_system_sleep();
}
-static void freeze_begin(void)
+static void s2idle_begin(void)
{
- suspend_freeze_state = FREEZE_STATE_NONE;
+ s2idle_state = S2IDLE_STATE_NONE;
}
-static void freeze_enter(void)
+static void s2idle_enter(void)
{
- trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true);
+ trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true);
- spin_lock_irq(&suspend_freeze_lock);
+ spin_lock_irq(&s2idle_lock);
if (pm_wakeup_pending())
goto out;
- suspend_freeze_state = FREEZE_STATE_ENTER;
- spin_unlock_irq(&suspend_freeze_lock);
+ s2idle_state = S2IDLE_STATE_ENTER;
+ spin_unlock_irq(&s2idle_lock);
get_online_cpus();
cpuidle_resume();
@@ -87,56 +91,75 @@ static void freeze_enter(void)
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
/* Make the current CPU wait so it can enter the idle loop too. */
- wait_event(suspend_freeze_wait_head,
- suspend_freeze_state == FREEZE_STATE_WAKE);
+ wait_event(s2idle_wait_head,
+ s2idle_state == S2IDLE_STATE_WAKE);
cpuidle_pause();
put_online_cpus();
- spin_lock_irq(&suspend_freeze_lock);
+ spin_lock_irq(&s2idle_lock);
out:
- suspend_freeze_state = FREEZE_STATE_NONE;
- spin_unlock_irq(&suspend_freeze_lock);
+ s2idle_state = S2IDLE_STATE_NONE;
+ spin_unlock_irq(&s2idle_lock);
- trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false);
+ trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false);
}
static void s2idle_loop(void)
{
- pr_debug("PM: suspend-to-idle\n");
+ pm_pr_dbg("suspend-to-idle\n");
+
+ for (;;) {
+ int error;
+
+ dpm_noirq_begin();
+
+ /*
+ * Suspend-to-idle equals
+ * frozen processes + suspended devices + idle processors.
+ * Thus s2idle_enter() should be called right after
+ * all devices have been suspended.
+ */
+ error = dpm_noirq_suspend_devices(PMSG_SUSPEND);
+ if (!error)
+ s2idle_enter();
+
+ dpm_noirq_resume_devices(PMSG_RESUME);
+ if (error && (error != -EBUSY || !pm_wakeup_pending())) {
+ dpm_noirq_end();
+ break;
+ }
- do {
- freeze_enter();
+ if (s2idle_ops && s2idle_ops->wake)
+ s2idle_ops->wake();
- if (freeze_ops && freeze_ops->wake)
- freeze_ops->wake();
+ dpm_noirq_end();
- dpm_resume_noirq(PMSG_RESUME);
- if (freeze_ops && freeze_ops->sync)
- freeze_ops->sync();
+ if (s2idle_ops && s2idle_ops->sync)
+ s2idle_ops->sync();
if (pm_wakeup_pending())
break;
pm_wakeup_clear(false);
- } while (!dpm_suspend_noirq(PMSG_SUSPEND));
+ }
- pr_debug("PM: resume from suspend-to-idle\n");
+ pm_pr_dbg("resume from suspend-to-idle\n");
}
-void freeze_wake(void)
+void s2idle_wake(void)
{
unsigned long flags;
- spin_lock_irqsave(&suspend_freeze_lock, flags);
- if (suspend_freeze_state > FREEZE_STATE_NONE) {
- suspend_freeze_state = FREEZE_STATE_WAKE;
- wake_up(&suspend_freeze_wait_head);
+ spin_lock_irqsave(&s2idle_lock, flags);
+ if (s2idle_state > S2IDLE_STATE_NONE) {
+ s2idle_state = S2IDLE_STATE_WAKE;
+ wake_up(&s2idle_wait_head);
}
- spin_unlock_irqrestore(&suspend_freeze_lock, flags);
+ spin_unlock_irqrestore(&s2idle_lock, flags);
}
-EXPORT_SYMBOL_GPL(freeze_wake);
+EXPORT_SYMBOL_GPL(s2idle_wake);
static bool valid_state(suspend_state_t state)
{
@@ -152,19 +175,19 @@ void __init pm_states_init(void)
{
/* "mem" and "freeze" are always present in /sys/power/state. */
pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM];
- pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE];
+ pm_states[PM_SUSPEND_TO_IDLE] = pm_labels[PM_SUSPEND_TO_IDLE];
/*
* Suspend-to-idle should be supported even without any suspend_ops,
* initialize mem_sleep_states[] accordingly here.
*/
- mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE];
+ mem_sleep_states[PM_SUSPEND_TO_IDLE] = mem_sleep_labels[PM_SUSPEND_TO_IDLE];
}
static int __init mem_sleep_default_setup(char *str)
{
suspend_state_t state;
- for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++)
+ for (state = PM_SUSPEND_TO_IDLE; state <= PM_SUSPEND_MEM; state++)
if (mem_sleep_labels[state] &&
!strcmp(str, mem_sleep_labels[state])) {
mem_sleep_default = state;
@@ -193,7 +216,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
}
if (valid_state(PM_SUSPEND_MEM)) {
mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM];
- if (mem_sleep_default == PM_SUSPEND_MEM)
+ if (mem_sleep_default >= PM_SUSPEND_MEM)
mem_sleep_current = PM_SUSPEND_MEM;
}
@@ -216,49 +239,49 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
static bool sleep_state_supported(suspend_state_t state)
{
- return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
+ return state == PM_SUSPEND_TO_IDLE || (suspend_ops && suspend_ops->enter);
}
static int platform_suspend_prepare(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
+ return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare ?
suspend_ops->prepare() : 0;
}
static int platform_suspend_prepare_late(suspend_state_t state)
{
- return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
- freeze_ops->prepare() : 0;
+ return state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->prepare ?
+ s2idle_ops->prepare() : 0;
}
static int platform_suspend_prepare_noirq(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
+ return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare_late ?
suspend_ops->prepare_late() : 0;
}
static void platform_resume_noirq(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
+ if (state != PM_SUSPEND_TO_IDLE && suspend_ops->wake)
suspend_ops->wake();
}
static void platform_resume_early(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
- freeze_ops->restore();
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->restore)
+ s2idle_ops->restore();
}
static void platform_resume_finish(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
+ if (state != PM_SUSPEND_TO_IDLE && suspend_ops->finish)
suspend_ops->finish();
}
static int platform_suspend_begin(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
- return freeze_ops->begin();
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->begin)
+ return s2idle_ops->begin();
else if (suspend_ops && suspend_ops->begin)
return suspend_ops->begin(state);
else
@@ -267,21 +290,21 @@ static int platform_suspend_begin(suspend_state_t state)
static void platform_resume_end(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
- freeze_ops->end();
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->end)
+ s2idle_ops->end();
else if (suspend_ops && suspend_ops->end)
suspend_ops->end();
}
static void platform_recover(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
+ if (state != PM_SUSPEND_TO_IDLE && suspend_ops->recover)
suspend_ops->recover();
}
static bool platform_suspend_again(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
+ return state != PM_SUSPEND_TO_IDLE && suspend_ops->suspend_again ?
suspend_ops->suspend_again() : false;
}
@@ -370,16 +393,21 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
- pr_err("PM: late suspend of devices failed\n");
+ pr_err("late suspend of devices failed\n");
goto Platform_finish;
}
error = platform_suspend_prepare_late(state);
if (error)
goto Devices_early_resume;
+ if (state == PM_SUSPEND_TO_IDLE && pm_test_level != TEST_PLATFORM) {
+ s2idle_loop();
+ goto Platform_early_resume;
+ }
+
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
- pr_err("PM: noirq suspend of devices failed\n");
+ pr_err("noirq suspend of devices failed\n");
goto Platform_early_resume;
}
error = platform_suspend_prepare_noirq(state);
@@ -389,17 +417,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
- /*
- * PM_SUSPEND_FREEZE equals
- * frozen processes + suspended devices + idle processors.
- * Thus we should invoke freeze_enter() soon after
- * all the devices are suspended.
- */
- if (state == PM_SUSPEND_FREEZE) {
- s2idle_loop();
- goto Platform_early_resume;
- }
-
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -456,6 +473,8 @@ int suspend_devices_and_enter(suspend_state_t state)
if (!sleep_state_supported(state))
return -ENOSYS;
+ pm_suspend_target_state = state;
+
error = platform_suspend_begin(state);
if (error)
goto Close;
@@ -464,7 +483,7 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
- pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
+ pr_err("Some devices failed to suspend, or early wake event detected\n");
goto Recover_platform;
}
suspend_test_finish("suspend devices");
@@ -485,6 +504,7 @@ int suspend_devices_and_enter(suspend_state_t state)
Close:
platform_resume_end(state);
+ pm_suspend_target_state = PM_SUSPEND_ON;
return error;
Recover_platform:
@@ -518,10 +538,10 @@ static int enter_state(suspend_state_t state)
int error;
trace_suspend_resume(TPS("suspend_enter"), state, true);
- if (state == PM_SUSPEND_FREEZE) {
+ if (state == PM_SUSPEND_TO_IDLE) {
#ifdef CONFIG_PM_DEBUG
if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
- pr_warn("PM: Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n");
+ pr_warn("Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n");
return -EAGAIN;
}
#endif
@@ -531,18 +551,18 @@ static int enter_state(suspend_state_t state)
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
- if (state == PM_SUSPEND_FREEZE)
- freeze_begin();
+ if (state == PM_SUSPEND_TO_IDLE)
+ s2idle_begin();
#ifndef CONFIG_SUSPEND_SKIP_SYNC
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
- pr_info("PM: Syncing filesystems ... ");
+ pr_info("Syncing filesystems ... ");
sys_sync();
pr_cont("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
#endif
- pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
+ pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
pm_suspend_clear_flags();
error = suspend_prepare(state);
if (error)
@@ -552,13 +572,13 @@ static int enter_state(suspend_state_t state)
goto Finish;
trace_suspend_resume(TPS("suspend_enter"), state, false);
- pr_debug("PM: Suspending system (%s)\n", pm_states[state]);
+ pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
pm_restore_gfp_mask();
Finish:
- pr_debug("PM: Finishing wakeup.\n");
+ pm_pr_dbg("Finishing wakeup.\n");
suspend_finish();
Unlock:
mutex_unlock(&pm_mutex);
@@ -579,6 +599,7 @@ int pm_suspend(suspend_state_t state)
if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
return -EINVAL;
+ pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
error = enter_state(state);
if (error) {
suspend_stats.fail++;
@@ -586,6 +607,7 @@ int pm_suspend(suspend_state_t state)
} else {
suspend_stats.success++;
}
+ pr_info("suspend exit\n");
return error;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 5db217051232..6a897e8b2a88 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -104,9 +104,9 @@ repeat:
printk(info_test, pm_states[state]);
status = pm_suspend(state);
if (status < 0)
- state = PM_SUSPEND_FREEZE;
+ state = PM_SUSPEND_TO_IDLE;
}
- if (state == PM_SUSPEND_FREEZE) {
+ if (state == PM_SUSPEND_TO_IDLE) {
printk(info_test, pm_states[state]);
status = pm_suspend(state);
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 57d22571f306..d7cdc426ee38 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -242,8 +242,7 @@ static void hib_end_io(struct bio *bio)
if (bio->bi_status) {
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
(unsigned long long)bio->bi_iter.bi_sector);
}
@@ -270,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
- bio->bi_bdev = hib_resume_bdev;
+ bio_set_dev(bio, hib_resume_bdev);
bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index cfd9ab1b80c5..512f7c2baedd 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -649,7 +649,7 @@ static int syslog_action_restricted(int type)
type != SYSLOG_ACTION_SIZE_BUFFER;
}
-int check_syslog_permissions(int type, int source)
+static int check_syslog_permissions(int type, int source)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
@@ -677,7 +677,6 @@ int check_syslog_permissions(int type, int source)
ok:
return security_syslog(type);
}
-EXPORT_SYMBOL_GPL(check_syslog_permissions);
static void append_char(char **pp, char *e, char c)
{
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index be90c945063f..9210379c0353 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -69,8 +69,7 @@ config TREE_SRCU
This option selects the full-fledged version of SRCU.
config TASKS_RCU
- bool
- default n
+ def_bool PREEMPT
select SRCU
help
This option enables a task-based RCU implementation that uses
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 808b8c85f626..e4b43fef89f5 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -356,22 +356,10 @@ do { \
#ifdef CONFIG_TINY_RCU
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
-static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */
-{
- return true;
-}
-static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
-{
- return false;
-}
-
-static inline void rcu_expedite_gp(void)
-{
-}
-
-static inline void rcu_unexpedite_gp(void)
-{
-}
+static inline bool rcu_gp_is_normal(void) { return true; }
+static inline bool rcu_gp_is_expedited(void) { return false; }
+static inline void rcu_expedite_gp(void) { }
+static inline void rcu_unexpedite_gp(void) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
@@ -419,12 +407,8 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
*gpnum = 0;
*completed = 0;
}
-static inline void rcutorture_record_test_transition(void)
-{
-}
-static inline void rcutorture_record_progress(unsigned long vernum)
-{
-}
+static inline void rcutorture_record_test_transition(void) { }
+static inline void rcutorture_record_progress(unsigned long vernum) { }
#ifdef CONFIG_RCU_TRACE
void do_trace_rcu_torture_read(const char *rcutorturename,
struct rcu_head *rhp,
@@ -460,92 +444,20 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
#endif
#ifdef CONFIG_TINY_RCU
-
-/*
- * Return the number of grace periods started.
- */
-static inline unsigned long rcu_batches_started(void)
-{
- return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods started.
- */
-static inline unsigned long rcu_batches_started_bh(void)
-{
- return 0;
-}
-
-/*
- * Return the number of sched grace periods started.
- */
-static inline unsigned long rcu_batches_started_sched(void)
-{
- return 0;
-}
-
-/*
- * Return the number of grace periods completed.
- */
-static inline unsigned long rcu_batches_completed(void)
-{
- return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_bh(void)
-{
- return 0;
-}
-
-/*
- * Return the number of sched grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_sched(void)
-{
- return 0;
-}
-
-/*
- * Return the number of expedited grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed(void)
-{
- return 0;
-}
-
-/*
- * Return the number of expedited sched grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed_sched(void)
-{
- return 0;
-}
-
-static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
-{
- return 0;
-}
-
-static inline void rcu_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_bh_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_sched_force_quiescent_state(void)
-{
-}
-
-static inline void show_rcu_gp_kthreads(void)
-{
-}
-
+static inline unsigned long rcu_batches_started(void) { return 0; }
+static inline unsigned long rcu_batches_started_bh(void) { return 0; }
+static inline unsigned long rcu_batches_started_sched(void) { return 0; }
+static inline unsigned long rcu_batches_completed(void) { return 0; }
+static inline unsigned long rcu_batches_completed_bh(void) { return 0; }
+static inline unsigned long rcu_batches_completed_sched(void) { return 0; }
+static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
+static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; }
+static inline unsigned long
+srcu_batches_completed(struct srcu_struct *sp) { return 0; }
+static inline void rcu_force_quiescent_state(void) { }
+static inline void rcu_bh_force_quiescent_state(void) { }
+static inline void rcu_sched_force_quiescent_state(void) { }
+static inline void show_rcu_gp_kthreads(void) { }
#else /* #ifdef CONFIG_TINY_RCU */
extern unsigned long rcutorture_testseq;
extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 2b62a38b080f..7649fcd2c4c7 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -36,24 +36,6 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
}
/*
- * Debug function to actually count the number of callbacks.
- * If the number exceeds the limit specified, return -1.
- */
-long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
-{
- int cnt = 0;
- struct rcu_head **rhpp = &rclp->head;
-
- for (;;) {
- if (!*rhpp)
- return cnt;
- if (++cnt > lim)
- return -1;
- rhpp = &(*rhpp)->next;
- }
-}
-
-/*
* Dequeue the oldest rcu_head structure from the specified callback
* list. This function assumes that the callback is non-lazy, but
* the caller can later invoke rcu_cblist_dequeued_lazy() if it
@@ -103,17 +85,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
}
/*
- * Is the specified segment of the specified rcu_segcblist structure
- * empty of callbacks?
- */
-bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
-{
- if (seg == RCU_DONE_TAIL)
- return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
- return rsclp->tails[seg - 1] == rsclp->tails[seg];
-}
-
-/*
* Does the specified rcu_segcblist structure contain callbacks that
* are ready to be invoked?
*/
@@ -134,50 +105,6 @@ bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
}
/*
- * Dequeue and return the first ready-to-invoke callback. If there
- * are no ready-to-invoke callbacks, return NULL. Disables interrupts
- * to avoid interference. Does not protect from interference from other
- * CPUs or tasks.
- */
-struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
-{
- unsigned long flags;
- int i;
- struct rcu_head *rhp;
-
- local_irq_save(flags);
- if (!rcu_segcblist_ready_cbs(rsclp)) {
- local_irq_restore(flags);
- return NULL;
- }
- rhp = rsclp->head;
- BUG_ON(!rhp);
- rsclp->head = rhp->next;
- for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
- if (rsclp->tails[i] != &rhp->next)
- break;
- rsclp->tails[i] = &rsclp->head;
- }
- smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
- WRITE_ONCE(rsclp->len, rsclp->len - 1);
- local_irq_restore(flags);
- return rhp;
-}
-
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rsclp->len_lazy--;
- local_irq_restore(flags);
-}
-
-/*
* Return a pointer to the first callback in the specified rcu_segcblist
* structure. This is useful for diagnostics.
*/
@@ -203,17 +130,6 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
}
/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * have not yet been processed beyond having been posted, that is,
- * does it contain callbacks in its last segment?
- */
-bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
-{
- return rcu_segcblist_is_enabled(rsclp) &&
- !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
-}
-
-/*
* Enqueue the specified callback onto the specified rcu_segcblist
* structure, updating accounting as needed. Note that the ->len
* field may be accessed locklessly, hence the WRITE_ONCE().
@@ -503,3 +419,27 @@ bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
return true;
return false;
}
+
+/*
+ * Merge the source rcu_segcblist structure into the destination
+ * rcu_segcblist structure, then initialize the source. Any pending
+ * callbacks from the source get to start over. It is best to
+ * advance and accelerate both the destination and the source
+ * before merging.
+ */
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+ struct rcu_segcblist *src_rsclp)
+{
+ struct rcu_cblist donecbs;
+ struct rcu_cblist pendcbs;
+
+ rcu_cblist_init(&donecbs);
+ rcu_cblist_init(&pendcbs);
+ rcu_segcblist_extract_count(src_rsclp, &donecbs);
+ rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs);
+ rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs);
+ rcu_segcblist_insert_count(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs);
+ rcu_segcblist_init(src_rsclp);
+}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 6e36e36478cd..581c12b63544 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -31,29 +31,7 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
rclp->len_lazy--;
}
-/*
- * Interim function to return rcu_cblist head pointer. Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
-{
- return rclp->head;
-}
-
-/*
- * Interim function to return rcu_cblist head pointer. Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
-{
- WARN_ON_ONCE(!rclp->head);
- return rclp->tail;
-}
-
void rcu_cblist_init(struct rcu_cblist *rclp);
-long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim);
struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
/*
@@ -134,14 +112,10 @@ static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg);
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
-struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp);
-void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp);
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp, bool lazy);
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
@@ -162,3 +136,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
unsigned long seq);
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+ struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 3cc18110b612..1f87a02c3399 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -317,8 +317,6 @@ static struct rcu_perf_ops sched_ops = {
.name = "sched"
};
-#ifdef CONFIG_TASKS_RCU
-
/*
* Definitions for RCU-tasks perf testing.
*/
@@ -346,24 +344,11 @@ static struct rcu_perf_ops tasks_ops = {
.name = "tasks"
};
-#define RCUPERF_TASKS_OPS &tasks_ops,
-
static bool __maybe_unused torturing_tasks(void)
{
return cur_ops == &tasks_ops;
}
-#else /* #ifdef CONFIG_TASKS_RCU */
-
-#define RCUPERF_TASKS_OPS
-
-static bool __maybe_unused torturing_tasks(void)
-{
- return false;
-}
-
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
-
/*
* If performance tests complete, wait for shutdown to commence.
*/
@@ -658,7 +643,7 @@ rcu_perf_init(void)
int firsterr = 0;
static struct rcu_perf_ops *perf_ops[] = {
&rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops,
- RCUPERF_TASKS_OPS
+ &tasks_ops,
};
if (!torture_init_begin(perf_type, verbose, &perf_runnable))
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b8f7f8ce8575..45f2ffbc1e78 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -199,7 +199,8 @@ MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
static u64 notrace rcu_trace_clock_local(void)
{
u64 ts = trace_clock_local();
- unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+
+ (void)do_div(ts, NSEC_PER_USEC);
return ts;
}
#else /* #ifdef CONFIG_RCU_TRACE */
@@ -496,7 +497,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
- .name = "rcu_busted"
+ .name = "busted"
};
/*
@@ -522,7 +523,7 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
delay = torture_random(rrsp) %
(nrealreaders * 2 * longdelay * uspertick);
- if (!delay)
+ if (!delay && in_task())
schedule_timeout_interruptible(longdelay);
else
rcu_read_delay(rrsp);
@@ -561,44 +562,7 @@ static void srcu_torture_barrier(void)
static void srcu_torture_stats(void)
{
- int __maybe_unused cpu;
- int idx;
-
-#ifdef CONFIG_TREE_SRCU
- idx = srcu_ctlp->srcu_idx & 0x1;
- pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
- torture_type, TORTURE_FLAG, idx);
- for_each_possible_cpu(cpu) {
- unsigned long l0, l1;
- unsigned long u0, u1;
- long c0, c1;
- struct srcu_data *counts;
-
- counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
- u0 = counts->srcu_unlock_count[!idx];
- u1 = counts->srcu_unlock_count[idx];
-
- /*
- * Make sure that a lock is always counted if the corresponding
- * unlock is counted.
- */
- smp_rmb();
-
- l0 = counts->srcu_lock_count[!idx];
- l1 = counts->srcu_lock_count[idx];
-
- c0 = l0 - u0;
- c1 = l1 - u1;
- pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
- }
- pr_cont("\n");
-#elif defined(CONFIG_TINY_SRCU)
- idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
- pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
- torture_type, TORTURE_FLAG, idx,
- READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
- READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
-#endif
+ srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG);
}
static void srcu_torture_synchronize_expedited(void)
@@ -620,6 +584,7 @@ static struct rcu_torture_ops srcu_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .irq_capable = 1,
.name = "srcu"
};
@@ -652,6 +617,7 @@ static struct rcu_torture_ops srcud_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .irq_capable = 1,
.name = "srcud"
};
@@ -696,8 +662,6 @@ static struct rcu_torture_ops sched_ops = {
.name = "sched"
};
-#ifdef CONFIG_TASKS_RCU
-
/*
* Definitions for RCU-tasks torture testing.
*/
@@ -735,24 +699,11 @@ static struct rcu_torture_ops tasks_ops = {
.name = "tasks"
};
-#define RCUTORTURE_TASKS_OPS &tasks_ops,
-
static bool __maybe_unused torturing_tasks(void)
{
return cur_ops == &tasks_ops;
}
-#else /* #ifdef CONFIG_TASKS_RCU */
-
-#define RCUTORTURE_TASKS_OPS
-
-static bool __maybe_unused torturing_tasks(void)
-{
- return false;
-}
-
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
-
/*
* RCU torture priority-boost testing. Runs one real-time thread per
* CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -1114,6 +1065,11 @@ rcu_torture_fakewriter(void *arg)
return 0;
}
+static void rcu_torture_timer_cb(struct rcu_head *rhp)
+{
+ kfree(rhp);
+}
+
/*
* RCU torture reader from timer handler. Dereferences rcu_torture_current,
* incrementing the corresponding element of the pipeline array. The
@@ -1176,6 +1132,14 @@ static void rcu_torture_timer(unsigned long unused)
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
cur_ops->readunlock(idx);
+
+ /* Test call_rcu() invocation from interrupt handler. */
+ if (cur_ops->call) {
+ struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT);
+
+ if (rhp)
+ cur_ops->call(rhp, rcu_torture_timer_cb);
+ }
}
/*
@@ -1354,11 +1318,12 @@ rcu_torture_stats_print(void)
srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
&flags, &gpnum, &completed);
wtp = READ_ONCE(writer_task);
- pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
+ pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n",
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
gpnum, completed, flags,
- wtp == NULL ? ~0UL : wtp->state);
+ wtp == NULL ? ~0UL : wtp->state,
+ wtp == NULL ? -1 : (int)task_cpu(wtp));
show_rcu_gp_kthreads();
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1749,7 +1714,7 @@ rcu_torture_init(void)
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] = {
&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
- &sched_ops, RCUTORTURE_TASKS_OPS
+ &sched_ops, &tasks_ops,
};
if (!torture_init_begin(torture_type, verbose, &torture_runnable))
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 1a1c1047d2ed..76ac5f50b2c7 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -33,6 +33,8 @@
#include "rcu_segcblist.h"
#include "rcu.h"
+int rcu_scheduler_active __read_mostly;
+
static int init_srcu_struct_fields(struct srcu_struct *sp)
{
sp->srcu_lock_nesting[0] = 0;
@@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp)
destroy_rcu_head_on_stack(&rs.head);
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/* Lockdep diagnostics. */
+void __init rcu_scheduler_starting(void)
+{
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d0ca524bf042..729a8706751d 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444);
static void srcu_invoke_callbacks(struct work_struct *work);
static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+static void process_srcu(struct work_struct *work);
/*
* Initialize SRCU combining tree. Note that statically allocated
@@ -896,6 +897,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
__call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
wait_for_completion(&rcu.completion);
destroy_rcu_head_on_stack(&rcu.head);
+
+ /*
+ * Make sure that later code is ordered after the SRCU grace
+ * period. This pairs with the raw_spin_lock_irq_rcu_node()
+ * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
+ * because the current CPU might have been totally uninvolved with
+ * (and thus unordered against) that grace period.
+ */
+ smp_mb();
}
/**
@@ -1194,7 +1204,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
/*
* This is the work-queue function that handles SRCU grace periods.
*/
-void process_srcu(struct work_struct *work)
+static void process_srcu(struct work_struct *work)
{
struct srcu_struct *sp;
@@ -1203,7 +1213,6 @@ void process_srcu(struct work_struct *work)
srcu_advance_state(sp);
srcu_reschedule(sp, srcu_get_delay(sp));
}
-EXPORT_SYMBOL_GPL(process_srcu);
void srcutorture_get_gp_data(enum rcutorture_type test_type,
struct srcu_struct *sp, int *flags,
@@ -1217,6 +1226,43 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf)
+{
+ int cpu;
+ int idx;
+ unsigned long s0 = 0, s1 = 0;
+
+ idx = sp->srcu_idx & 0x1;
+ pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx);
+ for_each_possible_cpu(cpu) {
+ unsigned long l0, l1;
+ unsigned long u0, u1;
+ long c0, c1;
+ struct srcu_data *counts;
+
+ counts = per_cpu_ptr(sp->sda, cpu);
+ u0 = counts->srcu_unlock_count[!idx];
+ u1 = counts->srcu_unlock_count[idx];
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted.
+ */
+ smp_rmb();
+
+ l0 = counts->srcu_lock_count[!idx];
+ l1 = counts->srcu_lock_count[idx];
+
+ c0 = l0 - u0;
+ c1 = l1 - u1;
+ pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
+ s0 += c0;
+ s1 += c1;
+ }
+ pr_cont(" T(%ld,%ld)\n", s0, s1);
+}
+EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
+
static int __init srcu_bootup_announce(void)
{
pr_info("Hierarchical SRCU implementation.\n");
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index f8488965250f..a64eee0db39e 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.curtail = &rcu_bh_ctrlblk.rcucblist,
};
-#include "tiny_plugin.h"
-
void rcu_barrier_bh(void)
{
wait_rcu_gp(call_rcu_bh);
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
deleted file mode 100644
index f0a01b2a3062..000000000000
--- a/kernel/rcu/tiny_plugin.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
- * Internal non-public definitions that provide either classic
- * or preemptible semantics.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (c) 2010 Linaro
- *
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
- */
-
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
-#include <linux/kernel_stat.h>
-
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-
-/*
- * During boot, we forgive RCU lockdep issues. After this function is
- * invoked, we start taking RCU lockdep issues seriously. Note that unlike
- * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
- * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
- * The reason for this is that Tiny RCU does not need kthreads, so does
- * not have to care about the fact that the scheduler is half-initialized
- * at a certain phase of the boot process. Unless SRCU is in the mix.
- */
-void __init rcu_scheduler_starting(void)
-{
- WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
- ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
-}
-
-#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 51d4c3acf32d..84fe96641b2e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -97,9 +97,6 @@ struct rcu_state sname##_state = { \
.gp_state = RCU_GP_IDLE, \
.gpnum = 0UL - 300UL, \
.completed = 0UL - 300UL, \
- .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
- .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \
- .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
@@ -843,13 +840,9 @@ static void rcu_eqs_enter(bool user)
*/
void rcu_idle_enter(void)
{
- unsigned long flags;
-
- local_irq_save(flags);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!");
rcu_eqs_enter(false);
- local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
#ifdef CONFIG_NO_HZ_FULL
/**
@@ -862,7 +855,8 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
*/
void rcu_user_enter(void)
{
- rcu_eqs_enter(1);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!");
+ rcu_eqs_enter(true);
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -955,8 +949,10 @@ static void rcu_eqs_exit(bool user)
if (oldval & DYNTICK_TASK_NEST_MASK) {
rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
} else {
+ __this_cpu_inc(disable_rcu_irq_enter);
rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
rcu_eqs_exit_common(oldval, user);
+ __this_cpu_dec(disable_rcu_irq_enter);
}
}
@@ -979,7 +975,6 @@ void rcu_idle_exit(void)
rcu_eqs_exit(false);
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
#ifdef CONFIG_NO_HZ_FULL
/**
@@ -1358,12 +1353,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
j = jiffies;
gpa = READ_ONCE(rsp->gp_activity);
if (j - gpa > 2 * HZ) {
- pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n",
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
rsp->name, j - gpa,
rsp->gpnum, rsp->completed,
rsp->gp_flags,
gp_state_getname(rsp->gp_state), rsp->gp_state,
- rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
+ rsp->gp_kthread ? rsp->gp_kthread->state : ~0,
+ rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1);
if (rsp->gp_kthread) {
sched_show_task(rsp->gp_kthread);
wake_up_process(rsp->gp_kthread);
@@ -2067,8 +2063,8 @@ static bool rcu_gp_init(struct rcu_state *rsp)
}
/*
- * Helper function for wait_event_interruptible_timeout() wakeup
- * at force-quiescent-state time.
+ * Helper function for swait_event_idle() wakeup at force-quiescent-state
+ * time.
*/
static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
{
@@ -2206,9 +2202,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
- swait_event_interruptible(rsp->gp_wq,
- READ_ONCE(rsp->gp_flags) &
- RCU_GP_FLAG_INIT);
+ swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
+ RCU_GP_FLAG_INIT);
rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
@@ -2239,7 +2234,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
- ret = swait_event_interruptible_timeout(rsp->gp_wq,
+ ret = swait_event_idle_timeout(rsp->gp_wq,
rcu_gp_fqs_check_wake(rsp, &gf), j);
rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
@@ -2409,6 +2404,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
return;
}
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 &&
+ rcu_preempt_blocked_readers_cgp(rnp));
rnp->qsmask &= ~mask;
trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
mask, rnp->qsmask, rnp->level,
@@ -2563,85 +2560,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
- * Send the specified CPU's RCU callbacks to the orphanage. The
- * specified CPU must be offline, and the caller must hold the
- * ->orphan_lock.
- */
-static void
-rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
- struct rcu_node *rnp, struct rcu_data *rdp)
-{
- lockdep_assert_held(&rsp->orphan_lock);
-
- /* No-CBs CPUs do not have orphanable callbacks. */
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
- return;
-
- /*
- * Orphan the callbacks. First adjust the counts. This is safe
- * because _rcu_barrier() excludes CPU-hotplug operations, so it
- * cannot be running now. Thus no memory barrier is required.
- */
- rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
- rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
-
- /*
- * Next, move those callbacks still needing a grace period to
- * the orphanage, where some other CPU will pick them up.
- * Some of the callbacks might have gone partway through a grace
- * period, but that is too bad. They get to start over because we
- * cannot assume that grace periods are synchronized across CPUs.
- */
- rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
-
- /*
- * Then move the ready-to-invoke callbacks to the orphanage,
- * where some other CPU will pick them up. These will not be
- * required to pass though another grace period: They are done.
- */
- rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
-
- /* Finally, disallow further callbacks on this CPU. */
- rcu_segcblist_disable(&rdp->cblist);
-}
-
-/*
- * Adopt the RCU callbacks from the specified rcu_state structure's
- * orphanage. The caller must hold the ->orphan_lock.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
-{
- struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
-
- lockdep_assert_held(&rsp->orphan_lock);
-
- /* No-CBs CPUs are handled specially. */
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
- rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
- return;
-
- /* Do the accounting first. */
- rdp->n_cbs_adopted += rsp->orphan_done.len;
- if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
- rcu_idle_count_callbacks_posted();
- rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
-
- /*
- * We do not need a memory barrier here because the only way we
- * can get here if there is an rcu_barrier() in flight is if
- * we are the task doing the rcu_barrier().
- */
-
- /* First adopt the ready-to-invoke callbacks, then the done ones. */
- rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
- WARN_ON_ONCE(rsp->orphan_done.head);
- rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
- WARN_ON_ONCE(rsp->orphan_pend.head);
- WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
- !rcu_segcblist_n_cbs(&rdp->cblist));
-}
-
-/*
* Trace the fact that this CPU is going offline.
*/
static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
@@ -2704,14 +2622,12 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
/*
* The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup,
- * including orphaning the outgoing CPU's RCU callbacks, and also
- * adopting them. There can only be one CPU hotplug operation at a time,
- * so no other CPU can be attempting to update rcu_cpu_kthread_task.
+ * this fact from process context. Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
*/
static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
{
- unsigned long flags;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
@@ -2720,18 +2636,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
-
- /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
- raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
- rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
- rcu_adopt_orphan_cbs(rsp, flags);
- raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
-
- WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
- !rcu_segcblist_empty(&rdp->cblist),
- "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
- cpu, rcu_segcblist_n_cbs(&rdp->cblist),
- rcu_segcblist_first_cb(&rdp->cblist));
}
/*
@@ -3569,10 +3473,11 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
struct rcu_state *rsp = rdp->rsp;
if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
- _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("LastCB"), -1,
+ rsp->barrier_sequence);
complete(&rsp->barrier_completion);
} else {
- _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence);
}
}
@@ -3584,14 +3489,15 @@ static void rcu_barrier_func(void *type)
struct rcu_state *rsp = type;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
- _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
atomic_inc(&rsp->barrier_cpu_count);
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
- _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1,
+ rsp->barrier_sequence);
}
}
@@ -3605,14 +3511,15 @@ static void _rcu_barrier(struct rcu_state *rsp)
struct rcu_data *rdp;
unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
- _rcu_barrier_trace(rsp, "Begin", -1, s);
+ _rcu_barrier_trace(rsp, TPS("Begin"), -1, s);
/* Take mutex to serialize concurrent rcu_barrier() requests. */
mutex_lock(&rsp->barrier_mutex);
/* Did someone else do our work for us? */
if (rcu_seq_done(&rsp->barrier_sequence, s)) {
- _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1,
+ rsp->barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rsp->barrier_mutex);
return;
@@ -3620,7 +3527,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
/* Mark the start of the barrier operation. */
rcu_seq_start(&rsp->barrier_sequence);
- _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence);
/*
* Initialize the count to one rather than to zero in order to
@@ -3643,10 +3550,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
rdp = per_cpu_ptr(rsp->rda, cpu);
if (rcu_is_nocb_cpu(cpu)) {
if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
- _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
+ _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu,
rsp->barrier_sequence);
} else {
- _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu,
rsp->barrier_sequence);
smp_mb__before_atomic();
atomic_inc(&rsp->barrier_cpu_count);
@@ -3654,11 +3561,11 @@ static void _rcu_barrier(struct rcu_state *rsp)
rcu_barrier_callback, rsp, cpu, 0);
}
} else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
- _rcu_barrier_trace(rsp, "OnlineQ", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu,
rsp->barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
} else {
- _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu,
rsp->barrier_sequence);
}
}
@@ -3675,7 +3582,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
wait_for_completion(&rsp->barrier_completion);
/* Mark the end of the barrier operation. */
- _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence);
rcu_seq_end(&rsp->barrier_sequence);
/* Other rcu_barrier() invocations can now safely proceed. */
@@ -3777,8 +3684,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
*/
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- if (!rdp->beenonline)
- WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
rdp->beenonline = true; /* We have now been online. */
rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
rdp->completed = rnp->completed;
@@ -3882,6 +3787,8 @@ void rcu_cpu_starting(unsigned int cpu)
{
unsigned long flags;
unsigned long mask;
+ int nbits;
+ unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp;
struct rcu_state *rsp;
@@ -3892,9 +3799,15 @@ void rcu_cpu_starting(unsigned int cpu)
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->qsmaskinitnext |= mask;
+ oldmask = rnp->expmaskinitnext;
rnp->expmaskinitnext |= mask;
+ oldmask ^= rnp->expmaskinitnext;
+ nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
+ /* Allow lockless access for expedited grace periods. */
+ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+ smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -3937,6 +3850,50 @@ void rcu_report_dead(unsigned int cpu)
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_idle_cpu(cpu, rsp);
}
+
+/* Migrate the dead CPU's callbacks to the current CPU. */
+static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ struct rcu_data *my_rdp;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+
+ if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
+ return; /* No callbacks to migrate. */
+
+ local_irq_save(flags);
+ my_rdp = this_cpu_ptr(rsp->rda);
+ if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) {
+ local_irq_restore(flags);
+ return;
+ }
+ raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+ rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */
+ rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */
+ rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
+ !rcu_segcblist_n_cbs(&my_rdp->cblist));
+ raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+ WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
+ !rcu_segcblist_empty(&rdp->cblist),
+ "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
+ cpu, rcu_segcblist_n_cbs(&rdp->cblist),
+ rcu_segcblist_first_cb(&rdp->cblist));
+}
+
+/*
+ * The outgoing CPU has just passed through the dying-idle state,
+ * and we are being invoked from the CPU that was IPIed to continue the
+ * offline operation. We need to migrate the outgoing CPU's callbacks.
+ */
+void rcutree_migrate_callbacks(int cpu)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ rcu_migrate_callbacks(cpu, rsp);
+}
#endif
/*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9af0f31d6847..8e1f285f0a70 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -219,8 +219,6 @@ struct rcu_data {
/* qlen at last check for QS forcing */
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
- unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
- unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
@@ -268,7 +266,9 @@ struct rcu_data {
struct rcu_head **nocb_follower_tail;
struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
+ raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
+ struct timer_list nocb_timer; /* Enforce finite deferral. */
/* The following fields are used by the leader, hence own cacheline. */
struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
@@ -350,15 +350,6 @@ struct rcu_state {
/* End of fields guarded by root rcu_node's lock. */
- raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
- /* Protect following fields. */
- struct rcu_cblist orphan_pend; /* Orphaned callbacks that */
- /* need a grace period. */
- struct rcu_cblist orphan_done; /* Orphaned callbacks that */
- /* are ready to invoke. */
- /* (Contains counts.) */
- /* End of fields guarded by orphan_lock. */
-
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
@@ -495,7 +486,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy, unsigned long flags);
-static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index dd21ca47e4b4..46d61b597731 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -73,7 +73,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
unsigned long flags;
unsigned long mask;
unsigned long oldmask;
- int ncpus = READ_ONCE(rsp->ncpus);
+ int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */
struct rcu_node *rnp;
struct rcu_node *rnp_up;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 908b309d60d7..55bde94b9572 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -180,6 +180,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
struct task_struct *t = current;
lockdep_assert_held(&rnp->lock);
+ WARN_ON_ONCE(rdp->mynode != rnp);
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
/*
* Decide where to queue the newly blocked task. In theory,
@@ -261,6 +263,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
rnp->gp_tasks = &t->rcu_node_entry;
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
rnp->exp_tasks = &t->rcu_node_entry;
+ WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
+ !(rnp->qsmask & rdp->grpmask));
+ WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
+ !(rnp->expmask & rdp->grpmask));
raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
/*
@@ -482,6 +488,7 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp = t->rcu_blocked_node;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
WARN_ON_ONCE(rnp != t->rcu_blocked_node);
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -495,10 +502,10 @@ void rcu_read_unlock_special(struct task_struct *t)
if (&t->rcu_node_entry == rnp->exp_tasks)
rnp->exp_tasks = np;
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
- if (&t->rcu_node_entry == rnp->boost_tasks)
- rnp->boost_tasks = np;
/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp->boost_tasks = np;
}
/*
@@ -636,10 +643,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
+ struct task_struct *t;
+
RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
- if (rcu_preempt_has_tasks(rnp))
+ if (rcu_preempt_has_tasks(rnp)) {
rnp->gp_tasks = rnp->blkd_tasks.next;
+ t = container_of(rnp->gp_tasks, struct task_struct,
+ rcu_node_entry);
+ trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
+ rnp->gpnum, t->pid);
+ }
WARN_ON_ONCE(rnp->qsmask);
}
@@ -1788,23 +1802,62 @@ bool rcu_is_nocb_cpu(int cpu)
}
/*
- * Kick the leader kthread for this NOCB group.
+ * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock
+ * and this function releases it.
*/
-static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
{
struct rcu_data *rdp_leader = rdp->nocb_leader;
- if (!READ_ONCE(rdp_leader->nocb_kthread))
+ lockdep_assert_held(&rdp->nocb_lock);
+ if (!READ_ONCE(rdp_leader->nocb_kthread)) {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
return;
- if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+ }
+ if (rdp_leader->nocb_leader_sleep || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
+ del_timer(&rdp->nocb_timer);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
swake_up(&rdp_leader->nocb_wq);
+ } else {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
}
}
/*
+ * Kick the leader kthread for this NOCB group, but caller has not
+ * acquired locks.
+ */
+static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ __wake_nocb_leader(rdp, force, flags);
+}
+
+/*
+ * Arrange to wake the leader kthread for this NOCB group at some
+ * future time when it is safe to do so.
+ */
+static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
+ mod_timer(&rdp->nocb_timer, jiffies + 1);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+}
+
+/*
* Does the specified CPU need an RCU callback for the specified flavor
* of rcu_barrier()?
*/
@@ -1891,11 +1944,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE);
- /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
- smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeEmptyIsDeferred"));
+ wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
}
rdp->qlen_last_fqs_check = 0;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
@@ -1905,11 +1955,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvf"));
} else {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE);
- /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
- smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeOvfIsDeferred"));
+ wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeOvfIsDeferred"));
}
rdp->qlen_last_fqs_check = LONG_MAX / 2;
} else {
@@ -1961,30 +2008,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
* Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
* not a no-CBs CPU.
*/
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags)
{
- long ql = rsp->orphan_done.len;
- long qll = rsp->orphan_done.len_lazy;
-
- /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!");
if (!rcu_is_nocb_cpu(smp_processor_id()))
- return false;
-
- /* First, enqueue the donelist, if any. This preserves CB ordering. */
- if (rsp->orphan_done.head) {
- __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done),
- rcu_cblist_tail(&rsp->orphan_done),
- ql, qll, flags);
- }
- if (rsp->orphan_pend.head) {
- __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
- rcu_cblist_tail(&rsp->orphan_pend),
- ql, qll, flags);
- }
- rcu_cblist_init(&rsp->orphan_done);
- rcu_cblist_init(&rsp->orphan_pend);
+ return false; /* Not NOCBs CPU, caller must migrate CBs. */
+ __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
+ rcu_segcblist_tail(&rdp->cblist),
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags);
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_disable(&rdp->cblist);
return true;
}
@@ -2031,6 +2067,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
static void nocb_leader_wait(struct rcu_data *my_rdp)
{
bool firsttime = true;
+ unsigned long flags;
bool gotcbs;
struct rcu_data *rdp;
struct rcu_head **tail;
@@ -2039,13 +2076,17 @@ wait_again:
/* Wait for callbacks to appear. */
if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
swait_event_interruptible(my_rdp->nocb_wq,
!READ_ONCE(my_rdp->nocb_leader_sleep));
- /* Memory barrier handled by smp_mb() calls below and repoll. */
+ raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
+ my_rdp->nocb_leader_sleep = true;
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&my_rdp->nocb_timer);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
} else if (firsttime) {
firsttime = false; /* Don't drown trace log with "Poll"! */
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll"));
}
/*
@@ -2054,7 +2095,7 @@ wait_again:
* nocb_gp_head, where they await a grace period.
*/
gotcbs = false;
- smp_mb(); /* wakeup before ->nocb_head reads. */
+ smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
if (!rdp->nocb_gp_head)
@@ -2066,56 +2107,41 @@ wait_again:
gotcbs = true;
}
- /*
- * If there were no callbacks, sleep a bit, rescan after a
- * memory barrier, and go retry.
- */
+ /* No callbacks? Sleep a bit if polling, and go retry. */
if (unlikely(!gotcbs)) {
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
- "WokeEmpty");
WARN_ON(signal_pending(current));
- schedule_timeout_interruptible(1);
-
- /* Rescan in case we were a victim of memory ordering. */
- my_rdp->nocb_leader_sleep = true;
- smp_mb(); /* Ensure _sleep true before scan. */
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
- if (READ_ONCE(rdp->nocb_head)) {
- /* Found CB, so short-circuit next wait. */
- my_rdp->nocb_leader_sleep = false;
- break;
- }
+ if (rcu_nocb_poll) {
+ schedule_timeout_interruptible(1);
+ } else {
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
+ TPS("WokeEmpty"));
+ }
goto wait_again;
}
/* Wait for one grace period. */
rcu_nocb_wait_gp(my_rdp);
- /*
- * We left ->nocb_leader_sleep unset to reduce cache thrashing.
- * We set it now, but recheck for new callbacks while
- * traversing our follower list.
- */
- my_rdp->nocb_leader_sleep = true;
- smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
-
/* Each pass through the following loop wakes a follower, if needed. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- if (READ_ONCE(rdp->nocb_head))
+ if (!rcu_nocb_poll &&
+ READ_ONCE(rdp->nocb_head) &&
+ READ_ONCE(my_rdp->nocb_leader_sleep)) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
+ }
if (!rdp->nocb_gp_head)
continue; /* No CBs, so no need to wake follower. */
/* Append callbacks to follower's "done" list. */
- tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ tail = rdp->nocb_follower_tail;
+ rdp->nocb_follower_tail = rdp->nocb_gp_tail;
*tail = rdp->nocb_gp_head;
- smp_mb__after_atomic(); /* Store *tail before wakeup. */
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
- /*
- * List was empty, wake up the follower.
- * Memory barriers supplied by atomic_long_add().
- */
+ /* List was empty, so wake up the follower. */
swake_up(&rdp->nocb_wq);
}
}
@@ -2131,28 +2157,16 @@ wait_again:
*/
static void nocb_follower_wait(struct rcu_data *rdp)
{
- bool firsttime = true;
-
for (;;) {
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- "FollowerSleep");
- swait_event_interruptible(rdp->nocb_wq,
- READ_ONCE(rdp->nocb_follower_head));
- } else if (firsttime) {
- /* Don't drown trace log with "Poll"! */
- firsttime = false;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
- }
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
+ swait_event_interruptible(rdp->nocb_wq,
+ READ_ONCE(rdp->nocb_follower_head));
if (smp_load_acquire(&rdp->nocb_follower_head)) {
/* ^^^ Ensure CB invocation follows _head test. */
return;
}
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- "WokeEmpty");
WARN_ON(signal_pending(current));
- schedule_timeout_interruptible(1);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty"));
}
}
@@ -2165,6 +2179,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
static int rcu_nocb_kthread(void *arg)
{
int c, cl;
+ unsigned long flags;
struct rcu_head *list;
struct rcu_head *next;
struct rcu_head **tail;
@@ -2179,11 +2194,14 @@ static int rcu_nocb_kthread(void *arg)
nocb_follower_wait(rdp);
/* Pull the ready-to-invoke callbacks onto local list. */
- list = READ_ONCE(rdp->nocb_follower_head);
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ list = rdp->nocb_follower_head;
+ rdp->nocb_follower_head = NULL;
+ tail = rdp->nocb_follower_tail;
+ rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
BUG_ON(!list);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
- WRITE_ONCE(rdp->nocb_follower_head, NULL);
- tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty"));
/* Each pass through the following loop invokes a callback. */
trace_rcu_batch_start(rdp->rsp->name,
@@ -2226,18 +2244,39 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
}
/* Do a deferred wakeup of rcu_nocb_kthread(). */
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
{
+ unsigned long flags;
int ndw;
- if (!rcu_nocb_need_deferred_wakeup(rdp))
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ if (!rcu_nocb_need_deferred_wakeup(rdp)) {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
return;
+ }
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE);
+ __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
}
+/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
+static void do_nocb_deferred_wakeup_timer(unsigned long x)
+{
+ do_nocb_deferred_wakeup_common((struct rcu_data *)x);
+}
+
+/*
+ * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
+ * This means we do an inexact common-case check. Note that if
+ * we miss, ->nocb_timer will eventually clean things up.
+ */
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ if (rcu_nocb_need_deferred_wakeup(rdp))
+ do_nocb_deferred_wakeup_common(rdp);
+}
+
void __init rcu_init_nohz(void)
{
int cpu;
@@ -2287,6 +2326,9 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
rdp->nocb_tail = &rdp->nocb_head;
init_swait_queue_head(&rdp->nocb_wq);
rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+ raw_spin_lock_init(&rdp->nocb_lock);
+ setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer,
+ (unsigned long)rdp);
}
/*
@@ -2459,7 +2501,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
return false;
}
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags)
{
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 00e77c470017..5033b66d2753 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -568,7 +568,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
/* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_SRCU(tasks_rcu_exit_srcu);
+DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
@@ -875,6 +875,22 @@ static void rcu_spawn_tasks_kthread(void)
mutex_unlock(&rcu_tasks_kthread_mutex);
}
+/* Do the srcu_read_lock() for the above synchronize_srcu(). */
+void exit_tasks_rcu_start(void)
+{
+ preempt_disable();
+ current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
+ preempt_enable();
+}
+
+/* Do the srcu_read_unlock() for the above synchronize_srcu(). */
+void exit_tasks_rcu_finish(void)
+{
+ preempt_disable();
+ __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx);
+ preempt_enable();
+}
+
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifndef CONFIG_TINY_RCU
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 53f0164ed362..78f54932ea1d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index da39489d2d80..de6d7f4dfcb5 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -71,7 +71,6 @@ static inline struct autogroup *autogroup_create(void)
goto out_fail;
tg = sched_create_group(&root_task_group);
-
if (IS_ERR(tg))
goto out_free;
@@ -101,7 +100,7 @@ out_free:
out_fail:
if (printk_ratelimit()) {
printk(KERN_WARNING "autogroup_create: %s failure.\n",
- ag ? "sched_create_group()" : "kmalloc()");
+ ag ? "sched_create_group()" : "kzalloc()");
}
return autogroup_kref_get(&autogroup_default);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 13fc5ae9bf2f..cc873075c3bd 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -32,6 +32,12 @@ void complete(struct completion *x)
unsigned long flags;
spin_lock_irqsave(&x->wait.lock, flags);
+
+ /*
+ * Perform commit of crossrelease here.
+ */
+ complete_release_commit(x);
+
if (x->done != UINT_MAX)
x->done++;
__wake_up_locked(&x->wait, TASK_NORMAL, 1);
@@ -47,6 +53,13 @@ EXPORT_SYMBOL(complete);
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
+ *
+ * Since complete_all() sets the completion of @x permanently to done
+ * to allow multiple waiters to finish, a call to reinit_completion()
+ * must be used on @x if @x is to be used again. The code must make
+ * sure that all waiters have woken and finished before reinitializing
+ * @x. Also note that the function completion_done() can not be used
+ * to know if there are still waiters after complete_all() has been called.
*/
void complete_all(struct completion *x)
{
@@ -92,9 +105,14 @@ __wait_for_common(struct completion *x,
{
might_sleep();
+ complete_acquire(x);
+
spin_lock_irq(&x->wait.lock);
timeout = do_wait_for_common(x, action, timeout, state);
spin_unlock_irq(&x->wait.lock);
+
+ complete_release(x);
+
return timeout;
}
@@ -297,9 +315,12 @@ EXPORT_SYMBOL(try_wait_for_completion);
* Return: 0 if there are waiters (wait_for_completion() in progress)
* 1 if there are no waiters.
*
+ * Note, this will always return true if complete_all() was called on @X.
*/
bool completion_done(struct completion *x)
{
+ unsigned long flags;
+
if (!READ_ONCE(x->done))
return false;
@@ -307,14 +328,9 @@ bool completion_done(struct completion *x)
* If ->done, we need to wait for complete() to release ->wait.lock
* otherwise we can end up freeing the completion before complete()
* is done referencing it.
- *
- * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
- * the loads of ->done and ->wait.lock such that we cannot observe
- * the lock before complete() acquires it while observing the ->done
- * after it's acquired the lock.
*/
- smp_rmb();
- spin_unlock_wait(&x->wait.lock);
+ spin_lock_irqsave(&x->wait.lock, flags);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 17c667b427b4..6d2c7ff9ba98 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -951,8 +951,13 @@ struct migration_arg {
static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int dest_cpu)
{
- if (unlikely(!cpu_active(dest_cpu)))
- return rq;
+ if (p->flags & PF_KTHREAD) {
+ if (unlikely(!cpu_online(dest_cpu)))
+ return rq;
+ } else {
+ if (unlikely(!cpu_active(dest_cpu)))
+ return rq;
+ }
/* Affinity changed (again). */
if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
@@ -1967,8 +1972,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* reordered with p->state check below. This pairs with mb() in
* set_current_state() the waiting thread does.
*/
- smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ smp_mb__after_spinlock();
if (!(p->state & state))
goto out;
@@ -2069,7 +2074,7 @@ out:
/**
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
- * @cookie: context's cookie for pinning
+ * @rf: request-queue flags for pinning
*
* Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -2635,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
prev_state = prev->state;
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
+ /*
+ * The membarrier system call requires a full memory barrier
+ * after storing to rq->curr, before going back to user-space.
+ *
+ * TODO: This smp_mb__after_unlock_lock can go away if PPC end
+ * up adding a full barrier to switch_mm(), or we should figure
+ * out if a smp_mb__after_unlock_lock is really the proper API
+ * to use.
+ */
+ smp_mb__after_unlock_lock();
finish_lock_switch(rq, prev);
finish_arch_post_lock_switch();
@@ -3281,8 +3296,8 @@ static void __sched notrace __schedule(bool preempt)
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*/
- smp_mb__before_spinlock();
rq_lock(rq, &rf);
+ smp_mb__after_spinlock();
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
@@ -3324,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt)
if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;
+ /*
+ * The membarrier system call requires each architecture
+ * to have a full memory barrier after updating
+ * rq->curr, before returning to user-space. For TSO
+ * (e.g. x86), the architecture must provide its own
+ * barrier in switch_mm(). For weakly ordered machines
+ * for which spin_unlock() acts as a full memory
+ * barrier, finish_lock_switch() in common code takes
+ * care of this barrier. For weakly ordered machines for
+ * which spin_unlock() acts as a RELEASE barrier (only
+ * arm64 and PowerPC), arm64 has a full barrier in
+ * switch_to(), and PowerPC has
+ * smp_mb__after_unlock_lock() before
+ * finish_lock_switch().
+ */
++*switch_count;
trace_sched_switch(preempt, prev, next);
@@ -3352,8 +3382,8 @@ void __noreturn do_task_dead(void)
* To avoid it, we have to wait for releasing tsk->pi_lock which
* is held by try_to_wake_up()
*/
- smp_mb();
- raw_spin_unlock_wait(&current->pi_lock);
+ raw_spin_lock_irq(&current->pi_lock);
+ raw_spin_unlock_irq(&current->pi_lock);
/* Causes final put_task_struct in finish_task_switch(): */
__set_current_state(TASK_DEAD);
@@ -5103,24 +5133,17 @@ out_unlock:
return retval;
}
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
int ppid;
- unsigned long state = p->state;
-
- /* Make sure the string lines up properly with the number of task states: */
- BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
if (!try_get_task_stack(p))
return;
- if (state)
- state = __ffs(state) + 1;
- printk(KERN_INFO "%-15.15s %c", p->comm,
- state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
- if (state == TASK_RUNNING)
+
+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+
+ if (p->state == TASK_RUNNING)
printk(KERN_CONT " running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
@@ -5177,11 +5200,6 @@ void show_state_filter(unsigned long state_filter)
debug_show_all_locks();
}
-void init_idle_bootup_task(struct task_struct *idle)
-{
- idle->sched_class = &idle_sched_class;
-}
-
/**
* init_idle - set up an idle thread for a given CPU
* @idle: task in question
@@ -5438,7 +5456,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
*/
next = pick_next_task(rq, &fake_task, rf);
BUG_ON(!next);
- next->sched_class->put_prev_task(rq, next);
+ put_prev_task(rq, next);
/*
* Rules for changing task_struct::cpus_allowed are holding
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index fba235c7d026..8d9562d890d3 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -119,29 +119,29 @@ static inline int cpudl_maximum(struct cpudl *cp)
* @p: the task
* @later_mask: a mask to fill in with the selected CPUs (or NULL)
*
- * Returns: int - best CPU (heap maximum if suitable)
+ * Returns: int - CPUs were found
*/
int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask)
{
- int best_cpu = -1;
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
- best_cpu = cpumask_any(later_mask);
- goto out;
- } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
- dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
- best_cpu = cpudl_maximum(cp);
- if (later_mask)
- cpumask_set_cpu(best_cpu, later_mask);
- }
+ return 1;
+ } else {
+ int best_cpu = cpudl_maximum(cp);
+ WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
-out:
- WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
+ if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+ dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+ if (later_mask)
+ cpumask_set_cpu(best_cpu, later_mask);
- return best_cpu;
+ return 1;
+ }
+ }
+ return 0;
}
/*
@@ -246,7 +246,6 @@ int cpudl_init(struct cpudl *cp)
{
int i;
- memset(cp, 0, sizeof(*cp));
raw_spin_lock_init(&cp->lock);
cp->size = 0;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 076a2e31951c..9209d83ecdcf 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -52,9 +52,11 @@ struct sugov_policy {
struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;
+ unsigned int cpu;
- unsigned long iowait_boost;
- unsigned long iowait_boost_max;
+ bool iowait_boost_pending;
+ unsigned int iowait_boost;
+ unsigned int iowait_boost_max;
u64 last_update;
/* The fields below are only needed when sharing a policy. */
@@ -76,6 +78,26 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
{
s64 delta_ns;
+ /*
+ * Since cpufreq_update_util() is called with rq->lock held for
+ * the @target_cpu, our per-cpu data is fully serialized.
+ *
+ * However, drivers cannot in general deal with cross-cpu
+ * requests, so while get_next_freq() will work, our
+ * sugov_update_commit() call may not for the fast switching platforms.
+ *
+ * Hence stop here for remote requests if they aren't supported
+ * by the hardware, as calculating the frequency is pointless if
+ * we cannot in fact act on it.
+ *
+ * For the slow switching platforms, the kthread is always scheduled on
+ * the right set of CPUs and any CPU can find the next frequency and
+ * schedule the kthread.
+ */
+ if (sg_policy->policy->fast_switch_enabled &&
+ !cpufreq_can_do_remote_dvfs(sg_policy->policy))
+ return false;
+
if (sg_policy->work_in_progress)
return false;
@@ -106,7 +128,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
if (policy->fast_switch_enabled) {
next_freq = cpufreq_driver_fast_switch(policy, next_freq);
- if (next_freq == CPUFREQ_ENTRY_INVALID)
+ if (!next_freq)
return;
policy->cur = next_freq;
@@ -154,12 +176,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
-static void sugov_get_util(unsigned long *util, unsigned long *max)
+static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu)
{
- struct rq *rq = this_rq();
+ struct rq *rq = cpu_rq(cpu);
unsigned long cfs_max;
- cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+ cfs_max = arch_scale_cpu_capacity(NULL, cpu);
*util = min(rq->cfs.avg.util_avg, cfs_max);
*max = cfs_max;
@@ -169,30 +191,54 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
unsigned int flags)
{
if (flags & SCHED_CPUFREQ_IOWAIT) {
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ if (sg_cpu->iowait_boost_pending)
+ return;
+
+ sg_cpu->iowait_boost_pending = true;
+
+ if (sg_cpu->iowait_boost) {
+ sg_cpu->iowait_boost <<= 1;
+ if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ } else {
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+ }
} else if (sg_cpu->iowait_boost) {
s64 delta_ns = time - sg_cpu->last_update;
/* Clear iowait_boost if the CPU apprears to have been idle. */
- if (delta_ns > TICK_NSEC)
+ if (delta_ns > TICK_NSEC) {
sg_cpu->iowait_boost = 0;
+ sg_cpu->iowait_boost_pending = false;
+ }
}
}
static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
unsigned long *max)
{
- unsigned long boost_util = sg_cpu->iowait_boost;
- unsigned long boost_max = sg_cpu->iowait_boost_max;
+ unsigned int boost_util, boost_max;
- if (!boost_util)
+ if (!sg_cpu->iowait_boost)
return;
+ if (sg_cpu->iowait_boost_pending) {
+ sg_cpu->iowait_boost_pending = false;
+ } else {
+ sg_cpu->iowait_boost >>= 1;
+ if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+ sg_cpu->iowait_boost = 0;
+ return;
+ }
+ }
+
+ boost_util = sg_cpu->iowait_boost;
+ boost_max = sg_cpu->iowait_boost_max;
+
if (*util * boost_max < *max * boost_util) {
*util = boost_util;
*max = boost_max;
}
- sg_cpu->iowait_boost >>= 1;
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -229,7 +275,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
if (flags & SCHED_CPUFREQ_RT_DL) {
next_f = policy->cpuinfo.max_freq;
} else {
- sugov_get_util(&util, &max);
+ sugov_get_util(&util, &max, sg_cpu->cpu);
sugov_iowait_boost(sg_cpu, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
/*
@@ -264,6 +310,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
delta_ns = time - j_sg_cpu->last_update;
if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0;
+ j_sg_cpu->iowait_boost_pending = false;
continue;
}
if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
@@ -290,7 +337,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
unsigned long util, max;
unsigned int next_f;
- sugov_get_util(&util, &max);
+ sugov_get_util(&util, &max, sg_cpu->cpu);
raw_spin_lock(&sg_policy->update_lock);
@@ -445,7 +492,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
}
sg_policy->thread = thread;
- kthread_bind_mask(thread, policy->related_cpus);
+
+ /* Kthread is bound to all CPUs by default */
+ if (!policy->dvfs_possible_from_any_cpu)
+ kthread_bind_mask(thread, policy->related_cpus);
+
init_irq_work(&sg_policy->irq_work, sugov_irq_work);
mutex_init(&sg_policy->work_lock);
@@ -528,16 +579,7 @@ static int sugov_init(struct cpufreq_policy *policy)
goto stop_kthread;
}
- if (policy->transition_delay_us) {
- tunables->rate_limit_us = policy->transition_delay_us;
- } else {
- unsigned int lat;
-
- tunables->rate_limit_us = LATENCY_MULTIPLIER;
- lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
- if (lat)
- tunables->rate_limit_us *= lat;
- }
+ tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
@@ -610,6 +652,11 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_cpu->sg_policy = sg_policy;
sg_cpu->flags = SCHED_CPUFREQ_RT;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ }
+
+ for_each_cpu(cpu, policy->cpus) {
+ struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
policy_is_shared(policy) ?
sugov_update_shared :
@@ -650,6 +697,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
static struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
.owner = THIS_MODULE,
+ .dynamic_switching = true,
.init = sugov_init,
.exit = sugov_exit,
.start = sugov_start,
@@ -666,6 +714,11 @@ struct cpufreq_governor *cpufreq_default_governor(void)
static int __init sugov_register(void)
{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(sugov_cpu, cpu).cpu = cpu;
+
return cpufreq_register_governor(&schedutil_gov);
}
fs_initcall(sugov_register);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7dc394..2511aba36b89 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -209,8 +209,6 @@ int cpupri_init(struct cpupri *cp)
{
int i;
- memset(cp, 0, sizeof(*cp));
-
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[i];
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 67c70e287647..14d2dbf97c53 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr,
utime = curr->utime;
/*
- * If either stime or both stime and utime are 0, assume all runtime is
- * userspace. Once a task gets some ticks, the monotonicy code at
- * 'update' will ensure things converge to the observed ratio.
+ * If either stime or utime are 0, assume all runtime is userspace.
+ * Once a task gets some ticks, the monotonicy code at 'update:'
+ * will ensure things converge to the observed ratio.
*/
- if (stime != 0) {
- if (utime == 0)
- stime = rtime;
- else
- stime = scale_stime(stime, rtime, stime + utime);
+ if (stime == 0) {
+ utime = rtime;
+ goto update;
}
+ if (utime == 0) {
+ stime = rtime;
+ goto update;
+ }
+
+ stime = scale_stime(stime, rtime, stime + utime);
+
+update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
@@ -673,20 +679,21 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static u64 vtime_delta(struct task_struct *tsk)
+static u64 vtime_delta(struct vtime *vtime)
{
- unsigned long now = READ_ONCE(jiffies);
+ unsigned long long clock;
- if (time_before(now, (unsigned long)tsk->vtime_snap))
+ clock = sched_clock();
+ if (clock < vtime->starttime)
return 0;
- return jiffies_to_nsecs(now - tsk->vtime_snap);
+ return clock - vtime->starttime;
}
-static u64 get_vtime_delta(struct task_struct *tsk)
+static u64 get_vtime_delta(struct vtime *vtime)
{
- unsigned long now = READ_ONCE(jiffies);
- u64 delta, other;
+ u64 delta = vtime_delta(vtime);
+ u64 other;
/*
* Unlike tick based timing, vtime based timing never has lost
@@ -695,104 +702,138 @@ static u64 get_vtime_delta(struct task_struct *tsk)
* elapsed time. Limit account_other_time to prevent rounding
* errors from causing elapsed vtime to go negative.
*/
- delta = jiffies_to_nsecs(now - tsk->vtime_snap);
other = account_other_time(delta);
- WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
- tsk->vtime_snap = now;
+ WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
+ vtime->starttime += delta;
return delta - other;
}
-static void __vtime_account_system(struct task_struct *tsk)
+static void __vtime_account_system(struct task_struct *tsk,
+ struct vtime *vtime)
+{
+ vtime->stime += get_vtime_delta(vtime);
+ if (vtime->stime >= TICK_NSEC) {
+ account_system_time(tsk, irq_count(), vtime->stime);
+ vtime->stime = 0;
+ }
+}
+
+static void vtime_account_guest(struct task_struct *tsk,
+ struct vtime *vtime)
{
- account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
+ vtime->gtime += get_vtime_delta(vtime);
+ if (vtime->gtime >= TICK_NSEC) {
+ account_guest_time(tsk, vtime->gtime);
+ vtime->gtime = 0;
+ }
}
void vtime_account_system(struct task_struct *tsk)
{
- if (!vtime_delta(tsk))
+ struct vtime *vtime = &tsk->vtime;
+
+ if (!vtime_delta(vtime))
return;
- write_seqcount_begin(&tsk->vtime_seqcount);
- __vtime_account_system(tsk);
- write_seqcount_end(&tsk->vtime_seqcount);
+ write_seqcount_begin(&vtime->seqcount);
+ /* We might have scheduled out from guest path */
+ if (current->flags & PF_VCPU)
+ vtime_account_guest(tsk, vtime);
+ else
+ __vtime_account_system(tsk, vtime);
+ write_seqcount_end(&vtime->seqcount);
}
-void vtime_account_user(struct task_struct *tsk)
+void vtime_user_enter(struct task_struct *tsk)
{
- write_seqcount_begin(&tsk->vtime_seqcount);
- tsk->vtime_snap_whence = VTIME_SYS;
- if (vtime_delta(tsk))
- account_user_time(tsk, get_vtime_delta(tsk));
- write_seqcount_end(&tsk->vtime_seqcount);
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ __vtime_account_system(tsk, vtime);
+ vtime->state = VTIME_USER;
+ write_seqcount_end(&vtime->seqcount);
}
-void vtime_user_enter(struct task_struct *tsk)
+void vtime_user_exit(struct task_struct *tsk)
{
- write_seqcount_begin(&tsk->vtime_seqcount);
- if (vtime_delta(tsk))
- __vtime_account_system(tsk);
- tsk->vtime_snap_whence = VTIME_USER;
- write_seqcount_end(&tsk->vtime_seqcount);
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->utime += get_vtime_delta(vtime);
+ if (vtime->utime >= TICK_NSEC) {
+ account_user_time(tsk, vtime->utime);
+ vtime->utime = 0;
+ }
+ vtime->state = VTIME_SYS;
+ write_seqcount_end(&vtime->seqcount);
}
void vtime_guest_enter(struct task_struct *tsk)
{
+ struct vtime *vtime = &tsk->vtime;
/*
* The flags must be updated under the lock with
- * the vtime_snap flush and update.
+ * the vtime_starttime flush and update.
* That enforces a right ordering and update sequence
* synchronization against the reader (task_gtime())
* that can thus safely catch up with a tickless delta.
*/
- write_seqcount_begin(&tsk->vtime_seqcount);
- if (vtime_delta(tsk))
- __vtime_account_system(tsk);
+ write_seqcount_begin(&vtime->seqcount);
+ __vtime_account_system(tsk, vtime);
current->flags |= PF_VCPU;
- write_seqcount_end(&tsk->vtime_seqcount);
+ write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk)
{
- write_seqcount_begin(&tsk->vtime_seqcount);
- __vtime_account_system(tsk);
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime_account_guest(tsk, vtime);
current->flags &= ~PF_VCPU;
- write_seqcount_end(&tsk->vtime_seqcount);
+ write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
void vtime_account_idle(struct task_struct *tsk)
{
- account_idle_time(get_vtime_delta(tsk));
+ account_idle_time(get_vtime_delta(&tsk->vtime));
}
void arch_vtime_task_switch(struct task_struct *prev)
{
- write_seqcount_begin(&prev->vtime_seqcount);
- prev->vtime_snap_whence = VTIME_INACTIVE;
- write_seqcount_end(&prev->vtime_seqcount);
+ struct vtime *vtime = &prev->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_INACTIVE;
+ write_seqcount_end(&vtime->seqcount);
+
+ vtime = &current->vtime;
- write_seqcount_begin(&current->vtime_seqcount);
- current->vtime_snap_whence = VTIME_SYS;
- current->vtime_snap = jiffies;
- write_seqcount_end(&current->vtime_seqcount);
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_SYS;
+ vtime->starttime = sched_clock();
+ write_seqcount_end(&vtime->seqcount);
}
void vtime_init_idle(struct task_struct *t, int cpu)
{
+ struct vtime *vtime = &t->vtime;
unsigned long flags;
local_irq_save(flags);
- write_seqcount_begin(&t->vtime_seqcount);
- t->vtime_snap_whence = VTIME_SYS;
- t->vtime_snap = jiffies;
- write_seqcount_end(&t->vtime_seqcount);
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_SYS;
+ vtime->starttime = sched_clock();
+ write_seqcount_end(&vtime->seqcount);
local_irq_restore(flags);
}
u64 task_gtime(struct task_struct *t)
{
+ struct vtime *vtime = &t->vtime;
unsigned int seq;
u64 gtime;
@@ -800,13 +841,13 @@ u64 task_gtime(struct task_struct *t)
return t->gtime;
do {
- seq = read_seqcount_begin(&t->vtime_seqcount);
+ seq = read_seqcount_begin(&vtime->seqcount);
gtime = t->gtime;
- if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
- gtime += vtime_delta(t);
+ if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+ gtime += vtime->gtime + vtime_delta(vtime);
- } while (read_seqcount_retry(&t->vtime_seqcount, seq));
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
return gtime;
}
@@ -818,8 +859,9 @@ u64 task_gtime(struct task_struct *t)
*/
void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
{
- u64 delta;
+ struct vtime *vtime = &t->vtime;
unsigned int seq;
+ u64 delta;
if (!vtime_accounting_enabled()) {
*utime = t->utime;
@@ -828,25 +870,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
}
do {
- seq = read_seqcount_begin(&t->vtime_seqcount);
+ seq = read_seqcount_begin(&vtime->seqcount);
*utime = t->utime;
*stime = t->stime;
/* Task is sleeping, nothing to add */
- if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
+ if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
continue;
- delta = vtime_delta(t);
+ delta = vtime_delta(vtime);
/*
* Task runs either in user or kernel space, add pending nohz time to
* the right place.
*/
- if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
- *utime += delta;
- else if (t->vtime_snap_whence == VTIME_SYS)
- *stime += delta;
- } while (read_seqcount_retry(&t->vtime_seqcount, seq));
+ if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
+ *utime += vtime->utime + delta;
+ else if (vtime->state == VTIME_SYS)
+ *stime += vtime->stime + delta;
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a84299f44b5d..9e38df7649f4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1136,7 +1136,7 @@ static void update_curr_dl(struct rq *rq)
}
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_DL);
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1392,17 +1392,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
struct sched_dl_entity *pi_se = &p->dl;
/*
- * Use the scheduling parameters of the top pi-waiter
- * task if we have one and its (absolute) deadline is
- * smaller than our one... OTW we keep our runtime and
- * deadline.
+ * Use the scheduling parameters of the top pi-waiter task if:
+ * - we have a top pi-waiter which is a SCHED_DEADLINE task AND
+ * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
+ * smaller than our deadline OR we are a !SCHED_DEADLINE task getting
+ * boosted due to a SCHED_DEADLINE pi-waiter).
+ * Otherwise we keep our runtime and deadline.
*/
- if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
+ if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
pi_se = &pi_task->dl;
} else if (!dl_prio(p->normal_prio)) {
/*
* Special case in which we have a !SCHED_DEADLINE task
- * that is going to be deboosted, but exceedes its
+ * that is going to be deboosted, but exceeds its
* runtime while doing so. No point in replenishing
* it, as it's going to return back to its original
* scheduling class after this.
@@ -1592,7 +1594,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* let's hope p can move out.
*/
if (rq->curr->nr_cpus_allowed == 1 ||
- cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
+ !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))
return;
/*
@@ -1600,7 +1602,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* see if it is pushed or pulled somewhere else.
*/
if (p->nr_cpus_allowed != 1 &&
- cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
+ cpudl_find(&rq->rd->cpudl, p, NULL))
return;
resched_curr(rq);
@@ -1653,7 +1655,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-struct task_struct *
+static struct task_struct *
pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct sched_dl_entity *dl_se;
@@ -1796,7 +1798,7 @@ static int find_later_rq(struct task_struct *task)
struct sched_domain *sd;
struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
int this_cpu = smp_processor_id();
- int best_cpu, cpu = task_cpu(task);
+ int cpu = task_cpu(task);
/* Make sure the mask is initialized first */
if (unlikely(!later_mask))
@@ -1809,17 +1811,14 @@ static int find_later_rq(struct task_struct *task)
* We have to consider system topology and task affinity
* first, then we can look for a suitable cpu.
*/
- best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
- task, later_mask);
- if (best_cpu == -1)
+ if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
return -1;
/*
- * If we are here, some target has been found,
- * the most suitable of which is cached in best_cpu.
- * This is, among the runqueues where the current tasks
- * have later deadlines than the task's one, the rq
- * with the latest possible one.
+ * If we are here, some targets have been found, including
+ * the most suitable which is, among the runqueues where the
+ * current tasks have later deadlines than the task's one, the
+ * rq with the latest possible one.
*
* Now we check how well this matches with task's
* affinity and system topology.
@@ -1839,6 +1838,7 @@ static int find_later_rq(struct task_struct *task)
rcu_read_lock();
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
/*
* If possible, preempting this_cpu is
@@ -1850,12 +1850,15 @@ static int find_later_rq(struct task_struct *task)
return this_cpu;
}
+ best_cpu = cpumask_first_and(later_mask,
+ sched_domain_span(sd));
/*
- * Last chance: if best_cpu is valid and is
- * in the mask, that becomes our choice.
+ * Last chance: if a cpu being in both later_mask
+ * and current sd span is valid, that becomes our
+ * choice. Of course, the latest possible cpu is
+ * already under consideration through later_mask.
*/
- if (best_cpu < nr_cpu_ids &&
- cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
+ if (best_cpu < nr_cpu_ids) {
rcu_read_unlock();
return best_cpu;
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4fa66de52bd6..4a23bbc3111b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
return table;
}
+static cpumask_var_t sd_sysctl_cpus;
static struct ctl_table_header *sd_sysctl_header;
+
void register_sched_domain_sysctl(void)
{
- int i, cpu_num = num_possible_cpus();
- struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+ static struct ctl_table *cpu_entries;
+ static struct ctl_table **cpu_idx;
char buf[32];
+ int i;
- WARN_ON(sd_ctl_dir[0].child);
- sd_ctl_dir[0].child = entry;
+ if (!cpu_entries) {
+ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
+ if (!cpu_entries)
+ return;
- if (entry == NULL)
- return;
+ WARN_ON(sd_ctl_dir[0].child);
+ sd_ctl_dir[0].child = cpu_entries;
+ }
- for_each_possible_cpu(i) {
- snprintf(buf, 32, "cpu%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_cpu_table(i);
- entry++;
+ if (!cpu_idx) {
+ struct ctl_table *e = cpu_entries;
+
+ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
+ if (!cpu_idx)
+ return;
+
+ /* deal with sparse possible map */
+ for_each_possible_cpu(i) {
+ cpu_idx[i] = e;
+ e++;
+ }
+ }
+
+ if (!cpumask_available(sd_sysctl_cpus)) {
+ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
+ return;
+
+ /* init to possible to not have holes in @cpu_entries */
+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
+ }
+
+ for_each_cpu(i, sd_sysctl_cpus) {
+ struct ctl_table *e = cpu_idx[i];
+
+ if (e->child)
+ sd_free_ctl_entry(&e->child);
+
+ if (!e->procname) {
+ snprintf(buf, 32, "cpu%d", i);
+ e->procname = kstrdup(buf, GFP_KERNEL);
+ }
+ e->mode = 0555;
+ e->child = sd_alloc_ctl_cpu_table(i);
+
+ __cpumask_clear_cpu(i, sd_sysctl_cpus);
}
WARN_ON(sd_sysctl_header);
sd_sysctl_header = register_sysctl_table(sd_ctl_root);
}
+void dirty_sched_domain_sysctl(int cpu)
+{
+ if (cpumask_available(sd_sysctl_cpus))
+ __cpumask_set_cpu(cpu, sd_sysctl_cpus);
+}
+
/* may be called multiple times per register */
void unregister_sched_domain_sysctl(void)
{
unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL;
- if (sd_ctl_dir[0].child)
- sd_free_ctl_entry(&sd_ctl_dir[0].child);
}
#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_SMP */
@@ -421,13 +461,15 @@ static char *task_group_path(struct task_group *tg)
}
#endif
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
if (rq->curr == p)
- SEQ_printf(m, "R");
+ SEQ_printf(m, ">R");
else
- SEQ_printf(m, " ");
+ SEQ_printf(m, " %c", task_state_to_char(p));
SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
@@ -456,9 +498,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m,
"\nrunnable tasks:\n"
- " task PID tree-key switches prio"
+ " S task PID tree-key switches prio"
" wait-time sum-exec sum-sleep\n"
- "------------------------------------------------------"
+ "-------------------------------------------------------"
"----------------------------------------------------\n");
rcu_read_lock();
@@ -872,11 +914,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
#endif
}
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ struct seq_file *m)
{
unsigned long nr_switches;
- SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),
get_nr_threads(p));
SEQ_printf(m,
"---------------------------------------------------------"
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 008c514dc241..8bc0a883d190 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
/*
* For !fair tasks do:
*
- update_cfs_rq_load_avg(now, cfs_rq, false);
+ update_cfs_rq_load_avg(now, cfs_rq);
attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
@@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+struct numa_group {
+ atomic_t refcount;
+
+ spinlock_t lock; /* nr_tasks, tasks */
+ int nr_tasks;
+ pid_t gid;
+ int active_nodes;
+
+ struct rcu_head rcu;
+ unsigned long total_faults;
+ unsigned long max_faults_cpu;
+ /*
+ * Faults_cpu is used to decide whether memory should move
+ * towards the CPU. As a consequence, these stats are weighted
+ * more by CPU use than by memory faults.
+ */
+ unsigned long *faults_cpu;
+ unsigned long faults[0];
+};
+
+static inline unsigned long group_faults_priv(struct numa_group *ng);
+static inline unsigned long group_faults_shared(struct numa_group *ng);
+
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
@@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p)
return max_t(unsigned int, floor, scan);
}
+static unsigned int task_scan_start(struct task_struct *p)
+{
+ unsigned long smin = task_scan_min(p);
+ unsigned long period = smin;
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+ }
+
+ return max(smin, period);
+}
+
static unsigned int task_scan_max(struct task_struct *p)
{
- unsigned int smin = task_scan_min(p);
- unsigned int smax;
+ unsigned long smin = task_scan_min(p);
+ unsigned long smax;
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+ unsigned long period = smax;
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+
+ smax = max(smax, period);
+ }
+
return max(smin, smax);
}
@@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
-struct numa_group {
- atomic_t refcount;
-
- spinlock_t lock; /* nr_tasks, tasks */
- int nr_tasks;
- pid_t gid;
- int active_nodes;
-
- struct rcu_head rcu;
- unsigned long total_faults;
- unsigned long max_faults_cpu;
- /*
- * Faults_cpu is used to decide whether memory should move
- * towards the CPU. As a consequence, these stats are weighted
- * more by CPU use than by memory faults.
- */
- unsigned long *faults_cpu;
- unsigned long faults[0];
-};
-
/* Shared or private faults. */
#define NR_NUMA_HINT_FAULT_TYPES 2
@@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
}
+static inline unsigned long group_faults_priv(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+
+ return faults;
+}
+
+static inline unsigned long group_faults_shared(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
+ }
+
+ return faults;
+}
+
/*
* A node triggering more than 1/3 as many NUMA faults as the maximum is
* considered part of a numa group's pseudo-interleaving set. Migrations
@@ -1378,7 +1439,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long weighted_cpuload(const int cpu);
+static unsigned long weighted_cpuload(struct rq *rq);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long capacity_of(int cpu);
@@ -1409,7 +1470,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
struct rq *rq = cpu_rq(cpu);
ns->nr_running += rq->nr_running;
- ns->load += weighted_cpuload(cpu);
+ ns->load += weighted_cpuload(rq);
ns->compute_capacity += capacity_of(cpu);
cpus++;
@@ -1808,7 +1869,7 @@ static int task_numa_migrate(struct task_struct *p)
* Reset the scan period if the task is being rescheduled on an
* alternative node to recheck if the tasks is now properly placed.
*/
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
@@ -1892,7 +1953,7 @@ static void update_task_scan_period(struct task_struct *p,
unsigned long shared, unsigned long private)
{
unsigned int period_slot;
- int ratio;
+ int lr_ratio, ps_ratio;
int diff;
unsigned long remote = p->numa_faults_locality[0];
@@ -1922,25 +1983,36 @@ static void update_task_scan_period(struct task_struct *p,
* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
*/
period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
- ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
- if (ratio >= NUMA_PERIOD_THRESHOLD) {
- int slot = ratio - NUMA_PERIOD_THRESHOLD;
+ lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
+
+ if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are local. There is no need to
+ * do fast NUMA scanning, since memory is already local.
+ */
+ int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are shared with other tasks.
+ * There is no point in continuing fast NUMA scanning,
+ * since other tasks may just move the memory elsewhere.
+ */
+ int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else {
- diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
-
/*
- * Scale scan rate increases based on sharing. There is an
- * inverse relationship between the degree of sharing and
- * the adjustment made to the scanning period. Broadly
- * speaking the intent is that there is little point
- * scanning faster if shared accesses dominate as it may
- * simply bounce migrations uselessly
+ * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
+ * yet they are not on the local NUMA node. Speed up
+ * NUMA scanning to get the memory moved over.
*/
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
- diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+ int ratio = max(lr_ratio, ps_ratio);
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
}
p->numa_scan_period = clamp(p->numa_scan_period + diff,
@@ -2448,7 +2520,7 @@ void task_numa_work(struct callback_head *work)
if (p->numa_scan_period == 0) {
p->numa_scan_period_max = task_scan_max(p);
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
@@ -2576,7 +2648,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
if (now > curr->node_stamp + period) {
if (!curr->node_stamp)
- curr->numa_scan_period = task_scan_min(curr);
+ curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
@@ -2586,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
}
}
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- */
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- struct numa_stats prev_load, this_load;
- s64 this_eff_load, prev_eff_load;
-
- update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
- update_numa_stats(&this_load, cpu_to_node(this_cpu));
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync) {
- unsigned long current_load = task_h_load(current);
-
- if (this_load.load > current_load)
- this_load.load -= current_load;
- else
- this_load.load = 0;
- }
-
- /*
- * In low-load situations, where this_cpu's node is idle due to the
- * sync cause above having dropped this_load.load to 0, move the task.
- * Moving to an idle socket will not create a bad imbalance.
- *
- * Otherwise check if the nodes are near enough in load to allow this
- * task to be woken on this_cpu's node.
- */
- if (this_load.load > 0) {
- unsigned long task_load = task_h_load(p);
-
- this_eff_load = 100;
- this_eff_load *= prev_load.compute_capacity;
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= this_load.compute_capacity;
-
- this_eff_load *= this_load.load + task_load;
- prev_eff_load *= prev_load.load - task_load;
-
- return this_eff_load <= prev_eff_load;
- }
-
- return true;
-}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2652,14 +2671,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
-#ifdef CONFIG_SMP
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- return true;
-}
-#endif /* !SMP */
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -2790,6 +2801,31 @@ static inline void update_cfs_shares(struct sched_entity *se)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (&rq->cfs == cfs_rq) {
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq, 0);
+ }
+}
+
#ifdef CONFIG_SMP
/*
* Approximate:
@@ -2968,6 +3004,18 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
sa->last_update_time += delta << 10;
/*
+ * running is a subset of runnable (weight) so running can't be set if
+ * runnable is clear. But there are some corner cases where the current
+ * se has been already dequeued but cfs_rq->curr still points to it.
+ * This means that weight will be 0 but not running for a sched_entity
+ * but also for a cfs_rq if the latter becomes idle. As an example,
+ * this happens during idle_balance() which calls
+ * update_blocked_averages()
+ */
+ if (!weight)
+ running = 0;
+
+ /*
* Now we know we crossed measurement unit boundaries. The *_avg
* accrues by two steps:
*
@@ -3276,29 +3324,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
-{
- if (&this_rq()->cfs == cfs_rq) {
- /*
- * There are a few boundary cases this might miss but it should
- * get called often enough that that should (hopefully) not be
- * a real problem -- added to that it only calls on the local
- * CPU, so if we enqueue remotely we'll miss an update, but
- * the next tick/schedule should update.
- *
- * It will not get called when we go idle, because the idle
- * thread is a different class (!fair), nor will the utilization
- * number include things like RT tasks.
- *
- * As is, the util number is not freq-invariant (we'd have to
- * implement arch_scale_freq_capacity() for that).
- *
- * See cpu_util().
- */
- cpufreq_update_util(rq_of(cfs_rq), 0);
- }
-}
-
/*
* Unsigned subtract and clamp on underflow.
*
@@ -3320,7 +3345,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_task()
* @cfs_rq: cfs_rq to update
- * @update_freq: should we call cfs_rq_util_change() or will the call do so
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
* avg. The immediate corollary is that all (fair) tasks must be attached, see
@@ -3334,7 +3358,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* call update_tg_load_avg() when this function returns true.
*/
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
struct sched_avg *sa = &cfs_rq->avg;
int decayed, removed_load = 0, removed_util = 0;
@@ -3362,7 +3386,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (update_freq && (decayed || removed_util))
+ if (decayed || removed_util)
cfs_rq_util_change(cfs_rq);
return decayed || removed_load;
@@ -3390,7 +3414,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cpu, cfs_rq, se);
- decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
if (decayed && (flags & UPDATE_TG))
@@ -3534,7 +3558,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
#else /* CONFIG_SMP */
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
return 0;
}
@@ -3544,7 +3568,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
static inline void update_load_avg(struct sched_entity *se, int not_used1)
{
- cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
+ cfs_rq_util_change(cfs_rq_of(se));
}
static inline void
@@ -4875,7 +4899,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* passed.
*/
if (p->in_iowait)
- cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) {
if (se->on_rq)
@@ -5125,9 +5149,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
}
/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(struct rq *rq)
{
- return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+ return cfs_rq_runnable_load_avg(&rq->cfs);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -5172,7 +5196,7 @@ static void cpu_load_update_idle(struct rq *this_rq)
/*
* bail if there's load or we're actually up-to-date.
*/
- if (weighted_cpuload(cpu_of(this_rq)))
+ if (weighted_cpuload(this_rq))
return;
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -5193,7 +5217,7 @@ void cpu_load_update_nohz_start(void)
* concurrently we'll exit nohz. And cpu_load write can race with
* cpu_load_update_idle() but both updater would be writing the same.
*/
- this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+ this_rq->cpu_load[0] = weighted_cpuload(this_rq);
}
/*
@@ -5209,7 +5233,7 @@ void cpu_load_update_nohz_stop(void)
if (curr_jiffies == this_rq->last_load_update_tick)
return;
- load = weighted_cpuload(cpu_of(this_rq));
+ load = weighted_cpuload(this_rq);
rq_lock(this_rq, &rf);
update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -5235,7 +5259,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
*/
void cpu_load_update_active(struct rq *this_rq)
{
- unsigned long load = weighted_cpuload(cpu_of(this_rq));
+ unsigned long load = weighted_cpuload(this_rq);
if (tick_nohz_tick_stopped())
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -5253,7 +5277,7 @@ void cpu_load_update_active(struct rq *this_rq)
static unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -5268,7 +5292,7 @@ static unsigned long source_load(int cpu, int type)
static unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -5290,7 +5314,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = weighted_cpuload(cpu);
+ unsigned long load_avg = weighted_cpuload(rq);
if (nr_running)
return load_avg / nr_running;
@@ -5345,20 +5369,115 @@ static int wake_wide(struct task_struct *p)
return 1;
}
+struct llc_stats {
+ unsigned long nr_running;
+ unsigned long load;
+ unsigned long capacity;
+ int has_capacity;
+};
+
+static bool get_llc_stats(struct llc_stats *stats, int cpu)
+{
+ struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+
+ if (!sds)
+ return false;
+
+ stats->nr_running = READ_ONCE(sds->nr_running);
+ stats->load = READ_ONCE(sds->load);
+ stats->capacity = READ_ONCE(sds->capacity);
+ stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
+
+ return true;
+}
+
+/*
+ * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ *
+ * Since we're running on 'stale' values, we might in fact create an imbalance
+ * but recomputing these values is expensive, as that'd mean iteration 2 cache
+ * domains worth of CPUs.
+ */
+static bool
+wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
+{
+ struct llc_stats prev_stats, this_stats;
+ s64 this_eff_load, prev_eff_load;
+ unsigned long task_load;
+
+ if (!get_llc_stats(&prev_stats, prev_cpu) ||
+ !get_llc_stats(&this_stats, this_cpu))
+ return false;
+
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current LLC.
+ */
+ if (sync) {
+ unsigned long current_load = task_h_load(current);
+
+ /* in this case load hits 0 and this LLC is considered 'idle' */
+ if (current_load > this_stats.load)
+ return true;
+
+ this_stats.load -= current_load;
+ }
+
+ /*
+ * The has_capacity stuff is not SMT aware, but by trying to balance
+ * the nr_running on both ends we try and fill the domain at equal
+ * rates, thereby first consuming cores before siblings.
+ */
+
+ /* if the old cache has capacity, stay there */
+ if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
+ return false;
+
+ /* if this cache has capacity, come here */
+ if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
+ return true;
+
+ /*
+ * Check to see if we can move the load without causing too much
+ * imbalance.
+ */
+ task_load = task_h_load(p);
+
+ this_eff_load = 100;
+ this_eff_load *= prev_stats.capacity;
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= this_stats.capacity;
+
+ this_eff_load *= this_stats.load + task_load;
+ prev_eff_load *= prev_stats.load - task_load;
+
+ return this_eff_load <= prev_eff_load;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
- bool affine = false;
+ bool affine;
/*
- * Common case: CPUs are in the same socket, and select_idle_sibling()
- * will do its thing regardless of what we return:
+ * Default to no affine wakeups; wake_affine() should not effect a task
+ * placement the load-balancer feels inclined to undo. The conservative
+ * option is therefore to not move tasks when they wake up.
*/
- if (cpus_share_cache(prev_cpu, this_cpu))
- affine = true;
- else
- affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
+ affine = false;
+
+ /*
+ * If the wakeup is across cache domains, try to evaluate if movement
+ * makes sense, otherwise rely on select_idle_siblings() to do
+ * placement inside the cache domain.
+ */
+ if (!cpus_share_cache(prev_cpu, this_cpu))
+ affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (affine) {
@@ -5550,7 +5669,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
- load = weighted_cpuload(i);
+ load = weighted_cpuload(cpu_rq(i));
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
least_loaded_cpu = i;
@@ -6187,10 +6306,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
int new_tasks;
again:
-#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running)
goto idle;
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (prev->sched_class != &fair_sched_class)
goto simple;
@@ -6220,11 +6339,17 @@ again:
/*
* This call to check_cfs_rq_runtime() will do the
* throttle and dequeue its entity in the parent(s).
- * Therefore the 'simple' nr_running test will indeed
+ * Therefore the nr_running test will indeed
* be correct.
*/
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
+ cfs_rq = &rq->cfs;
+
+ if (!cfs_rq->nr_running)
+ goto idle;
+
goto simple;
+ }
}
se = pick_next_entity(cfs_rq, curr);
@@ -6264,12 +6389,8 @@ again:
return p;
simple:
- cfs_rq = &rq->cfs;
#endif
- if (!cfs_rq->nr_running)
- goto idle;
-
put_prev_task(rq, prev);
do {
@@ -6646,10 +6767,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
- * Also avoid computing new_dst_cpu if we have already computed
- * one in current iteration.
+ * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
+ * already computed one in current iteration.
*/
- if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+ if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
return 0;
/* Prevent to re-select dst_cpu via env's cpus */
@@ -6917,7 +7038,7 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent, if any: */
@@ -6990,7 +7111,7 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
rq_unlock_irqrestore(rq, &rf);
}
@@ -7036,6 +7157,7 @@ struct sg_lb_stats {
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
+ unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
@@ -7055,6 +7177,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
+ .total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
@@ -7363,7 +7486,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
- sgs->sum_weighted_load += weighted_cpuload(i);
+ sgs->sum_weighted_load += weighted_cpuload(rq);
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -7490,6 +7613,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
+ struct sched_domain_shared *shared = env->sd->shared;
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
@@ -7546,6 +7670,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group:
/* Now, start updating sd_lb_stats */
+ sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
@@ -7561,6 +7686,21 @@ next_group:
env->dst_rq->rd->overload = overload;
}
+ if (!shared)
+ return;
+
+ /*
+ * Since these are sums over groups they can contain some CPUs
+ * multiple times for the NUMA domains.
+ *
+ * Currently only wake_affine_llc() and find_busiest_group()
+ * uses these numbers, only the last is affected by this problem.
+ *
+ * XXX fix that.
+ */
+ WRITE_ONCE(shared->nr_running, sds->total_running);
+ WRITE_ONCE(shared->load, sds->total_load);
+ WRITE_ONCE(shared->capacity, sds->total_capacity);
}
/**
@@ -7790,6 +7930,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
+ /* XXX broken for overlapping NUMA groups */
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;
@@ -7892,7 +8033,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
capacity = capacity_of(i);
- wl = weighted_cpuload(i);
+ wl = weighted_cpuload(rq);
/*
* When comparing with imbalance, use weighted_cpuload()
@@ -8022,14 +8163,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.tasks = LIST_HEAD_INIT(env.tasks),
};
- /*
- * For NEWLY_IDLE load_balancing, we don't need to consider
- * other cpus in our group
- */
- if (idle == CPU_NEWLY_IDLE)
- env.dst_grpmask = NULL;
-
- cpumask_copy(cpus, cpu_active_mask);
+ cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
schedstat_inc(sd->lb_count[idle]);
@@ -8151,7 +8285,15 @@ more_balance:
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
- if (!cpumask_empty(cpus)) {
+ /*
+ * Attempting to continue load balancing at the current
+ * sched_domain level only makes sense if there are
+ * active CPUs remaining as possible busiest CPUs to
+ * pull load from which are not contained within the
+ * destination group that is receiving any migrated
+ * load.
+ */
+ if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
goto redo;
@@ -8447,6 +8589,13 @@ static int active_load_balance_cpu_stop(void *data)
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,
+ /*
+ * can_migrate_task() doesn't need to compute new_dst_cpu
+ * for active balancing. Since we have CPU_IDLE, but no
+ * @dst_grpmask we need to make that test go away with lying
+ * about DST_PINNED.
+ */
+ .flags = LBF_DST_PINNED,
};
schedstat_inc(sd->alb_count);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 6c23e30c0e5c..257f4f0b4532 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -158,7 +158,7 @@ static void cpuidle_idle_call(void)
}
/*
- * Suspend-to-idle ("freeze") is a system state in which all user space
+ * Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
* activity happens here and in iterrupts (if any). In that case bypass
* the cpuidle governor and go stratight for the deepest idle state
@@ -167,9 +167,9 @@ static void cpuidle_idle_call(void)
* until a proper wakeup interrupt happens.
*/
- if (idle_should_freeze() || dev->use_deepest_state) {
- if (idle_should_freeze()) {
- entered_state = cpuidle_enter_freeze(drv, dev);
+ if (idle_should_enter_s2idle() || dev->use_deepest_state) {
+ if (idle_should_enter_s2idle()) {
+ entered_state = cpuidle_enter_s2idle(drv, dev);
if (entered_state > 0) {
local_irq_enable();
goto exit_idle;
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
new file mode 100644
index 000000000000..a92fddc22747
--- /dev/null
+++ b/kernel/sched/membarrier.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+#include <linux/tick.h>
+#include <linux/cpumask.h>
+
+#include "sched.h" /* for cpu_rq(). */
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK \
+ (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+
+static void ipi_mb(void *info)
+{
+ smp_mb(); /* IPIs should be serializing but paranoid. */
+}
+
+static void membarrier_private_expedited(void)
+{
+ int cpu;
+ bool fallback = false;
+ cpumask_var_t tmpmask;
+
+ if (num_online_cpus() == 1)
+ return;
+
+ /*
+ * Matches memory barriers around rq->curr modification in
+ * scheduler.
+ */
+ smp_mb(); /* system call entry is not a mb. */
+
+ /*
+ * Expedited membarrier commands guarantee that they won't
+ * block, hence the GFP_NOWAIT allocation flag and fallback
+ * implementation.
+ */
+ if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+ /* Fallback for OOM. */
+ fallback = true;
+ }
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ /*
+ * Skipping the current CPU is OK even through we can be
+ * migrated at any point. The current CPU, at the point
+ * where we read raw_smp_processor_id(), is ensured to
+ * be in program order with respect to the caller
+ * thread. Therefore, we can skip this CPU from the
+ * iteration.
+ */
+ if (cpu == raw_smp_processor_id())
+ continue;
+ rcu_read_lock();
+ p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+ if (p && p->mm == current->mm) {
+ if (!fallback)
+ __cpumask_set_cpu(cpu, tmpmask);
+ else
+ smp_call_function_single(cpu, ipi_mb, NULL, 1);
+ }
+ rcu_read_unlock();
+ }
+ if (!fallback) {
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ free_cpumask_var(tmpmask);
+ }
+ cpus_read_unlock();
+
+ /*
+ * Memory barrier on the caller thread _after_ we finished
+ * waiting for the last IPI. Matches memory barriers around
+ * rq->curr modification in scheduler.
+ */
+ smp_mb(); /* exit from system call is not a mb */
+}
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd: Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, not available on the running
+ * kernel, or if the command argument is invalid, this system call
+ * returns -EINVAL. For a given command, with flags argument set to 0,
+ * this system call is guaranteed to always return the same value until
+ * reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ * barrier() smp_mb() sys_membarrier()
+ * barrier() X X O
+ * smp_mb() X O O
+ * sys_membarrier() O O O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+ if (unlikely(flags))
+ return -EINVAL;
+ switch (cmd) {
+ case MEMBARRIER_CMD_QUERY:
+ {
+ int cmd_mask = MEMBARRIER_CMD_BITMASK;
+
+ if (tick_nohz_full_enabled())
+ cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+ return cmd_mask;
+ }
+ case MEMBARRIER_CMD_SHARED:
+ /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+ if (tick_nohz_full_enabled())
+ return -EINVAL;
+ if (num_online_cpus() > 1)
+ synchronize_sched();
+ return 0;
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ membarrier_private_expedited();
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 45caf937ef90..0af5ca9e3e3f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -970,7 +970,7 @@ static void update_curr_rt(struct rq *rq)
return;
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eeef1a3086d1..6ed7962dc896 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -769,7 +769,7 @@ struct rq {
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
- struct call_single_data hrtick_csd;
+ call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
#endif
@@ -1120,11 +1120,15 @@ extern int group_balance_cpu(struct sched_group *sg);
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
void register_sched_domain_sysctl(void);
+void dirty_sched_domain_sysctl(int cpu);
void unregister_sched_domain_sysctl(void);
#else
static inline void register_sched_domain_sysctl(void)
{
}
+static inline void dirty_sched_domain_sysctl(int cpu)
+{
+}
static inline void unregister_sched_domain_sysctl(void)
{
}
@@ -2070,19 +2074,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
struct update_util_data *data;
- data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
+ cpu_of(rq)));
if (data)
data->func(data, rq_clock(rq), flags);
}
-
-static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
-{
- if (cpu_of(rq) == smp_processor_id())
- cpufreq_update_util(rq, flags);
-}
#else
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
-static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef arch_scale_freq_capacity
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 3d5610dcce11..2227e183e202 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -33,9 +33,6 @@ void swake_up(struct swait_queue_head *q)
{
unsigned long flags;
- if (!swait_active(q))
- return;
-
raw_spin_lock_irqsave(&q->lock, flags);
swake_up_locked(q);
raw_spin_unlock_irqrestore(&q->lock, flags);
@@ -51,9 +48,6 @@ void swake_up_all(struct swait_queue_head *q)
struct swait_queue *curr;
LIST_HEAD(tmp);
- if (!swait_active(q))
- return;
-
raw_spin_lock_irq(&q->lock);
list_splice_init(&q->task_list, &tmp);
while (!list_empty(&tmp)) {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 79895aec281e..6f7b43982f73 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -261,8 +261,6 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
static int init_rootdomain(struct root_domain *rd)
{
- memset(rd, 0, sizeof(*rd));
-
if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
goto out;
if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
@@ -311,7 +309,7 @@ static struct root_domain *alloc_rootdomain(void)
{
struct root_domain *rd;
- rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+ rd = kzalloc(sizeof(*rd), GFP_KERNEL);
if (!rd)
return NULL;
@@ -337,7 +335,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
kfree(sg->sgc);
- kfree(sg);
+ if (atomic_dec_and_test(&sg->ref))
+ kfree(sg);
sg = tmp;
} while (sg != first);
}
@@ -345,15 +344,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
static void destroy_sched_domain(struct sched_domain *sd)
{
/*
- * If its an overlapping domain it has private groups, iterate and
- * nuke them all.
+ * A normal sched domain may have multiple group references, an
+ * overlapping domain, having private groups, only one. Iterate,
+ * dropping group/capacity references, freeing where none remain.
*/
- if (sd->flags & SD_OVERLAP) {
- free_sched_groups(sd->groups, 1);
- } else if (atomic_dec_and_test(&sd->groups->ref)) {
- kfree(sd->groups->sgc);
- kfree(sd->groups);
- }
+ free_sched_groups(sd->groups, 1);
+
if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
kfree(sd->shared);
kfree(sd);
@@ -463,6 +459,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
rq_attach_root(rq, rd);
tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd);
+ dirty_sched_domain_sysctl(cpu);
destroy_sched_domains(tmp);
update_top_cache_domain(cpu);
@@ -670,6 +667,7 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
else
cpumask_copy(sg_span, sched_domain_span(sd));
+ atomic_inc(&sg->ref);
return sg;
}
@@ -1595,7 +1593,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
}
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
@@ -1854,7 +1852,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
/* Let the architecture update CPU core mappings: */
new_topology = arch_update_cpu_topology();
- n = doms_new ? ndoms_new : 0;
+ if (!doms_new) {
+ WARN_ON_ONCE(dattr_new);
+ n = 0;
+ doms_new = alloc_sched_domains(1);
+ if (doms_new) {
+ n = 1;
+ cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+ }
+ } else {
+ n = ndoms_new;
+ }
/* Destroy deleted domains: */
for (i = 0; i < ndoms_cur; i++) {
@@ -1870,11 +1878,10 @@ match1:
}
n = ndoms_cur;
- if (doms_new == NULL) {
+ if (!doms_new) {
n = 0;
doms_new = &fallback_doms;
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
- WARN_ON_ONCE(dattr_new);
}
/* Build new domains: */
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 17f11c6b0a9f..d6afed6d0752 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -70,9 +70,10 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
-
- if (curr->func(curr, mode, wake_flags, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ int ret = curr->func(curr, mode, wake_flags, key);
+ if (ret < 0)
+ break;
+ if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 65f61077ad50..98b59b5db90b 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -13,7 +13,7 @@
* of Berkeley Packet Filters/Linux Socket Filters.
*/
-#include <linux/atomic.h>
+#include <linux/refcount.h>
#include <linux/audit.h>
#include <linux/compat.h>
#include <linux/coredump.h>
@@ -56,7 +56,7 @@
* to a task_struct (other than @usage).
*/
struct seccomp_filter {
- atomic_t usage;
+ refcount_t usage;
struct seccomp_filter *prev;
struct bpf_prog *prog;
};
@@ -378,7 +378,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
return ERR_PTR(ret);
}
- atomic_set(&sfilter->usage, 1);
+ refcount_set(&sfilter->usage, 1);
return sfilter;
}
@@ -465,7 +465,7 @@ void get_seccomp_filter(struct task_struct *tsk)
if (!orig)
return;
/* Reference count is bounded by the number of total processes. */
- atomic_inc(&orig->usage);
+ refcount_inc(&orig->usage);
}
static inline void seccomp_filter_free(struct seccomp_filter *filter)
@@ -481,7 +481,7 @@ void put_seccomp_filter(struct task_struct *tsk)
{
struct seccomp_filter *orig = tsk->seccomp.filter;
/* Clean up single-reference branches iteratively. */
- while (orig && atomic_dec_and_test(&orig->usage)) {
+ while (orig && refcount_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
orig = orig->prev;
seccomp_filter_free(freeme);
@@ -641,11 +641,12 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
return 0;
case SECCOMP_RET_KILL:
- default: {
- siginfo_t info;
+ default:
audit_seccomp(this_syscall, SIGSYS, action);
/* Dump core only if this is the last remaining thread. */
if (get_nr_threads(current) == 1) {
+ siginfo_t info;
+
/* Show the original registers in the dump. */
syscall_rollback(current, task_pt_regs(current));
/* Trigger a manual coredump since do_exit skips it. */
@@ -654,7 +655,6 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
}
do_exit(SIGSYS);
}
- }
unreachable();
diff --git a/kernel/signal.c b/kernel/signal.c
index 35a570f71f07..ed804a470dcd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
recalc_sigpending_and_wake(t);
}
}
- if (action->sa.sa_handler == SIG_DFL)
+ /*
+ * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
+ * debugging to leave init killable.
+ */
+ if (action->sa.sa_handler == SIG_DFL && !t->ptrace)
t->signal->flags &= ~SIGNAL_UNKILLABLE;
ret = specific_send_sig_info(sig, info, t);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
@@ -1402,6 +1406,10 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
return ret;
}
+ /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */
+ if (pid == INT_MIN)
+ return -ESRCH;
+
read_lock(&tasklist_lock);
if (pid != -1) {
ret = __kill_pgrp_info(sig, info,
@@ -2776,7 +2784,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
* @info: if non-null, the signal's siginfo is returned here
* @ts: upper bound on process time suspension
*/
-int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
+static int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
const struct timespec *ts)
{
ktime_t *to = NULL, timeout = KTIME_MAX;
@@ -2865,6 +2873,40 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
return ret;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+ struct compat_siginfo __user *, uinfo,
+ struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
+{
+ compat_sigset_t s32;
+ sigset_t s;
+ struct timespec t;
+ siginfo_t info;
+ long ret;
+
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ sigset_from_compat(&s, &s32);
+
+ if (uts) {
+ if (compat_get_timespec(&t, uts))
+ return -EFAULT;
+ }
+
+ ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
+
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user32(uinfo, &info))
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+#endif
+
/**
* sys_kill - send a signal to a process
* @pid: the PID of the process
@@ -3121,78 +3163,68 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
}
static int
-do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
+do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp)
{
- stack_t oss;
- int error;
+ struct task_struct *t = current;
- oss.ss_sp = (void __user *) current->sas_ss_sp;
- oss.ss_size = current->sas_ss_size;
- oss.ss_flags = sas_ss_flags(sp) |
- (current->sas_ss_flags & SS_FLAG_BITS);
+ if (oss) {
+ memset(oss, 0, sizeof(stack_t));
+ oss->ss_sp = (void __user *) t->sas_ss_sp;
+ oss->ss_size = t->sas_ss_size;
+ oss->ss_flags = sas_ss_flags(sp) |
+ (current->sas_ss_flags & SS_FLAG_BITS);
+ }
- if (uss) {
- void __user *ss_sp;
- size_t ss_size;
- unsigned ss_flags;
+ if (ss) {
+ void __user *ss_sp = ss->ss_sp;
+ size_t ss_size = ss->ss_size;
+ unsigned ss_flags = ss->ss_flags;
int ss_mode;
- error = -EFAULT;
- if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
- goto out;
- error = __get_user(ss_sp, &uss->ss_sp) |
- __get_user(ss_flags, &uss->ss_flags) |
- __get_user(ss_size, &uss->ss_size);
- if (error)
- goto out;
-
- error = -EPERM;
- if (on_sig_stack(sp))
- goto out;
+ if (unlikely(on_sig_stack(sp)))
+ return -EPERM;
ss_mode = ss_flags & ~SS_FLAG_BITS;
- error = -EINVAL;
- if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
- ss_mode != 0)
- goto out;
+ if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
+ ss_mode != 0))
+ return -EINVAL;
if (ss_mode == SS_DISABLE) {
ss_size = 0;
ss_sp = NULL;
} else {
- error = -ENOMEM;
- if (ss_size < MINSIGSTKSZ)
- goto out;
+ if (unlikely(ss_size < MINSIGSTKSZ))
+ return -ENOMEM;
}
- current->sas_ss_sp = (unsigned long) ss_sp;
- current->sas_ss_size = ss_size;
- current->sas_ss_flags = ss_flags;
+ t->sas_ss_sp = (unsigned long) ss_sp;
+ t->sas_ss_size = ss_size;
+ t->sas_ss_flags = ss_flags;
}
-
- error = 0;
- if (uoss) {
- error = -EFAULT;
- if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
- goto out;
- error = __put_user(oss.ss_sp, &uoss->ss_sp) |
- __put_user(oss.ss_size, &uoss->ss_size) |
- __put_user(oss.ss_flags, &uoss->ss_flags);
- }
-
-out:
- return error;
+ return 0;
}
+
SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
{
- return do_sigaltstack(uss, uoss, current_user_stack_pointer());
+ stack_t new, old;
+ int err;
+ if (uss && copy_from_user(&new, uss, sizeof(stack_t)))
+ return -EFAULT;
+ err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL,
+ current_user_stack_pointer());
+ if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t)))
+ err = -EFAULT;
+ return err;
}
int restore_altstack(const stack_t __user *uss)
{
- int err = do_sigaltstack(uss, NULL, current_user_stack_pointer());
+ stack_t new;
+ if (copy_from_user(&new, uss, sizeof(stack_t)))
+ return -EFAULT;
+ (void)do_sigaltstack(&new, NULL, current_user_stack_pointer());
/* squash all but EFAULT for now */
- return err == -EFAULT ? err : 0;
+ return 0;
}
int __save_altstack(stack_t __user *uss, unsigned long sp)
@@ -3215,29 +3247,24 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack,
{
stack_t uss, uoss;
int ret;
- mm_segment_t seg;
if (uss_ptr) {
compat_stack_t uss32;
-
- memset(&uss, 0, sizeof(stack_t));
if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
return -EFAULT;
uss.ss_sp = compat_ptr(uss32.ss_sp);
uss.ss_flags = uss32.ss_flags;
uss.ss_size = uss32.ss_size;
}
- seg = get_fs();
- set_fs(KERNEL_DS);
- ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
- (stack_t __force __user *) &uoss,
+ ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss,
compat_user_stack_pointer());
- set_fs(seg);
if (ret >= 0 && uoss_ptr) {
- if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) ||
- __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
- __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
- __put_user(uoss.ss_size, &uoss_ptr->ss_size))
+ compat_stack_t old;
+ memset(&old, 0, sizeof(old));
+ old.ss_sp = ptr_to_compat(uoss.ss_sp);
+ old.ss_flags = uoss.ss_flags;
+ old.ss_size = uoss.ss_size;
+ if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t)))
ret = -EFAULT;
}
return ret;
@@ -3277,6 +3304,21 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
+{
+#ifdef __BIG_ENDIAN
+ sigset_t set;
+ int err = do_sigpending(&set, sizeof(set.sig[0]));
+ if (!err)
+ err = put_user(set.sig[0], set32);
+ return err;
+#else
+ return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32));
+#endif
+}
+#endif
+
#endif
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/kernel/smp.c b/kernel/smp.c
index 3061483cb3ad..81cfca9b4cc3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -28,7 +28,7 @@ enum {
};
struct call_function_data {
- struct call_single_data __percpu *csd;
+ call_single_data_t __percpu *csd;
cpumask_var_t cpumask;
cpumask_var_t cpumask_ipi;
};
@@ -51,7 +51,7 @@ int smpcfd_prepare_cpu(unsigned int cpu)
free_cpumask_var(cfd->cpumask);
return -ENOMEM;
}
- cfd->csd = alloc_percpu(struct call_single_data);
+ cfd->csd = alloc_percpu(call_single_data_t);
if (!cfd->csd) {
free_cpumask_var(cfd->cpumask);
free_cpumask_var(cfd->cpumask_ipi);
@@ -103,12 +103,12 @@ void __init call_function_init(void)
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
-static __always_inline void csd_lock_wait(struct call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
}
-static __always_inline void csd_lock(struct call_single_data *csd)
+static __always_inline void csd_lock(call_single_data_t *csd)
{
csd_lock_wait(csd);
csd->flags |= CSD_FLAG_LOCK;
@@ -116,12 +116,12 @@ static __always_inline void csd_lock(struct call_single_data *csd)
/*
* prevent CPU from reordering the above assignment
* to ->flags with any subsequent assignments to other
- * fields of the specified call_single_data structure:
+ * fields of the specified call_single_data_t structure:
*/
smp_wmb();
}
-static __always_inline void csd_unlock(struct call_single_data *csd)
+static __always_inline void csd_unlock(call_single_data_t *csd)
{
WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
@@ -131,14 +131,14 @@ static __always_inline void csd_unlock(struct call_single_data *csd)
smp_store_release(&csd->flags, 0);
}
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
/*
- * Insert a previously allocated call_single_data element
+ * Insert a previously allocated call_single_data_t element
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
-static int generic_exec_single(int cpu, struct call_single_data *csd,
+static int generic_exec_single(int cpu, call_single_data_t *csd,
smp_call_func_t func, void *info)
{
if (cpu == smp_processor_id()) {
@@ -210,7 +210,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
struct llist_head *head;
struct llist_node *entry;
- struct call_single_data *csd, *csd_next;
+ call_single_data_t *csd, *csd_next;
static bool warned;
WARN_ON(!irqs_disabled());
@@ -268,8 +268,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int wait)
{
- struct call_single_data *csd;
- struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
+ call_single_data_t *csd;
+ call_single_data_t csd_stack = {
+ .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+ };
int this_cpu;
int err;
@@ -321,7 +323,7 @@ EXPORT_SYMBOL(smp_call_function_single);
* NOTE: Be careful, there is unfortunately no current debugging facility to
* validate the correctness of this serialization.
*/
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
int err = 0;
@@ -444,7 +446,7 @@ void smp_call_function_many(const struct cpumask *mask,
cpumask_clear(cfd->cpumask_ipi);
for_each_cpu(cpu, cfd->cpumask) {
- struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
+ call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock(csd);
if (wait)
@@ -460,7 +462,7 @@ void smp_call_function_many(const struct cpumask *mask,
if (wait) {
for_each_cpu(cpu, cfd->cpumask) {
- struct call_single_data *csd;
+ call_single_data_t *csd;
csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock_wait(csd);
diff --git a/kernel/sys.c b/kernel/sys.c
index 8a94b4eabcaa..2855ee73acd0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -886,7 +886,7 @@ SYSCALL_DEFINE0(getegid)
return from_kgid_munged(current_user_ns(), current_egid());
}
-void do_sys_times(struct tms *tms)
+static void do_sys_times(struct tms *tms)
{
u64 tgutime, tgstime, cutime, cstime;
@@ -912,6 +912,32 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
return (long) jiffies_64_to_clock_t(get_jiffies_64());
}
+#ifdef CONFIG_COMPAT
+static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
+{
+ return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
+}
+
+COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
+{
+ if (tbuf) {
+ struct tms tms;
+ struct compat_tms tmp;
+
+ do_sys_times(&tms);
+ /* Convert our struct tms to the compat version. */
+ tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
+ tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
+ tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
+ tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
+ if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
+ return -EFAULT;
+ }
+ force_successful_syscall_return();
+ return compat_jiffies_to_clock_t(jiffies);
+}
+#endif
+
/*
* This needs some heavy checking ...
* I just haven't the stomach for it. I also don't fully
@@ -1306,6 +1332,54 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
return ret;
}
+#ifdef CONFIG_COMPAT
+
+COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
+{
+ struct rlimit r;
+ struct compat_rlimit r32;
+
+ if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit)))
+ return -EFAULT;
+
+ if (r32.rlim_cur == COMPAT_RLIM_INFINITY)
+ r.rlim_cur = RLIM_INFINITY;
+ else
+ r.rlim_cur = r32.rlim_cur;
+ if (r32.rlim_max == COMPAT_RLIM_INFINITY)
+ r.rlim_max = RLIM_INFINITY;
+ else
+ r.rlim_max = r32.rlim_max;
+ return do_prlimit(current, resource, &r, NULL);
+}
+
+COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
+{
+ struct rlimit r;
+ int ret;
+
+ ret = do_prlimit(current, resource, NULL, &r);
+ if (!ret) {
+ struct compat_rlimit r32;
+ if (r.rlim_cur > COMPAT_RLIM_INFINITY)
+ r32.rlim_cur = COMPAT_RLIM_INFINITY;
+ else
+ r32.rlim_cur = r.rlim_cur;
+ if (r.rlim_max > COMPAT_RLIM_INFINITY)
+ r32.rlim_max = COMPAT_RLIM_INFINITY;
+ else
+ r32.rlim_max = r.rlim_max;
+
+ if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit)))
+ return -EFAULT;
+ }
+ return ret;
+}
+
+#endif
+
#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
/*
@@ -1328,6 +1402,30 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
+{
+ struct rlimit r;
+
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+
+ task_lock(current->group_leader);
+ r = current->signal->rlim[resource];
+ task_unlock(current->group_leader);
+ if (r.rlim_cur > 0x7FFFFFFF)
+ r.rlim_cur = 0x7FFFFFFF;
+ if (r.rlim_max > 0x7FFFFFFF)
+ r.rlim_max = 0x7FFFFFFF;
+
+ if (put_user(r.rlim_cur, &rlim->rlim_cur) ||
+ put_user(r.rlim_max, &rlim->rlim_max))
+ return -EFAULT;
+ return 0;
+}
+#endif
+
#endif
static inline bool rlim64_is_infinity(__u64 rlim64)
@@ -1552,7 +1650,7 @@ static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
r->ru_oublock += task_io_get_oublock(t);
}
-static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
+void getrusage(struct task_struct *p, int who, struct rusage *r)
{
struct task_struct *t;
unsigned long flags;
@@ -1626,20 +1724,16 @@ out:
r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
}
-int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
+SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
{
struct rusage r;
- k_getrusage(p, who, &r);
- return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
-}
-
-SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
-{
if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
who != RUSAGE_THREAD)
return -EINVAL;
- return getrusage(current, who, ru);
+
+ getrusage(current, who, &r);
+ return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
}
#ifdef CONFIG_COMPAT
@@ -1651,7 +1745,7 @@ COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
who != RUSAGE_THREAD)
return -EINVAL;
- k_getrusage(current, who, &r);
+ getrusage(current, who, &r);
return put_compat_rusage(&r, ru);
}
#endif
@@ -2266,7 +2360,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_GET_THP_DISABLE:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
+ error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags);
break;
case PR_SET_THP_DISABLE:
if (arg3 || arg4 || arg5)
@@ -2274,9 +2368,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (down_write_killable(&me->mm->mmap_sem))
return -EINTR;
if (arg2)
- me->mm->def_flags |= VM_NOHUGEPAGE;
+ set_bit(MMF_DISABLE_THP, &me->mm->flags);
else
- me->mm->def_flags &= ~VM_NOHUGEPAGE;
+ clear_bit(MMF_DISABLE_THP, &me->mm->flags);
up_write(&me->mm->mmap_sem);
break;
case PR_MPX_ENABLE_MANAGEMENT:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4dfba1a76cc3..6648fbbb8157 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -174,11 +174,32 @@ extern int no_unaligned_warning;
#ifdef CONFIG_PROC_SYSCTL
-#define SYSCTL_WRITES_LEGACY -1
-#define SYSCTL_WRITES_WARN 0
-#define SYSCTL_WRITES_STRICT 1
+/**
+ * enum sysctl_writes_mode - supported sysctl write modes
+ *
+ * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value
+ * to be written, and multiple writes on the same sysctl file descriptor
+ * will rewrite the sysctl value, regardless of file position. No warning
+ * is issued when the initial position is not 0.
+ * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is
+ * not 0.
+ * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at
+ * file position 0 and the value must be fully contained in the buffer
+ * sent to the write syscall. If dealing with strings respect the file
+ * position, but restrict this to the max length of the buffer, anything
+ * passed the max lenght will be ignored. Multiple writes will append
+ * to the buffer.
+ *
+ * These write modes control how current file position affects the behavior of
+ * updating sysctl values through the proc interface on each write.
+ */
+enum sysctl_writes_mode {
+ SYSCTL_WRITES_LEGACY = -1,
+ SYSCTL_WRITES_WARN = 0,
+ SYSCTL_WRITES_STRICT = 1,
+};
-static int sysctl_writes_strict = SYSCTL_WRITES_STRICT;
+static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -880,6 +901,14 @@ static struct ctl_table kern_table[] = {
#endif
},
{
+ .procname = "watchdog_cpumask",
+ .data = &watchdog_cpumask_bits,
+ .maxlen = NR_CPUS,
+ .mode = 0644,
+ .proc_handler = proc_watchdog_cpumask,
+ },
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
+ {
.procname = "soft_watchdog",
.data = &soft_watchdog_enabled,
.maxlen = sizeof (int),
@@ -889,13 +918,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
{
- .procname = "watchdog_cpumask",
- .data = &watchdog_cpumask_bits,
- .maxlen = NR_CPUS,
- .mode = 0644,
- .proc_handler = proc_watchdog_cpumask,
- },
- {
.procname = "softlockup_panic",
.data = &softlockup_panic,
.maxlen = sizeof(int),
@@ -904,27 +926,29 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_SMP
{
- .procname = "hardlockup_panic",
- .data = &hardlockup_panic,
+ .procname = "softlockup_all_cpu_backtrace",
+ .data = &sysctl_softlockup_all_cpu_backtrace,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one,
},
+#endif /* CONFIG_SMP */
#endif
-#ifdef CONFIG_SMP
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
{
- .procname = "softlockup_all_cpu_backtrace",
- .data = &sysctl_softlockup_all_cpu_backtrace,
+ .procname = "hardlockup_panic",
+ .data = &hardlockup_panic,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one,
},
+#ifdef CONFIG_SMP
{
.procname = "hardlockup_all_cpu_backtrace",
.data = &sysctl_hardlockup_all_cpu_backtrace,
@@ -936,6 +960,8 @@ static struct ctl_table kern_table[] = {
},
#endif /* CONFIG_SMP */
#endif
+#endif
+
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
{
.procname = "unknown_nmi_panic",
@@ -1950,6 +1976,32 @@ static void warn_sysctl_write(struct ctl_table *table)
}
/**
+ * proc_first_pos_non_zero_ignore - check if firs position is allowed
+ * @ppos: file position
+ * @table: the sysctl table
+ *
+ * Returns true if the first position is non-zero and the sysctl_writes_strict
+ * mode indicates this is not allowed for numeric input types. String proc
+ * hadlers can ignore the return value.
+ */
+static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
+ struct ctl_table *table)
+{
+ if (!*ppos)
+ return false;
+
+ switch (sysctl_writes_strict) {
+ case SYSCTL_WRITES_STRICT:
+ return true;
+ case SYSCTL_WRITES_WARN:
+ warn_sysctl_write(table);
+ return false;
+ default:
+ return false;
+ }
+}
+
+/**
* proc_dostring - read a string sysctl
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
@@ -1969,8 +2021,8 @@ static void warn_sysctl_write(struct ctl_table *table)
int proc_dostring(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN)
- warn_sysctl_write(table);
+ if (write)
+ proc_first_pos_non_zero_ignore(ppos, table);
return _proc_do_string((char *)(table->data), table->maxlen, write,
(char __user *)buffer, lenp, ppos);
@@ -2128,19 +2180,18 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
return 0;
}
-static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
+static int do_proc_douintvec_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
{
if (write) {
- if (*negp)
+ if (*lvalp > UINT_MAX)
return -EINVAL;
if (*lvalp > UINT_MAX)
return -EINVAL;
*valp = *lvalp;
} else {
unsigned int val = *valp;
- *negp = false;
*lvalp = (unsigned long)val;
}
return 0;
@@ -2172,17 +2223,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
conv = do_proc_dointvec_conv;
if (write) {
- if (*ppos) {
- switch (sysctl_writes_strict) {
- case SYSCTL_WRITES_STRICT:
- goto out;
- case SYSCTL_WRITES_WARN:
- warn_sysctl_write(table);
- break;
- default:
- break;
- }
- }
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto out;
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
@@ -2249,6 +2291,146 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
buffer, lenp, ppos, conv, data);
}
+static int do_proc_douintvec_w(unsigned int *tbl_data,
+ struct ctl_table *table,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned long lval;
+ int err = 0;
+ size_t left;
+ bool neg;
+ char *kbuf = NULL, *p;
+
+ left = *lenp;
+
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto bail_early;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+
+ p = kbuf = memdup_user_nul(buffer, left);
+ if (IS_ERR(kbuf))
+ return -EINVAL;
+
+ left -= proc_skip_spaces(&p);
+ if (!left) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ err = proc_get_long(&p, &left, &lval, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err || neg) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (conv(&lval, tbl_data, 1, data)) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (!err && left)
+ left -= proc_skip_spaces(&p);
+
+out_free:
+ kfree(kbuf);
+ if (err)
+ return -EINVAL;
+
+ return 0;
+
+ /* This is in keeping with old __do_proc_dointvec() */
+bail_early:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned long lval;
+ int err = 0;
+ size_t left;
+
+ left = *lenp;
+
+ if (conv(&lval, tbl_data, 0, data)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = proc_put_long(&buffer, &left, lval, false);
+ if (err || !left)
+ goto out;
+
+ err = proc_put_char(&buffer, &left, '\n');
+
+out:
+ *lenp -= left;
+ *ppos += *lenp;
+
+ return err;
+}
+
+static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
+ int write, void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned int *i, vleft;
+
+ if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (unsigned int *) tbl_data;
+ vleft = table->maxlen / sizeof(*i);
+
+ /*
+ * Arrays are not supported, keep this simple. *Do not* add
+ * support for them.
+ */
+ if (vleft != 1) {
+ *lenp = 0;
+ return -EINVAL;
+ }
+
+ if (!conv)
+ conv = do_proc_douintvec_conv;
+
+ if (write)
+ return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
+ conv, data);
+ return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
+}
+
+static int do_proc_douintvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ return __do_proc_douintvec(table->data, table, write,
+ buffer, lenp, ppos, conv, data);
+}
+
/**
* proc_dointvec - read a vector of integers
* @table: the sysctl table
@@ -2284,8 +2466,8 @@ int proc_dointvec(struct ctl_table *table, int write,
int proc_douintvec(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_douintvec_conv, NULL);
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_conv, NULL);
}
/*
@@ -2390,6 +2572,65 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
do_proc_dointvec_minmax_conv, &param);
}
+struct do_proc_douintvec_minmax_conv_param {
+ unsigned int *min;
+ unsigned int *max;
+};
+
+static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ struct do_proc_douintvec_minmax_conv_param *param = data;
+
+ if (write) {
+ unsigned int val = *lvalp;
+
+ if ((param->min && *param->min > val) ||
+ (param->max && *param->max < val))
+ return -ERANGE;
+
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
+ *valp = val;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long) val;
+ }
+
+ return 0;
+}
+
+/**
+ * proc_douintvec_minmax - read a vector of unsigned ints with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string. Negative
+ * strings are not allowed.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max). There is a final sanity
+ * check for UINT_MAX to avoid having to support wrap around uses from
+ * userspace.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec_minmax(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_douintvec_minmax_conv_param param = {
+ .min = (unsigned int *) table->extra1,
+ .max = (unsigned int *) table->extra2,
+ };
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_minmax_conv, &param);
+}
+
static void validate_coredump_safety(void)
{
#ifdef CONFIG_COREDUMP
@@ -2447,17 +2688,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
left = *lenp;
if (write) {
- if (*ppos) {
- switch (sysctl_writes_strict) {
- case SYSCTL_WRITES_STRICT:
- goto out;
- case SYSCTL_WRITES_WARN:
- warn_sysctl_write(table);
- break;
- default:
- break;
- }
- }
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto out;
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
@@ -2898,6 +3130,12 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_douintvec_minmax(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_jiffies(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -2940,6 +3178,7 @@ EXPORT_SYMBOL(proc_dointvec);
EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
+EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
EXPORT_SYMBOL(proc_dostring);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 939a158eab11..02e1859f2ca8 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1346,7 +1346,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
* CTL_KERN/KERN_VERSION is used by older glibc and cannot
* ever go away.
*/
- if (name[0] == CTL_KERN && name[1] == KERN_VERSION)
+ if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION)
return;
if (printk_ratelimit()) {
diff --git a/kernel/task_work.c b/kernel/task_work.c
index d513051fcca2..836a72a66fba 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -96,20 +96,16 @@ void task_work_run(void)
* work->func() can do task_work_add(), do not set
* work_exited unless the list is empty.
*/
+ raw_spin_lock_irq(&task->pi_lock);
do {
work = READ_ONCE(task->task_works);
head = !work && (task->flags & PF_EXITING) ?
&work_exited : NULL;
} while (cmpxchg(&task->task_works, work, head) != work);
+ raw_spin_unlock_irq(&task->pi_lock);
if (!work)
break;
- /*
- * Synchronize with task_work_cancel(). It can't remove
- * the first entry == work, cmpxchg(task_works) should
- * fail, but it can play with *work and other entries.
- */
- raw_spin_unlock_wait(&task->pi_lock);
do {
next = work->next;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c991cf212c6d..ec09ce9a6012 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -28,6 +28,7 @@
#include <linux/workqueue.h>
#include <linux/freezer.h>
#include <linux/compat.h>
+#include <linux/module.h>
#include "posix-timers.h"
@@ -56,9 +57,9 @@ static ktime_t freezer_delta;
static DEFINE_SPINLOCK(freezer_delta_lock);
#endif
+#ifdef CONFIG_RTC_CLASS
static struct wakeup_source *ws;
-#ifdef CONFIG_RTC_CLASS
/* rtc timer and device for setting alarm wakeups at suspend */
static struct rtc_timer rtctimer;
static struct rtc_device *rtcdev;
@@ -89,6 +90,7 @@ static int alarmtimer_rtc_add_device(struct device *dev,
{
unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
+ struct wakeup_source *__ws;
if (rtcdev)
return -EBUSY;
@@ -98,13 +100,25 @@ static int alarmtimer_rtc_add_device(struct device *dev,
if (!device_may_wakeup(rtc->dev.parent))
return -1;
+ __ws = wakeup_source_register("alarmtimer");
+
spin_lock_irqsave(&rtcdev_lock, flags);
if (!rtcdev) {
+ if (!try_module_get(rtc->owner)) {
+ spin_unlock_irqrestore(&rtcdev_lock, flags);
+ return -1;
+ }
+
rtcdev = rtc;
/* hold a reference so it doesn't go away */
get_device(dev);
+ ws = __ws;
+ __ws = NULL;
}
spin_unlock_irqrestore(&rtcdev_lock, flags);
+
+ wakeup_source_unregister(__ws);
+
return 0;
}
@@ -712,14 +726,14 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
alarmtimer_freezerset(absexp, type);
restart = &current->restart_block;
if (restart->nanosleep.type != TT_NONE) {
- struct timespec rmt;
+ struct timespec64 rmt;
ktime_t rem;
rem = ktime_sub(absexp, alarm_bases[type].gettime());
if (rem <= 0)
return 0;
- rmt = ktime_to_timespec(rem);
+ rmt = ktime_to_timespec64(rem);
return nanosleep_copyout(restart, &rmt);
}
@@ -860,7 +874,6 @@ static int __init alarmtimer_init(void)
error = PTR_ERR(pdev);
goto out_drv;
}
- ws = wakeup_source_register("alarmtimer");
return 0;
out_drv:
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 81da124f1115..88f75f92ef36 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1440,17 +1440,17 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
-int nanosleep_copyout(struct restart_block *restart, struct timespec *ts)
+int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
{
switch(restart->nanosleep.type) {
#ifdef CONFIG_COMPAT
case TT_COMPAT:
- if (compat_put_timespec(ts, restart->nanosleep.compat_rmtp))
+ if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp))
return -EFAULT;
break;
#endif
case TT_NATIVE:
- if (copy_to_user(restart->nanosleep.rmtp, ts, sizeof(struct timespec)))
+ if (put_timespec64(ts, restart->nanosleep.rmtp))
return -EFAULT;
break;
default:
@@ -1485,11 +1485,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
restart = &current->restart_block;
if (restart->nanosleep.type != TT_NONE) {
ktime_t rem = hrtimer_expires_remaining(&t->timer);
- struct timespec rmt;
+ struct timespec64 rmt;
if (rem <= 0)
return 0;
- rmt = ktime_to_timespec(rem);
+ rmt = ktime_to_timespec64(rem);
return nanosleep_copyout(restart, &rmt);
}
@@ -1546,19 +1546,17 @@ out:
SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
struct timespec __user *, rmtp)
{
- struct timespec64 tu64;
- struct timespec tu;
+ struct timespec64 tu;
- if (copy_from_user(&tu, rqtp, sizeof(tu)))
+ if (get_timespec64(&tu, rqtp))
return -EFAULT;
- tu64 = timespec_to_timespec64(tu);
- if (!timespec64_valid(&tu64))
+ if (!timespec64_valid(&tu))
return -EINVAL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#ifdef CONFIG_COMPAT
@@ -1566,19 +1564,17 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
struct compat_timespec __user *, rmtp)
{
- struct timespec64 tu64;
- struct timespec tu;
+ struct timespec64 tu;
- if (compat_get_timespec(&tu, rqtp))
+ if (compat_get_timespec64(&tu, rqtp))
return -EFAULT;
- tu64 = timespec_to_timespec64(tu);
- if (!timespec64_valid(&tu64))
+ if (!timespec64_valid(&tu))
return -EINVAL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#endif
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 60cb24ac9ebc..8585ad6e472a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -799,7 +799,6 @@ static void check_thread_timers(struct task_struct *tsk,
struct list_head *firing)
{
struct list_head *timers = tsk->cpu_timers;
- struct signal_struct *const sig = tsk->signal;
struct task_cputime *tsk_expires = &tsk->cputime_expires;
u64 expires;
unsigned long soft;
@@ -823,10 +822,9 @@ static void check_thread_timers(struct task_struct *tsk,
/*
* Check for the special case thread timers.
*/
- soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+ soft = task_rlimit(tsk, RLIMIT_RTTIME);
if (soft != RLIM_INFINITY) {
- unsigned long hard =
- READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+ unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);
if (hard != RLIM_INFINITY &&
tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -847,7 +845,8 @@ static void check_thread_timers(struct task_struct *tsk,
*/
if (soft < hard) {
soft += USEC_PER_SEC;
- sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
+ tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur =
+ soft;
}
if (print_fatal_signals) {
pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
@@ -938,11 +937,10 @@ static void check_process_timers(struct task_struct *tsk,
SIGPROF);
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
SIGVTALRM);
- soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ soft = task_rlimit(tsk, RLIMIT_CPU);
if (soft != RLIM_INFINITY) {
unsigned long psecs = div_u64(ptime, NSEC_PER_SEC);
- unsigned long hard =
- READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+ unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU);
u64 x;
if (psecs >= hard) {
/*
@@ -1318,12 +1316,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
*/
restart = &current->restart_block;
restart->nanosleep.expires = expires;
- if (restart->nanosleep.type != TT_NONE) {
- struct timespec ts;
-
- ts = timespec64_to_timespec(it.it_value);
- error = nanosleep_copyout(restart, &ts);
- }
+ if (restart->nanosleep.type != TT_NONE)
+ error = nanosleep_copyout(restart, &it.it_value);
}
return error;
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 38f3b20efa29..06f34feb635e 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -41,12 +41,6 @@ SYS_NI(setitimer);
#ifdef __ARCH_WANT_SYS_ALARM
SYS_NI(alarm);
#endif
-COMPAT_SYS_NI(timer_create);
-COMPAT_SYS_NI(clock_adjtime);
-COMPAT_SYS_NI(timer_settime);
-COMPAT_SYS_NI(timer_gettime);
-COMPAT_SYS_NI(getitimer);
-COMPAT_SYS_NI(setitimer);
/*
* We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
@@ -57,40 +51,52 @@ COMPAT_SYS_NI(setitimer);
SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
const struct timespec __user *, tp)
{
- struct timespec64 new_tp64;
- struct timespec new_tp;
+ struct timespec64 new_tp;
if (which_clock != CLOCK_REALTIME)
return -EINVAL;
- if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+ if (get_timespec64(&new_tp, tp))
return -EFAULT;
- new_tp64 = timespec_to_timespec64(new_tp);
- return do_sys_settimeofday64(&new_tp64, NULL);
+ return do_sys_settimeofday64(&new_tp, NULL);
}
-SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
- struct timespec __user *,tp)
+int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
{
- struct timespec64 kernel_tp64;
- struct timespec kernel_tp;
-
switch (which_clock) {
- case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
- case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
- case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
- default: return -EINVAL;
+ case CLOCK_REALTIME:
+ ktime_get_real_ts64(tp);
+ break;
+ case CLOCK_MONOTONIC:
+ ktime_get_ts64(tp);
+ break;
+ case CLOCK_BOOTTIME:
+ get_monotonic_boottime64(tp);
+ break;
+ default:
+ return -EINVAL;
}
- kernel_tp = timespec64_to_timespec(kernel_tp64);
- if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+ return 0;
+}
+SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
+ struct timespec __user *, tp)
+{
+ int ret;
+ struct timespec64 kernel_tp;
+
+ ret = do_clock_gettime(which_clock, &kernel_tp);
+ if (ret)
+ return ret;
+
+ if (put_timespec64(&kernel_tp, tp))
return -EFAULT;
return 0;
}
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp)
{
- struct timespec rtn_tp = {
+ struct timespec64 rtn_tp = {
.tv_sec = 0,
.tv_nsec = hrtimer_resolution,
};
@@ -99,7 +105,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us
case CLOCK_REALTIME:
case CLOCK_MONOTONIC:
case CLOCK_BOOTTIME:
- if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp)))
+ if (put_timespec64(&rtn_tp, tp))
return -EFAULT;
return 0;
default:
@@ -138,44 +144,45 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
}
#ifdef CONFIG_COMPAT
+COMPAT_SYS_NI(timer_create);
+COMPAT_SYS_NI(clock_adjtime);
+COMPAT_SYS_NI(timer_settime);
+COMPAT_SYS_NI(timer_gettime);
+COMPAT_SYS_NI(getitimer);
+COMPAT_SYS_NI(setitimer);
+
COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
- struct timespec64 new_tp64;
- struct timespec new_tp;
+ struct timespec64 new_tp;
if (which_clock != CLOCK_REALTIME)
return -EINVAL;
- if (compat_get_timespec(&new_tp, tp))
+ if (compat_get_timespec64(&new_tp, tp))
return -EFAULT;
- new_tp64 = timespec_to_timespec64(new_tp);
- return do_sys_settimeofday64(&new_tp64, NULL);
+ return do_sys_settimeofday64(&new_tp, NULL);
}
-COMPAT_SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
- struct compat_timespec __user *,tp)
+COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
- struct timespec64 kernel_tp64;
- struct timespec kernel_tp;
+ int ret;
+ struct timespec64 kernel_tp;
- switch (which_clock) {
- case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
- case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
- case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
- default: return -EINVAL;
- }
+ ret = do_clock_gettime(which_clock, &kernel_tp);
+ if (ret)
+ return ret;
- kernel_tp = timespec64_to_timespec(kernel_tp64);
- if (compat_put_timespec(&kernel_tp, tp))
+ if (compat_put_timespec64(&kernel_tp, tp))
return -EFAULT;
return 0;
}
-COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
+COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
- struct timespec rtn_tp = {
+ struct timespec64 rtn_tp = {
.tv_sec = 0,
.tv_nsec = hrtimer_resolution,
};
@@ -184,13 +191,14 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
case CLOCK_REALTIME:
case CLOCK_MONOTONIC:
case CLOCK_BOOTTIME:
- if (compat_put_timespec(&rtn_tp, tp))
+ if (compat_put_timespec64(&rtn_tp, tp))
return -EFAULT;
return 0;
default:
return -EINVAL;
}
}
+
COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
struct compat_timespec __user *, rqtp,
struct compat_timespec __user *, rmtp)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 82d67be7d9d1..13d6881f908b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -739,13 +739,11 @@ static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
struct itimerspec __user *, setting)
{
- struct itimerspec64 cur_setting64;
+ struct itimerspec64 cur_setting;
- int ret = do_timer_gettime(timer_id, &cur_setting64);
+ int ret = do_timer_gettime(timer_id, &cur_setting);
if (!ret) {
- struct itimerspec cur_setting;
- cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
- if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+ if (put_itimerspec64(&cur_setting, setting))
ret = -EFAULT;
}
return ret;
@@ -755,13 +753,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
struct compat_itimerspec __user *, setting)
{
- struct itimerspec64 cur_setting64;
+ struct itimerspec64 cur_setting;
- int ret = do_timer_gettime(timer_id, &cur_setting64);
+ int ret = do_timer_gettime(timer_id, &cur_setting);
if (!ret) {
- struct itimerspec cur_setting;
- cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
- if (put_compat_itimerspec(setting, &cur_setting))
+ if (put_compat_itimerspec64(&cur_setting, setting))
ret = -EFAULT;
}
return ret;
@@ -907,23 +903,19 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
const struct itimerspec __user *, new_setting,
struct itimerspec __user *, old_setting)
{
- struct itimerspec64 new_spec64, old_spec64;
- struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL;
- struct itimerspec new_spec;
+ struct itimerspec64 new_spec, old_spec;
+ struct itimerspec64 *rtn = old_setting ? &old_spec : NULL;
int error = 0;
if (!new_setting)
return -EINVAL;
- if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
+ if (get_itimerspec64(&new_spec, new_setting))
return -EFAULT;
- new_spec64 = itimerspec_to_itimerspec64(&new_spec);
- error = do_timer_settime(timer_id, flags, &new_spec64, rtn);
+ error = do_timer_settime(timer_id, flags, &new_spec, rtn);
if (!error && old_setting) {
- struct itimerspec old_spec;
- old_spec = itimerspec64_to_itimerspec(&old_spec64);
- if (copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
+ if (put_itimerspec64(&old_spec, old_setting))
error = -EFAULT;
}
return error;
@@ -934,22 +926,18 @@ COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
struct compat_itimerspec __user *, new,
struct compat_itimerspec __user *, old)
{
- struct itimerspec64 new_spec64, old_spec64;
- struct itimerspec64 *rtn = old ? &old_spec64 : NULL;
- struct itimerspec new_spec;
+ struct itimerspec64 new_spec, old_spec;
+ struct itimerspec64 *rtn = old ? &old_spec : NULL;
int error = 0;
if (!new)
return -EINVAL;
- if (get_compat_itimerspec(&new_spec, new))
+ if (get_compat_itimerspec64(&new_spec, new))
return -EFAULT;
- new_spec64 = itimerspec_to_itimerspec64(&new_spec);
- error = do_timer_settime(timer_id, flags, &new_spec64, rtn);
+ error = do_timer_settime(timer_id, flags, &new_spec, rtn);
if (!error && old) {
- struct itimerspec old_spec;
- old_spec = itimerspec64_to_itimerspec(&old_spec64);
- if (put_compat_itimerspec(old, &old_spec))
+ if (put_compat_itimerspec64(&old_spec, old))
error = -EFAULT;
}
return error;
@@ -1049,34 +1037,30 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
const struct timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 new_tp64;
- struct timespec new_tp;
+ struct timespec64 new_tp;
if (!kc || !kc->clock_set)
return -EINVAL;
- if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+ if (get_timespec64(&new_tp, tp))
return -EFAULT;
- new_tp64 = timespec_to_timespec64(new_tp);
- return kc->clock_set(which_clock, &new_tp64);
+ return kc->clock_set(which_clock, &new_tp);
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
struct timespec __user *,tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 kernel_tp64;
- struct timespec kernel_tp;
+ struct timespec64 kernel_tp;
int error;
if (!kc)
return -EINVAL;
- error = kc->clock_get(which_clock, &kernel_tp64);
- kernel_tp = timespec64_to_timespec(kernel_tp64);
+ error = kc->clock_get(which_clock, &kernel_tp);
- if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+ if (!error && put_timespec64(&kernel_tp, tp))
error = -EFAULT;
return error;
@@ -1109,17 +1093,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
struct timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 rtn_tp64;
- struct timespec rtn_tp;
+ struct timespec64 rtn_tp;
int error;
if (!kc)
return -EINVAL;
- error = kc->clock_getres(which_clock, &rtn_tp64);
- rtn_tp = timespec64_to_timespec(rtn_tp64);
+ error = kc->clock_getres(which_clock, &rtn_tp);
- if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
+ if (!error && tp && put_timespec64(&rtn_tp, tp))
error = -EFAULT;
return error;
@@ -1131,38 +1113,33 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 new_tp64;
- struct timespec new_tp;
+ struct timespec64 ts;
if (!kc || !kc->clock_set)
return -EINVAL;
- if (compat_get_timespec(&new_tp, tp))
+ if (compat_get_timespec64(&ts, tp))
return -EFAULT;
- new_tp64 = timespec_to_timespec64(new_tp);
-
- return kc->clock_set(which_clock, &new_tp64);
+ return kc->clock_set(which_clock, &ts);
}
COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 kernel_tp64;
- struct timespec kernel_tp;
- int error;
+ struct timespec64 ts;
+ int err;
if (!kc)
return -EINVAL;
- error = kc->clock_get(which_clock, &kernel_tp64);
- kernel_tp = timespec64_to_timespec(kernel_tp64);
+ err = kc->clock_get(which_clock, &ts);
- if (!error && compat_put_timespec(&kernel_tp, tp))
- error = -EFAULT;
+ if (!err && compat_put_timespec64(&ts, tp))
+ err = -EFAULT;
- return error;
+ return err;
}
COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
@@ -1193,21 +1170,19 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 rtn_tp64;
- struct timespec rtn_tp;
- int error;
+ struct timespec64 ts;
+ int err;
if (!kc)
return -EINVAL;
- error = kc->clock_getres(which_clock, &rtn_tp64);
- rtn_tp = timespec64_to_timespec(rtn_tp64);
-
- if (!error && tp && compat_put_timespec(&rtn_tp, tp))
- error = -EFAULT;
+ err = kc->clock_getres(which_clock, &ts);
+ if (!err && tp && compat_put_timespec64(&ts, tp))
+ return -EFAULT;
- return error;
+ return err;
}
+
#endif
/*
@@ -1226,26 +1201,24 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
struct timespec __user *, rmtp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 t64;
- struct timespec t;
+ struct timespec64 t;
if (!kc)
return -EINVAL;
if (!kc->nsleep)
return -ENANOSLEEP_NOTSUP;
- if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+ if (get_timespec64(&t, rqtp))
return -EFAULT;
- t64 = timespec_to_timespec64(t);
- if (!timespec64_valid(&t64))
+ if (!timespec64_valid(&t))
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return kc->nsleep(which_clock, flags, &t64);
+ return kc->nsleep(which_clock, flags, &t);
}
#ifdef CONFIG_COMPAT
@@ -1254,26 +1227,24 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
struct compat_timespec __user *, rmtp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 t64;
- struct timespec t;
+ struct timespec64 t;
if (!kc)
return -EINVAL;
if (!kc->nsleep)
return -ENANOSLEEP_NOTSUP;
- if (compat_get_timespec(&t, rqtp))
+ if (compat_get_timespec64(&t, rqtp))
return -EFAULT;
- t64 = timespec_to_timespec64(t);
- if (!timespec64_valid(&t64))
+ if (!timespec64_valid(&t))
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return kc->nsleep(which_clock, flags, &t64);
+ return kc->nsleep(which_clock, flags, &t);
}
#endif
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 7c89e437c4d7..44a8c1402133 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -890,3 +890,61 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
return res;
}
+
+int get_timespec64(struct timespec64 *ts,
+ const struct timespec __user *uts)
+{
+ struct timespec kts;
+ int ret;
+
+ ret = copy_from_user(&kts, uts, sizeof(kts));
+ if (ret)
+ return -EFAULT;
+
+ ts->tv_sec = kts.tv_sec;
+ ts->tv_nsec = kts.tv_nsec;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(get_timespec64);
+
+int put_timespec64(const struct timespec64 *ts,
+ struct timespec __user *uts)
+{
+ struct timespec kts = {
+ .tv_sec = ts->tv_sec,
+ .tv_nsec = ts->tv_nsec
+ };
+ return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(put_timespec64);
+
+int get_itimerspec64(struct itimerspec64 *it,
+ const struct itimerspec __user *uit)
+{
+ int ret;
+
+ ret = get_timespec64(&it->it_interval, &uit->it_interval);
+ if (ret)
+ return ret;
+
+ ret = get_timespec64(&it->it_value, &uit->it_value);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(get_itimerspec64);
+
+int put_itimerspec64(const struct itimerspec64 *it,
+ struct itimerspec __user *uit)
+{
+ int ret;
+
+ ret = put_timespec64(&it->it_interval, &uit->it_interval);
+ if (ret)
+ return ret;
+
+ ret = put_timespec64(&it->it_value, &uit->it_value);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(put_itimerspec64);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cedafa008de5..8ea4fb315719 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -637,9 +637,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
tk->ktime_sec = seconds;
/* Update the monotonic raw base */
- seconds = tk->raw_sec;
- nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift);
- tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+ tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}
/* must hold timekeeper_lock */
@@ -2066,7 +2064,7 @@ void update_wall_time(void)
goto out;
/* Do some additional sanity checking */
- timekeeping_check_update(real_tk, offset);
+ timekeeping_check_update(tk, offset);
/*
* With NO_HZ we may have to accumulate many cycle_intervals
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 38bc4d2208e8..0754cadfa9e6 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -19,6 +19,7 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/seq_file.h>
+#include <linux/suspend.h>
#include <linux/time.h>
#include "timekeeping_internal.h"
@@ -75,7 +76,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t)
int bin = min(fls(t->tv_sec), NUM_BINS-1);
sleep_time_bin[bin]++;
- printk_deferred(KERN_INFO "Suspended for %lld.%03lu seconds\n",
- (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
+ pm_deferred_pr_dbg("Timekeeping suspended for %lld.%03lu seconds\n",
+ (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 71ce3f4eead3..f2674a056c26 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -203,6 +203,7 @@ struct timer_base {
bool migration_enabled;
bool nohz_active;
bool is_idle;
+ bool must_forward_clk;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base)
{
- unsigned long jnow = READ_ONCE(jiffies);
+ unsigned long jnow;
/*
- * We only forward the base when it's idle and we have a delta between
- * base clock and jiffies.
+ * We only forward the base when we are idle or have just come out of
+ * idle (must_forward_clk logic), and have a delta between base clock
+ * and jiffies. In the common case, run_timers will take care of it.
*/
- if (!base->is_idle || (long) (jnow - base->clk) < 2)
+ if (likely(!base->must_forward_clk))
+ return;
+
+ jnow = READ_ONCE(jiffies);
+ base->must_forward_clk = base->is_idle;
+ if ((long)(jnow - base->clk) < 2)
return;
/*
@@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* same array bucket then just return:
*/
if (timer_pending(timer)) {
+ /*
+ * The downside of this optimization is that it can result in
+ * larger granularity than you would get from adding a new
+ * timer with this expiry.
+ */
if (timer->expires == expires)
return 1;
@@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* dequeue/enqueue dance.
*/
base = lock_timer_base(timer, &flags);
+ forward_timer_base(base);
clk = base->clk;
idx = calc_wheel_index(expires, clk);
@@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
}
} else {
base = lock_timer_base(timer, &flags);
+ forward_timer_base(base);
}
ret = detach_if_pending(timer, base, false);
@@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
raw_spin_lock(&base->lock);
WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | base->cpu);
+ forward_timer_base(base);
}
}
- /* Try to forward a stale timer base clock */
- forward_timer_base(base);
-
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
@@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | cpu);
}
+ forward_timer_base(base);
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
@@ -1495,12 +1508,18 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
base->is_idle = false;
} else {
if (!is_max_delta)
- expires = basem + (nextevt - basej) * TICK_NSEC;
+ expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
- * If we expect to sleep more than a tick, mark the base idle:
+ * If we expect to sleep more than a tick, mark the base idle.
+ * Also the tick is stopped so any added timer must forward
+ * the base clk itself to keep granularity small. This idle
+ * logic is only maintained for the BASE_STD base, deferrable
+ * timers may still see large granularity skew (by design).
*/
- if ((expires - basem) > TICK_NSEC)
+ if ((expires - basem) > TICK_NSEC) {
+ base->must_forward_clk = true;
base->is_idle = true;
+ }
}
raw_spin_unlock(&base->lock);
@@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ /*
+ * must_forward_clk must be cleared before running timers so that any
+ * timer functions that call mod_timer will not try to forward the
+ * base. idle trcking / clock forwarding logic is only used with
+ * BASE_STD timers.
+ *
+ * The deferrable base does not do idle tracking at all, so we do
+ * not forward it. This can result in very large variations in
+ * granularity for deferrable timers, but they can be deferred for
+ * long periods due to idle.
+ */
+ base->must_forward_clk = false;
+
__run_timers(base);
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
diff --git a/kernel/torture.c b/kernel/torture.c
index 55de96529287..637e172835d8 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -117,7 +117,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
torture_type, cpu);
(*n_offl_successes)++;
delta = jiffies - starttime;
- sum_offl += delta;
+ *sum_offl += delta;
if (*min_offl < 0) {
*min_offl = delta;
*max_offl = delta;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 7e06f04e98fe..434c840e2d82 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -667,30 +667,30 @@ config RING_BUFFER_STARTUP_TEST
If unsure, say N
-config TRACE_ENUM_MAP_FILE
- bool "Show enum mappings for trace events"
+config TRACE_EVAL_MAP_FILE
+ bool "Show eval mappings for trace events"
depends on TRACING
help
- The "print fmt" of the trace events will show the enum names instead
- of their values. This can cause problems for user space tools that
- use this string to parse the raw data as user space does not know
+ The "print fmt" of the trace events will show the enum/sizeof names
+ instead of their values. This can cause problems for user space tools
+ that use this string to parse the raw data as user space does not know
how to convert the string to its value.
To fix this, there's a special macro in the kernel that can be used
- to convert the enum into its value. If this macro is used, then the
- print fmt strings will have the enums converted to their values.
+ to convert an enum/sizeof into its value. If this macro is used, then
+ the print fmt strings will be converted to their values.
If something does not get converted properly, this option can be
- used to show what enums the kernel tried to convert.
+ used to show what enums/sizeof the kernel tried to convert.
- This option is for debugging the enum conversions. A file is created
- in the tracing directory called "enum_map" that will show the enum
+ This option is for debugging the conversions. A file is created
+ in the tracing directory called "eval_map" that will show the
names matched with their values and what trace event system they
belong too.
Normally, the mapping of the strings to values will be freed after
boot up or module load. With this option, they will not be freed, as
- they are needed for the "enum_map" file. Enabling this option will
+ they are needed for the "eval_map" file. Enabling this option will
increase the memory footprint of the running kernel.
If unsure, say N
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc364f86100a..2a685b45b73b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -27,6 +27,7 @@
#include <linux/time.h>
#include <linux/uaccess.h>
#include <linux/list.h>
+#include <linux/blk-cgroup.h>
#include "../../block/blk.h"
@@ -46,10 +47,16 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
/* Select an alternative, minimalistic output than the original one */
#define TRACE_BLK_OPT_CLASSIC 0x1
+#define TRACE_BLK_OPT_CGROUP 0x2
+#define TRACE_BLK_OPT_CGNAME 0x4
static struct tracer_opt blk_tracer_opts[] = {
/* Default disable the minimalistic output */
{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
+#ifdef CONFIG_BLK_CGROUP
+ { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
+ { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) },
+#endif
{ }
};
@@ -68,7 +75,8 @@ static void blk_unregister_tracepoints(void);
* Send out a notify message.
*/
static void trace_note(struct blk_trace *bt, pid_t pid, int action,
- const void *data, size_t len)
+ const void *data, size_t len,
+ union kernfs_node_id *cgid)
{
struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
@@ -76,12 +84,13 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
int pc = 0;
int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled;
+ ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
if (blk_tracer) {
buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + len,
+ sizeof(*t) + len + cgid_len,
0, pc);
if (!event)
return;
@@ -92,17 +101,19 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
if (!bt->rchan)
return;
- t = relay_reserve(bt->rchan, sizeof(*t) + len);
+ t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
if (t) {
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->time = ktime_to_ns(ktime_get());
record_it:
t->device = bt->dev;
- t->action = action;
+ t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
t->pid = pid;
t->cpu = cpu;
- t->pdu_len = len;
- memcpy((void *) t + sizeof(*t), data, len);
+ t->pdu_len = len + cgid_len;
+ if (cgid)
+ memcpy((void *)t + sizeof(*t), cgid, cgid_len);
+ memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
if (blk_tracer)
trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -122,7 +133,7 @@ static void trace_note_tsk(struct task_struct *tsk)
spin_lock_irqsave(&running_trace_lock, flags);
list_for_each_entry(bt, &running_trace_list, running_list) {
trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
- sizeof(tsk->comm));
+ sizeof(tsk->comm), NULL);
}
spin_unlock_irqrestore(&running_trace_lock, flags);
}
@@ -139,11 +150,12 @@ static void trace_note_time(struct blk_trace *bt)
words[1] = now.tv_nsec;
local_irq_save(flags);
- trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
+ trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL);
local_irq_restore(flags);
}
-void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
+void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
+ const char *fmt, ...)
{
int n;
va_list args;
@@ -167,7 +179,14 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
va_end(args);
- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+ if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+ blkcg = NULL;
+#ifdef CONFIG_BLK_CGROUP
+ trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
+ blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL);
+#else
+ trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL);
+#endif
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(__trace_note_message);
@@ -204,7 +223,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
int op, int op_flags, u32 what, int error, int pdu_len,
- void *pdu_data)
+ void *pdu_data, union kernfs_node_id *cgid)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
@@ -215,6 +234,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
pid_t pid;
int cpu, pc = 0;
bool blk_tracer = blk_tracer_enabled;
+ ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
@@ -229,6 +249,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
what |= BLK_TC_ACT(BLK_TC_DISCARD);
if (op == REQ_OP_FLUSH)
what |= BLK_TC_ACT(BLK_TC_FLUSH);
+ if (cgid)
+ what |= __BLK_TA_CGROUP;
pid = tsk->pid;
if (act_log_check(bt, what, sector, pid))
@@ -241,7 +263,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + pdu_len,
+ sizeof(*t) + pdu_len + cgid_len,
0, pc);
if (!event)
return;
@@ -258,7 +280,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
* from coming in and stepping on our toes.
*/
local_irq_save(flags);
- t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+ t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
if (t) {
sequence = per_cpu_ptr(bt->sequence, cpu);
@@ -280,10 +302,12 @@ record_it:
t->action = what;
t->device = bt->dev;
t->error = error;
- t->pdu_len = pdu_len;
+ t->pdu_len = pdu_len + cgid_len;
+ if (cgid_len)
+ memcpy((void *)t + sizeof(*t), cgid, cgid_len);
if (pdu_len)
- memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+ memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
if (blk_tracer) {
trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -359,7 +383,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
return PTR_ERR(msg);
bt = filp->private_data;
- __trace_note_message(bt, "%s", msg);
+ __trace_note_message(bt, NULL, "%s", msg);
kfree(msg);
return count;
@@ -684,6 +708,36 @@ void blk_trace_shutdown(struct request_queue *q)
}
}
+#ifdef CONFIG_BLK_CGROUP
+static union kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+ return NULL;
+
+ if (!bio->bi_css)
+ return NULL;
+ return cgroup_get_kernfs_id(bio->bi_css->cgroup);
+}
+#else
+static union kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+ return NULL;
+}
+#endif
+
+static union kernfs_node_id *
+blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+{
+ if (!rq->bio)
+ return NULL;
+ /* Use the first bio */
+ return blk_trace_bio_get_cgid(q, rq->bio);
+}
+
/*
* blktrace probes
*/
@@ -694,13 +748,15 @@ void blk_trace_shutdown(struct request_queue *q)
* @error: return status to log
* @nr_bytes: number of completed bytes
* @what: the action
+ * @cgid: the cgroup info
*
* Description:
* Records an action against a request. Will log the bio offset + size.
*
**/
static void blk_add_trace_rq(struct request *rq, int error,
- unsigned int nr_bytes, u32 what)
+ unsigned int nr_bytes, u32 what,
+ union kernfs_node_id *cgid)
{
struct blk_trace *bt = rq->q->blk_trace;
@@ -713,32 +769,36 @@ static void blk_add_trace_rq(struct request *rq, int error,
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, error, 0, NULL);
+ rq->cmd_flags, what, error, 0, NULL, cgid);
}
static void blk_add_trace_rq_insert(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_issue(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_requeue(void *ignore,
struct request_queue *q,
struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
int error, unsigned int nr_bytes)
{
- blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
+ blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
+ blk_trace_request_get_cgid(rq->q, rq));
}
/**
@@ -753,7 +813,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
*
**/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what, int error)
+ u32 what, int error, union kernfs_node_id *cgid)
{
struct blk_trace *bt = q->blk_trace;
@@ -761,20 +821,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
return;
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, what, error, 0, NULL);
+ bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid);
}
static void blk_add_trace_bio_bounce(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_complete(void *ignore,
struct request_queue *q, struct bio *bio,
int error)
{
- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_backmerge(void *ignore,
@@ -782,7 +844,8 @@ static void blk_add_trace_bio_backmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_frontmerge(void *ignore,
@@ -790,13 +853,15 @@ static void blk_add_trace_bio_frontmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_queue(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_getrq(void *ignore,
@@ -804,13 +869,14 @@ static void blk_add_trace_getrq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0,
+ blk_trace_bio_get_cgid(q, bio));
else {
struct blk_trace *bt = q->blk_trace;
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
- NULL);
+ NULL, NULL);
}
}
@@ -820,13 +886,14 @@ static void blk_add_trace_sleeprq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0,
+ blk_trace_bio_get_cgid(q, bio));
else {
struct blk_trace *bt = q->blk_trace;
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
- 0, 0, NULL);
+ 0, 0, NULL, NULL);
}
}
@@ -835,7 +902,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL);
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -852,7 +919,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else
what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+ __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL);
}
}
@@ -868,7 +935,7 @@ static void blk_add_trace_split(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
- &rpdu);
+ &rpdu, blk_trace_bio_get_cgid(q, bio));
}
}
@@ -896,12 +963,12 @@ static void blk_add_trace_bio_remap(void *ignore,
return;
r.device_from = cpu_to_be32(dev);
- r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
+ r.device_to = cpu_to_be32(bio_dev(bio));
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
- sizeof(r), &r);
+ sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
}
/**
@@ -934,7 +1001,7 @@ static void blk_add_trace_rq_remap(void *ignore,
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
- sizeof(r), &r);
+ sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
}
/**
@@ -958,7 +1025,8 @@ void blk_add_driver_data(struct request_queue *q,
return;
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
- BLK_TA_DRV_DATA, 0, len, data);
+ BLK_TA_DRV_DATA, 0, len, data,
+ blk_trace_request_get_cgid(q, rq));
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1031,7 +1099,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
int i = 0;
int tc = t->action >> BLK_TC_SHIFT;
- if (t->action == BLK_TN_MESSAGE) {
+ if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
rwbs[i++] = 'N';
goto out;
}
@@ -1066,9 +1134,21 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
return (const struct blk_io_trace *)ent;
}
-static inline const void *pdu_start(const struct trace_entry *ent)
+static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
+{
+ return (void *)(te_blk_io_trace(ent) + 1) +
+ (has_cg ? sizeof(union kernfs_node_id) : 0);
+}
+
+static inline const void *cgid_start(const struct trace_entry *ent)
+{
+ return (void *)(te_blk_io_trace(ent) + 1);
+}
+
+static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
{
- return te_blk_io_trace(ent) + 1;
+ return te_blk_io_trace(ent)->pdu_len -
+ (has_cg ? sizeof(union kernfs_node_id) : 0);
}
static inline u32 t_action(const struct trace_entry *ent)
@@ -1096,16 +1176,16 @@ static inline __u16 t_error(const struct trace_entry *ent)
return te_blk_io_trace(ent)->error;
}
-static __u64 get_pdu_int(const struct trace_entry *ent)
+static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
{
- const __u64 *val = pdu_start(ent);
+ const __u64 *val = pdu_start(ent, has_cg);
return be64_to_cpu(*val);
}
static void get_pdu_remap(const struct trace_entry *ent,
- struct blk_io_trace_remap *r)
+ struct blk_io_trace_remap *r, bool has_cg)
{
- const struct blk_io_trace_remap *__r = pdu_start(ent);
+ const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
__u64 sector_from = __r->sector_from;
r->device_from = be32_to_cpu(__r->device_from);
@@ -1113,9 +1193,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
r->sector_from = be64_to_cpu(sector_from);
}
-typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
+ bool has_cg);
-static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
+static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
+ bool has_cg)
{
char rwbs[RWBS_LEN];
unsigned long long ts = iter->ts;
@@ -1131,24 +1213,43 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
secs, nsec_rem, iter->ent->pid, act, rwbs);
}
-static void blk_log_action(struct trace_iterator *iter, const char *act)
+static void blk_log_action(struct trace_iterator *iter, const char *act,
+ bool has_cg)
{
char rwbs[RWBS_LEN];
const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
- trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
- MAJOR(t->device), MINOR(t->device), act, rwbs);
-}
-
-static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+ if (has_cg) {
+ const union kernfs_node_id *id = cgid_start(iter->ent);
+
+ if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) {
+ char blkcg_name_buf[NAME_MAX + 1] = "<...>";
+
+ cgroup_path_from_kernfs_id(id, blkcg_name_buf,
+ sizeof(blkcg_name_buf));
+ trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ",
+ MAJOR(t->device), MINOR(t->device),
+ blkcg_name_buf, act, rwbs);
+ } else
+ trace_seq_printf(&iter->seq,
+ "%3d,%-3d %x,%-x %2s %3s ",
+ MAJOR(t->device), MINOR(t->device),
+ id->ino, id->generation, act, rwbs);
+ } else
+ trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), act, rwbs);
+}
+
+static void blk_log_dump_pdu(struct trace_seq *s,
+ const struct trace_entry *ent, bool has_cg)
{
const unsigned char *pdu_buf;
int pdu_len;
int i, end;
- pdu_buf = pdu_start(ent);
- pdu_len = te_blk_io_trace(ent)->pdu_len;
+ pdu_buf = pdu_start(ent, has_cg);
+ pdu_len = pdu_real_len(ent, has_cg);
if (!pdu_len)
return;
@@ -1179,7 +1280,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
trace_seq_puts(s, ") ");
}
-static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
@@ -1187,7 +1288,7 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
trace_seq_printf(s, "%u ", t_bytes(ent));
- blk_log_dump_pdu(s, ent);
+ blk_log_dump_pdu(s, ent, has_cg);
trace_seq_printf(s, "[%s]\n", cmd);
} else {
if (t_sec(ent))
@@ -1199,10 +1300,10 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
}
static void blk_log_with_error(struct trace_seq *s,
- const struct trace_entry *ent)
+ const struct trace_entry *ent, bool has_cg)
{
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
- blk_log_dump_pdu(s, ent);
+ blk_log_dump_pdu(s, ent, has_cg);
trace_seq_printf(s, "[%d]\n", t_error(ent));
} else {
if (t_sec(ent))
@@ -1215,18 +1316,18 @@ static void blk_log_with_error(struct trace_seq *s,
}
}
-static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
struct blk_io_trace_remap r = { .device_from = 0, };
- get_pdu_remap(ent, &r);
+ get_pdu_remap(ent, &r, has_cg);
trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
t_sector(ent), t_sec(ent),
MAJOR(r.device_from), MINOR(r.device_from),
(unsigned long long)r.sector_from);
}
-static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
@@ -1235,30 +1336,31 @@ static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
trace_seq_printf(s, "[%s]\n", cmd);
}
-static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+ trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg));
}
-static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
- get_pdu_int(ent), cmd);
+ get_pdu_int(ent, has_cg), cmd);
}
-static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent,
+ bool has_cg)
{
- const struct blk_io_trace *t = te_blk_io_trace(ent);
- trace_seq_putmem(s, t + 1, t->pdu_len);
+ trace_seq_putmem(s, pdu_start(ent, has_cg),
+ pdu_real_len(ent, has_cg));
trace_seq_putc(s, '\n');
}
@@ -1298,7 +1400,8 @@ static void blk_tracer_reset(struct trace_array *tr)
static const struct {
const char *act[2];
- void (*print)(struct trace_seq *s, const struct trace_entry *ent);
+ void (*print)(struct trace_seq *s, const struct trace_entry *ent,
+ bool has_cg);
} what2act[] = {
[__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
[__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
@@ -1326,23 +1429,25 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
u16 what;
bool long_act;
blk_log_action_t *log_action;
+ bool has_cg;
t = te_blk_io_trace(iter->ent);
- what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+ what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP;
long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
log_action = classic ? &blk_log_action_classic : &blk_log_action;
+ has_cg = t->action & __BLK_TA_CGROUP;
- if (t->action == BLK_TN_MESSAGE) {
- log_action(iter, long_act ? "message" : "m");
- blk_log_msg(s, iter->ent);
+ if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
+ log_action(iter, long_act ? "message" : "m", has_cg);
+ blk_log_msg(s, iter->ent, has_cg);
return trace_handle_return(s);
}
if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
trace_seq_printf(s, "Unknown action %x\n", what);
else {
- log_action(iter, what2act[what].act[long_act]);
- what2act[what].print(s, iter->ent);
+ log_action(iter, what2act[what].act[long_act], has_cg);
+ what2act[what].print(s, iter->ent, has_cg);
}
return trace_handle_return(s);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 460a031c77e5..dc498b605d5d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -122,8 +122,8 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
}
/*
- * limited trace_printk()
- * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
+ * Only limited trace_printk() conversion specifiers allowed:
+ * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s
*/
BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
u64, arg2, u64, arg3)
@@ -198,15 +198,42 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
i++;
}
- if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+ if (fmt[i] != 'i' && fmt[i] != 'd' &&
+ fmt[i] != 'u' && fmt[i] != 'x')
return -EINVAL;
fmt_cnt++;
}
- return __trace_printk(1/* fake ip will not be printed */, fmt,
- mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
- mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
- mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
+/* Horrid workaround for getting va_list handling working with different
+ * argument type combinations generically for 32 and 64 bit archs.
+ */
+#define __BPF_TP_EMIT() __BPF_ARG3_TP()
+#define __BPF_TP(...) \
+ __trace_printk(1 /* Fake ip will not be printed. */, \
+ fmt, ##__VA_ARGS__)
+
+#define __BPF_ARG1_TP(...) \
+ ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_TP(arg1, ##__VA_ARGS__) \
+ : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_TP((long)arg1, ##__VA_ARGS__) \
+ : __BPF_TP((u32)arg1, ##__VA_ARGS__)))
+
+#define __BPF_ARG2_TP(...) \
+ ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \
+ : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \
+ : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
+
+#define __BPF_ARG3_TP(...) \
+ ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \
+ : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \
+ : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
+
+ return __BPF_TP_EMIT();
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
@@ -234,7 +261,8 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
struct bpf_event_entry *ee;
- struct perf_event *event;
+ u64 value = 0;
+ int err;
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -247,21 +275,14 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
if (!ee)
return -ENOENT;
- event = ee->event;
- if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
- event->attr.type != PERF_TYPE_RAW))
- return -EINVAL;
-
- /* make sure event is local and doesn't have pmu::count */
- if (unlikely(event->oncpu != cpu || event->pmu->count))
- return -EINVAL;
-
+ err = perf_event_read_local(ee->event, &value);
/*
- * we don't know if the function is run successfully by the
- * return value. It can be judged in other places, such as
- * eBPF programs.
+ * this api is ugly since we miss [-22..-2] range of valid
+ * counter values, but that's uapi
*/
- return perf_event_read_local(event);
+ if (err)
+ return err;
+ return value;
}
static const struct bpf_func_proto bpf_perf_event_read_proto = {
@@ -272,14 +293,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
+
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
u64 flags, struct perf_raw_record *raw)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
- struct perf_sample_data sample_data;
struct bpf_event_entry *ee;
struct perf_event *event;
@@ -300,9 +323,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
- perf_sample_data_init(&sample_data, 0, 0);
- sample_data.raw = raw;
- perf_event_output(event, &sample_data, regs);
+ perf_sample_data_init(sd, 0, 0);
+ sd->raw = raw;
+ perf_event_output(event, sd, regs);
return 0;
}
@@ -483,7 +506,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
/* bpf+kprobe programs can access fields of 'struct pt_regs' */
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
if (off < 0 || off >= sizeof(struct pt_regs))
return false;
@@ -566,7 +589,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
}
static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
return false;
@@ -585,40 +608,47 @@ const struct bpf_verifier_ops tracepoint_prog_ops = {
};
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
+ const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data,
+ sample_period);
+
if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
return false;
if (type != BPF_READ)
return false;
if (off % size != 0)
return false;
- if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
- if (size != sizeof(u64))
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
+ bpf_ctx_record_field_size(info, size_sp);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_sp))
return false;
- } else {
+ break;
+ default:
if (size != sizeof(long))
return false;
}
+
return true;
}
static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
switch (si->off) {
case offsetof(struct bpf_perf_event_data, sample_period):
- BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
-
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
data), si->dst_reg, si->src_reg,
offsetof(struct bpf_perf_event_data_kern, data));
*insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
- offsetof(struct perf_sample_data, period));
+ bpf_target_off(struct perf_sample_data, period, 8,
+ target_size));
break;
default:
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b308be30dfb9..96cea88fa00f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -113,7 +113,7 @@ static int ftrace_disabled __read_mostly;
static DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
+static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
static struct ftrace_ops global_ops;
@@ -169,8 +169,11 @@ int ftrace_nr_registered_ops(void)
mutex_lock(&ftrace_lock);
- for (ops = ftrace_ops_list;
- ops != &ftrace_list_end; ops = ops->next)
+ for (ops = rcu_dereference_protected(ftrace_ops_list,
+ lockdep_is_held(&ftrace_lock));
+ ops != &ftrace_list_end;
+ ops = rcu_dereference_protected(ops->next,
+ lockdep_is_held(&ftrace_lock)))
cnt++;
mutex_unlock(&ftrace_lock);
@@ -275,10 +278,11 @@ static void update_ftrace_function(void)
* If there's only one ftrace_ops registered, the ftrace_ops_list
* will point to the ops we want.
*/
- set_function_trace_op = ftrace_ops_list;
+ set_function_trace_op = rcu_dereference_protected(ftrace_ops_list,
+ lockdep_is_held(&ftrace_lock));
/* If there's no ftrace_ops registered, just call the stub function */
- if (ftrace_ops_list == &ftrace_list_end) {
+ if (set_function_trace_op == &ftrace_list_end) {
func = ftrace_stub;
/*
@@ -286,7 +290,8 @@ static void update_ftrace_function(void)
* recursion safe and not dynamic and the arch supports passing ops,
* then have the mcount trampoline call the function directly.
*/
- } else if (ftrace_ops_list->next == &ftrace_list_end) {
+ } else if (rcu_dereference_protected(ftrace_ops_list->next,
+ lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
func = ftrace_ops_get_list_func(ftrace_ops_list);
} else {
@@ -348,9 +353,11 @@ int using_ftrace_ops_list_func(void)
return ftrace_trace_function == ftrace_ops_list_func;
}
-static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
+static void add_ftrace_ops(struct ftrace_ops __rcu **list,
+ struct ftrace_ops *ops)
{
- ops->next = *list;
+ rcu_assign_pointer(ops->next, *list);
+
/*
* We are entering ops into the list but another
* CPU might be walking that list. We need to make sure
@@ -360,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
rcu_assign_pointer(*list, ops);
}
-static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
+static int remove_ftrace_ops(struct ftrace_ops __rcu **list,
+ struct ftrace_ops *ops)
{
struct ftrace_ops **p;
@@ -368,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
* If we are removing the last function, then simply point
* to the ftrace_stub.
*/
- if (*list == ops && ops->next == &ftrace_list_end) {
+ if (rcu_dereference_protected(*list,
+ lockdep_is_held(&ftrace_lock)) == ops &&
+ rcu_dereference_protected(ops->next,
+ lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
*list = &ftrace_list_end;
return 0;
}
@@ -878,6 +889,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
function_profile_call(trace->func, 0, NULL, NULL);
+ /* If function graph is shutting down, ret_stack can be NULL */
+ if (!current->ret_stack)
+ return 0;
+
if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
current->ret_stack[index].subtime = 0;
@@ -1293,6 +1308,28 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
FTRACE_WARN_ON(hash->count);
}
+static void free_ftrace_mod(struct ftrace_mod_load *ftrace_mod)
+{
+ list_del(&ftrace_mod->list);
+ kfree(ftrace_mod->module);
+ kfree(ftrace_mod->func);
+ kfree(ftrace_mod);
+}
+
+static void clear_ftrace_mod_list(struct list_head *head)
+{
+ struct ftrace_mod_load *p, *n;
+
+ /* stack tracer isn't supported yet */
+ if (!head)
+ return;
+
+ mutex_lock(&ftrace_lock);
+ list_for_each_entry_safe(p, n, head, list)
+ free_ftrace_mod(p);
+ mutex_unlock(&ftrace_lock);
+}
+
static void free_ftrace_hash(struct ftrace_hash *hash)
{
if (!hash || hash == EMPTY_HASH)
@@ -1346,6 +1383,35 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
return hash;
}
+
+static int ftrace_add_mod(struct trace_array *tr,
+ const char *func, const char *module,
+ int enable)
+{
+ struct ftrace_mod_load *ftrace_mod;
+ struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace;
+
+ ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL);
+ if (!ftrace_mod)
+ return -ENOMEM;
+
+ ftrace_mod->func = kstrdup(func, GFP_KERNEL);
+ ftrace_mod->module = kstrdup(module, GFP_KERNEL);
+ ftrace_mod->enable = enable;
+
+ if (!ftrace_mod->func || !ftrace_mod->module)
+ goto out_free;
+
+ list_add(&ftrace_mod->list, mod_head);
+
+ return 0;
+
+ out_free:
+ free_ftrace_mod(ftrace_mod);
+
+ return -ENOMEM;
+}
+
static struct ftrace_hash *
alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
{
@@ -1359,6 +1425,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
if (!new_hash)
return NULL;
+ if (hash)
+ new_hash->flags = hash->flags;
+
/* Empty hash? */
if (ftrace_hash_empty(hash))
return new_hash;
@@ -1403,7 +1472,7 @@ __ftrace_hash_move(struct ftrace_hash *src)
/*
* If the new source is empty, just return the empty_hash.
*/
- if (!src->count)
+ if (ftrace_hash_empty(src))
return EMPTY_HASH;
/*
@@ -1420,6 +1489,8 @@ __ftrace_hash_move(struct ftrace_hash *src)
if (!new_hash)
return NULL;
+ new_hash->flags = src->flags;
+
size = 1 << src->size_bits;
for (i = 0; i < size; i++) {
hhd = &src->buckets[i];
@@ -1513,8 +1584,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
return 0;
#endif
- hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash);
- hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash);
+ rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash);
+ rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash);
if (hash_contains_ip(ip, &hash))
ret = 1;
@@ -1650,7 +1721,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
struct dyn_ftrace *rec;
bool update = false;
int count = 0;
- int all = 0;
+ int all = false;
/* Only update if the ops has been registered */
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
@@ -1671,7 +1742,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
hash = ops->func_hash->filter_hash;
other_hash = ops->func_hash->notrace_hash;
if (ftrace_hash_empty(hash))
- all = 1;
+ all = true;
} else {
inc = !inc;
hash = ops->func_hash->notrace_hash;
@@ -2784,7 +2855,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* If there's no more ops registered with ftrace, run a
* sanity check to make sure all rec flags are cleared.
*/
- if (ftrace_ops_list == &ftrace_list_end) {
+ if (rcu_dereference_protected(ftrace_ops_list,
+ lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
struct ftrace_page *pg;
struct dyn_ftrace *rec;
@@ -3061,6 +3133,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
struct ftrace_iterator {
loff_t pos;
loff_t func_pos;
+ loff_t mod_pos;
struct ftrace_page *pg;
struct dyn_ftrace *func;
struct ftrace_func_probe *probe;
@@ -3068,6 +3141,8 @@ struct ftrace_iterator {
struct trace_parser parser;
struct ftrace_hash *hash;
struct ftrace_ops *ops;
+ struct trace_array *tr;
+ struct list_head *mod_list;
int pidx;
int idx;
unsigned flags;
@@ -3152,13 +3227,13 @@ static void *t_probe_start(struct seq_file *m, loff_t *pos)
if (!(iter->flags & FTRACE_ITER_DO_PROBES))
return NULL;
- if (iter->func_pos > *pos)
+ if (iter->mod_pos > *pos)
return NULL;
iter->probe = NULL;
iter->probe_entry = NULL;
iter->pidx = 0;
- for (l = 0; l <= (*pos - iter->func_pos); ) {
+ for (l = 0; l <= (*pos - iter->mod_pos); ) {
p = t_probe_next(m, &l);
if (!p)
break;
@@ -3197,6 +3272,82 @@ t_probe_show(struct seq_file *m, struct ftrace_iterator *iter)
}
static void *
+t_mod_next(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
+
+ (*pos)++;
+ iter->pos = *pos;
+
+ iter->mod_list = iter->mod_list->next;
+
+ if (iter->mod_list == &tr->mod_trace ||
+ iter->mod_list == &tr->mod_notrace) {
+ iter->flags &= ~FTRACE_ITER_MOD;
+ return NULL;
+ }
+
+ iter->mod_pos = *pos;
+
+ return iter;
+}
+
+static void *t_mod_start(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ void *p = NULL;
+ loff_t l;
+
+ if (iter->func_pos > *pos)
+ return NULL;
+
+ iter->mod_pos = iter->func_pos;
+
+ /* probes are only available if tr is set */
+ if (!iter->tr)
+ return NULL;
+
+ for (l = 0; l <= (*pos - iter->func_pos); ) {
+ p = t_mod_next(m, &l);
+ if (!p)
+ break;
+ }
+ if (!p) {
+ iter->flags &= ~FTRACE_ITER_MOD;
+ return t_probe_start(m, pos);
+ }
+
+ /* Only set this if we have an item */
+ iter->flags |= FTRACE_ITER_MOD;
+
+ return iter;
+}
+
+static int
+t_mod_show(struct seq_file *m, struct ftrace_iterator *iter)
+{
+ struct ftrace_mod_load *ftrace_mod;
+ struct trace_array *tr = iter->tr;
+
+ if (WARN_ON_ONCE(!iter->mod_list) ||
+ iter->mod_list == &tr->mod_trace ||
+ iter->mod_list == &tr->mod_notrace)
+ return -EIO;
+
+ ftrace_mod = list_entry(iter->mod_list, struct ftrace_mod_load, list);
+
+ if (ftrace_mod->func)
+ seq_printf(m, "%s", ftrace_mod->func);
+ else
+ seq_putc(m, '*');
+
+ seq_printf(m, ":mod:%s\n", ftrace_mod->module);
+
+ return 0;
+}
+
+static void *
t_func_next(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
@@ -3237,7 +3388,7 @@ static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
- loff_t l = *pos; /* t_hash_start() must use original pos */
+ loff_t l = *pos; /* t_probe_start() must use original pos */
void *ret;
if (unlikely(ftrace_disabled))
@@ -3246,16 +3397,19 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
if (iter->flags & FTRACE_ITER_PROBE)
return t_probe_next(m, pos);
+ if (iter->flags & FTRACE_ITER_MOD)
+ return t_mod_next(m, pos);
+
if (iter->flags & FTRACE_ITER_PRINTALL) {
/* next must increment pos, and t_probe_start does not */
(*pos)++;
- return t_probe_start(m, &l);
+ return t_mod_start(m, &l);
}
ret = t_func_next(m, pos);
if (!ret)
- return t_probe_start(m, &l);
+ return t_mod_start(m, &l);
return ret;
}
@@ -3264,7 +3418,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
{
iter->pos = 0;
iter->func_pos = 0;
- iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE);
+ iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE | FTRACE_ITER_MOD);
}
static void *t_start(struct seq_file *m, loff_t *pos)
@@ -3293,15 +3447,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
ftrace_hash_empty(iter->hash)) {
iter->func_pos = 1; /* Account for the message */
if (*pos > 0)
- return t_probe_start(m, pos);
+ return t_mod_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
/* reset in case of seek/pread */
iter->flags &= ~FTRACE_ITER_PROBE;
return iter;
}
- if (iter->flags & FTRACE_ITER_PROBE)
- return t_probe_start(m, pos);
+ if (iter->flags & FTRACE_ITER_MOD)
+ return t_mod_start(m, pos);
/*
* Unfortunately, we need to restart at ftrace_pages_start
@@ -3317,7 +3471,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
}
if (!p)
- return t_probe_start(m, pos);
+ return t_mod_start(m, pos);
return iter;
}
@@ -3351,6 +3505,9 @@ static int t_show(struct seq_file *m, void *v)
if (iter->flags & FTRACE_ITER_PROBE)
return t_probe_show(m, iter);
+ if (iter->flags & FTRACE_ITER_MOD)
+ return t_mod_show(m, iter);
+
if (iter->flags & FTRACE_ITER_PRINTALL) {
if (iter->flags & FTRACE_ITER_NOTRACE)
seq_puts(m, "#### no functions disabled ####\n");
@@ -3457,6 +3614,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
{
struct ftrace_iterator *iter;
struct ftrace_hash *hash;
+ struct list_head *mod_head;
+ struct trace_array *tr = ops->private;
int ret = 0;
ftrace_ops_init(ops);
@@ -3475,21 +3634,29 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
iter->ops = ops;
iter->flags = flag;
+ iter->tr = tr;
mutex_lock(&ops->func_hash->regex_lock);
- if (flag & FTRACE_ITER_NOTRACE)
+ if (flag & FTRACE_ITER_NOTRACE) {
hash = ops->func_hash->notrace_hash;
- else
+ mod_head = tr ? &tr->mod_notrace : NULL;
+ } else {
hash = ops->func_hash->filter_hash;
+ mod_head = tr ? &tr->mod_trace : NULL;
+ }
+
+ iter->mod_list = mod_head;
if (file->f_mode & FMODE_WRITE) {
const int size_bits = FTRACE_HASH_DEFAULT_BITS;
- if (file->f_flags & O_TRUNC)
+ if (file->f_flags & O_TRUNC) {
iter->hash = alloc_ftrace_hash(size_bits);
- else
+ clear_ftrace_mod_list(mod_head);
+ } else {
iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
+ }
if (!iter->hash) {
trace_parser_put(&iter->parser);
@@ -3665,7 +3832,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
int exclude_mod = 0;
int found = 0;
int ret;
- int clear_filter;
+ int clear_filter = 0;
if (func) {
func_g.type = filter_parse_regex(func, len, &func_g.search,
@@ -3761,6 +3928,165 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
return ret;
}
+static bool module_exists(const char *module)
+{
+ /* All modules have the symbol __this_module */
+ const char this_mod[] = "__this_module";
+ const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1;
+ char modname[modname_size + 1];
+ unsigned long val;
+ int n;
+
+ n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod);
+
+ if (n > modname_size)
+ return false;
+
+ val = module_kallsyms_lookup_name(modname);
+ return val != 0;
+}
+
+static int cache_mod(struct trace_array *tr,
+ const char *func, char *module, int enable)
+{
+ struct ftrace_mod_load *ftrace_mod, *n;
+ struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace;
+ int ret;
+
+ mutex_lock(&ftrace_lock);
+
+ /* We do not cache inverse filters */
+ if (func[0] == '!') {
+ func++;
+ ret = -EINVAL;
+
+ /* Look to remove this hash */
+ list_for_each_entry_safe(ftrace_mod, n, head, list) {
+ if (strcmp(ftrace_mod->module, module) != 0)
+ continue;
+
+ /* no func matches all */
+ if (strcmp(func, "*") == 0 ||
+ (ftrace_mod->func &&
+ strcmp(ftrace_mod->func, func) == 0)) {
+ ret = 0;
+ free_ftrace_mod(ftrace_mod);
+ continue;
+ }
+ }
+ goto out;
+ }
+
+ ret = -EINVAL;
+ /* We only care about modules that have not been loaded yet */
+ if (module_exists(module))
+ goto out;
+
+ /* Save this string off, and execute it when the module is loaded */
+ ret = ftrace_add_mod(tr, func, module, enable);
+ out:
+ mutex_unlock(&ftrace_lock);
+
+ return ret;
+}
+
+static int
+ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
+ int reset, int enable);
+
+#ifdef CONFIG_MODULES
+static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
+ char *mod, bool enable)
+{
+ struct ftrace_mod_load *ftrace_mod, *n;
+ struct ftrace_hash **orig_hash, *new_hash;
+ LIST_HEAD(process_mods);
+ char *func;
+ int ret;
+
+ mutex_lock(&ops->func_hash->regex_lock);
+
+ if (enable)
+ orig_hash = &ops->func_hash->filter_hash;
+ else
+ orig_hash = &ops->func_hash->notrace_hash;
+
+ new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS,
+ *orig_hash);
+ if (!new_hash)
+ goto out; /* warn? */
+
+ mutex_lock(&ftrace_lock);
+
+ list_for_each_entry_safe(ftrace_mod, n, head, list) {
+
+ if (strcmp(ftrace_mod->module, mod) != 0)
+ continue;
+
+ if (ftrace_mod->func)
+ func = kstrdup(ftrace_mod->func, GFP_KERNEL);
+ else
+ func = kstrdup("*", GFP_KERNEL);
+
+ if (!func) /* warn? */
+ continue;
+
+ list_del(&ftrace_mod->list);
+ list_add(&ftrace_mod->list, &process_mods);
+
+ /* Use the newly allocated func, as it may be "*" */
+ kfree(ftrace_mod->func);
+ ftrace_mod->func = func;
+ }
+
+ mutex_unlock(&ftrace_lock);
+
+ list_for_each_entry_safe(ftrace_mod, n, &process_mods, list) {
+
+ func = ftrace_mod->func;
+
+ /* Grabs ftrace_lock, which is why we have this extra step */
+ match_records(new_hash, func, strlen(func), mod);
+ free_ftrace_mod(ftrace_mod);
+ }
+
+ if (enable && list_empty(head))
+ new_hash->flags &= ~FTRACE_HASH_FL_MOD;
+
+ mutex_lock(&ftrace_lock);
+
+ ret = ftrace_hash_move_and_update_ops(ops, orig_hash,
+ new_hash, enable);
+ mutex_unlock(&ftrace_lock);
+
+ out:
+ mutex_unlock(&ops->func_hash->regex_lock);
+
+ free_ftrace_hash(new_hash);
+}
+
+static void process_cached_mods(const char *mod_name)
+{
+ struct trace_array *tr;
+ char *mod;
+
+ mod = kstrdup(mod_name, GFP_KERNEL);
+ if (!mod)
+ return;
+
+ mutex_lock(&trace_types_lock);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!list_empty(&tr->mod_trace))
+ process_mod_list(&tr->mod_trace, tr->ops, mod, true);
+ if (!list_empty(&tr->mod_notrace))
+ process_mod_list(&tr->mod_notrace, tr->ops, mod, false);
+ }
+ mutex_unlock(&trace_types_lock);
+
+ kfree(mod);
+}
+#endif
+
/*
* We register the module command as a template to show others how
* to register the a command as well.
@@ -3768,10 +4094,16 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
static int
ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
- char *func, char *cmd, char *module, int enable)
+ char *func_orig, char *cmd, char *module, int enable)
{
+ char *func;
int ret;
+ /* match_records() modifies func, and we need the original */
+ func = kstrdup(func_orig, GFP_KERNEL);
+ if (!func)
+ return -ENOMEM;
+
/*
* cmd == 'mod' because we only registered this func
* for the 'mod' ftrace_func_command.
@@ -3780,8 +4112,10 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
* parameter.
*/
ret = match_records(hash, func, strlen(func), module);
+ kfree(func);
+
if (!ret)
- return -EINVAL;
+ return cache_mod(tr, func_orig, module, enable);
if (ret < 0)
return ret;
return 0;
@@ -4725,9 +5059,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
if (file->f_mode & FMODE_WRITE) {
filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
- if (filter_hash)
+ if (filter_hash) {
orig_hash = &iter->ops->func_hash->filter_hash;
- else
+ if (iter->tr && !list_empty(&iter->tr->mod_trace))
+ iter->hash->flags |= FTRACE_HASH_FL_MOD;
+ } else
orig_hash = &iter->ops->func_hash->notrace_hash;
mutex_lock(&ftrace_lock);
@@ -5385,6 +5721,7 @@ void ftrace_release_mod(struct module *mod)
if (pg == ftrace_pages)
ftrace_pages = next_to_ftrace_page(last_pg);
+ ftrace_update_tot_cnt -= pg->index;
*last_pg = pg->next;
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
free_pages((unsigned long)pg->records, order);
@@ -5463,6 +5800,8 @@ void ftrace_module_enable(struct module *mod)
out_unlock:
mutex_unlock(&ftrace_lock);
+
+ process_cached_mods(mod->name);
}
void ftrace_module_init(struct module *mod)
@@ -5501,6 +5840,7 @@ void __init ftrace_free_init_mem(void)
if (!rec)
continue;
pg->index--;
+ ftrace_update_tot_cnt--;
if (!pg->index) {
*last_pg = pg->next;
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
@@ -5567,6 +5907,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
void ftrace_init_trace_array(struct trace_array *tr)
{
INIT_LIST_HEAD(&tr->func_probes);
+ INIT_LIST_HEAD(&tr->mod_trace);
+ INIT_LIST_HEAD(&tr->mod_notrace);
}
#else
@@ -6127,7 +6469,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
if (ftrace_enabled) {
/* we are starting ftrace again */
- if (ftrace_ops_list != &ftrace_list_end)
+ if (rcu_dereference_protected(ftrace_ops_list,
+ lockdep_is_held(&ftrace_lock)) != &ftrace_list_end)
update_ftrace_function();
ftrace_startup_sysctl();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4ae268e687fe..81279c6602ff 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1136,12 +1136,12 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
for (i = 0; i < nr_pages; i++) {
struct page *page;
/*
- * __GFP_NORETRY flag makes sure that the allocation fails
- * gracefully without invoking oom-killer and the system is
- * not destabilized.
+ * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
+ * gracefully without invoking oom-killer and the system is not
+ * destabilized.
*/
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- GFP_KERNEL | __GFP_NORETRY,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL,
cpu_to_node(cpu));
if (!bpage)
goto free_pages;
@@ -1149,7 +1149,7 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
list_add(&bpage->list, pages);
page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0);
if (!page)
goto free_pages;
bpage->page = page_address(page);
@@ -4386,15 +4386,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* the page that was allocated, with the read page of the buffer.
*
* Returns:
- * The page allocated, or NULL on error.
+ * The page allocated, or ERR_PTR
*/
void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
{
- struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_data_page *bpage = NULL;
unsigned long flags;
struct page *page;
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return ERR_PTR(-ENODEV);
+
+ cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
@@ -4412,7 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
if (!page)
- return NULL;
+ return ERR_PTR(-ENOMEM);
bpage = page_address(page);
@@ -4467,8 +4471,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
*
* for example:
* rpage = ring_buffer_alloc_read_page(buffer, cpu);
- * if (!rpage)
- * return error;
+ * if (IS_ERR(rpage))
+ * return PTR_ERR(rpage);
* ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
* if (ret >= 0)
* process_page(rpage, ret);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 9fbcaf567886..68ee79afe31c 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -113,7 +113,7 @@ static enum event_status read_page(int cpu)
int i;
bpage = ring_buffer_alloc_read_page(buffer, cpu);
- if (!bpage)
+ if (IS_ERR(bpage))
return EVENT_DROPPED;
ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 091e801145c9..44004d8aa3b3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -87,7 +87,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
* tracing is active, only save the comm when a trace event
* occurred.
*/
-static DEFINE_PER_CPU(bool, trace_cmdline_save);
+static DEFINE_PER_CPU(bool, trace_taskinfo_save);
/*
* Kill all tracing for good (never come back).
@@ -120,41 +120,41 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
/* When set, tracing will stop when a WARN*() is hit */
int __disable_trace_on_warning;
-#ifdef CONFIG_TRACE_ENUM_MAP_FILE
-/* Map of enums to their values, for "enum_map" file */
-struct trace_enum_map_head {
+#ifdef CONFIG_TRACE_EVAL_MAP_FILE
+/* Map of enums to their values, for "eval_map" file */
+struct trace_eval_map_head {
struct module *mod;
unsigned long length;
};
-union trace_enum_map_item;
+union trace_eval_map_item;
-struct trace_enum_map_tail {
+struct trace_eval_map_tail {
/*
* "end" is first and points to NULL as it must be different
- * than "mod" or "enum_string"
+ * than "mod" or "eval_string"
*/
- union trace_enum_map_item *next;
+ union trace_eval_map_item *next;
const char *end; /* points to NULL */
};
-static DEFINE_MUTEX(trace_enum_mutex);
+static DEFINE_MUTEX(trace_eval_mutex);
/*
- * The trace_enum_maps are saved in an array with two extra elements,
+ * The trace_eval_maps are saved in an array with two extra elements,
* one at the beginning, and one at the end. The beginning item contains
* the count of the saved maps (head.length), and the module they
* belong to if not built in (head.mod). The ending item contains a
- * pointer to the next array of saved enum_map items.
+ * pointer to the next array of saved eval_map items.
*/
-union trace_enum_map_item {
- struct trace_enum_map map;
- struct trace_enum_map_head head;
- struct trace_enum_map_tail tail;
+union trace_eval_map_item {
+ struct trace_eval_map map;
+ struct trace_eval_map_head head;
+ struct trace_eval_map_tail tail;
};
-static union trace_enum_map_item *trace_enum_maps;
-#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+static union trace_eval_map_item *trace_eval_maps;
+#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
static int tracing_set_tracer(struct trace_array *tr, const char *buf);
@@ -790,7 +790,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
static __always_inline void
__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
{
- __this_cpu_write(trace_cmdline_save, true);
+ __this_cpu_write(trace_taskinfo_save, true);
/* If this is the temp buffer, we need to commit fully */
if (this_cpu_read(trace_buffered_event) == event) {
@@ -1141,9 +1141,9 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
/*
* TRACE_FLAGS is defined as a tuple matching bit masks with strings.
- * It uses C(a, b) where 'a' is the enum name and 'b' is the string that
+ * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that
* matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list
- * of strings in the order that the enums were defined.
+ * of strings in the order that the evals (enum) were defined.
*/
#undef C
#define C(a, b) b
@@ -1709,6 +1709,8 @@ void tracing_reset_all_online_cpus(void)
}
}
+static int *tgid_map;
+
#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -1722,7 +1724,7 @@ struct saved_cmdlines_buffer {
static struct saved_cmdlines_buffer *savedcmd;
/* temporary disable recording */
-static atomic_t trace_record_cmdline_disabled __read_mostly;
+static atomic_t trace_record_taskinfo_disabled __read_mostly;
static inline char *get_saved_cmdlines(int idx)
{
@@ -1910,13 +1912,15 @@ static void tracing_stop_tr(struct trace_array *tr)
raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
-void trace_stop_cmdline_recording(void);
-
static int trace_save_cmdline(struct task_struct *tsk)
{
unsigned pid, idx;
- if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
+ /* treat recording of idle task as a success */
+ if (!tsk->pid)
+ return 1;
+
+ if (unlikely(tsk->pid > PID_MAX_DEFAULT))
return 0;
/*
@@ -1992,16 +1996,107 @@ void trace_find_cmdline(int pid, char comm[])
preempt_enable();
}
-void tracing_record_cmdline(struct task_struct *tsk)
+int trace_find_tgid(int pid)
+{
+ if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT))
+ return 0;
+
+ return tgid_map[pid];
+}
+
+static int trace_save_tgid(struct task_struct *tsk)
{
- if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
+ /* treat recording of idle task as a success */
+ if (!tsk->pid)
+ return 1;
+
+ if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT))
+ return 0;
+
+ tgid_map[tsk->pid] = tsk->tgid;
+ return 1;
+}
+
+static bool tracing_record_taskinfo_skip(int flags)
+{
+ if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
+ return true;
+ if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on())
+ return true;
+ if (!__this_cpu_read(trace_taskinfo_save))
+ return true;
+ return false;
+}
+
+/**
+ * tracing_record_taskinfo - record the task info of a task
+ *
+ * @task - task to record
+ * @flags - TRACE_RECORD_CMDLINE for recording comm
+ * - TRACE_RECORD_TGID for recording tgid
+ */
+void tracing_record_taskinfo(struct task_struct *task, int flags)
+{
+ bool done;
+
+ if (tracing_record_taskinfo_skip(flags))
return;
- if (!__this_cpu_read(trace_cmdline_save))
+ /*
+ * Record as much task information as possible. If some fail, continue
+ * to try to record the others.
+ */
+ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);
+
+ /* If recording any information failed, retry again soon. */
+ if (!done)
return;
- if (trace_save_cmdline(tsk))
- __this_cpu_write(trace_cmdline_save, false);
+ __this_cpu_write(trace_taskinfo_save, false);
+}
+
+/**
+ * tracing_record_taskinfo_sched_switch - record task info for sched_switch
+ *
+ * @prev - previous task during sched_switch
+ * @next - next task during sched_switch
+ * @flags - TRACE_RECORD_CMDLINE for recording comm
+ * TRACE_RECORD_TGID for recording tgid
+ */
+void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
+ struct task_struct *next, int flags)
+{
+ bool done;
+
+ if (tracing_record_taskinfo_skip(flags))
+ return;
+
+ /*
+ * Record as much task information as possible. If some fail, continue
+ * to try to record the others.
+ */
+ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
+ done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);
+
+ /* If recording any information failed, retry again soon. */
+ if (!done)
+ return;
+
+ __this_cpu_write(trace_taskinfo_save, false);
+}
+
+/* Helpers to record a specific task information */
+void tracing_record_cmdline(struct task_struct *task)
+{
+ tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
+}
+
+void tracing_record_tgid(struct task_struct *task)
+{
+ tracing_record_taskinfo(task, TRACE_RECORD_TGID);
}
/*
@@ -3146,7 +3241,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
#endif
if (!iter->snapshot)
- atomic_inc(&trace_record_cmdline_disabled);
+ atomic_inc(&trace_record_taskinfo_disabled);
if (*pos != iter->pos) {
iter->ent = NULL;
@@ -3191,7 +3286,7 @@ static void s_stop(struct seq_file *m, void *p)
#endif
if (!iter->snapshot)
- atomic_dec(&trace_record_cmdline_disabled);
+ atomic_dec(&trace_record_taskinfo_disabled);
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
@@ -3248,23 +3343,38 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
seq_puts(m, "#\n");
}
-static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
+static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m,
+ unsigned int flags)
{
+ bool tgid = flags & TRACE_ITER_RECORD_TGID;
+
print_event_info(buf, m);
- seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"
- "# | | | | |\n");
+
+ seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : "");
+ seq_printf(m, "# | | | %s | |\n", tgid ? " | " : "");
}
-static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
+static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m,
+ unsigned int flags)
{
- print_event_info(buf, m);
- seq_puts(m, "# _-----=> irqs-off\n"
- "# / _----=> need-resched\n"
- "# | / _---=> hardirq/softirq\n"
- "# || / _--=> preempt-depth\n"
- "# ||| / delay\n"
- "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"
- "# | | | |||| | |\n");
+ bool tgid = flags & TRACE_ITER_RECORD_TGID;
+ const char tgid_space[] = " ";
+ const char space[] = " ";
+
+ seq_printf(m, "# %s _-----=> irqs-off\n",
+ tgid ? tgid_space : space);
+ seq_printf(m, "# %s / _----=> need-resched\n",
+ tgid ? tgid_space : space);
+ seq_printf(m, "# %s| / _---=> hardirq/softirq\n",
+ tgid ? tgid_space : space);
+ seq_printf(m, "# %s|| / _--=> preempt-depth\n",
+ tgid ? tgid_space : space);
+ seq_printf(m, "# %s||| / delay\n",
+ tgid ? tgid_space : space);
+ seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n",
+ tgid ? " TGID " : space);
+ seq_printf(m, "# | | | %s|||| | |\n",
+ tgid ? " | " : space);
}
void
@@ -3580,9 +3690,11 @@ void trace_default_header(struct seq_file *m)
} else {
if (!(trace_flags & TRACE_ITER_VERBOSE)) {
if (trace_flags & TRACE_ITER_IRQ_INFO)
- print_func_help_header_irq(iter->trace_buffer, m);
+ print_func_help_header_irq(iter->trace_buffer,
+ m, trace_flags);
else
- print_func_help_header(iter->trace_buffer, m);
+ print_func_help_header(iter->trace_buffer, m,
+ trace_flags);
}
}
}
@@ -4238,6 +4350,18 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_CMD)
trace_event_enable_cmd_record(enabled);
+ if (mask == TRACE_ITER_RECORD_TGID) {
+ if (!tgid_map)
+ tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map),
+ GFP_KERNEL);
+ if (!tgid_map) {
+ tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
+ return -ENOMEM;
+ }
+
+ trace_event_enable_tgid_record(enabled);
+ }
+
if (mask == TRACE_ITER_EVENT_FORK)
trace_event_follow_fork(tr, enabled);
@@ -4473,7 +4597,8 @@ static const char readme_msg[] =
#endif
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
"\t accepts: event-definitions (one definition per line)\n"
- "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n"
+ "\t Format: p[:[<group>/]<event>] <place> [<args>]\n"
+ "\t r[maxactive][:[<group>/]<event>] <place> [<args>]\n"
"\t -:[<group>/]<event>\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
@@ -4597,6 +4722,76 @@ static const struct file_operations tracing_readme_fops = {
.llseek = generic_file_llseek,
};
+static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int *ptr = v;
+
+ if (*pos || m->count)
+ ptr++;
+
+ (*pos)++;
+
+ for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) {
+ if (trace_find_tgid(*ptr))
+ return ptr;
+ }
+
+ return NULL;
+}
+
+static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
+{
+ void *v;
+ loff_t l = 0;
+
+ if (!tgid_map)
+ return NULL;
+
+ v = &tgid_map[0];
+ while (l <= *pos) {
+ v = saved_tgids_next(m, v, &l);
+ if (!v)
+ return NULL;
+ }
+
+ return v;
+}
+
+static void saved_tgids_stop(struct seq_file *m, void *v)
+{
+}
+
+static int saved_tgids_show(struct seq_file *m, void *v)
+{
+ int pid = (int *)v - tgid_map;
+
+ seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid));
+ return 0;
+}
+
+static const struct seq_operations tracing_saved_tgids_seq_ops = {
+ .start = saved_tgids_start,
+ .stop = saved_tgids_stop,
+ .next = saved_tgids_next,
+ .show = saved_tgids_show,
+};
+
+static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
+{
+ if (tracing_disabled)
+ return -ENODEV;
+
+ return seq_open(filp, &tracing_saved_tgids_seq_ops);
+}
+
+
+static const struct file_operations tracing_saved_tgids_fops = {
+ .open = tracing_saved_tgids_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
{
unsigned int *ptr = v;
@@ -4746,11 +4941,11 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
.write = tracing_saved_cmdlines_size_write,
};
-#ifdef CONFIG_TRACE_ENUM_MAP_FILE
-static union trace_enum_map_item *
-update_enum_map(union trace_enum_map_item *ptr)
+#ifdef CONFIG_TRACE_EVAL_MAP_FILE
+static union trace_eval_map_item *
+update_eval_map(union trace_eval_map_item *ptr)
{
- if (!ptr->map.enum_string) {
+ if (!ptr->map.eval_string) {
if (ptr->tail.next) {
ptr = ptr->tail.next;
/* Set ptr to the next real item (skip head) */
@@ -4761,15 +4956,15 @@ update_enum_map(union trace_enum_map_item *ptr)
return ptr;
}
-static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
+static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos)
{
- union trace_enum_map_item *ptr = v;
+ union trace_eval_map_item *ptr = v;
/*
* Paranoid! If ptr points to end, we don't want to increment past it.
* This really should never happen.
*/
- ptr = update_enum_map(ptr);
+ ptr = update_eval_map(ptr);
if (WARN_ON_ONCE(!ptr))
return NULL;
@@ -4777,104 +4972,104 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
(*pos)++;
- ptr = update_enum_map(ptr);
+ ptr = update_eval_map(ptr);
return ptr;
}
-static void *enum_map_start(struct seq_file *m, loff_t *pos)
+static void *eval_map_start(struct seq_file *m, loff_t *pos)
{
- union trace_enum_map_item *v;
+ union trace_eval_map_item *v;
loff_t l = 0;
- mutex_lock(&trace_enum_mutex);
+ mutex_lock(&trace_eval_mutex);
- v = trace_enum_maps;
+ v = trace_eval_maps;
if (v)
v++;
while (v && l < *pos) {
- v = enum_map_next(m, v, &l);
+ v = eval_map_next(m, v, &l);
}
return v;
}
-static void enum_map_stop(struct seq_file *m, void *v)
+static void eval_map_stop(struct seq_file *m, void *v)
{
- mutex_unlock(&trace_enum_mutex);
+ mutex_unlock(&trace_eval_mutex);
}
-static int enum_map_show(struct seq_file *m, void *v)
+static int eval_map_show(struct seq_file *m, void *v)
{
- union trace_enum_map_item *ptr = v;
+ union trace_eval_map_item *ptr = v;
seq_printf(m, "%s %ld (%s)\n",
- ptr->map.enum_string, ptr->map.enum_value,
+ ptr->map.eval_string, ptr->map.eval_value,
ptr->map.system);
return 0;
}
-static const struct seq_operations tracing_enum_map_seq_ops = {
- .start = enum_map_start,
- .next = enum_map_next,
- .stop = enum_map_stop,
- .show = enum_map_show,
+static const struct seq_operations tracing_eval_map_seq_ops = {
+ .start = eval_map_start,
+ .next = eval_map_next,
+ .stop = eval_map_stop,
+ .show = eval_map_show,
};
-static int tracing_enum_map_open(struct inode *inode, struct file *filp)
+static int tracing_eval_map_open(struct inode *inode, struct file *filp)
{
if (tracing_disabled)
return -ENODEV;
- return seq_open(filp, &tracing_enum_map_seq_ops);
+ return seq_open(filp, &tracing_eval_map_seq_ops);
}
-static const struct file_operations tracing_enum_map_fops = {
- .open = tracing_enum_map_open,
+static const struct file_operations tracing_eval_map_fops = {
+ .open = tracing_eval_map_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
-static inline union trace_enum_map_item *
-trace_enum_jmp_to_tail(union trace_enum_map_item *ptr)
+static inline union trace_eval_map_item *
+trace_eval_jmp_to_tail(union trace_eval_map_item *ptr)
{
/* Return tail of array given the head */
return ptr + ptr->head.length + 1;
}
static void
-trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
+trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
int len)
{
- struct trace_enum_map **stop;
- struct trace_enum_map **map;
- union trace_enum_map_item *map_array;
- union trace_enum_map_item *ptr;
+ struct trace_eval_map **stop;
+ struct trace_eval_map **map;
+ union trace_eval_map_item *map_array;
+ union trace_eval_map_item *ptr;
stop = start + len;
/*
- * The trace_enum_maps contains the map plus a head and tail item,
+ * The trace_eval_maps contains the map plus a head and tail item,
* where the head holds the module and length of array, and the
* tail holds a pointer to the next list.
*/
map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
if (!map_array) {
- pr_warn("Unable to allocate trace enum mapping\n");
+ pr_warn("Unable to allocate trace eval mapping\n");
return;
}
- mutex_lock(&trace_enum_mutex);
+ mutex_lock(&trace_eval_mutex);
- if (!trace_enum_maps)
- trace_enum_maps = map_array;
+ if (!trace_eval_maps)
+ trace_eval_maps = map_array;
else {
- ptr = trace_enum_maps;
+ ptr = trace_eval_maps;
for (;;) {
- ptr = trace_enum_jmp_to_tail(ptr);
+ ptr = trace_eval_jmp_to_tail(ptr);
if (!ptr->tail.next)
break;
ptr = ptr->tail.next;
@@ -4892,34 +5087,34 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
}
memset(map_array, 0, sizeof(*map_array));
- mutex_unlock(&trace_enum_mutex);
+ mutex_unlock(&trace_eval_mutex);
}
-static void trace_create_enum_file(struct dentry *d_tracer)
+static void trace_create_eval_file(struct dentry *d_tracer)
{
- trace_create_file("enum_map", 0444, d_tracer,
- NULL, &tracing_enum_map_fops);
+ trace_create_file("eval_map", 0444, d_tracer,
+ NULL, &tracing_eval_map_fops);
}
-#else /* CONFIG_TRACE_ENUM_MAP_FILE */
-static inline void trace_create_enum_file(struct dentry *d_tracer) { }
-static inline void trace_insert_enum_map_file(struct module *mod,
- struct trace_enum_map **start, int len) { }
-#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
+#else /* CONFIG_TRACE_EVAL_MAP_FILE */
+static inline void trace_create_eval_file(struct dentry *d_tracer) { }
+static inline void trace_insert_eval_map_file(struct module *mod,
+ struct trace_eval_map **start, int len) { }
+#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */
-static void trace_insert_enum_map(struct module *mod,
- struct trace_enum_map **start, int len)
+static void trace_insert_eval_map(struct module *mod,
+ struct trace_eval_map **start, int len)
{
- struct trace_enum_map **map;
+ struct trace_eval_map **map;
if (len <= 0)
return;
map = start;
- trace_event_enum_update(map, len);
+ trace_event_eval_update(map, len);
- trace_insert_enum_map_file(mod, start, len);
+ trace_insert_eval_map_file(mod, start, len);
}
static ssize_t
@@ -6403,7 +6598,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
- ssize_t ret;
+ ssize_t ret = 0;
ssize_t size;
if (!count)
@@ -6417,10 +6612,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (!info->spare) {
info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
iter->cpu_file);
- info->spare_cpu = iter->cpu_file;
+ if (IS_ERR(info->spare)) {
+ ret = PTR_ERR(info->spare);
+ info->spare = NULL;
+ } else {
+ info->spare_cpu = iter->cpu_file;
+ }
}
if (!info->spare)
- return -ENOMEM;
+ return ret;
/* Do we have previous read data to read? */
if (info->read < PAGE_SIZE)
@@ -6595,8 +6795,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
ref->ref = 1;
ref->buffer = iter->trace_buffer->buffer;
ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
- if (!ref->page) {
- ret = -ENOMEM;
+ if (IS_ERR(ref->page)) {
+ ret = PTR_ERR(ref->page);
+ ref->page = NULL;
kfree(ref);
break;
}
@@ -6739,33 +6940,18 @@ static const struct file_operations tracing_stats_fops = {
#ifdef CONFIG_DYNAMIC_FTRACE
-int __weak ftrace_arch_read_dyn_info(char *buf, int size)
-{
- return 0;
-}
-
static ssize_t
tracing_read_dyn_info(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- static char ftrace_dyn_info_buffer[1024];
- static DEFINE_MUTEX(dyn_info_mutex);
unsigned long *p = filp->private_data;
- char *buf = ftrace_dyn_info_buffer;
- int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
+ char buf[64]; /* Not too big for a shallow stack */
int r;
- mutex_lock(&dyn_info_mutex);
- r = sprintf(buf, "%ld ", *p);
-
- r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
+ r = scnprintf(buf, 63, "%ld", *p);
buf[r++] = '\n';
- r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-
- mutex_unlock(&dyn_info_mutex);
-
- return r;
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
static const struct file_operations tracing_dyn_info_fops = {
@@ -7594,6 +7780,7 @@ static int instance_rmdir(const char *name)
}
kfree(tr->topts);
+ free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -7737,21 +7924,21 @@ struct dentry *tracing_init_dentry(void)
return NULL;
}
-extern struct trace_enum_map *__start_ftrace_enum_maps[];
-extern struct trace_enum_map *__stop_ftrace_enum_maps[];
+extern struct trace_eval_map *__start_ftrace_eval_maps[];
+extern struct trace_eval_map *__stop_ftrace_eval_maps[];
-static void __init trace_enum_init(void)
+static void __init trace_eval_init(void)
{
int len;
- len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps;
- trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len);
+ len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps;
+ trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
}
#ifdef CONFIG_MODULES
-static void trace_module_add_enums(struct module *mod)
+static void trace_module_add_evals(struct module *mod)
{
- if (!mod->num_trace_enums)
+ if (!mod->num_trace_evals)
return;
/*
@@ -7761,40 +7948,40 @@ static void trace_module_add_enums(struct module *mod)
if (trace_module_has_bad_taint(mod))
return;
- trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums);
+ trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
}
-#ifdef CONFIG_TRACE_ENUM_MAP_FILE
-static void trace_module_remove_enums(struct module *mod)
+#ifdef CONFIG_TRACE_EVAL_MAP_FILE
+static void trace_module_remove_evals(struct module *mod)
{
- union trace_enum_map_item *map;
- union trace_enum_map_item **last = &trace_enum_maps;
+ union trace_eval_map_item *map;
+ union trace_eval_map_item **last = &trace_eval_maps;
- if (!mod->num_trace_enums)
+ if (!mod->num_trace_evals)
return;
- mutex_lock(&trace_enum_mutex);
+ mutex_lock(&trace_eval_mutex);
- map = trace_enum_maps;
+ map = trace_eval_maps;
while (map) {
if (map->head.mod == mod)
break;
- map = trace_enum_jmp_to_tail(map);
+ map = trace_eval_jmp_to_tail(map);
last = &map->tail.next;
map = map->tail.next;
}
if (!map)
goto out;
- *last = trace_enum_jmp_to_tail(map)->tail.next;
+ *last = trace_eval_jmp_to_tail(map)->tail.next;
kfree(map);
out:
- mutex_unlock(&trace_enum_mutex);
+ mutex_unlock(&trace_eval_mutex);
}
#else
-static inline void trace_module_remove_enums(struct module *mod) { }
-#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+static inline void trace_module_remove_evals(struct module *mod) { }
+#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
static int trace_module_notify(struct notifier_block *self,
unsigned long val, void *data)
@@ -7803,10 +7990,10 @@ static int trace_module_notify(struct notifier_block *self,
switch (val) {
case MODULE_STATE_COMING:
- trace_module_add_enums(mod);
+ trace_module_add_evals(mod);
break;
case MODULE_STATE_GOING:
- trace_module_remove_enums(mod);
+ trace_module_remove_evals(mod);
break;
}
@@ -7844,9 +8031,12 @@ static __init int tracer_init_tracefs(void)
trace_create_file("saved_cmdlines_size", 0644, d_tracer,
NULL, &tracing_saved_cmdlines_size_fops);
- trace_enum_init();
+ trace_create_file("saved_tgids", 0444, d_tracer,
+ NULL, &tracing_saved_tgids_fops);
+
+ trace_eval_init();
- trace_create_enum_file(d_tracer);
+ trace_create_eval_file(d_tracer);
#ifdef CONFIG_MODULES
register_module_notifier(&trace_module_nb);
@@ -8109,6 +8299,7 @@ __init static int tracer_alloc_buffers(void)
if (ret < 0)
goto out_free_cpumask;
/* Used for event triggers */
+ ret = -ENOMEM;
temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
if (!temp_buffer)
goto out_rm_hp_state;
@@ -8223,4 +8414,4 @@ __init static int clear_boot_tracer(void)
}
fs_initcall(tracer_init_tracefs);
-late_initcall(clear_boot_tracer);
+late_initcall_sync(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 39fd77330aab..490ba229931d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -263,7 +263,10 @@ struct trace_array {
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
#ifdef CONFIG_DYNAMIC_FTRACE
+ /* All of these are protected by the ftrace_lock */
struct list_head func_probes;
+ struct list_head mod_trace;
+ struct list_head mod_notrace;
#endif
/* function tracing enabled */
int function_enabled;
@@ -637,6 +640,9 @@ void set_graph_array(struct trace_array *tr);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
+void tracing_start_tgid_record(void);
+void tracing_stop_tgid_record(void);
+
int register_tracer(struct tracer *type);
int is_tracing_stopped(void);
@@ -697,6 +703,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
extern u64 ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
+extern int trace_find_tgid(int pid);
extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -761,10 +768,24 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
extern char trace_find_mark(unsigned long long duration);
+struct ftrace_hash;
+
+struct ftrace_mod_load {
+ struct list_head list;
+ char *func;
+ char *module;
+ int enable;
+};
+
+enum {
+ FTRACE_HASH_FL_MOD = (1 << 0),
+};
+
struct ftrace_hash {
unsigned long size_bits;
struct hlist_head *buckets;
unsigned long count;
+ unsigned long flags;
struct rcu_head rcu;
};
@@ -773,7 +794,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip);
static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
{
- return !hash || !hash->count;
+ return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD));
}
/* Standard output formatting function used for function return traces */
@@ -1107,6 +1128,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \
C(LATENCY_FMT, "latency-format"), \
C(RECORD_CMD, "record-cmd"), \
+ C(RECORD_TGID, "record-tgid"), \
C(OVERWRITE, "overwrite"), \
C(STOP_ON_FREE, "disable_on_free"), \
C(IRQ_INFO, "irq-info"), \
@@ -1188,9 +1210,9 @@ struct ftrace_event_field {
struct event_filter {
int n_preds; /* Number assigned */
int a_preds; /* allocated */
- struct filter_pred *preds;
- struct filter_pred *root;
- char *filter_string;
+ struct filter_pred __rcu *preds;
+ struct filter_pred __rcu *root;
+ char *filter_string;
};
struct event_subsystem {
@@ -1423,6 +1445,8 @@ struct ftrace_event_field *
trace_find_event_field(struct trace_event_call *call, char *name);
extern void trace_event_enable_cmd_record(bool enable);
+extern void trace_event_enable_tgid_record(bool enable);
+
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
@@ -1773,10 +1797,10 @@ static inline const char *get_syscall_name(int syscall)
#ifdef CONFIG_EVENT_TRACING
void trace_event_init(void);
-void trace_event_enum_update(struct trace_enum_map **map, int len);
+void trace_event_eval_update(struct trace_eval_map **map, int len);
#else
static inline void __init trace_event_init(void) { }
-static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { }
+static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
#endif
extern struct trace_iterator *tracepoint_print_iter;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 562fa69df5d3..13ba2d3f6a91 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -306,6 +306,7 @@ static void
perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *pt_regs)
{
+ struct perf_event *event;
struct ftrace_entry *entry;
struct hlist_head *head;
struct pt_regs regs;
@@ -329,8 +330,9 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
entry->ip = ip;
entry->parent_ip = parent_ip;
+ event = container_of(ops, struct perf_event, ftrace_ops);
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
- 1, &regs, head, NULL);
+ 1, &regs, head, NULL, event);
#undef ENTRY_SIZE
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e7973e10398c..36132f9280e6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -343,6 +343,28 @@ void trace_event_enable_cmd_record(bool enable)
mutex_unlock(&event_mutex);
}
+void trace_event_enable_tgid_record(bool enable)
+{
+ struct trace_event_file *file;
+ struct trace_array *tr;
+
+ mutex_lock(&event_mutex);
+ do_for_each_event_file(tr, file) {
+ if (!(file->flags & EVENT_FILE_FL_ENABLED))
+ continue;
+
+ if (enable) {
+ tracing_start_tgid_record();
+ set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
+ } else {
+ tracing_stop_tgid_record();
+ clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT,
+ &file->flags);
+ }
+ } while_for_each_event_file();
+ mutex_unlock(&event_mutex);
+}
+
static int __ftrace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable)
{
@@ -381,6 +403,12 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
tracing_stop_cmdline_record();
clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
+
+ if (file->flags & EVENT_FILE_FL_RECORDED_TGID) {
+ tracing_stop_tgid_record();
+ clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
+ }
+
call->class->reg(call, TRACE_REG_UNREGISTER, file);
}
/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
@@ -407,18 +435,30 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
}
if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
+ bool cmd = false, tgid = false;
/* Keep the event disabled, when going to SOFT_MODE. */
if (soft_disable)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
if (tr->trace_flags & TRACE_ITER_RECORD_CMD) {
+ cmd = true;
tracing_start_cmdline_record();
set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
+
+ if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
+ tgid = true;
+ tracing_start_tgid_record();
+ set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
+ }
+
ret = call->class->reg(call, TRACE_REG_REGISTER, file);
if (ret) {
- tracing_stop_cmdline_record();
+ if (cmd)
+ tracing_stop_cmdline_record();
+ if (tgid)
+ tracing_stop_tgid_record();
pr_info("event trace: Could not enable event "
"%s\n", trace_event_name(call));
break;
@@ -2067,18 +2107,18 @@ __register_event(struct trace_event_call *call, struct module *mod)
return 0;
}
-static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
+static char *eval_replace(char *ptr, struct trace_eval_map *map, int len)
{
int rlen;
int elen;
- /* Find the length of the enum value as a string */
- elen = snprintf(ptr, 0, "%ld", map->enum_value);
+ /* Find the length of the eval value as a string */
+ elen = snprintf(ptr, 0, "%ld", map->eval_value);
/* Make sure there's enough room to replace the string with the value */
if (len < elen)
return NULL;
- snprintf(ptr, elen + 1, "%ld", map->enum_value);
+ snprintf(ptr, elen + 1, "%ld", map->eval_value);
/* Get the rest of the string of ptr */
rlen = strlen(ptr + len);
@@ -2090,11 +2130,11 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
}
static void update_event_printk(struct trace_event_call *call,
- struct trace_enum_map *map)
+ struct trace_eval_map *map)
{
char *ptr;
int quote = 0;
- int len = strlen(map->enum_string);
+ int len = strlen(map->eval_string);
for (ptr = call->print_fmt; *ptr; ptr++) {
if (*ptr == '\\') {
@@ -2125,16 +2165,16 @@ static void update_event_printk(struct trace_event_call *call,
continue;
}
if (isalpha(*ptr) || *ptr == '_') {
- if (strncmp(map->enum_string, ptr, len) == 0 &&
+ if (strncmp(map->eval_string, ptr, len) == 0 &&
!isalnum(ptr[len]) && ptr[len] != '_') {
- ptr = enum_replace(ptr, map, len);
- /* Hmm, enum string smaller than value */
+ ptr = eval_replace(ptr, map, len);
+ /* enum/sizeof string smaller than value */
if (WARN_ON_ONCE(!ptr))
return;
/*
- * No need to decrement here, as enum_replace()
+ * No need to decrement here, as eval_replace()
* returns the pointer to the character passed
- * the enum, and two enums can not be placed
+ * the eval, and two evals can not be placed
* back to back without something in between.
* We can skip that something in between.
*/
@@ -2165,7 +2205,7 @@ static void update_event_printk(struct trace_event_call *call,
}
}
-void trace_event_enum_update(struct trace_enum_map **map, int len)
+void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
const char *last_system = NULL;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 59a411ff60c7..181e139a8057 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1959,6 +1959,10 @@ static int create_filter(struct trace_event_call *call,
if (err && set_str)
append_filter_err(ps, filter);
}
+ if (err && !set_str) {
+ free_event_filter(filter);
+ filter = NULL;
+ }
create_filter_finish(ps);
*filterp = filter;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b53c8d369163..8a907e12b6b9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -598,6 +598,14 @@ static struct notifier_block trace_kprobe_module_nb = {
.priority = 1 /* Invoked after kprobe module callback */
};
+/* Convert certain expected symbols into '_' when generating event names */
+static inline void sanitize_event_name(char *name)
+{
+ while (*name++ != '\0')
+ if (*name == ':' || *name == '.')
+ *name = '_';
+}
+
static int create_trace_kprobe(int argc, char **argv)
{
/*
@@ -720,7 +728,7 @@ static int create_trace_kprobe(int argc, char **argv)
return ret;
}
if (offset && is_return &&
- !function_offset_within_entry(NULL, symbol, offset)) {
+ !kprobe_on_func_entry(NULL, symbol, offset)) {
pr_info("Given offset is not valid for return probe.\n");
return -EINVAL;
}
@@ -736,6 +744,7 @@ static int create_trace_kprobe(int argc, char **argv)
else
snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
is_return ? 'r' : 'p', addr);
+ sanitize_event_name(buf);
event = buf;
}
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
@@ -1191,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
memset(&entry[1], 0, dsize);
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
NOKPROBE_SYMBOL(kprobe_perf_func);
@@ -1227,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 08f9bab8089e..bac629af2285 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -340,31 +340,41 @@ static inline const char *kretprobed(const char *name)
static void
seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
{
-#ifdef CONFIG_KALLSYMS
char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_KALLSYMS
const char *name;
kallsyms_lookup(address, NULL, NULL, NULL, str);
name = kretprobed(str);
- trace_seq_printf(s, fmt, name);
+ if (name && strlen(name)) {
+ trace_seq_printf(s, fmt, name);
+ return;
+ }
#endif
+ snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
+ trace_seq_printf(s, fmt, str);
}
static void
seq_print_sym_offset(struct trace_seq *s, const char *fmt,
unsigned long address)
{
-#ifdef CONFIG_KALLSYMS
char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_KALLSYMS
const char *name;
sprint_symbol(str, address);
name = kretprobed(str);
- trace_seq_printf(s, fmt, name);
+ if (name && strlen(name)) {
+ trace_seq_printf(s, fmt, name);
+ return;
+ }
#endif
+ snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
+ trace_seq_printf(s, fmt, str);
}
#ifndef CONFIG_64BIT
@@ -587,6 +597,15 @@ int trace_print_context(struct trace_iterator *iter)
trace_seq_printf(s, "%16s-%-5d [%03d] ",
comm, entry->pid, iter->cpu);
+ if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
+ unsigned int tgid = trace_find_tgid(entry->pid);
+
+ if (!tgid)
+ trace_seq_printf(s, "(-----) ");
+ else
+ trace_seq_printf(s, "(%5d) ", tgid);
+ }
+
if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
trace_print_lat_fmt(s, entry);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 4c896a0101bd..b341c02730be 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -12,27 +12,38 @@
#include "trace.h"
-static int sched_ref;
+#define RECORD_CMDLINE 1
+#define RECORD_TGID 2
+
+static int sched_cmdline_ref;
+static int sched_tgid_ref;
static DEFINE_MUTEX(sched_register_mutex);
static void
probe_sched_switch(void *ignore, bool preempt,
struct task_struct *prev, struct task_struct *next)
{
- if (unlikely(!sched_ref))
- return;
+ int flags;
+
+ flags = (RECORD_TGID * !!sched_tgid_ref) +
+ (RECORD_CMDLINE * !!sched_cmdline_ref);
- tracing_record_cmdline(prev);
- tracing_record_cmdline(next);
+ if (!flags)
+ return;
+ tracing_record_taskinfo_sched_switch(prev, next, flags);
}
static void
probe_sched_wakeup(void *ignore, struct task_struct *wakee)
{
- if (unlikely(!sched_ref))
- return;
+ int flags;
+
+ flags = (RECORD_TGID * !!sched_tgid_ref) +
+ (RECORD_CMDLINE * !!sched_cmdline_ref);
- tracing_record_cmdline(current);
+ if (!flags)
+ return;
+ tracing_record_taskinfo(current, flags);
}
static int tracing_sched_register(void)
@@ -75,28 +86,61 @@ static void tracing_sched_unregister(void)
unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
}
-static void tracing_start_sched_switch(void)
+static void tracing_start_sched_switch(int ops)
{
+ bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
mutex_lock(&sched_register_mutex);
- if (!(sched_ref++))
+
+ switch (ops) {
+ case RECORD_CMDLINE:
+ sched_cmdline_ref++;
+ break;
+
+ case RECORD_TGID:
+ sched_tgid_ref++;
+ break;
+ }
+
+ if (sched_register && (sched_cmdline_ref || sched_tgid_ref))
tracing_sched_register();
mutex_unlock(&sched_register_mutex);
}
-static void tracing_stop_sched_switch(void)
+static void tracing_stop_sched_switch(int ops)
{
mutex_lock(&sched_register_mutex);
- if (!(--sched_ref))
+
+ switch (ops) {
+ case RECORD_CMDLINE:
+ sched_cmdline_ref--;
+ break;
+
+ case RECORD_TGID:
+ sched_tgid_ref--;
+ break;
+ }
+
+ if (!sched_cmdline_ref && !sched_tgid_ref)
tracing_sched_unregister();
mutex_unlock(&sched_register_mutex);
}
void tracing_start_cmdline_record(void)
{
- tracing_start_sched_switch();
+ tracing_start_sched_switch(RECORD_CMDLINE);
}
void tracing_stop_cmdline_record(void)
{
- tracing_stop_sched_switch();
+ tracing_stop_sched_switch(RECORD_CMDLINE);
+}
+
+void tracing_start_tgid_record(void)
+{
+ tracing_start_sched_switch(RECORD_TGID);
+}
+
+void tracing_stop_tgid_record(void)
+{
+ tracing_stop_sched_switch(RECORD_TGID);
}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b4a751e8f9d6..a4df67cbc711 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -406,6 +406,8 @@ static const struct file_operations stack_trace_fops = {
.release = seq_release,
};
+#ifdef CONFIG_DYNAMIC_FTRACE
+
static int
stack_trace_filter_open(struct inode *inode, struct file *file)
{
@@ -423,6 +425,8 @@ static const struct file_operations stack_trace_filter_fops = {
.release = ftrace_regex_release,
};
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
int
stack_trace_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -477,8 +481,10 @@ static __init int stack_trace_init(void)
trace_create_file("stack_trace", 0444, d_tracer,
NULL, &stack_trace_fops);
+#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("stack_trace_filter", 0444, d_tracer,
&trace_ops, &stack_trace_filter_fops);
+#endif
if (stack_trace_filter_buf[0])
ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e10395da88e..9c4eef20301c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -559,11 +559,29 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
+static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
+ struct syscall_metadata *sys_data,
+ struct syscall_trace_enter *rec) {
+ struct syscall_tp_t {
+ unsigned long long regs;
+ unsigned long syscall_nr;
+ unsigned long args[sys_data->nb_args];
+ } param;
+ int i;
+
+ *(struct pt_regs **)&param = regs;
+ param.syscall_nr = rec->nr;
+ for (i = 0; i < sys_data->nb_args; i++)
+ param.args[i] = rec->args[i];
+ return trace_call_bpf(prog, &param);
+}
+
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
struct hlist_head *head;
+ struct bpf_prog *prog;
int syscall_nr;
int rctx;
int size;
@@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (!sys_data)
return;
+ prog = READ_ONCE(sys_data->enter_event->prog);
head = this_cpu_ptr(sys_data->enter_event->perf_events);
- if (hlist_empty(head))
+ if (!prog && hlist_empty(head))
return;
/* get the size after alignment with the u32 buffer size field */
@@ -594,9 +613,16 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
+
+ if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
+ hlist_empty(head)) {
+ perf_swevent_put_recursion_context(rctx);
+ return;
+ }
+
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
static int perf_sysenter_enable(struct trace_event_call *call)
@@ -633,11 +659,26 @@ static void perf_sysenter_disable(struct trace_event_call *call)
mutex_unlock(&syscall_trace_lock);
}
+static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
+ struct syscall_trace_exit *rec) {
+ struct syscall_tp_t {
+ unsigned long long regs;
+ unsigned long syscall_nr;
+ unsigned long ret;
+ } param;
+
+ *(struct pt_regs **)&param = regs;
+ param.syscall_nr = rec->nr;
+ param.ret = rec->ret;
+ return trace_call_bpf(prog, &param);
+}
+
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
struct hlist_head *head;
+ struct bpf_prog *prog;
int syscall_nr;
int rctx;
int size;
@@ -652,8 +693,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data)
return;
+ prog = READ_ONCE(sys_data->exit_event->prog);
head = this_cpu_ptr(sys_data->exit_event->perf_events);
- if (hlist_empty(head))
+ if (!prog && hlist_empty(head))
return;
/* We can probably do that at build time */
@@ -666,8 +708,15 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
+
+ if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
+ hlist_empty(head)) {
+ perf_swevent_put_recursion_context(rctx);
+ return;
+ }
+
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
- 1, regs, head, NULL);
+ 1, regs, head, NULL, NULL);
}
static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a7581fec9681..4525e0271a53 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
}
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
out:
preempt_enable();
}
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 0a689bbb78ef..305039b122fa 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a)
if (!a)
return;
- if (!a->pages) {
- kfree(a);
- return;
- }
+ if (!a->pages)
+ goto free;
for (i = 0; i < a->n_pages; i++) {
if (!a->pages[i])
break;
free_page((unsigned long)a->pages[i]);
}
+
+ kfree(a->pages);
+
+ free:
+ kfree(a);
}
struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
diff --git a/kernel/up.c b/kernel/up.c
index ee81ac9af4ca..42c46bf3e0a5 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -23,7 +23,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
}
EXPORT_SYMBOL(smp_call_function_single);
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
unsigned long flags;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 03e0b69bb5bf..f5d52024f6b7 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -9,7 +9,7 @@
* to those contributors as well.
*/
-#define pr_fmt(fmt) "NMI watchdog: " fmt
+#define pr_fmt(fmt) "watchdog: " fmt
#include <linux/mm.h>
#include <linux/cpu.h>
@@ -29,15 +29,58 @@
#include <linux/kvm_para.h>
#include <linux/kthread.h>
+/* Watchdog configuration */
static DEFINE_MUTEX(watchdog_proc_mutex);
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+int __read_mostly nmi_watchdog_enabled;
+
+#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
+ NMI_WATCHDOG_ENABLED;
#else
unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
#endif
-int __read_mostly nmi_watchdog_enabled;
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+unsigned int __read_mostly hardlockup_panic =
+ CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+/*
+ * We may not want to enable hard lockup detection by default in all cases,
+ * for example when running the kernel as a guest on a hypervisor. In these
+ * cases this function can be called to disable hard lockup detection. This
+ * function should only be executed once by the boot processor before the
+ * kernel command line parameters are parsed, because otherwise it is not
+ * possible to override this in hardlockup_panic_setup().
+ */
+void hardlockup_detector_disable(void)
+{
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+}
+
+static int __init hardlockup_panic_setup(char *str)
+{
+ if (!strncmp(str, "panic", 5))
+ hardlockup_panic = 1;
+ else if (!strncmp(str, "nopanic", 7))
+ hardlockup_panic = 0;
+ else if (!strncmp(str, "0", 1))
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ else if (!strncmp(str, "1", 1))
+ watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+ return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+
+#endif
+
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
int __read_mostly soft_watchdog_enabled;
+#endif
+
int __read_mostly watchdog_user_enabled;
int __read_mostly watchdog_thresh = 10;
@@ -45,15 +88,9 @@ int __read_mostly watchdog_thresh = 10;
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
#endif
-static struct cpumask watchdog_cpumask __read_mostly;
+struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
-/* Helper for online, unparked cpus. */
-#define for_each_watchdog_cpu(cpu) \
- for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
-
-atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
-
/*
* The 'watchdog_running' variable is set to 1 when the watchdog threads
* are registered/started and is set to 0 when the watchdog threads are
@@ -72,7 +109,47 @@ static int __read_mostly watchdog_running;
* of 'watchdog_running' cannot change while the watchdog is deactivated
* temporarily (see related code in 'proc' handlers).
*/
-static int __read_mostly watchdog_suspended;
+int __read_mostly watchdog_suspended;
+
+/*
+ * These functions can be overridden if an architecture implements its
+ * own hardlockup detector.
+ *
+ * watchdog_nmi_enable/disable can be implemented to start and stop when
+ * softlockup watchdog threads start and stop. The arch must select the
+ * SOFTLOCKUP_DETECTOR Kconfig.
+ */
+int __weak watchdog_nmi_enable(unsigned int cpu)
+{
+ return 0;
+}
+void __weak watchdog_nmi_disable(unsigned int cpu)
+{
+}
+
+/*
+ * watchdog_nmi_reconfigure can be implemented to be notified after any
+ * watchdog configuration change. The arch hardlockup watchdog should
+ * respond to the following variables:
+ * - nmi_watchdog_enabled
+ * - watchdog_thresh
+ * - watchdog_cpumask
+ * - sysctl_hardlockup_all_cpu_backtrace
+ * - hardlockup_panic
+ * - watchdog_suspended
+ */
+void __weak watchdog_nmi_reconfigure(void)
+{
+}
+
+
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
+
+/* Helper for online, unparked cpus. */
+#define for_each_watchdog_cpu(cpu) \
+ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+
+atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
static u64 __read_mostly sample_period;
@@ -120,6 +197,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
return 1;
}
__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
static int __init hardlockup_all_cpu_backtrace_setup(char *str)
{
sysctl_hardlockup_all_cpu_backtrace =
@@ -128,6 +206,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str)
}
__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
#endif
+#endif
/*
* Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -161,6 +240,7 @@ static void set_sample_period(void)
* hardlockup detector generates a warning
*/
sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+ watchdog_update_hrtimer_threshold(sample_period);
}
/* Commands for resetting the watchdog */
@@ -213,18 +293,6 @@ void touch_softlockup_watchdog_sync(void)
__this_cpu_write(watchdog_touch_ts, 0);
}
-/* watchdog detector functions */
-bool is_hardlockup(void)
-{
- unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
-
- if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
- return true;
-
- __this_cpu_write(hrtimer_interrupts_saved, hrint);
- return false;
-}
-
static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp();
@@ -237,21 +305,21 @@ static int is_softlockup(unsigned long touch_ts)
return 0;
}
-static void watchdog_interrupt_count(void)
+/* watchdog detector functions */
+bool is_hardlockup(void)
{
- __this_cpu_inc(hrtimer_interrupts);
-}
+ unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
-/*
- * These two functions are mostly architecture specific
- * defining them as weak here.
- */
-int __weak watchdog_nmi_enable(unsigned int cpu)
-{
- return 0;
+ if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
+ return true;
+
+ __this_cpu_write(hrtimer_interrupts_saved, hrint);
+ return false;
}
-void __weak watchdog_nmi_disable(unsigned int cpu)
+
+static void watchdog_interrupt_count(void)
{
+ __this_cpu_inc(hrtimer_interrupts);
}
static int watchdog_enable_all_cpus(void);
@@ -502,57 +570,6 @@ static void watchdog_unpark_threads(void)
kthread_unpark(per_cpu(softlockup_watchdog, cpu));
}
-/*
- * Suspend the hard and soft lockup detector by parking the watchdog threads.
- */
-int lockup_detector_suspend(void)
-{
- int ret = 0;
-
- get_online_cpus();
- mutex_lock(&watchdog_proc_mutex);
- /*
- * Multiple suspend requests can be active in parallel (counted by
- * the 'watchdog_suspended' variable). If the watchdog threads are
- * running, the first caller takes care that they will be parked.
- * The state of 'watchdog_running' cannot change while a suspend
- * request is active (see related code in 'proc' handlers).
- */
- if (watchdog_running && !watchdog_suspended)
- ret = watchdog_park_threads();
-
- if (ret == 0)
- watchdog_suspended++;
- else {
- watchdog_disable_all_cpus();
- pr_err("Failed to suspend lockup detectors, disabled\n");
- watchdog_enabled = 0;
- }
-
- mutex_unlock(&watchdog_proc_mutex);
-
- return ret;
-}
-
-/*
- * Resume the hard and soft lockup detector by unparking the watchdog threads.
- */
-void lockup_detector_resume(void)
-{
- mutex_lock(&watchdog_proc_mutex);
-
- watchdog_suspended--;
- /*
- * The watchdog threads are unparked if they were previously running
- * and if there is no more active suspend request.
- */
- if (watchdog_running && !watchdog_suspended)
- watchdog_unpark_threads();
-
- mutex_unlock(&watchdog_proc_mutex);
- put_online_cpus();
-}
-
static int update_watchdog_all_cpus(void)
{
int ret;
@@ -605,6 +622,100 @@ static void watchdog_disable_all_cpus(void)
}
#ifdef CONFIG_SYSCTL
+static int watchdog_update_cpus(void)
+{
+ return smpboot_update_cpumask_percpu_thread(
+ &watchdog_threads, &watchdog_cpumask);
+}
+#endif
+
+#else /* SOFTLOCKUP */
+static int watchdog_park_threads(void)
+{
+ return 0;
+}
+
+static void watchdog_unpark_threads(void)
+{
+}
+
+static int watchdog_enable_all_cpus(void)
+{
+ return 0;
+}
+
+static void watchdog_disable_all_cpus(void)
+{
+}
+
+#ifdef CONFIG_SYSCTL
+static int watchdog_update_cpus(void)
+{
+ return 0;
+}
+#endif
+
+static void set_sample_period(void)
+{
+}
+#endif /* SOFTLOCKUP */
+
+/*
+ * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ */
+int lockup_detector_suspend(void)
+{
+ int ret = 0;
+
+ get_online_cpus();
+ mutex_lock(&watchdog_proc_mutex);
+ /*
+ * Multiple suspend requests can be active in parallel (counted by
+ * the 'watchdog_suspended' variable). If the watchdog threads are
+ * running, the first caller takes care that they will be parked.
+ * The state of 'watchdog_running' cannot change while a suspend
+ * request is active (see related code in 'proc' handlers).
+ */
+ if (watchdog_running && !watchdog_suspended)
+ ret = watchdog_park_threads();
+
+ if (ret == 0)
+ watchdog_suspended++;
+ else {
+ watchdog_disable_all_cpus();
+ pr_err("Failed to suspend lockup detectors, disabled\n");
+ watchdog_enabled = 0;
+ }
+
+ watchdog_nmi_reconfigure();
+
+ mutex_unlock(&watchdog_proc_mutex);
+
+ return ret;
+}
+
+/*
+ * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ */
+void lockup_detector_resume(void)
+{
+ mutex_lock(&watchdog_proc_mutex);
+
+ watchdog_suspended--;
+ /*
+ * The watchdog threads are unparked if they were previously running
+ * and if there is no more active suspend request.
+ */
+ if (watchdog_running && !watchdog_suspended)
+ watchdog_unpark_threads();
+
+ watchdog_nmi_reconfigure();
+
+ mutex_unlock(&watchdog_proc_mutex);
+ put_online_cpus();
+}
+
+#ifdef CONFIG_SYSCTL
/*
* Update the run state of the lockup detectors.
@@ -625,6 +736,8 @@ static int proc_watchdog_update(void)
else
watchdog_disable_all_cpus();
+ watchdog_nmi_reconfigure();
+
return err;
}
@@ -810,10 +923,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
* a temporary cpumask, so we are likely not in a
* position to do much else to make things better.
*/
- if (smpboot_update_cpumask_percpu_thread(
- &watchdog_threads, &watchdog_cpumask) != 0)
+ if (watchdog_update_cpus() != 0)
pr_err("cpumask update failed\n");
}
+
+ watchdog_nmi_reconfigure();
}
out:
mutex_unlock(&watchdog_proc_mutex);
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 54a427d1f344..3a09ea1b1d3d 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -22,41 +22,9 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-/* boot commands */
-/*
- * Should we panic when a soft-lockup or hard-lockup occurs:
- */
-unsigned int __read_mostly hardlockup_panic =
- CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
static unsigned long hardlockup_allcpu_dumped;
-/*
- * We may not want to enable hard lockup detection by default in all cases,
- * for example when running the kernel as a guest on a hypervisor. In these
- * cases this function can be called to disable hard lockup detection. This
- * function should only be executed once by the boot processor before the
- * kernel command line parameters are parsed, because otherwise it is not
- * possible to override this in hardlockup_panic_setup().
- */
-void hardlockup_detector_disable(void)
-{
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-}
-static int __init hardlockup_panic_setup(char *str)
-{
- if (!strncmp(str, "panic", 5))
- hardlockup_panic = 1;
- else if (!strncmp(str, "nopanic", 7))
- hardlockup_panic = 0;
- else if (!strncmp(str, "0", 1))
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
- else if (!strncmp(str, "1", 1))
- watchdog_enabled |= NMI_WATCHDOG_ENABLED;
- return 1;
-}
-__setup("nmi_watchdog=", hardlockup_panic_setup);
-
-void touch_nmi_watchdog(void)
+void arch_touch_nmi_watchdog(void)
{
/*
* Using __raw here because some code paths have
@@ -66,9 +34,64 @@ void touch_nmi_watchdog(void)
* going off.
*/
raw_cpu_write(watchdog_nmi_touch, true);
- touch_softlockup_watchdog();
}
-EXPORT_SYMBOL(touch_nmi_watchdog);
+EXPORT_SYMBOL(arch_touch_nmi_watchdog);
+
+#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
+static DEFINE_PER_CPU(ktime_t, last_timestamp);
+static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
+static ktime_t watchdog_hrtimer_sample_threshold __read_mostly;
+
+void watchdog_update_hrtimer_threshold(u64 period)
+{
+ /*
+ * The hrtimer runs with a period of (watchdog_threshold * 2) / 5
+ *
+ * So it runs effectively with 2.5 times the rate of the NMI
+ * watchdog. That means the hrtimer should fire 2-3 times before
+ * the NMI watchdog expires. The NMI watchdog on x86 is based on
+ * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles
+ * might run way faster than expected and the NMI fires in a
+ * smaller period than the one deduced from the nominal CPU
+ * frequency. Depending on the Turbo-Mode factor this might be fast
+ * enough to get the NMI period smaller than the hrtimer watchdog
+ * period and trigger false positives.
+ *
+ * The sample threshold is used to check in the NMI handler whether
+ * the minimum time between two NMI samples has elapsed. That
+ * prevents false positives.
+ *
+ * Set this to 4/5 of the actual watchdog threshold period so the
+ * hrtimer is guaranteed to fire at least once within the real
+ * watchdog threshold.
+ */
+ watchdog_hrtimer_sample_threshold = period * 2;
+}
+
+static bool watchdog_check_timestamp(void)
+{
+ ktime_t delta, now = ktime_get_mono_fast_ns();
+
+ delta = now - __this_cpu_read(last_timestamp);
+ if (delta < watchdog_hrtimer_sample_threshold) {
+ /*
+ * If ktime is jiffies based, a stalled timer would prevent
+ * jiffies from being incremented and the filter would look
+ * at a stale timestamp and never trigger.
+ */
+ if (__this_cpu_inc_return(nmi_rearmed) < 10)
+ return false;
+ }
+ __this_cpu_write(nmi_rearmed, 0);
+ __this_cpu_write(last_timestamp, now);
+ return true;
+}
+#else
+static inline bool watchdog_check_timestamp(void)
+{
+ return true;
+}
+#endif
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
@@ -94,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
return;
}
+ if (!watchdog_check_timestamp())
+ return;
+
/* check for a hardlockup
* This is done by making sure our timer interrupt
* is incrementing. The timer interrupt should have
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a86688fabc55..64d0edf428f8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -21,7 +21,7 @@
* pools for workqueues which are not bound to any specific CPU - the
* number of these backing pools is dynamic.
*
- * Please read Documentation/workqueue.txt for details.
+ * Please read Documentation/core-api/workqueue.rst for details.
*/
#include <linux/export.h>
@@ -2091,8 +2091,30 @@ __acquires(&pool->lock)
spin_unlock_irq(&pool->lock);
- lock_map_acquire_read(&pwq->wq->lockdep_map);
+ lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
+ /*
+ * Strictly speaking we should mark the invariant state without holding
+ * any locks, that is, before these two lock_map_acquire()'s.
+ *
+ * However, that would result in:
+ *
+ * A(W1)
+ * WFC(C)
+ * A(W1)
+ * C(C)
+ *
+ * Which would create W1->C->W1 dependencies, even though there is no
+ * actual deadlock possible. There are two solutions, using a
+ * read-recursive acquire on the work(queue) 'locks', but this will then
+ * hit the lockdep limitation on recursive locks, or simply discard
+ * these locks.
+ *
+ * AFAICT there is no possible deadlock scenario between the
+ * flush_work() and complete() primitives (except for single-threaded
+ * workqueues), so hiding them isn't a problem.
+ */
+ lockdep_invariant_state(true);
trace_workqueue_execute_start(work);
worker->current_func(work);
/*
@@ -2247,7 +2269,7 @@ sleep:
* event.
*/
worker_enter_idle(worker);
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_IDLE);
spin_unlock_irq(&pool->lock);
schedule();
goto woke_up;
@@ -2289,7 +2311,7 @@ static int rescuer_thread(void *__rescuer)
*/
rescuer->task->flags |= PF_WQ_WORKER;
repeat:
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_IDLE);
/*
* By the time the rescuer is requested to stop, the workqueue
@@ -2474,7 +2496,16 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
*/
INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
- init_completion(&barr->done);
+
+ /*
+ * Explicitly init the crosslock for wq_barrier::done, make its lock
+ * key a subkey of the corresponding work. As a result we won't
+ * build a dependency between wq_barrier::done and unrelated work.
+ */
+ lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map,
+ "(complete)wq_barr::done",
+ target->lockdep_map.key, 1);
+ __init_completion(&barr->done);
barr->task = current;
/*
@@ -2815,16 +2846,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
spin_unlock_irq(&pool->lock);
/*
- * If @max_active is 1 or rescuer is in use, flushing another work
- * item on the same workqueue may lead to deadlock. Make sure the
- * flusher is not running on the same workqueue by verifying write
- * access.
+ * Force a lock recursion deadlock when using flush_work() inside a
+ * single-threaded or rescuer equipped workqueue.
+ *
+ * For single threaded workqueues the deadlock happens when the work
+ * is after the work issuing the flush_work(). For rescuer equipped
+ * workqueues the deadlock happens when the rescuer stalls, blocking
+ * forward progress.
*/
- if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
+ if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) {
lock_map_acquire(&pwq->wq->lockdep_map);
- else
- lock_map_acquire_read(&pwq->wq->lockdep_map);
- lock_map_release(&pwq->wq->lockdep_map);
+ lock_map_release(&pwq->wq->lockdep_map);
+ }
return true;
already_gone:
@@ -3577,6 +3610,13 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
/* yeap, return possible CPUs in @node that @attrs wants */
cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+
+ if (cpumask_empty(cpumask)) {
+ pr_warn_once("WARNING: workqueue cpumask: online intersect > "
+ "possible intersect\n");
+ return false;
+ }
+
return !cpumask_equal(cpumask, attrs->cpumask);
use_dfl:
@@ -3744,8 +3784,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
return -EINVAL;
/* creating multiple pwqs breaks ordering guarantee */
- if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
- return -EINVAL;
+ if (!list_empty(&wq->pwqs)) {
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
+ return -EINVAL;
+
+ wq->flags &= ~__WQ_ORDERED;
+ }
ctx = apply_wqattrs_prepare(wq, attrs);
if (!ctx)
@@ -3929,6 +3973,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
+ /*
+ * Unbound && max_active == 1 used to imply ordered, which is no
+ * longer the case on NUMA machines due to per-node pools. While
+ * alloc_ordered_workqueue() is the right way to create an ordered
+ * workqueue, keep the previous behavior to avoid subtle breakages
+ * on NUMA.
+ */
+ if ((flags & WQ_UNBOUND) && max_active == 1)
+ flags |= __WQ_ORDERED;
+
/* see the comment above the definition of WQ_POWER_EFFICIENT */
if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
flags |= WQ_UNBOUND;
@@ -4119,13 +4173,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
struct pool_workqueue *pwq;
/* disallow meddling with max_active for ordered workqueues */
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return;
max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
mutex_lock(&wq->mutex);
+ wq->flags &= ~__WQ_ORDERED;
wq->saved_max_active = max_active;
for_each_pwq(pwq, wq)
@@ -5253,7 +5308,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
* attributes breaks ordering guarantee. Disallow exposing ordered
* workqueues.
*/
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return -EINVAL;
wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);