summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c374
-rw-r--r--kernel/audit.h15
-rw-r--r--kernel/audit_fsnotify.c12
-rw-r--r--kernel/audit_tree.c87
-rw-r--r--kernel/audit_watch.c21
-rw-r--r--kernel/auditfilter.c18
-rw-r--r--kernel/auditsc.c36
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c138
-rw-r--r--kernel/bpf/bpf_lru_list.c2
-rw-r--r--kernel/bpf/cgroup.c5
-rw-r--r--kernel/bpf/core.c33
-rw-r--r--kernel/bpf/hashtab.c185
-rw-r--r--kernel/bpf/inode.c2
-rw-r--r--kernel/bpf/lpm_trie.c15
-rw-r--r--kernel/bpf/map_in_map.c97
-rw-r--r--kernel/bpf/map_in_map.h23
-rw-r--r--kernel/bpf/stackmap.c15
-rw-r--r--kernel/bpf/syscall.c186
-rw-r--r--kernel/bpf/verifier.c438
-rw-r--r--kernel/cgroup/cgroup-internal.h7
-rw-r--r--kernel/cgroup/cgroup-v1.c20
-rw-r--r--kernel/cgroup/cgroup.c54
-rw-r--r--kernel/cgroup/cpuset.c15
-rw-r--r--kernel/cgroup/namespace.c2
-rw-r--r--kernel/compat.c10
-rw-r--r--kernel/cpu.c10
-rw-r--r--kernel/crash_core.c439
-rw-r--r--kernel/events/callchain.c6
-rw-r--r--kernel/events/core.c160
-rw-r--r--kernel/events/ring_buffer.c34
-rw-r--r--kernel/fork.c69
-rw-r--r--kernel/futex.c518
-rw-r--r--kernel/gcov/base.c6
-rw-r--r--kernel/gcov/gcc_4_7.c4
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hung_task.c8
-rw-r--r--kernel/irq/affinity.c20
-rw-r--r--kernel/irq/chip.c7
-rw-r--r--kernel/irq/manage.c21
-rw-r--r--kernel/kcov.c9
-rw-r--r--kernel/kexec_core.c431
-rw-r--r--kernel/kprobes.c86
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/livepatch/Kconfig1
-rw-r--r--kernel/livepatch/Makefile2
-rw-r--r--kernel/livepatch/core.c450
-rw-r--r--kernel/livepatch/core.h6
-rw-r--r--kernel/livepatch/patch.c272
-rw-r--r--kernel/livepatch/patch.h33
-rw-r--r--kernel/livepatch/transition.c553
-rw-r--r--kernel/livepatch/transition.h14
-rw-r--r--kernel/locking/lockdep.c341
-rw-r--r--kernel/locking/lockdep_internals.h6
-rw-r--r--kernel/locking/rtmutex-debug.c18
-rw-r--r--kernel/locking/rtmutex-debug.h3
-rw-r--r--kernel/locking/rtmutex.c410
-rw-r--r--kernel/locking/rtmutex.h2
-rw-r--r--kernel/locking/rtmutex_common.h25
-rw-r--r--kernel/locking/rwsem.c6
-rw-r--r--kernel/locking/test-ww_mutex.c29
-rw-r--r--kernel/memremap.c22
-rw-r--r--kernel/module.c51
-rw-r--r--kernel/nsproxy.c3
-rw-r--r--kernel/padata.c20
-rw-r--r--kernel/params.c52
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/pid_namespace.c36
-rw-r--r--kernel/power/snapshot.c5
-rw-r--r--kernel/printk/braille.c15
-rw-r--r--kernel/printk/braille.h13
-rw-r--r--kernel/printk/printk.c67
-rw-r--r--kernel/ptrace.c34
-rw-r--r--kernel/rcu/Makefile5
-rw-r--r--kernel/rcu/rcu.h153
-rw-r--r--kernel/rcu/rcu_segcblist.c505
-rw-r--r--kernel/rcu/rcu_segcblist.h164
-rw-r--r--kernel/rcu/rcutorture.c43
-rw-r--r--kernel/rcu/srcu.c17
-rw-r--r--kernel/rcu/srcutiny.c217
-rw-r--r--kernel/rcu/srcutree.c1154
-rw-r--r--kernel/rcu/tiny.c20
-rw-r--r--kernel/rcu/tiny_plugin.h13
-rw-r--r--kernel/rcu/tree.c772
-rw-r--r--kernel/rcu/tree.h163
-rw-r--r--kernel/rcu/tree_exp.h25
-rw-r--r--kernel/rcu/tree_plugin.h64
-rw-r--r--kernel/rcu/tree_trace.c26
-rw-r--r--kernel/rcu/update.c53
-rw-r--r--kernel/relay.c1
-rw-r--r--kernel/sched/clock.c46
-rw-r--r--kernel/sched/core.c294
-rw-r--r--kernel/sched/cpufreq_schedutil.c82
-rw-r--r--kernel/sched/cputime.c27
-rw-r--r--kernel/sched/fair.c420
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/idle.c6
-rw-r--r--kernel/sched/rt.c81
-rw-r--r--kernel/sched/sched-pelt.h13
-rw-r--r--kernel/sched/sched.h76
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/stacktrace.c12
-rw-r--r--kernel/sys.c33
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/taskstats.c14
-rw-r--r--kernel/time/alarmtimer.c27
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/hrtimer.c17
-rw-r--r--kernel/time/posix-clock.c10
-rw-r--r--kernel/time/posix-cpu-timers.c87
-rw-r--r--kernel/time/posix-stubs.c20
-rw-r--r--kernel/time/posix-timers.c97
-rw-r--r--kernel/time/sched_clock.c5
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/time.c18
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/time/timer.c4
-rw-r--r--kernel/time/timer_list.c6
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/blktrace.c39
-rw-r--r--kernel/trace/bpf_trace.c32
-rw-r--r--kernel/trace/ftrace.c1037
-rw-r--r--kernel/trace/ring_buffer.c64
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c289
-rw-r--r--kernel/trace/trace.h84
-rw-r--r--kernel/trace/trace_benchmark.c14
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_events.c151
-rw-r--r--kernel/trace/trace_functions.c227
-rw-r--r--kernel/trace/trace_hwlat.c14
-rw-r--r--kernel/trace/trace_kprobe.c53
-rw-r--r--kernel/trace/trace_output.c9
-rw-r--r--kernel/trace/trace_stack.c35
-rw-r--r--kernel/workqueue.c28
138 files changed, 9136 insertions, 3927 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index b302b4731d16..72aa080f91f0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_CRASH_CORE) += crash_core.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 2f4964cfde0b..4b7d49868ce1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -58,6 +58,8 @@
#include <linux/rcupdate.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
+#include <linux/pid.h>
+#include <linux/slab.h>
#include <linux/audit.h>
@@ -110,18 +112,19 @@ struct audit_net {
* @pid: auditd PID
* @portid: netlink portid
* @net: the associated network namespace
- * @lock: spinlock to protect write access
+ * @rcu: RCU head
*
* Description:
* This struct is RCU protected; you must either hold the RCU lock for reading
- * or the included spinlock for writing.
+ * or the associated spinlock for writing.
*/
static struct auditd_connection {
- int pid;
+ struct pid *pid;
u32 portid;
struct net *net;
- spinlock_t lock;
-} auditd_conn;
+ struct rcu_head rcu;
+} *auditd_conn = NULL;
+static DEFINE_SPINLOCK(auditd_conn_lock);
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
@@ -151,16 +154,10 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
-/* The audit_freelist is a list of pre-allocated audit buffers (if more
- * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
- * being placed on the freelist). */
-static DEFINE_SPINLOCK(audit_freelist_lock);
-static int audit_freelist_count;
-static LIST_HEAD(audit_freelist);
+static struct kmem_cache *audit_buffer_cache;
/* queue msgs to send via kauditd_task */
static struct sk_buff_head audit_queue;
-static void kauditd_hold_skb(struct sk_buff *skb);
/* queue msgs due to temporary unicast send problems */
static struct sk_buff_head audit_retry_queue;
/* queue msgs waiting for new auditd connection */
@@ -192,17 +189,12 @@ DEFINE_MUTEX(audit_cmd_mutex);
* should be at least that large. */
#define AUDIT_BUFSIZ 1024
-/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
- * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */
-#define AUDIT_MAXFREE (2*NR_CPUS)
-
/* The audit_buffer is used when formatting an audit record. The caller
* locks briefly to get the record off the freelist or to allocate the
* buffer, and locks briefly to send the buffer to the netlink layer or
* to place it on a transmit queue. Multiple audit_buffers can be in
* use simultaneously. */
struct audit_buffer {
- struct list_head list;
struct sk_buff *skb; /* formatted skb ready to send */
struct audit_context *ctx; /* NULL or associated context */
gfp_t gfp_mask;
@@ -221,18 +213,42 @@ struct audit_reply {
* Description:
* Return 1 if the task is a registered audit daemon, 0 otherwise.
*/
-int auditd_test_task(const struct task_struct *task)
+int auditd_test_task(struct task_struct *task)
{
int rc;
+ struct auditd_connection *ac;
rcu_read_lock();
- rc = (auditd_conn.pid && task->tgid == auditd_conn.pid ? 1 : 0);
+ ac = rcu_dereference(auditd_conn);
+ rc = (ac && ac->pid == task_tgid(task) ? 1 : 0);
rcu_read_unlock();
return rc;
}
/**
+ * auditd_pid_vnr - Return the auditd PID relative to the namespace
+ *
+ * Description:
+ * Returns the PID in relation to the namespace, 0 on failure.
+ */
+static pid_t auditd_pid_vnr(void)
+{
+ pid_t pid;
+ const struct auditd_connection *ac;
+
+ rcu_read_lock();
+ ac = rcu_dereference(auditd_conn);
+ if (!ac || !ac->pid)
+ pid = 0;
+ else
+ pid = pid_vnr(ac->pid);
+ rcu_read_unlock();
+
+ return pid;
+}
+
+/**
* audit_get_sk - Return the audit socket for the given network namespace
* @net: the destination network namespace
*
@@ -251,14 +267,6 @@ static struct sock *audit_get_sk(const struct net *net)
return aunet->sk;
}
-static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
-{
- if (ab) {
- struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- nlh->nlmsg_pid = portid;
- }
-}
-
void audit_panic(const char *message)
{
switch (audit_failure) {
@@ -428,6 +436,24 @@ static int audit_set_failure(u32 state)
}
/**
+ * auditd_conn_free - RCU helper to release an auditd connection struct
+ * @rcu: RCU head
+ *
+ * Description:
+ * Drop any references inside the auditd connection tracking struct and free
+ * the memory.
+ */
+ static void auditd_conn_free(struct rcu_head *rcu)
+ {
+ struct auditd_connection *ac;
+
+ ac = container_of(rcu, struct auditd_connection, rcu);
+ put_pid(ac->pid);
+ put_net(ac->net);
+ kfree(ac);
+ }
+
+/**
* auditd_set - Set/Reset the auditd connection state
* @pid: auditd PID
* @portid: auditd netlink portid
@@ -435,46 +461,33 @@ static int audit_set_failure(u32 state)
*
* Description:
* This function will obtain and drop network namespace references as
- * necessary.
+ * necessary. Returns zero on success, negative values on failure.
*/
-static void auditd_set(int pid, u32 portid, struct net *net)
+static int auditd_set(struct pid *pid, u32 portid, struct net *net)
{
unsigned long flags;
+ struct auditd_connection *ac_old, *ac_new;
- spin_lock_irqsave(&auditd_conn.lock, flags);
- auditd_conn.pid = pid;
- auditd_conn.portid = portid;
- if (auditd_conn.net)
- put_net(auditd_conn.net);
- if (net)
- auditd_conn.net = get_net(net);
- else
- auditd_conn.net = NULL;
- spin_unlock_irqrestore(&auditd_conn.lock, flags);
-}
+ if (!pid || !net)
+ return -EINVAL;
-/**
- * auditd_reset - Disconnect the auditd connection
- *
- * Description:
- * Break the auditd/kauditd connection and move all the queued records into the
- * hold queue in case auditd reconnects.
- */
-static void auditd_reset(void)
-{
- struct sk_buff *skb;
+ ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL);
+ if (!ac_new)
+ return -ENOMEM;
+ ac_new->pid = get_pid(pid);
+ ac_new->portid = portid;
+ ac_new->net = get_net(net);
- /* if it isn't already broken, break the connection */
- rcu_read_lock();
- if (auditd_conn.pid)
- auditd_set(0, 0, NULL);
- rcu_read_unlock();
+ spin_lock_irqsave(&auditd_conn_lock, flags);
+ ac_old = rcu_dereference_protected(auditd_conn,
+ lockdep_is_held(&auditd_conn_lock));
+ rcu_assign_pointer(auditd_conn, ac_new);
+ spin_unlock_irqrestore(&auditd_conn_lock, flags);
- /* flush all of the main and retry queues to the hold queue */
- while ((skb = skb_dequeue(&audit_retry_queue)))
- kauditd_hold_skb(skb);
- while ((skb = skb_dequeue(&audit_queue)))
- kauditd_hold_skb(skb);
+ if (ac_old)
+ call_rcu(&ac_old->rcu, auditd_conn_free);
+
+ return 0;
}
/**
@@ -505,9 +518,6 @@ static void kauditd_rehold_skb(struct sk_buff *skb)
{
/* put the record back in the queue at the same place */
skb_queue_head(&audit_hold_queue, skb);
-
- /* fail the auditd connection */
- auditd_reset();
}
/**
@@ -544,9 +554,6 @@ static void kauditd_hold_skb(struct sk_buff *skb)
/* we have no other options - drop the message */
audit_log_lost("kauditd hold queue overflow");
kfree_skb(skb);
-
- /* fail the auditd connection */
- auditd_reset();
}
/**
@@ -567,6 +574,36 @@ static void kauditd_retry_skb(struct sk_buff *skb)
}
/**
+ * auditd_reset - Disconnect the auditd connection
+ *
+ * Description:
+ * Break the auditd/kauditd connection and move all the queued records into the
+ * hold queue in case auditd reconnects.
+ */
+static void auditd_reset(void)
+{
+ unsigned long flags;
+ struct sk_buff *skb;
+ struct auditd_connection *ac_old;
+
+ /* if it isn't already broken, break the connection */
+ spin_lock_irqsave(&auditd_conn_lock, flags);
+ ac_old = rcu_dereference_protected(auditd_conn,
+ lockdep_is_held(&auditd_conn_lock));
+ rcu_assign_pointer(auditd_conn, NULL);
+ spin_unlock_irqrestore(&auditd_conn_lock, flags);
+
+ if (ac_old)
+ call_rcu(&ac_old->rcu, auditd_conn_free);
+
+ /* flush all of the main and retry queues to the hold queue */
+ while ((skb = skb_dequeue(&audit_retry_queue)))
+ kauditd_hold_skb(skb);
+ while ((skb = skb_dequeue(&audit_queue)))
+ kauditd_hold_skb(skb);
+}
+
+/**
* auditd_send_unicast_skb - Send a record via unicast to auditd
* @skb: audit record
*
@@ -583,6 +620,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
u32 portid;
struct net *net;
struct sock *sk;
+ struct auditd_connection *ac;
/* NOTE: we can't call netlink_unicast while in the RCU section so
* take a reference to the network namespace and grab local
@@ -592,15 +630,15 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
* section netlink_unicast() should safely return an error */
rcu_read_lock();
- if (!auditd_conn.pid) {
+ ac = rcu_dereference(auditd_conn);
+ if (!ac) {
rcu_read_unlock();
rc = -ECONNREFUSED;
goto err;
}
- net = auditd_conn.net;
- get_net(net);
+ net = get_net(ac->net);
sk = audit_get_sk(net);
- portid = auditd_conn.portid;
+ portid = ac->portid;
rcu_read_unlock();
rc = netlink_unicast(sk, skb, portid, 0);
@@ -735,6 +773,7 @@ static int kauditd_thread(void *dummy)
u32 portid = 0;
struct net *net = NULL;
struct sock *sk = NULL;
+ struct auditd_connection *ac;
#define UNICAST_RETRIES 5
@@ -742,14 +781,14 @@ static int kauditd_thread(void *dummy)
while (!kthread_should_stop()) {
/* NOTE: see the lock comments in auditd_send_unicast_skb() */
rcu_read_lock();
- if (!auditd_conn.pid) {
+ ac = rcu_dereference(auditd_conn);
+ if (!ac) {
rcu_read_unlock();
goto main_queue;
}
- net = auditd_conn.net;
- get_net(net);
+ net = get_net(ac->net);
sk = audit_get_sk(net);
- portid = auditd_conn.portid;
+ portid = ac->portid;
rcu_read_unlock();
/* attempt to flush the hold queue */
@@ -758,6 +797,7 @@ static int kauditd_thread(void *dummy)
NULL, kauditd_rehold_skb);
if (rc < 0) {
sk = NULL;
+ auditd_reset();
goto main_queue;
}
@@ -767,6 +807,7 @@ static int kauditd_thread(void *dummy)
NULL, kauditd_hold_skb);
if (rc < 0) {
sk = NULL;
+ auditd_reset();
goto main_queue;
}
@@ -775,16 +816,18 @@ main_queue:
* unicast, dump failed record sends to the retry queue; if
* sk == NULL due to previous failures we will just do the
* multicast send and move the record to the retry queue */
- kauditd_send_queue(sk, portid, &audit_queue, 1,
- kauditd_send_multicast_skb,
- kauditd_retry_skb);
+ rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
+ kauditd_send_multicast_skb,
+ kauditd_retry_skb);
+ if (sk == NULL || rc < 0)
+ auditd_reset();
+ sk = NULL;
/* drop our netns reference, no auditd sends past this line */
if (net) {
put_net(net);
net = NULL;
}
- sk = NULL;
/* we have processed all the queues so wake everyone */
wake_up(&audit_backlog_wait);
@@ -819,7 +862,7 @@ int audit_send_list(void *_dest)
return 0;
}
-struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
+struct sk_buff *audit_make_reply(int seq, int type, int done,
int multi, const void *payload, int size)
{
struct sk_buff *skb;
@@ -832,7 +875,7 @@ struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
if (!skb)
return NULL;
- nlh = nlmsg_put(skb, portid, seq, t, size, flags);
+ nlh = nlmsg_put(skb, 0, seq, t, size, flags);
if (!nlh)
goto out_kfree_skb;
data = nlmsg_data(nlh);
@@ -876,7 +919,6 @@ static int audit_send_reply_thread(void *arg)
static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
int multi, const void *payload, int size)
{
- u32 portid = NETLINK_CB(request_skb).portid;
struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct sk_buff *skb;
struct task_struct *tsk;
@@ -886,12 +928,12 @@ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int
if (!reply)
return;
- skb = audit_make_reply(portid, seq, type, done, multi, payload, size);
+ skb = audit_make_reply(seq, type, done, multi, payload, size);
if (!skb)
goto out;
reply->net = get_net(net);
- reply->portid = portid;
+ reply->portid = NETLINK_CB(request_skb).portid;
reply->skb = skb;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
@@ -1071,11 +1113,13 @@ static int audit_set_feature(struct sk_buff *skb)
return 0;
}
-static int audit_replace(pid_t pid)
+static int audit_replace(struct pid *pid)
{
+ pid_t pvnr;
struct sk_buff *skb;
- skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, &pid, sizeof(pid));
+ pvnr = pid_vnr(pid);
+ skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr));
if (!skb)
return -ENOMEM;
return auditd_send_unicast_skb(skb);
@@ -1105,9 +1149,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
memset(&s, 0, sizeof(s));
s.enabled = audit_enabled;
s.failure = audit_failure;
- rcu_read_lock();
- s.pid = auditd_conn.pid;
- rcu_read_unlock();
+ /* NOTE: use pid_vnr() so the PID is relative to the current
+ * namespace */
+ s.pid = auditd_pid_vnr();
s.rate_limit = audit_rate_limit;
s.backlog_limit = audit_backlog_limit;
s.lost = atomic_read(&audit_lost);
@@ -1133,51 +1177,61 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
}
if (s.mask & AUDIT_STATUS_PID) {
- /* NOTE: we are using task_tgid_vnr() below because
- * the s.pid value is relative to the namespace
- * of the caller; at present this doesn't matter
- * much since you can really only run auditd
- * from the initial pid namespace, but something
- * to keep in mind if this changes */
- int new_pid = s.pid;
+ /* NOTE: we are using the vnr PID functions below
+ * because the s.pid value is relative to the
+ * namespace of the caller; at present this
+ * doesn't matter much since you can really only
+ * run auditd from the initial pid namespace, but
+ * something to keep in mind if this changes */
+ pid_t new_pid = s.pid;
pid_t auditd_pid;
- pid_t requesting_pid = task_tgid_vnr(current);
+ struct pid *req_pid = task_tgid(current);
+
+ /* sanity check - PID values must match */
+ if (new_pid != pid_vnr(req_pid))
+ return -EINVAL;
/* test the auditd connection */
- audit_replace(requesting_pid);
+ audit_replace(req_pid);
- rcu_read_lock();
- auditd_pid = auditd_conn.pid;
+ auditd_pid = auditd_pid_vnr();
/* only the current auditd can unregister itself */
- if ((!new_pid) && (requesting_pid != auditd_pid)) {
- rcu_read_unlock();
+ if ((!new_pid) && (new_pid != auditd_pid)) {
audit_log_config_change("audit_pid", new_pid,
auditd_pid, 0);
return -EACCES;
}
/* replacing a healthy auditd is not allowed */
if (auditd_pid && new_pid) {
- rcu_read_unlock();
audit_log_config_change("audit_pid", new_pid,
auditd_pid, 0);
return -EEXIST;
}
- rcu_read_unlock();
-
- if (audit_enabled != AUDIT_OFF)
- audit_log_config_change("audit_pid", new_pid,
- auditd_pid, 1);
if (new_pid) {
/* register a new auditd connection */
- auditd_set(new_pid,
- NETLINK_CB(skb).portid,
- sock_net(NETLINK_CB(skb).sk));
+ err = auditd_set(req_pid,
+ NETLINK_CB(skb).portid,
+ sock_net(NETLINK_CB(skb).sk));
+ if (audit_enabled != AUDIT_OFF)
+ audit_log_config_change("audit_pid",
+ new_pid,
+ auditd_pid,
+ err ? 0 : 1);
+ if (err)
+ return err;
+
/* try to process any backlog */
wake_up_interruptible(&kauditd_wait);
- } else
+ } else {
+ if (audit_enabled != AUDIT_OFF)
+ audit_log_config_change("audit_pid",
+ new_pid,
+ auditd_pid, 1);
+
/* unregister the auditd connection */
auditd_reset();
+ }
}
if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
err = audit_set_rate_limit(s.rate_limit);
@@ -1245,7 +1299,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
size--;
audit_log_n_untrustedstring(ab, data, size);
}
- audit_set_portid(ab, NETLINK_CB(skb).portid);
audit_log_end(ab);
}
break;
@@ -1259,8 +1312,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
audit_log_end(ab);
return -EPERM;
}
- err = audit_rule_change(msg_type, NETLINK_CB(skb).portid,
- seq, data, nlmsg_len(nlh));
+ err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh));
break;
case AUDIT_LIST_RULES:
err = audit_list_rules_send(skb, seq);
@@ -1381,11 +1433,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err < 0 ? err : 0;
}
-/*
- * Get message from skb. Each message is processed by audit_receive_msg.
- * Malformed skbs with wrong length are discarded silently.
+/**
+ * audit_receive - receive messages from a netlink control socket
+ * @skb: the message buffer
+ *
+ * Parse the provided skb and deal with any messages that may be present,
+ * malformed skbs are discarded.
*/
-static void audit_receive_skb(struct sk_buff *skb)
+static void audit_receive(struct sk_buff *skb)
{
struct nlmsghdr *nlh;
/*
@@ -1398,21 +1453,15 @@ static void audit_receive_skb(struct sk_buff *skb)
nlh = nlmsg_hdr(skb);
len = skb->len;
+ mutex_lock(&audit_cmd_mutex);
while (nlmsg_ok(nlh, len)) {
err = audit_receive_msg(skb, nlh);
/* if err or if this message says it wants a response */
if (err || (nlh->nlmsg_flags & NLM_F_ACK))
- netlink_ack(skb, nlh, err);
+ netlink_ack(skb, nlh, err, NULL);
nlh = nlmsg_next(nlh, &len);
}
-}
-
-/* Receive messages from netlink socket. */
-static void audit_receive(struct sk_buff *skb)
-{
- mutex_lock(&audit_cmd_mutex);
- audit_receive_skb(skb);
mutex_unlock(&audit_cmd_mutex);
}
@@ -1450,10 +1499,11 @@ static void __net_exit audit_net_exit(struct net *net)
{
struct audit_net *aunet = net_generic(net, audit_net_id);
- rcu_read_lock();
- if (net == auditd_conn.net)
- auditd_reset();
- rcu_read_unlock();
+ /* NOTE: you would think that we would want to check the auditd
+ * connection and potentially reset it here if it lives in this
+ * namespace, but since the auditd connection tracking struct holds a
+ * reference to this namespace (see auditd_set()) we are only ever
+ * going to get here after that connection has been released */
netlink_kernel_release(aunet->sk);
}
@@ -1473,8 +1523,9 @@ static int __init audit_init(void)
if (audit_initialized == AUDIT_DISABLED)
return 0;
- memset(&auditd_conn, 0, sizeof(auditd_conn));
- spin_lock_init(&auditd_conn.lock);
+ audit_buffer_cache = kmem_cache_create("audit_buffer",
+ sizeof(struct audit_buffer),
+ 0, SLAB_PANIC, NULL);
skb_queue_head_init(&audit_queue);
skb_queue_head_init(&audit_retry_queue);
@@ -1541,60 +1592,33 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set);
static void audit_buffer_free(struct audit_buffer *ab)
{
- unsigned long flags;
-
if (!ab)
return;
kfree_skb(ab->skb);
- spin_lock_irqsave(&audit_freelist_lock, flags);
- if (audit_freelist_count > AUDIT_MAXFREE)
- kfree(ab);
- else {
- audit_freelist_count++;
- list_add(&ab->list, &audit_freelist);
- }
- spin_unlock_irqrestore(&audit_freelist_lock, flags);
+ kmem_cache_free(audit_buffer_cache, ab);
}
-static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
- gfp_t gfp_mask, int type)
+static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx,
+ gfp_t gfp_mask, int type)
{
- unsigned long flags;
- struct audit_buffer *ab = NULL;
- struct nlmsghdr *nlh;
-
- spin_lock_irqsave(&audit_freelist_lock, flags);
- if (!list_empty(&audit_freelist)) {
- ab = list_entry(audit_freelist.next,
- struct audit_buffer, list);
- list_del(&ab->list);
- --audit_freelist_count;
- }
- spin_unlock_irqrestore(&audit_freelist_lock, flags);
-
- if (!ab) {
- ab = kmalloc(sizeof(*ab), gfp_mask);
- if (!ab)
- goto err;
- }
+ struct audit_buffer *ab;
- ab->ctx = ctx;
- ab->gfp_mask = gfp_mask;
+ ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask);
+ if (!ab)
+ return NULL;
ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
if (!ab->skb)
goto err;
+ if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
+ goto err;
- nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
- if (!nlh)
- goto out_kfree_skb;
+ ab->ctx = ctx;
+ ab->gfp_mask = gfp_mask;
return ab;
-out_kfree_skb:
- kfree_skb(ab->skb);
- ab->skb = NULL;
err:
audit_buffer_free(ab);
return NULL;
@@ -1625,10 +1649,10 @@ unsigned int audit_serial(void)
}
static inline void audit_get_stamp(struct audit_context *ctx,
- struct timespec *t, unsigned int *serial)
+ struct timespec64 *t, unsigned int *serial)
{
if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
- *t = CURRENT_TIME;
+ ktime_get_real_ts64(t);
*serial = audit_serial();
}
}
@@ -1652,7 +1676,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
int type)
{
struct audit_buffer *ab;
- struct timespec t;
+ struct timespec64 t;
unsigned int uninitialized_var(serial);
if (audit_initialized != AUDIT_INITIALIZED)
@@ -1705,8 +1729,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
}
audit_get_stamp(ab->ctx, &t, &serial);
- audit_log_format(ab, "audit(%lu.%03lu:%u): ",
- t.tv_sec, t.tv_nsec/1000000, serial);
+ audit_log_format(ab, "audit(%llu.%03lu:%u): ",
+ (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);
return ab;
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 0f1cf6d1878a..ddfce2ea4891 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -112,7 +112,7 @@ struct audit_context {
enum audit_state state, current_state;
unsigned int serial; /* serial number for record */
int major; /* syscall number */
- struct timespec ctime; /* time of syscall entry */
+ struct timespec64 ctime; /* time of syscall entry */
unsigned long argv[4]; /* syscall arguments */
long return_code;/* syscall return code */
u64 prio;
@@ -218,7 +218,7 @@ extern void audit_log_name(struct audit_context *context,
struct audit_names *n, const struct path *path,
int record_num, int *call_panic);
-extern int auditd_test_task(const struct task_struct *task);
+extern int auditd_test_task(struct task_struct *task);
#define AUDIT_INODE_BUCKETS 32
extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -237,8 +237,7 @@ extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
extern int parent_len(const char *path);
extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
-extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type,
- int done, int multi,
+extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi,
const void *payload, int size);
extern void audit_panic(const char *message);
@@ -333,13 +332,7 @@ extern u32 audit_sig_sid;
extern int audit_filter(int msgtype, unsigned int listtype);
#ifdef CONFIG_AUDITSYSCALL
-extern int __audit_signal_info(int sig, struct task_struct *t);
-static inline int audit_signal_info(int sig, struct task_struct *t)
-{
- if (auditd_test_task(t) || (audit_signals && !audit_dummy_context()))
- return __audit_signal_info(sig, t);
- return 0;
-}
+extern int audit_signal_info(int sig, struct task_struct *t);
extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
extern struct list_head *audit_killed_trees(void);
#else
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 7ea57e516029..52f368b6561e 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -103,15 +103,15 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
goto out;
}
- fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark);
+ fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_group);
audit_mark->mark.mask = AUDIT_FS_EVENTS;
audit_mark->path = pathname;
audit_update_mark(audit_mark, dentry->d_inode);
audit_mark->rule = krule;
- ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true);
+ ret = fsnotify_add_mark(&audit_mark->mark, inode, NULL, true);
if (ret < 0) {
- audit_fsnotify_mark_free(audit_mark);
+ fsnotify_put_mark(&audit_mark->mark);
audit_mark = ERR_PTR(ret);
}
out:
@@ -168,7 +168,8 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
u32 mask, const void *data, int data_type,
- const unsigned char *dname, u32 cookie)
+ const unsigned char *dname, u32 cookie,
+ struct fsnotify_iter_info *iter_info)
{
struct audit_fsnotify_mark *audit_mark;
const struct inode *inode = NULL;
@@ -187,7 +188,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
default:
BUG();
return 0;
- };
+ }
if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
@@ -201,6 +202,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
static const struct fsnotify_ops audit_mark_fsnotify_ops = {
.handle_event = audit_mark_handle_event,
+ .free_mark = audit_fsnotify_free_mark,
};
static int __init audit_fsnotify_init(void)
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 7b44195da81b..011d46e5f73f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -3,13 +3,14 @@
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/kthread.h>
+#include <linux/refcount.h>
#include <linux/slab.h>
struct audit_tree;
struct audit_chunk;
struct audit_tree {
- atomic_t count;
+ refcount_t count;
int goner;
struct audit_chunk *root;
struct list_head chunks;
@@ -77,7 +78,7 @@ static struct audit_tree *alloc_tree(const char *s)
tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
if (tree) {
- atomic_set(&tree->count, 1);
+ refcount_set(&tree->count, 1);
tree->goner = 0;
INIT_LIST_HEAD(&tree->chunks);
INIT_LIST_HEAD(&tree->rules);
@@ -91,12 +92,12 @@ static struct audit_tree *alloc_tree(const char *s)
static inline void get_tree(struct audit_tree *tree)
{
- atomic_inc(&tree->count);
+ refcount_inc(&tree->count);
}
static inline void put_tree(struct audit_tree *tree)
{
- if (atomic_dec_and_test(&tree->count))
+ if (refcount_dec_and_test(&tree->count))
kfree_rcu(tree, head);
}
@@ -154,7 +155,7 @@ static struct audit_chunk *alloc_chunk(int count)
INIT_LIST_HEAD(&chunk->owners[i].list);
chunk->owners[i].index = i;
}
- fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+ fsnotify_init_mark(&chunk->mark, audit_tree_group);
chunk->mark.mask = FS_IN_IGNORED;
return chunk;
}
@@ -163,33 +164,54 @@ enum {HASH_SIZE = 128};
static struct list_head chunk_hash_heads[HASH_SIZE];
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
-static inline struct list_head *chunk_hash(const struct inode *inode)
+/* Function to return search key in our hash from inode. */
+static unsigned long inode_to_key(const struct inode *inode)
{
- unsigned long n = (unsigned long)inode / L1_CACHE_BYTES;
+ return (unsigned long)inode;
+}
+
+/*
+ * Function to return search key in our hash from chunk. Key 0 is special and
+ * should never be present in the hash.
+ */
+static unsigned long chunk_to_key(struct audit_chunk *chunk)
+{
+ /*
+ * We have a reference to the mark so it should be attached to a
+ * connector.
+ */
+ if (WARN_ON_ONCE(!chunk->mark.connector))
+ return 0;
+ return (unsigned long)chunk->mark.connector->inode;
+}
+
+static inline struct list_head *chunk_hash(unsigned long key)
+{
+ unsigned long n = key / L1_CACHE_BYTES;
return chunk_hash_heads + n % HASH_SIZE;
}
/* hash_lock & entry->lock is held by caller */
static void insert_hash(struct audit_chunk *chunk)
{
- struct fsnotify_mark *entry = &chunk->mark;
+ unsigned long key = chunk_to_key(chunk);
struct list_head *list;
- if (!entry->inode)
+ if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED))
return;
- list = chunk_hash(entry->inode);
+ list = chunk_hash(key);
list_add_rcu(&chunk->hash, list);
}
/* called under rcu_read_lock */
struct audit_chunk *audit_tree_lookup(const struct inode *inode)
{
- struct list_head *list = chunk_hash(inode);
+ unsigned long key = inode_to_key(inode);
+ struct list_head *list = chunk_hash(key);
struct audit_chunk *p;
list_for_each_entry_rcu(p, list, hash) {
- /* mark.inode may have gone NULL, but who cares? */
- if (p->mark.inode == inode) {
+ if (chunk_to_key(p) == key) {
atomic_long_inc(&p->refs);
return p;
}
@@ -233,11 +255,15 @@ static void untag_chunk(struct node *p)
mutex_lock(&entry->group->mark_mutex);
spin_lock(&entry->lock);
- if (chunk->dead || !entry->inode) {
+ /*
+ * mark_mutex protects mark from getting detached and thus also from
+ * mark->connector->inode getting NULL.
+ */
+ if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
spin_unlock(&entry->lock);
mutex_unlock(&entry->group->mark_mutex);
if (new)
- free_chunk(new);
+ fsnotify_put_mark(&new->mark);
goto out;
}
@@ -261,7 +287,7 @@ static void untag_chunk(struct node *p)
if (!new)
goto Fallback;
- if (fsnotify_add_mark_locked(&new->mark, entry->group, entry->inode,
+ if (fsnotify_add_mark_locked(&new->mark, entry->connector->inode,
NULL, 1)) {
fsnotify_put_mark(&new->mark);
goto Fallback;
@@ -327,7 +353,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
return -ENOMEM;
entry = &chunk->mark;
- if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
+ if (fsnotify_add_mark(entry, inode, NULL, 0)) {
fsnotify_put_mark(entry);
return -ENOSPC;
}
@@ -366,7 +392,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
struct node *p;
int n;
- old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
+ old_entry = fsnotify_find_mark(&inode->i_fsnotify_marks,
+ audit_tree_group);
if (!old_entry)
return create_chunk(inode, tree);
@@ -393,17 +420,21 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
mutex_lock(&old_entry->group->mark_mutex);
spin_lock(&old_entry->lock);
- if (!old_entry->inode) {
+ /*
+ * mark_mutex protects mark from getting detached and thus also from
+ * mark->connector->inode getting NULL.
+ */
+ if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
/* old_entry is being shot, lets just lie */
spin_unlock(&old_entry->lock);
mutex_unlock(&old_entry->group->mark_mutex);
fsnotify_put_mark(old_entry);
- free_chunk(chunk);
+ fsnotify_put_mark(&chunk->mark);
return -ENOENT;
}
- if (fsnotify_add_mark_locked(chunk_entry, old_entry->group,
- old_entry->inode, NULL, 1)) {
+ if (fsnotify_add_mark_locked(chunk_entry,
+ old_entry->connector->inode, NULL, 1)) {
spin_unlock(&old_entry->lock);
mutex_unlock(&old_entry->group->mark_mutex);
fsnotify_put_mark(chunk_entry);
@@ -588,7 +619,8 @@ int audit_remove_tree_rule(struct audit_krule *rule)
static int compare_root(struct vfsmount *mnt, void *arg)
{
- return d_backing_inode(mnt->mnt_root) == arg;
+ return inode_to_key(d_backing_inode(mnt->mnt_root)) ==
+ (unsigned long)arg;
}
void audit_trim_trees(void)
@@ -623,9 +655,10 @@ void audit_trim_trees(void)
list_for_each_entry(node, &tree->chunks, list) {
struct audit_chunk *chunk = find_chunk(node);
/* this could be NULL if the watch is dying else where... */
- struct inode *inode = chunk->mark.inode;
node->index |= 1U<<31;
- if (iterate_mounts(compare_root, inode, root_mnt))
+ if (iterate_mounts(compare_root,
+ (void *)chunk_to_key(chunk),
+ root_mnt))
node->index &= ~(1U<<31);
}
spin_unlock(&hash_lock);
@@ -958,7 +991,8 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
u32 mask, const void *data, int data_type,
- const unsigned char *file_name, u32 cookie)
+ const unsigned char *file_name, u32 cookie,
+ struct fsnotify_iter_info *iter_info)
{
return 0;
}
@@ -979,6 +1013,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
static const struct fsnotify_ops audit_tree_ops = {
.handle_event = audit_tree_handle_event,
.freeing_mark = audit_tree_freeing_mark,
+ .free_mark = audit_tree_destroy_watch,
};
static int __init audit_tree_init(void)
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f79e4658433d..62d686d96581 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -28,6 +28,7 @@
#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/netlink.h>
+#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
@@ -46,7 +47,7 @@
*/
struct audit_watch {
- atomic_t count; /* reference count */
+ refcount_t count; /* reference count */
dev_t dev; /* associated superblock device */
char *path; /* insertion path */
unsigned long ino; /* associated inode number */
@@ -102,7 +103,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode)
struct audit_parent *parent = NULL;
struct fsnotify_mark *entry;
- entry = fsnotify_find_inode_mark(audit_watch_group, inode);
+ entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_watch_group);
if (entry)
parent = container_of(entry, struct audit_parent, mark);
@@ -111,12 +112,12 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode)
void audit_get_watch(struct audit_watch *watch)
{
- atomic_inc(&watch->count);
+ refcount_inc(&watch->count);
}
void audit_put_watch(struct audit_watch *watch)
{
- if (atomic_dec_and_test(&watch->count)) {
+ if (refcount_dec_and_test(&watch->count)) {
WARN_ON(watch->parent);
WARN_ON(!list_empty(&watch->rules));
kfree(watch->path);
@@ -157,9 +158,9 @@ static struct audit_parent *audit_init_parent(struct path *path)
INIT_LIST_HEAD(&parent->watches);
- fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
+ fsnotify_init_mark(&parent->mark, audit_watch_group);
parent->mark.mask = AUDIT_FS_WATCH;
- ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
+ ret = fsnotify_add_mark(&parent->mark, inode, NULL, 0);
if (ret < 0) {
audit_free_parent(parent);
return ERR_PTR(ret);
@@ -178,7 +179,7 @@ static struct audit_watch *audit_init_watch(char *path)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&watch->rules);
- atomic_set(&watch->count, 1);
+ refcount_set(&watch->count, 1);
watch->path = path;
watch->dev = AUDIT_DEV_UNSET;
watch->ino = AUDIT_INO_UNSET;
@@ -472,7 +473,8 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
u32 mask, const void *data, int data_type,
- const unsigned char *dname, u32 cookie)
+ const unsigned char *dname, u32 cookie,
+ struct fsnotify_iter_info *iter_info)
{
const struct inode *inode;
struct audit_parent *parent;
@@ -492,7 +494,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
BUG();
inode = NULL;
break;
- };
+ }
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
@@ -506,6 +508,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
static const struct fsnotify_ops audit_watch_fsnotify_ops = {
.handle_event = audit_watch_handle_event,
+ .free_mark = audit_watch_free_mark,
};
static int __init audit_watch_init(void)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 880519d6cf2a..0b0aa5854dac 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -338,7 +338,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
entry->rule.listnr != AUDIT_FILTER_USER)
return -EINVAL;
break;
- };
+ }
switch(f->type) {
default:
@@ -412,7 +412,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
if (entry->rule.listnr != AUDIT_FILTER_EXIT)
return -EINVAL;
break;
- };
+ }
return 0;
}
@@ -1033,7 +1033,7 @@ out:
}
/* List rules using struct audit_rule_data. */
-static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
+static void audit_list_rules(int seq, struct sk_buff_head *q)
{
struct sk_buff *skb;
struct audit_krule *r;
@@ -1048,15 +1048,15 @@ static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
data = audit_krule_to_data(r);
if (unlikely(!data))
break;
- skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES,
- 0, 1, data,
+ skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
+ data,
sizeof(*data) + data->buflen);
if (skb)
skb_queue_tail(q, skb);
kfree(data);
}
}
- skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+ skb = audit_make_reply(seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
if (skb)
skb_queue_tail(q, skb);
}
@@ -1085,13 +1085,11 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
/**
* audit_rule_change - apply all rules to the specified message type
* @type: audit message type
- * @portid: target port id for netlink audit messages
* @seq: netlink audit message sequence (serial) number
* @data: payload data
* @datasz: size of payload data
*/
-int audit_rule_change(int type, __u32 portid, int seq, void *data,
- size_t datasz)
+int audit_rule_change(int type, int seq, void *data, size_t datasz)
{
int err = 0;
struct audit_entry *entry;
@@ -1150,7 +1148,7 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq)
skb_queue_head_init(&dest->q);
mutex_lock(&audit_filter_mutex);
- audit_list_rules(portid, seq, &dest->q);
+ audit_list_rules(seq, &dest->q);
mutex_unlock(&audit_filter_mutex);
tsk = kthread_run(audit_send_list, dest, "audit_send_list");
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e59ffc7fc522..bb724baa7ac9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -73,6 +73,7 @@
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/uaccess.h>
+#include <linux/fsnotify_backend.h>
#include <uapi/linux/limits.h>
#include "audit.h"
@@ -1532,7 +1533,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
return;
context->serial = 0;
- context->ctime = CURRENT_TIME;
+ ktime_get_real_ts64(&context->ctime);
context->in_syscall = 1;
context->current_state = state;
context->ppid = 0;
@@ -1596,7 +1597,7 @@ static inline void handle_one(const struct inode *inode)
struct audit_tree_refs *p;
struct audit_chunk *chunk;
int count;
- if (likely(hlist_empty(&inode->i_fsnotify_marks)))
+ if (likely(!inode->i_fsnotify_marks))
return;
context = current->audit_context;
p = context->trees;
@@ -1639,7 +1640,7 @@ retry:
seq = read_seqbegin(&rename_lock);
for(;;) {
struct inode *inode = d_backing_inode(d);
- if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
+ if (inode && unlikely(inode->i_fsnotify_marks)) {
struct audit_chunk *chunk;
chunk = audit_tree_lookup(inode);
if (chunk) {
@@ -1941,13 +1942,13 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
/**
* auditsc_get_stamp - get local copies of audit_context values
* @ctx: audit_context for the task
- * @t: timespec to store time recorded in the audit_context
+ * @t: timespec64 to store time recorded in the audit_context
* @serial: serial value that is recorded in the audit_context
*
* Also sets the context as auditable.
*/
int auditsc_get_stamp(struct audit_context *ctx,
- struct timespec *t, unsigned int *serial)
+ struct timespec64 *t, unsigned int *serial)
{
if (!ctx->in_syscall)
return 0;
@@ -2249,26 +2250,27 @@ void __audit_ptrace(struct task_struct *t)
* If the audit subsystem is being terminated, record the task (pid)
* and uid that is doing that.
*/
-int __audit_signal_info(int sig, struct task_struct *t)
+int audit_signal_info(int sig, struct task_struct *t)
{
struct audit_aux_data_pids *axp;
struct task_struct *tsk = current;
struct audit_context *ctx = tsk->audit_context;
kuid_t uid = current_uid(), t_uid = task_uid(t);
- if (auditd_test_task(t)) {
- if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
- audit_sig_pid = task_tgid_nr(tsk);
- if (uid_valid(tsk->loginuid))
- audit_sig_uid = tsk->loginuid;
- else
- audit_sig_uid = uid;
- security_task_getsecid(tsk, &audit_sig_sid);
- }
- if (!audit_signals || audit_dummy_context())
- return 0;
+ if (auditd_test_task(t) &&
+ (sig == SIGTERM || sig == SIGHUP ||
+ sig == SIGUSR1 || sig == SIGUSR2)) {
+ audit_sig_pid = task_tgid_nr(tsk);
+ if (uid_valid(tsk->loginuid))
+ audit_sig_uid = tsk->loginuid;
+ else
+ audit_sig_uid = uid;
+ security_task_getsecid(tsk, &audit_sig_sid);
}
+ if (!audit_signals || audit_dummy_context())
+ return 0;
+
/* optimize the common case by putting first signal recipient directly
* in audit_context */
if (!ctx->target_pid) {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e1ce4f4fd7fd..e1e5e658f2db 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 6b6f41f0b211..172dc8ee0e3b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1,4 +1,5 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016,2017 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -16,6 +17,8 @@
#include <linux/filter.h>
#include <linux/perf_event.h>
+#include "map_in_map.h"
+
static void bpf_array_free_percpu(struct bpf_array *array)
{
int i;
@@ -83,6 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array->map.key_size = attr->key_size;
array->map.value_size = attr->value_size;
array->map.max_entries = attr->max_entries;
+ array->map.map_flags = attr->map_flags;
array->elem_size = elem_size;
if (!percpu)
@@ -113,6 +117,30 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
return array->value + array->elem_size * index;
}
+/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
+static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ u32 elem_size = round_up(map->value_size, 8);
+ const int ret = BPF_REG_0;
+ const int map_ptr = BPF_REG_1;
+ const int index = BPF_REG_2;
+
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
+ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+
+ if (is_power_of_2(elem_size)) {
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
+ } else {
+ *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
+ }
+ *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(ret, 0);
+ return insn - insn_buf;
+}
+
/* Called from eBPF program */
static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -155,7 +183,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- u32 index = *(u32 *)key;
+ u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = (u32 *)next_key;
if (index >= array->map.max_entries) {
@@ -260,21 +288,17 @@ static void array_map_free(struct bpf_map *map)
bpf_map_area_free(array);
}
-static const struct bpf_map_ops array_ops = {
+const struct bpf_map_ops array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
+ .map_gen_lookup = array_map_gen_lookup,
};
-static struct bpf_map_type_list array_type __ro_after_init = {
- .ops = &array_ops,
- .type = BPF_MAP_TYPE_ARRAY,
-};
-
-static const struct bpf_map_ops percpu_array_ops = {
+const struct bpf_map_ops percpu_array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
@@ -283,19 +307,6 @@ static const struct bpf_map_ops percpu_array_ops = {
.map_delete_elem = array_map_delete_elem,
};
-static struct bpf_map_type_list percpu_array_type __ro_after_init = {
- .ops = &percpu_array_ops,
- .type = BPF_MAP_TYPE_PERCPU_ARRAY,
-};
-
-static int __init register_array_map(void)
-{
- bpf_register_map_type(&array_type);
- bpf_register_map_type(&percpu_array_type);
- return 0;
-}
-late_initcall(register_array_map);
-
static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
{
/* only file descriptors can be stored in this type of map */
@@ -399,7 +410,7 @@ void bpf_fd_array_map_clear(struct bpf_map *map)
fd_array_map_delete_elem(map, &i);
}
-static const struct bpf_map_ops prog_array_ops = {
+const struct bpf_map_ops prog_array_map_ops = {
.map_alloc = fd_array_map_alloc,
.map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
@@ -409,18 +420,6 @@ static const struct bpf_map_ops prog_array_ops = {
.map_fd_put_ptr = prog_fd_array_put_ptr,
};
-static struct bpf_map_type_list prog_array_type __ro_after_init = {
- .ops = &prog_array_ops,
- .type = BPF_MAP_TYPE_PROG_ARRAY,
-};
-
-static int __init register_prog_array_map(void)
-{
- bpf_register_map_type(&prog_array_type);
- return 0;
-}
-late_initcall(register_prog_array_map);
-
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
struct file *map_file)
{
@@ -511,7 +510,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,
rcu_read_unlock();
}
-static const struct bpf_map_ops perf_event_array_ops = {
+const struct bpf_map_ops perf_event_array_map_ops = {
.map_alloc = fd_array_map_alloc,
.map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
@@ -522,18 +521,6 @@ static const struct bpf_map_ops perf_event_array_ops = {
.map_release = perf_event_fd_array_release,
};
-static struct bpf_map_type_list perf_event_array_type __ro_after_init = {
- .ops = &perf_event_array_ops,
- .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
-};
-
-static int __init register_perf_event_array_map(void)
-{
- bpf_register_map_type(&perf_event_array_type);
- return 0;
-}
-late_initcall(register_perf_event_array_map);
-
#ifdef CONFIG_CGROUPS
static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file /* not used */,
@@ -554,7 +541,7 @@ static void cgroup_fd_array_free(struct bpf_map *map)
fd_array_map_free(map);
}
-static const struct bpf_map_ops cgroup_array_ops = {
+const struct bpf_map_ops cgroup_array_map_ops = {
.map_alloc = fd_array_map_alloc,
.map_free = cgroup_fd_array_free,
.map_get_next_key = array_map_get_next_key,
@@ -563,16 +550,53 @@ static const struct bpf_map_ops cgroup_array_ops = {
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
.map_fd_put_ptr = cgroup_fd_array_put_ptr,
};
+#endif
-static struct bpf_map_type_list cgroup_array_type __ro_after_init = {
- .ops = &cgroup_array_ops,
- .type = BPF_MAP_TYPE_CGROUP_ARRAY,
-};
+static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_map *map, *inner_map_meta;
+
+ inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
+ if (IS_ERR(inner_map_meta))
+ return inner_map_meta;
-static int __init register_cgroup_array_map(void)
+ map = fd_array_map_alloc(attr);
+ if (IS_ERR(map)) {
+ bpf_map_meta_free(inner_map_meta);
+ return map;
+ }
+
+ map->inner_map_meta = inner_map_meta;
+
+ return map;
+}
+
+static void array_of_map_free(struct bpf_map *map)
{
- bpf_register_map_type(&cgroup_array_type);
- return 0;
+ /* map->inner_map_meta is only accessed by syscall which
+ * is protected by fdget/fdput.
+ */
+ bpf_map_meta_free(map->inner_map_meta);
+ bpf_fd_array_map_clear(map);
+ fd_array_map_free(map);
}
-late_initcall(register_cgroup_array_map);
-#endif
+
+static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_map **inner_map = array_map_lookup_elem(map, key);
+
+ if (!inner_map)
+ return NULL;
+
+ return READ_ONCE(*inner_map);
+}
+
+const struct bpf_map_ops array_of_maps_map_ops = {
+ .map_alloc = array_of_map_alloc,
+ .map_free = array_of_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = array_of_map_lookup_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = bpf_map_fd_get_ptr,
+ .map_fd_put_ptr = bpf_map_fd_put_ptr,
+};
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index f62d1d56f41d..e6ef4401a138 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -13,7 +13,7 @@
#define LOCAL_FREE_TARGET (128)
#define LOCAL_NR_SCANS LOCAL_FREE_TARGET
-#define PERCPU_FREE_TARGET (16)
+#define PERCPU_FREE_TARGET (4)
#define PERCPU_NR_SCANS PERCPU_FREE_TARGET
/* Helpers to get the local list index */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index da0f53690295..ea6033cba947 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -154,7 +154,7 @@ int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
/**
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
- * @sk: The socken sending or receiving traffic
+ * @sk: The socket sending or receiving traffic
* @skb: The skb that is being sent or received
* @type: The type of program to be exectuted
*
@@ -189,10 +189,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
prog = rcu_dereference(cgrp->bpf.effective[type]);
if (prog) {
unsigned int offset = skb->data - skb_network_header(skb);
+ struct sock *save_sk = skb->sk;
+ skb->sk = sk;
__skb_push(skb, offset);
ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
__skb_pull(skb, offset);
+ skb->sk = save_sk;
}
rcu_read_unlock();
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f45827e205d3..dedf367f59bb 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -76,8 +76,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
- gfp_extra_flags;
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog_aux *aux;
struct bpf_prog *fp;
@@ -107,8 +106,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_alloc);
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
- gfp_extra_flags;
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog *fp;
u32 pages, delta;
int ret;
@@ -394,27 +392,23 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
- unsigned long flags;
-
if (!bpf_prog_kallsyms_candidate(fp) ||
!capable(CAP_SYS_ADMIN))
return;
- spin_lock_irqsave(&bpf_lock, flags);
+ spin_lock_bh(&bpf_lock);
bpf_prog_ksym_node_add(fp->aux);
- spin_unlock_irqrestore(&bpf_lock, flags);
+ spin_unlock_bh(&bpf_lock);
}
void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
- unsigned long flags;
-
if (!bpf_prog_kallsyms_candidate(fp))
return;
- spin_lock_irqsave(&bpf_lock, flags);
+ spin_lock_bh(&bpf_lock);
bpf_prog_ksym_node_del(fp->aux);
- spin_unlock_irqrestore(&bpf_lock, flags);
+ spin_unlock_bh(&bpf_lock);
}
static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
@@ -659,8 +653,7 @@ out:
static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
- gfp_extra_flags;
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog *fp;
fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
@@ -1162,12 +1155,12 @@ out:
LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
off = IMM;
load_word:
- /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
- * only appearing in the programs where ctx ==
- * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
- * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
- * internal BPF verifier will check that BPF_R6 ==
- * ctx.
+ /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
+ * appearing in the programs where ctx == skb
+ * (see may_access_skb() in the verifier). All programs
+ * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6,
+ * bpf_convert_filter() saves it in BPF_R6, internal BPF
+ * verifier will check that BPF_R6 == ctx.
*
* BPF_ABS and BPF_IND are wrappers of function calls,
* so they scratch BPF_R1-BPF_R5 registers, preserve
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 361a69dfe543..004334ea13ba 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -16,6 +16,7 @@
#include <linux/rculist_nulls.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
+#include "map_in_map.h"
struct bucket {
struct hlist_nulls_head head;
@@ -86,6 +87,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size
return *(void __percpu **)(l->key + key_size);
}
+static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
+{
+ return *(void **)(l->key + roundup(map->key_size, 8));
+}
+
static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
{
return (struct htab_elem *) (htab->elems + i * htab->elem_size);
@@ -426,7 +432,11 @@ again:
return NULL;
}
-/* Called from syscall or from eBPF program */
+/* Called from syscall or from eBPF program directly, so
+ * arguments have to match bpf_map_lookup_elem() exactly.
+ * The return value is adjusted by BPF instructions
+ * in htab_map_gen_lookup().
+ */
static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
@@ -458,6 +468,30 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+/* inline bpf_map_lookup_elem() call.
+ * Instead of:
+ * bpf_prog
+ * bpf_map_lookup_elem
+ * map->ops->map_lookup_elem
+ * htab_map_lookup_elem
+ * __htab_map_lookup_elem
+ * do:
+ * bpf_prog
+ * __htab_map_lookup_elem
+ */
+static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+
+ *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+ offsetof(struct htab_elem, key) +
+ round_up(map->key_size, 8));
+ return insn - insn_buf;
+}
+
static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
{
struct htab_elem *l = __htab_map_lookup_elem(map, key);
@@ -506,12 +540,15 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
struct hlist_nulls_head *head;
struct htab_elem *l, *next_l;
u32 hash, key_size;
- int i;
+ int i = 0;
WARN_ON_ONCE(!rcu_read_lock_held());
key_size = map->key_size;
+ if (!key)
+ goto find_first_elem;
+
hash = htab_map_hash(key, key_size);
head = select_bucket(htab, hash);
@@ -519,10 +556,8 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
/* lookup the key */
l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
- if (!l) {
- i = 0;
+ if (!l)
goto find_first_elem;
- }
/* key was found, get next key in the same bucket */
next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
@@ -582,6 +617,14 @@ static void htab_elem_free_rcu(struct rcu_head *head)
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
+ struct bpf_map *map = &htab->map;
+
+ if (map->ops->map_fd_put_ptr) {
+ void *ptr = fd_htab_map_get_ptr(map, l);
+
+ map->ops->map_fd_put_ptr(ptr);
+ }
+
if (htab_is_prealloc(htab)) {
pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
@@ -1027,6 +1070,7 @@ static void delete_all_elements(struct bpf_htab *htab)
}
}
}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
{
@@ -1053,21 +1097,17 @@ static void htab_map_free(struct bpf_map *map)
kfree(htab);
}
-static const struct bpf_map_ops htab_ops = {
+const struct bpf_map_ops htab_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_map_lookup_elem,
.map_update_elem = htab_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
+ .map_gen_lookup = htab_map_gen_lookup,
};
-static struct bpf_map_type_list htab_type __ro_after_init = {
- .ops = &htab_ops,
- .type = BPF_MAP_TYPE_HASH,
-};
-
-static const struct bpf_map_ops htab_lru_ops = {
+const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1076,11 +1116,6 @@ static const struct bpf_map_ops htab_lru_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
};
-static struct bpf_map_type_list htab_lru_type __ro_after_init = {
- .ops = &htab_lru_ops,
- .type = BPF_MAP_TYPE_LRU_HASH,
-};
-
/* Called from eBPF program */
static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -1154,7 +1189,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
return ret;
}
-static const struct bpf_map_ops htab_percpu_ops = {
+const struct bpf_map_ops htab_percpu_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1163,12 +1198,7 @@ static const struct bpf_map_ops htab_percpu_ops = {
.map_delete_elem = htab_map_delete_elem,
};
-static struct bpf_map_type_list htab_percpu_type __ro_after_init = {
- .ops = &htab_percpu_ops,
- .type = BPF_MAP_TYPE_PERCPU_HASH,
-};
-
-static const struct bpf_map_ops htab_lru_percpu_ops = {
+const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1177,17 +1207,102 @@ static const struct bpf_map_ops htab_lru_percpu_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
};
-static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = {
- .ops = &htab_lru_percpu_ops,
- .type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
-};
+static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_map *map;
+
+ if (attr->value_size != sizeof(u32))
+ return ERR_PTR(-EINVAL);
+
+ /* pointer is stored internally */
+ attr->value_size = sizeof(void *);
+ map = htab_map_alloc(attr);
+ attr->value_size = sizeof(u32);
-static int __init register_htab_map(void)
+ return map;
+}
+
+static void fd_htab_map_free(struct bpf_map *map)
{
- bpf_register_map_type(&htab_type);
- bpf_register_map_type(&htab_percpu_type);
- bpf_register_map_type(&htab_lru_type);
- bpf_register_map_type(&htab_lru_percpu_type);
- return 0;
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_nulls_node *n;
+ struct hlist_nulls_head *head;
+ struct htab_elem *l;
+ int i;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ head = select_bucket(htab, i);
+
+ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+ void *ptr = fd_htab_map_get_ptr(map, l);
+
+ map->ops->map_fd_put_ptr(ptr);
+ }
+ }
+
+ htab_map_free(map);
+}
+
+/* only called from syscall */
+int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, u64 map_flags)
+{
+ void *ptr;
+ int ret;
+ u32 ufd = *(u32 *)value;
+
+ ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ ret = htab_map_update_elem(map, key, &ptr, map_flags);
+ if (ret)
+ map->ops->map_fd_put_ptr(ptr);
+
+ return ret;
+}
+
+static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_map *map, *inner_map_meta;
+
+ inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
+ if (IS_ERR(inner_map_meta))
+ return inner_map_meta;
+
+ map = fd_htab_map_alloc(attr);
+ if (IS_ERR(map)) {
+ bpf_map_meta_free(inner_map_meta);
+ return map;
+ }
+
+ map->inner_map_meta = inner_map_meta;
+
+ return map;
}
-late_initcall(register_htab_map);
+
+static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_map **inner_map = htab_map_lookup_elem(map, key);
+
+ if (!inner_map)
+ return NULL;
+
+ return READ_ONCE(*inner_map);
+}
+
+static void htab_of_map_free(struct bpf_map *map)
+{
+ bpf_map_meta_free(map->inner_map_meta);
+ fd_htab_map_free(map);
+}
+
+const struct bpf_map_ops htab_of_maps_map_ops = {
+ .map_alloc = htab_of_map_alloc,
+ .map_free = htab_of_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_of_map_lookup_elem,
+ .map_delete_elem = htab_map_delete_elem,
+ .map_fd_get_ptr = bpf_map_fd_get_ptr,
+ .map_fd_put_ptr = bpf_map_fd_put_ptr,
+};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index fddcae801724..9bbd33497d3d 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -429,7 +429,7 @@ static int bpf_parse_options(char *data, struct bpf_mount_opts *opts)
static int bpf_fill_super(struct super_block *sb, void *data, int silent)
{
- static struct tree_descr bpf_rfiles[] = { { "" } };
+ static const struct tree_descr bpf_rfiles[] = { { "" } };
struct bpf_mount_opts opts;
struct inode *inode;
int ret;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b37bd9ab7f57..b09185f0f17d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -432,6 +432,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
trie->map.key_size = attr->key_size;
trie->map.value_size = attr->value_size;
trie->map.max_entries = attr->max_entries;
+ trie->map.map_flags = attr->map_flags;
trie->data_size = attr->key_size -
offsetof(struct bpf_lpm_trie_key, data);
trie->max_prefixlen = trie->data_size * 8;
@@ -505,7 +506,7 @@ static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
return -ENOTSUPP;
}
-static const struct bpf_map_ops trie_ops = {
+const struct bpf_map_ops trie_map_ops = {
.map_alloc = trie_alloc,
.map_free = trie_free,
.map_get_next_key = trie_get_next_key,
@@ -513,15 +514,3 @@ static const struct bpf_map_ops trie_ops = {
.map_update_elem = trie_update_elem,
.map_delete_elem = trie_delete_elem,
};
-
-static struct bpf_map_type_list trie_type __ro_after_init = {
- .ops = &trie_ops,
- .type = BPF_MAP_TYPE_LPM_TRIE,
-};
-
-static int __init register_trie_map(void)
-{
- bpf_register_map_type(&trie_type);
- return 0;
-}
-late_initcall(register_trie_map);
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
new file mode 100644
index 000000000000..59bcdf821ae4
--- /dev/null
+++ b/kernel/bpf/map_in_map.c
@@ -0,0 +1,97 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/slab.h>
+#include <linux/bpf.h>
+
+#include "map_in_map.h"
+
+struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
+{
+ struct bpf_map *inner_map, *inner_map_meta;
+ struct fd f;
+
+ f = fdget(inner_map_ufd);
+ inner_map = __bpf_map_get(f);
+ if (IS_ERR(inner_map))
+ return inner_map;
+
+ /* prog_array->owner_prog_type and owner_jited
+ * is a runtime binding. Doing static check alone
+ * in the verifier is not enough.
+ */
+ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+ fdput(f);
+ return ERR_PTR(-ENOTSUPP);
+ }
+
+ /* Does not support >1 level map-in-map */
+ if (inner_map->inner_map_meta) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER);
+ if (!inner_map_meta) {
+ fdput(f);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ inner_map_meta->map_type = inner_map->map_type;
+ inner_map_meta->key_size = inner_map->key_size;
+ inner_map_meta->value_size = inner_map->value_size;
+ inner_map_meta->map_flags = inner_map->map_flags;
+ inner_map_meta->ops = inner_map->ops;
+ inner_map_meta->max_entries = inner_map->max_entries;
+
+ fdput(f);
+ return inner_map_meta;
+}
+
+void bpf_map_meta_free(struct bpf_map *map_meta)
+{
+ kfree(map_meta);
+}
+
+bool bpf_map_meta_equal(const struct bpf_map *meta0,
+ const struct bpf_map *meta1)
+{
+ /* No need to compare ops because it is covered by map_type */
+ return meta0->map_type == meta1->map_type &&
+ meta0->key_size == meta1->key_size &&
+ meta0->value_size == meta1->value_size &&
+ meta0->map_flags == meta1->map_flags &&
+ meta0->max_entries == meta1->max_entries;
+}
+
+void *bpf_map_fd_get_ptr(struct bpf_map *map,
+ struct file *map_file /* not used */,
+ int ufd)
+{
+ struct bpf_map *inner_map;
+ struct fd f;
+
+ f = fdget(ufd);
+ inner_map = __bpf_map_get(f);
+ if (IS_ERR(inner_map))
+ return inner_map;
+
+ if (bpf_map_meta_equal(map->inner_map_meta, inner_map))
+ inner_map = bpf_map_inc(inner_map, false);
+ else
+ inner_map = ERR_PTR(-EINVAL);
+
+ fdput(f);
+ return inner_map;
+}
+
+void bpf_map_fd_put_ptr(void *ptr)
+{
+ /* ptr->ops->map_free() has to go through one
+ * rcu grace period by itself.
+ */
+ bpf_map_put(ptr);
+}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
new file mode 100644
index 000000000000..177fadb689dc
--- /dev/null
+++ b/kernel/bpf/map_in_map.h
@@ -0,0 +1,23 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef __MAP_IN_MAP_H__
+#define __MAP_IN_MAP_H__
+
+#include <linux/types.h>
+
+struct file;
+struct bpf_map;
+
+struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
+void bpf_map_meta_free(struct bpf_map *map_meta);
+bool bpf_map_meta_equal(const struct bpf_map *meta0,
+ const struct bpf_map *meta1);
+void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
+ int ufd);
+void bpf_map_fd_put_ptr(void *ptr);
+
+#endif
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 22aa45cd0324..31147d730abf 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -88,6 +88,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
smap->map.key_size = attr->key_size;
smap->map.value_size = value_size;
smap->map.max_entries = attr->max_entries;
+ smap->map.map_flags = attr->map_flags;
smap->n_buckets = n_buckets;
smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
@@ -264,7 +265,7 @@ static void stack_map_free(struct bpf_map *map)
put_callchain_buffers();
}
-static const struct bpf_map_ops stack_map_ops = {
+const struct bpf_map_ops stack_map_ops = {
.map_alloc = stack_map_alloc,
.map_free = stack_map_free,
.map_get_next_key = stack_map_get_next_key,
@@ -272,15 +273,3 @@ static const struct bpf_map_ops stack_map_ops = {
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
};
-
-static struct bpf_map_type_list stack_map_type __ro_after_init = {
- .ops = &stack_map_ops,
- .type = BPF_MAP_TYPE_STACK_TRACE,
-};
-
-static int __init register_stack_map(void)
-{
- bpf_register_map_type(&stack_map_type);
- return 0;
-}
-late_initcall(register_stack_map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7af0dcc5d755..265a0d854e33 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -27,30 +27,29 @@ DEFINE_PER_CPU(int, bpf_prog_active);
int sysctl_unprivileged_bpf_disabled __read_mostly;
-static LIST_HEAD(bpf_map_types);
+static const struct bpf_map_ops * const bpf_map_types[] = {
+#define BPF_PROG_TYPE(_id, _ops)
+#define BPF_MAP_TYPE(_id, _ops) \
+ [_id] = &_ops,
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
- struct bpf_map_type_list *tl;
struct bpf_map *map;
- list_for_each_entry(tl, &bpf_map_types, list_node) {
- if (tl->type == attr->map_type) {
- map = tl->ops->map_alloc(attr);
- if (IS_ERR(map))
- return map;
- map->ops = tl->ops;
- map->map_type = attr->map_type;
- return map;
- }
- }
- return ERR_PTR(-EINVAL);
-}
+ if (attr->map_type >= ARRAY_SIZE(bpf_map_types) ||
+ !bpf_map_types[attr->map_type])
+ return ERR_PTR(-EINVAL);
-/* boot time registration of different map implementations */
-void bpf_register_map_type(struct bpf_map_type_list *tl)
-{
- list_add(&tl->list_node, &bpf_map_types);
+ map = bpf_map_types[attr->map_type]->map_alloc(attr);
+ if (IS_ERR(map))
+ return map;
+ map->ops = bpf_map_types[attr->map_type];
+ map->map_type = attr->map_type;
+ return map;
}
void *bpf_map_area_alloc(size_t size)
@@ -68,8 +67,7 @@ void *bpf_map_area_alloc(size_t size)
return area;
}
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags,
- PAGE_KERNEL);
+ return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL);
}
void bpf_map_area_free(void *area)
@@ -215,7 +213,7 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD map_flags
+#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -352,6 +350,9 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_percpu_array_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
+ map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ err = -ENOTSUPP;
} else {
rcu_read_lock();
ptr = map->ops->map_lookup_elem(map, key);
@@ -438,11 +439,17 @@ static int map_update_elem(union bpf_attr *attr)
err = bpf_percpu_array_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
- map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
+ map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
rcu_read_lock();
err = bpf_fd_array_map_update_elem(map, f.file, key, value,
attr->flags);
rcu_read_unlock();
+ } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ rcu_read_lock();
+ err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+ attr->flags);
+ rcu_read_unlock();
} else {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, attr->flags);
@@ -528,14 +535,18 @@ static int map_get_next_key(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
- goto err_put;
+ if (ukey) {
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+ } else {
+ key = NULL;
+ }
err = -ENOMEM;
next_key = kmalloc(map->key_size, GFP_USER);
@@ -564,79 +575,23 @@ err_put:
return err;
}
-static LIST_HEAD(bpf_prog_types);
+static const struct bpf_verifier_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _ops) \
+ [_id] = &_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
{
- struct bpf_prog_type_list *tl;
-
- list_for_each_entry(tl, &bpf_prog_types, list_node) {
- if (tl->type == type) {
- prog->aux->ops = tl->ops;
- prog->type = type;
- return 0;
- }
- }
-
- return -EINVAL;
-}
-
-void bpf_register_prog_type(struct bpf_prog_type_list *tl)
-{
- list_add(&tl->list_node, &bpf_prog_types);
-}
-
-/* fixup insn->imm field of bpf_call instructions:
- * if (insn->imm == BPF_FUNC_map_lookup_elem)
- * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
- * else if (insn->imm == BPF_FUNC_map_update_elem)
- * insn->imm = bpf_map_update_elem - __bpf_call_base;
- * else ...
- *
- * this function is called after eBPF program passed verification
- */
-static void fixup_bpf_calls(struct bpf_prog *prog)
-{
- const struct bpf_func_proto *fn;
- int i;
+ if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
+ return -EINVAL;
- for (i = 0; i < prog->len; i++) {
- struct bpf_insn *insn = &prog->insnsi[i];
-
- if (insn->code == (BPF_JMP | BPF_CALL)) {
- /* we reach here when program has bpf_call instructions
- * and it passed bpf_check(), means that
- * ops->get_func_proto must have been supplied, check it
- */
- BUG_ON(!prog->aux->ops->get_func_proto);
-
- if (insn->imm == BPF_FUNC_get_route_realm)
- prog->dst_needed = 1;
- if (insn->imm == BPF_FUNC_get_prandom_u32)
- bpf_user_rnd_init_once();
- if (insn->imm == BPF_FUNC_xdp_adjust_head)
- prog->xdp_adjust_head = 1;
- if (insn->imm == BPF_FUNC_tail_call) {
- /* mark bpf_tail_call as different opcode
- * to avoid conditional branch in
- * interpeter for every normal call
- * and to prevent accidental JITing by
- * JIT compiler that doesn't support
- * bpf_tail_call yet
- */
- insn->imm = 0;
- insn->code |= BPF_X;
- continue;
- }
-
- fn = prog->aux->ops->get_func_proto(insn->imm);
- /* all functions that have prototype and verifier allowed
- * programs to call them, must be real in-kernel functions
- */
- BUG_ON(!fn->func);
- insn->imm = fn->func - __bpf_call_base;
- }
- }
+ prog->aux->ops = bpf_prog_types[type];
+ prog->type = type;
+ return 0;
}
/* drop refcnt on maps used by eBPF program and free auxilary data */
@@ -828,7 +783,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
EXPORT_SYMBOL_GPL(bpf_prog_get_type);
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD kern_version
+#define BPF_PROG_LOAD_LAST_FIELD prog_flags
static int bpf_prog_load(union bpf_attr *attr)
{
@@ -841,6 +796,9 @@ static int bpf_prog_load(union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
+ if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
+ return -EINVAL;
+
/* copy eBPF program license from user space */
if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
sizeof(license) - 1) < 0)
@@ -892,9 +850,6 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_used_maps;
- /* fixup BPF_CALL->imm field */
- fixup_bpf_calls(prog);
-
/* eBPF program is ready to be JITed */
prog = bpf_prog_select_runtime(prog, &err);
if (err < 0)
@@ -1020,6 +975,28 @@ static int bpf_prog_detach(const union bpf_attr *attr)
}
#endif /* CONFIG_CGROUP_BPF */
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
+
+static int bpf_prog_test_run(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_prog *prog;
+ int ret = -ENOTSUPP;
+
+ if (CHECK_ATTR(BPF_PROG_TEST_RUN))
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->test.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->aux->ops->test_run)
+ ret = prog->aux->ops->test_run(prog, attr, uattr);
+
+ bpf_prog_put(prog);
+ return ret;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -1086,7 +1063,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_OBJ_GET:
err = bpf_obj_get(&attr);
break;
-
#ifdef CONFIG_CGROUP_BPF
case BPF_PROG_ATTACH:
err = bpf_prog_attach(&attr);
@@ -1095,7 +1071,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_prog_detach(&attr);
break;
#endif
-
+ case BPF_PROG_TEST_RUN:
+ err = bpf_prog_test_run(&attr, uattr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 796b68d00119..339c8a1371de 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -140,9 +140,11 @@ struct bpf_verifier_stack_elem {
struct bpf_verifier_stack_elem *next;
};
-#define BPF_COMPLEXITY_LIMIT_INSNS 65536
+#define BPF_COMPLEXITY_LIMIT_INSNS 98304
#define BPF_COMPLEXITY_LIMIT_STACK 1024
+#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
+
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
@@ -239,6 +241,12 @@ static void print_verifier_state(struct bpf_verifier_state *state)
if (reg->max_value != BPF_REGISTER_MAX_RANGE)
verbose(",max_value=%llu",
(unsigned long long)reg->max_value);
+ if (reg->min_align)
+ verbose(",min_align=%u", reg->min_align);
+ if (reg->aux_off)
+ verbose(",aux_off=%u", reg->aux_off);
+ if (reg->aux_off_align)
+ verbose(",aux_off_align=%u", reg->aux_off_align);
}
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] == STACK_SPILL)
@@ -296,7 +304,8 @@ static const char *const bpf_jmp_string[16] = {
[BPF_EXIT >> 4] = "exit",
};
-static void print_bpf_insn(struct bpf_insn *insn)
+static void print_bpf_insn(const struct bpf_verifier_env *env,
+ const struct bpf_insn *insn)
{
u8 class = BPF_CLASS(insn->code);
@@ -360,9 +369,19 @@ static void print_bpf_insn(struct bpf_insn *insn)
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IMM) {
- verbose("(%02x) r%d = 0x%x\n",
- insn->code, insn->dst_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM &&
+ BPF_SIZE(insn->code) == BPF_DW) {
+ /* At this point, we already made sure that the second
+ * part of the ldimm64 insn is accessible.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+ bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+ if (map_ptr && !env->allow_ptr_leaks)
+ imm = 0;
+
+ verbose("(%02x) r%d = 0x%llx\n", insn->code,
+ insn->dst_reg, (unsigned long long)imm);
} else {
verbose("BUG_ld_%02x\n", insn->code);
return;
@@ -444,16 +463,22 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
+static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+{
+ BUG_ON(regno >= MAX_BPF_REG);
+
+ memset(&regs[regno], 0, sizeof(regs[regno]));
+ regs[regno].type = NOT_INIT;
+ regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
+ regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+}
+
static void init_reg_state(struct bpf_reg_state *regs)
{
int i;
- for (i = 0; i < MAX_BPF_REG; i++) {
- regs[i].type = NOT_INIT;
- regs[i].imm = 0;
- regs[i].min_value = BPF_REGISTER_MIN_RANGE;
- regs[i].max_value = BPF_REGISTER_MAX_RANGE;
- }
+ for (i = 0; i < MAX_BPF_REG; i++)
+ mark_reg_not_init(regs, i);
/* frame pointer */
regs[BPF_REG_FP].type = FRAME_PTR;
@@ -479,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
{
regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+ regs[regno].min_align = 0;
}
static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
@@ -765,38 +791,77 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
}
}
-static int check_ptr_alignment(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg, int off, int size)
+static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+ int off, int size, bool strict)
{
- if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) {
- if (off % size != 0) {
- verbose("misaligned access off %d size %d\n",
- off, size);
+ int ip_align;
+ int reg_off;
+
+ /* Byte size accesses are always allowed. */
+ if (!strict || size == 1)
+ return 0;
+
+ reg_off = reg->off;
+ if (reg->id) {
+ if (reg->aux_off_align % size) {
+ verbose("Packet access is only %u byte aligned, %d byte access not allowed\n",
+ reg->aux_off_align, size);
return -EACCES;
- } else {
- return 0;
}
+ reg_off += reg->aux_off;
}
- if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
- /* misaligned access to packet is ok on x86,arm,arm64 */
- return 0;
-
- if (reg->id && size != 1) {
- verbose("Unknown packet alignment. Only byte-sized access allowed\n");
+ /* For platforms that do not have a Kconfig enabling
+ * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
+ * NET_IP_ALIGN is universally set to '2'. And on platforms
+ * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get
+ * to this code only in strict mode where we want to emulate
+ * the NET_IP_ALIGN==2 checking. Therefore use an
+ * unconditional IP align value of '2'.
+ */
+ ip_align = 2;
+ if ((ip_align + reg_off + off) % size != 0) {
+ verbose("misaligned packet access off %d+%d+%d size %d\n",
+ ip_align, reg_off, off, size);
return -EACCES;
}
- /* skb->data is NET_IP_ALIGN-ed */
- if (reg->type == PTR_TO_PACKET &&
- (NET_IP_ALIGN + reg->off + off) % size != 0) {
- verbose("misaligned packet access off %d+%d+%d size %d\n",
- NET_IP_ALIGN, reg->off, off, size);
+ return 0;
+}
+
+static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
+ int size, bool strict)
+{
+ if (strict && size != 1) {
+ verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
return -EACCES;
}
+
return 0;
}
+static int check_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ bool strict = env->strict_alignment;
+
+ switch (reg->type) {
+ case PTR_TO_PACKET:
+ return check_pkt_ptr_alignment(reg, off, size, strict);
+ case PTR_TO_MAP_VALUE_ADJ:
+ return check_val_ptr_alignment(reg, size, strict);
+ default:
+ if (off % size != 0) {
+ verbose("misaligned access off %d size %d\n",
+ off, size);
+ return -EACCES;
+ }
+
+ return 0;
+ }
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -852,6 +917,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
value_regno);
/* note that reg.[id|off|range] == 0 */
state->regs[value_regno].type = reg_type;
+ state->regs[value_regno].aux_off = 0;
+ state->regs[value_regno].aux_off_align = 0;
}
} else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
@@ -1197,6 +1264,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
func_id != BPF_FUNC_current_task_under_cgroup)
goto error;
break;
+ case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+ case BPF_MAP_TYPE_HASH_OF_MAPS:
+ if (func_id != BPF_FUNC_map_lookup_elem)
+ goto error;
default:
break;
}
@@ -1273,12 +1344,11 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
}
}
-static int check_call(struct bpf_verifier_env *env, int func_id)
+static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
{
struct bpf_verifier_state *state = &env->cur_state;
const struct bpf_func_proto *fn = NULL;
struct bpf_reg_state *regs = state->regs;
- struct bpf_reg_state *reg;
struct bpf_call_arg_meta meta;
bool changes_data;
int i, err;
@@ -1345,11 +1415,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
}
/* reset caller saved regs */
- for (i = 0; i < CALLER_SAVED_REGS; i++) {
- reg = regs + caller_saved[i];
- reg->type = NOT_INIT;
- reg->imm = 0;
- }
+ for (i = 0; i < CALLER_SAVED_REGS; i++)
+ mark_reg_not_init(regs, caller_saved[i]);
/* update return register */
if (fn->ret_type == RET_INTEGER) {
@@ -1357,6 +1424,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
+ struct bpf_insn_aux_data *insn_aux;
+
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
/* remember map_ptr, so that check_map_access()
@@ -1369,6 +1438,11 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].id = ++env->id_gen;
+ insn_aux = &env->insn_aux_data[insn_idx];
+ if (!insn_aux->map_ptr)
+ insn_aux->map_ptr = meta.map_ptr;
+ else if (insn_aux->map_ptr != meta.map_ptr)
+ insn_aux->map_ptr = BPF_MAP_PTR_POISON;
} else {
verbose("unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
@@ -1413,6 +1487,8 @@ add_imm:
*/
dst_reg->off += imm;
} else {
+ bool had_id;
+
if (src_reg->type == PTR_TO_PACKET) {
/* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
tmp_reg = *dst_reg; /* save r7 state */
@@ -1446,14 +1522,23 @@ add_imm:
src_reg->imm);
return -EACCES;
}
+
+ had_id = (dst_reg->id != 0);
+
/* dst_reg stays as pkt_ptr type and since some positive
* integer value was added to the pointer, increment its 'id'
*/
dst_reg->id = ++env->id_gen;
- /* something was added to pkt_ptr, set range and off to zero */
+ /* something was added to pkt_ptr, set range to zero */
+ dst_reg->aux_off += dst_reg->off;
dst_reg->off = 0;
dst_reg->range = 0;
+ if (had_id)
+ dst_reg->aux_off_align = min(dst_reg->aux_off_align,
+ src_reg->min_align);
+ else
+ dst_reg->aux_off_align = src_reg->min_align;
}
return 0;
}
@@ -1627,6 +1712,13 @@ static void check_reg_overflow(struct bpf_reg_state *reg)
reg->min_value = BPF_REGISTER_MIN_RANGE;
}
+static u32 calc_align(u32 imm)
+{
+ if (!imm)
+ return 1U << 31;
+ return imm - ((imm - 1) & imm);
+}
+
static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn)
{
@@ -1634,8 +1726,10 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
s64 min_val = BPF_REGISTER_MIN_RANGE;
u64 max_val = BPF_REGISTER_MAX_RANGE;
u8 opcode = BPF_OP(insn->code);
+ u32 dst_align, src_align;
dst_reg = &regs[insn->dst_reg];
+ src_align = 0;
if (BPF_SRC(insn->code) == BPF_X) {
check_reg_overflow(&regs[insn->src_reg]);
min_val = regs[insn->src_reg].min_value;
@@ -1651,12 +1745,18 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
regs[insn->src_reg].type != UNKNOWN_VALUE) {
min_val = BPF_REGISTER_MIN_RANGE;
max_val = BPF_REGISTER_MAX_RANGE;
+ src_align = 0;
+ } else {
+ src_align = regs[insn->src_reg].min_align;
}
} else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
(s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
min_val = max_val = insn->imm;
+ src_align = calc_align(insn->imm);
}
+ dst_align = dst_reg->min_align;
+
/* We don't know anything about what was done to this register, mark it
* as unknown.
*/
@@ -1681,18 +1781,21 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
dst_reg->min_value += min_val;
if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
dst_reg->max_value += max_val;
+ dst_reg->min_align = min(src_align, dst_align);
break;
case BPF_SUB:
if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
dst_reg->min_value -= min_val;
if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
dst_reg->max_value -= max_val;
+ dst_reg->min_align = min(src_align, dst_align);
break;
case BPF_MUL:
if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
dst_reg->min_value *= min_val;
if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
dst_reg->max_value *= max_val;
+ dst_reg->min_align = max(src_align, dst_align);
break;
case BPF_AND:
/* Disallow AND'ing of negative numbers, ain't nobody got time
@@ -1704,17 +1807,23 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
else
dst_reg->min_value = 0;
dst_reg->max_value = max_val;
+ dst_reg->min_align = max(src_align, dst_align);
break;
case BPF_LSH:
/* Gotta have special overflow logic here, if we're shifting
* more than MAX_RANGE then just assume we have an invalid
* range.
*/
- if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))
+ if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) {
dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value <<= min_val;
-
+ dst_reg->min_align = 1;
+ } else {
+ if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value <<= min_val;
+ if (!dst_reg->min_align)
+ dst_reg->min_align = 1;
+ dst_reg->min_align <<= min_val;
+ }
if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
@@ -1724,11 +1833,19 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* RSH by a negative number is undefined, and the BPF_RSH is an
* unsigned shift, so make the appropriate casts.
*/
- if (min_val < 0 || dst_reg->min_value < 0)
+ if (min_val < 0 || dst_reg->min_value < 0) {
dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- else
+ } else {
dst_reg->min_value =
(u64)(dst_reg->min_value) >> min_val;
+ }
+ if (min_val < 0) {
+ dst_reg->min_align = 1;
+ } else {
+ dst_reg->min_align >>= (u64) min_val;
+ if (!dst_reg->min_align)
+ dst_reg->min_align = 1;
+ }
if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
dst_reg->max_value >>= max_val;
break;
@@ -1830,6 +1947,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
regs[insn->dst_reg].imm = insn->imm;
regs[insn->dst_reg].max_value = insn->imm;
regs[insn->dst_reg].min_value = insn->imm;
+ regs[insn->dst_reg].min_align = calc_align(insn->imm);
}
} else if (opcode > BPF_END) {
@@ -1893,6 +2011,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
} else if (opcode == BPF_ADD &&
BPF_CLASS(insn->code) == BPF_ALU64 &&
+ dst_reg->type == PTR_TO_STACK &&
+ ((BPF_SRC(insn->code) == BPF_X &&
+ regs[insn->src_reg].type == CONST_IMM) ||
+ BPF_SRC(insn->code) == BPF_K)) {
+ if (BPF_SRC(insn->code) == BPF_X)
+ dst_reg->imm += regs[insn->src_reg].imm;
+ else
+ dst_reg->imm += insn->imm;
+ return 0;
+ } else if (opcode == BPF_ADD &&
+ BPF_CLASS(insn->code) == BPF_ALU64 &&
(dst_reg->type == PTR_TO_PACKET ||
(BPF_SRC(insn->code) == BPF_X &&
regs[insn->src_reg].type == PTR_TO_PACKET))) {
@@ -1925,6 +2054,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* register as unknown.
*/
if (env->allow_ptr_leaks &&
+ BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
(dst_reg->type == PTR_TO_MAP_VALUE ||
dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
@@ -1973,14 +2103,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
- regs[i].range = dst_reg->off;
+ /* keep the maximum range already checked */
+ regs[i].range = max(regs[i].range, dst_reg->off);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
reg = &state->spilled_regs[i / BPF_REG_SIZE];
if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
- reg->range = dst_reg->off;
+ reg->range = max(reg->range, dst_reg->off);
}
}
@@ -2092,14 +2223,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
struct bpf_reg_state *reg = &regs[regno];
if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
- reg->type = type;
+ if (type == UNKNOWN_VALUE) {
+ __mark_reg_unknown_value(regs, regno);
+ } else if (reg->map_ptr->inner_map_meta) {
+ reg->type = CONST_PTR_TO_MAP;
+ reg->map_ptr = reg->map_ptr->inner_map_meta;
+ } else {
+ reg->type = type;
+ }
/* We don't need id from this point onwards anymore, thus we
* should better reset it, so that state pruning has chances
* to take effect.
*/
reg->id = 0;
- if (type == UNKNOWN_VALUE)
- __mark_reg_unknown_value(regs, regno);
}
}
@@ -2308,7 +2444,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
struct bpf_reg_state *regs = env->cur_state.regs;
u8 mode = BPF_MODE(insn->code);
- struct bpf_reg_state *reg;
int i, err;
if (!may_access_skb(env->prog->type)) {
@@ -2341,11 +2476,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* reset caller saved regs to unreadable */
- for (i = 0; i < CALLER_SAVED_REGS; i++) {
- reg = regs + caller_saved[i];
- reg->type = NOT_INIT;
- reg->imm = 0;
- }
+ for (i = 0; i < CALLER_SAVED_REGS; i++)
+ mark_reg_not_init(regs, caller_saved[i]);
/* mark destination R0 register as readable, since it contains
* the value fetched from the packet
@@ -2504,6 +2636,7 @@ peek_stack:
env->explored_states[t + 1] = STATE_LIST_MARK;
} else {
/* conditional jump with two edges */
+ env->explored_states[t] = STATE_LIST_MARK;
ret = push_insn(t, t + 1, FALLTHROUGH, env);
if (ret == 1)
goto peek_stack;
@@ -2555,7 +2688,8 @@ err_free:
/* the following conditions reduce the number of explored insns
* from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
*/
-static bool compare_ptrs_to_packet(struct bpf_reg_state *old,
+static bool compare_ptrs_to_packet(struct bpf_verifier_env *env,
+ struct bpf_reg_state *old,
struct bpf_reg_state *cur)
{
if (old->id != cur->id)
@@ -2598,7 +2732,7 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old,
* 'if (R4 > data_end)' and all further insn were already good with r=20,
* so they will be good with r=30 and we can prune the search.
*/
- if (old->off <= cur->off &&
+ if (!env->strict_alignment && old->off <= cur->off &&
old->off >= old->range && cur->off >= cur->range)
return true;
@@ -2662,8 +2796,14 @@ static bool states_equal(struct bpf_verifier_env *env,
rcur->type != NOT_INIT))
continue;
+ /* Don't care about the reg->id in this case. */
+ if (rold->type == PTR_TO_MAP_VALUE_OR_NULL &&
+ rcur->type == PTR_TO_MAP_VALUE_OR_NULL &&
+ rold->map_ptr == rcur->map_ptr)
+ continue;
+
if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
- compare_ptrs_to_packet(rold, rcur))
+ compare_ptrs_to_packet(env, rold, rcur))
continue;
return false;
@@ -2796,15 +2936,22 @@ static int do_check(struct bpf_verifier_env *env)
goto process_bpf_exit;
}
- if (log_level && do_print_state) {
- verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
+ if (need_resched())
+ cond_resched();
+
+ if (log_level > 1 || (log_level && do_print_state)) {
+ if (log_level > 1)
+ verbose("%d:", insn_idx);
+ else
+ verbose("\nfrom %d to %d:",
+ prev_insn_idx, insn_idx);
print_verifier_state(&env->cur_state);
do_print_state = false;
}
if (log_level) {
verbose("%d: ", insn_idx);
- print_bpf_insn(insn);
+ print_bpf_insn(env, insn);
}
err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
@@ -2940,7 +3087,7 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- err = check_call(env, insn->imm);
+ err = check_call(env, insn->imm, insn_idx);
if (err)
return err;
@@ -3024,16 +3171,33 @@ process_bpf_exit:
return 0;
}
+static int check_map_prealloc(struct bpf_map *map)
+{
+ return (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) ||
+ !(map->map_flags & BPF_F_NO_PREALLOC);
+}
+
static int check_map_prog_compatibility(struct bpf_map *map,
struct bpf_prog *prog)
{
- if (prog->type == BPF_PROG_TYPE_PERF_EVENT &&
- (map->map_type == BPF_MAP_TYPE_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_HASH) &&
- (map->map_flags & BPF_F_NO_PREALLOC)) {
- verbose("perf_event programs can only use preallocated hash map\n");
- return -EINVAL;
+ /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use
+ * preallocated hash maps, since doing memory allocation
+ * in overflow_handler can crash depending on where nmi got
+ * triggered.
+ */
+ if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
+ if (!check_map_prealloc(map)) {
+ verbose("perf_event programs can only use preallocated hash map\n");
+ return -EINVAL;
+ }
+ if (map->inner_map_meta &&
+ !check_map_prealloc(map->inner_map_meta)) {
+ verbose("perf_event programs can only use preallocated inner hash map\n");
+ return -EINVAL;
+ }
}
return 0;
}
@@ -3162,6 +3326,41 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
insn->src_reg = 0;
}
+/* single env->prog->insni[off] instruction was replaced with the range
+ * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
+ * [0, off) and [off, end) to new locations, so the patched range stays zero
+ */
+static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
+ u32 off, u32 cnt)
+{
+ struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+
+ if (cnt == 1)
+ return 0;
+ new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len);
+ if (!new_data)
+ return -ENOMEM;
+ memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
+ memcpy(new_data + off + cnt - 1, old_data + off,
+ sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ env->insn_aux_data = new_data;
+ vfree(old_data);
+ return 0;
+}
+
+static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
+ const struct bpf_insn *patch, u32 len)
+{
+ struct bpf_prog *new_prog;
+
+ new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
+ if (!new_prog)
+ return NULL;
+ if (adjust_insn_aux_data(env, new_prog->len, off, len))
+ return NULL;
+ return new_prog;
+}
+
/* convert load instructions that access fields of 'struct __sk_buff'
* into sequence of instructions that access fields of 'struct sk_buff'
*/
@@ -3181,10 +3380,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
} else if (cnt) {
- new_prog = bpf_patch_insn_single(env->prog, 0,
- insn_buf, cnt);
+ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
+
env->prog = new_prog;
delta += cnt - 1;
}
@@ -3209,7 +3408,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
else
continue;
- if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
+ if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
continue;
cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
@@ -3218,8 +3417,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
return -EINVAL;
}
- new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf,
- cnt);
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -3233,6 +3431,89 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
return 0;
}
+/* fixup insn->imm field of bpf_call instructions
+ * and inline eligible helpers as explicit sequence of BPF instructions
+ *
+ * this function is called after eBPF program passed verification
+ */
+static int fixup_bpf_calls(struct bpf_verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog;
+ struct bpf_insn *insn = prog->insnsi;
+ const struct bpf_func_proto *fn;
+ const int insn_cnt = prog->len;
+ struct bpf_insn insn_buf[16];
+ struct bpf_prog *new_prog;
+ struct bpf_map *map_ptr;
+ int i, cnt, delta = 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code != (BPF_JMP | BPF_CALL))
+ continue;
+
+ if (insn->imm == BPF_FUNC_get_route_realm)
+ prog->dst_needed = 1;
+ if (insn->imm == BPF_FUNC_get_prandom_u32)
+ bpf_user_rnd_init_once();
+ if (insn->imm == BPF_FUNC_tail_call) {
+ /* If we tail call into other programs, we
+ * cannot make any assumptions since they can
+ * be replaced dynamically during runtime in
+ * the program array.
+ */
+ prog->cb_access = 1;
+
+ /* mark bpf_tail_call as different opcode to avoid
+ * conditional branch in the interpeter for every normal
+ * call and to prevent accidental JITing by JIT compiler
+ * that doesn't support bpf_tail_call yet
+ */
+ insn->imm = 0;
+ insn->code |= BPF_X;
+ continue;
+ }
+
+ if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) {
+ map_ptr = env->insn_aux_data[i + delta].map_ptr;
+ if (map_ptr == BPF_MAP_PTR_POISON ||
+ !map_ptr->ops->map_gen_lookup)
+ goto patch_call_imm;
+
+ cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose("bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
+ cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+
+ /* keep walking new program and skip insns we just inserted */
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+patch_call_imm:
+ fn = prog->aux->ops->get_func_proto(insn->imm);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ if (!fn->func) {
+ verbose("kernel subsystem misconfigured func %s#%d\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ }
+ insn->imm = fn->func - __bpf_call_base;
+ }
+
+ return 0;
+}
+
static void free_states(struct bpf_verifier_env *env)
{
struct bpf_verifier_state_list *sl, *sln;
@@ -3301,6 +3582,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
log_level = 0;
}
+ env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+ env->strict_alignment = true;
+
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
@@ -3328,6 +3613,9 @@ skip_full_check:
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
+ if (ret == 0)
+ ret = fixup_bpf_calls(env);
+
if (log_level && log_len >= log_size - 1) {
BUG_ON(log_len >= log_size);
/* verifier log exceeded user supplied buffer */
@@ -3403,6 +3691,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
log_level = 0;
+ env->strict_alignment = false;
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+ env->strict_alignment = true;
+
env->explored_states = kcalloc(env->prog->len,
sizeof(struct bpf_verifier_state_list *),
GFP_KERNEL);
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 9203bfb05603..00f4d6bf048f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -5,6 +5,7 @@
#include <linux/kernfs.h>
#include <linux/workqueue.h>
#include <linux/list.h>
+#include <linux/refcount.h>
/*
* A cgroup can be associated with multiple css_sets as different tasks may
@@ -134,7 +135,7 @@ static inline void put_css_set(struct css_set *cset)
* can see it. Similar to atomic_dec_and_lock(), but for an
* rwlock
*/
- if (atomic_add_unless(&cset->refcount, -1, 1))
+ if (refcount_dec_not_one(&cset->refcount))
return;
spin_lock_irqsave(&css_set_lock, flags);
@@ -147,7 +148,7 @@ static inline void put_css_set(struct css_set *cset)
*/
static inline void get_css_set(struct css_set *cset)
{
- atomic_inc(&cset->refcount);
+ refcount_inc(&cset->refcount);
}
bool cgroup_ssid_enabled(int ssid);
@@ -163,7 +164,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_root *root, unsigned long magic,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 1dc22f6b49f5..85d75152402d 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -346,7 +346,7 @@ static int cgroup_task_count(const struct cgroup *cgrp)
spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
- count += atomic_read(&link->cset->refcount);
+ count += refcount_read(&link->cset->refcount);
spin_unlock_irq(&css_set_lock);
return count;
}
@@ -1072,6 +1072,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
struct cgroup_subsys *ss;
struct dentry *dentry;
int i, ret;
+ bool new_root = false;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
@@ -1181,10 +1182,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
ret = -ENOMEM;
goto out_unlock;
}
+ new_root = true;
init_cgroup_root(root, &opts);
- ret = cgroup_setup_root(root, opts.subsys_mask);
+ ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
if (ret)
cgroup_free_root(root);
@@ -1201,6 +1203,18 @@ out_free:
CGROUP_SUPER_MAGIC, ns);
/*
+ * There's a race window after we release cgroup_mutex and before
+ * allocating a superblock. Make sure a concurrent process won't
+ * be able to re-use the root during this window by delaying the
+ * initialization of root refcnt.
+ */
+ if (new_root) {
+ mutex_lock(&cgroup_mutex);
+ percpu_ref_reinit(&root->cgrp.self.refcnt);
+ mutex_unlock(&cgroup_mutex);
+ }
+
+ /*
* If @pinned_sb, we're reusing an existing root and holding an
* extra ref on its sb. Mount is complete. Put the extra ref.
*/
@@ -1286,7 +1300,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
u64 count;
rcu_read_lock();
- count = atomic_read(&task_css_set(current)->refcount);
+ count = refcount_read(&task_css_set(current)->refcount);
rcu_read_unlock();
return count;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 48851327a15e..8d4e85eae42c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -189,7 +189,7 @@ static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .count = { .counter = 2, },
+ .count = REFCOUNT_INIT(2),
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
.ns.inum = PROC_CGROUP_INIT_INO,
@@ -436,7 +436,12 @@ out_unlock:
return css;
}
-static void cgroup_get(struct cgroup *cgrp)
+static void __maybe_unused cgroup_get(struct cgroup *cgrp)
+{
+ css_get(&cgrp->self);
+}
+
+static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
css_get(&cgrp->self);
@@ -554,7 +559,7 @@ EXPORT_SYMBOL_GPL(of_css);
* haven't been created.
*/
struct css_set init_css_set = {
- .refcount = ATOMIC_INIT(1),
+ .refcount = REFCOUNT_INIT(1),
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
@@ -724,7 +729,7 @@ void put_css_set_locked(struct css_set *cset)
lockdep_assert_held(&css_set_lock);
- if (!atomic_dec_and_test(&cset->refcount))
+ if (!refcount_dec_and_test(&cset->refcount))
return;
/* This css_set is dead. unlink it and release cgroup and css refs */
@@ -932,7 +937,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
if (cgroup_parent(cgrp))
- cgroup_get(cgrp);
+ cgroup_get_live(cgrp);
}
/**
@@ -977,7 +982,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
return NULL;
}
- atomic_set(&cset->refcount, 1);
+ refcount_set(&cset->refcount, 1);
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->task_iters);
@@ -1640,7 +1645,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -1656,8 +1661,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
root_cgrp->id = ret;
root_cgrp->ancestor_ids[0] = ret;
- ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
- GFP_KERNEL);
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
+ ref_flags, GFP_KERNEL);
if (ret)
goto out;
@@ -1802,7 +1807,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
return ERR_PTR(-EINVAL);
}
cgrp_dfl_visible = true;
- cgroup_get(&cgrp_dfl_root.cgrp);
+ cgroup_get_live(&cgrp_dfl_root.cgrp);
dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
CGROUP2_SUPER_MAGIC, ns);
@@ -2425,11 +2430,12 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
tsk = tsk->group_leader;
/*
- * Workqueue threads may acquire PF_NO_SETAFFINITY and become
- * trapped in a cpuset, or RT worker may be born in a cgroup
- * with no rt_runtime allocated. Just say no.
+ * kthreads may acquire PF_NO_SETAFFINITY during initialization.
+ * If userland migrates such a kthread to a non-root cgroup, it can
+ * become trapped in a cpuset, or RT kthread may be born in a
+ * cgroup with no rt_runtime allocated. Just say no.
*/
- if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out_unlock_rcu;
}
@@ -2575,7 +2581,7 @@ restart:
if (!css || !percpu_ref_is_dying(&css->refcnt))
continue;
- cgroup_get(dsct);
+ cgroup_get_live(dsct);
prepare_to_wait(&dsct->offline_waitq, &wait,
TASK_UNINTERRUPTIBLE);
@@ -3946,7 +3952,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
{
lockdep_assert_held(&cgroup_mutex);
- cgroup_get(cgrp);
+ cgroup_get_live(cgrp);
memset(css, 0, sizeof(*css));
css->cgroup = cgrp;
@@ -4122,7 +4128,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
/* allocation complete, commit to creation */
list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
atomic_inc(&root->nr_cgrps);
- cgroup_get(parent);
+ cgroup_get_live(parent);
/*
* @cgrp is now fully operational. If something fails after this
@@ -4259,6 +4265,11 @@ static void kill_css(struct cgroup_subsys_state *css)
{
lockdep_assert_held(&cgroup_mutex);
+ if (css->flags & CSS_DYING)
+ return;
+
+ css->flags |= CSS_DYING;
+
/*
* This must happen before css is disassociated with its cgroup.
* See seq_css() for details.
@@ -4512,7 +4523,7 @@ int __init cgroup_init(void)
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
- BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
mutex_unlock(&cgroup_mutex);
@@ -4946,7 +4957,7 @@ struct cgroup *cgroup_get_from_path(const char *path)
if (kn) {
if (kernfs_type(kn) == KERNFS_DIR) {
cgrp = kn->priv;
- cgroup_get(cgrp);
+ cgroup_get_live(cgrp);
} else {
cgrp = ERR_PTR(-ENOTDIR);
}
@@ -5026,6 +5037,11 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
/* Socket clone path */
if (skcd->val) {
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
cgroup_get(sock_cgroup_ptr(skcd));
return;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0f41292be0fb..ae643412948a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -176,9 +176,9 @@ typedef enum {
} cpuset_flagbits_t;
/* convenient tests for these bits */
-static inline bool is_cpuset_online(const struct cpuset *cs)
+static inline bool is_cpuset_online(struct cpuset *cs)
{
- return test_bit(CS_ONLINE, &cs->flags);
+ return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
}
static inline int is_cpu_exclusive(const struct cpuset *cs)
@@ -2121,10 +2121,8 @@ int __init cpuset_init(void)
{
int err = 0;
- if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
- BUG();
- if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
- BUG();
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
@@ -2139,8 +2137,7 @@ int __init cpuset_init(void)
if (err < 0)
return err;
- if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
- BUG();
+ BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
return 0;
}
@@ -2354,7 +2351,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
rebuild_sched_domains();
}
-void cpuset_update_active_cpus(bool cpu_online)
+void cpuset_update_active_cpus(void)
{
/*
* We're inside cpu hotplug critical region which usually nests
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 96d38dab6fb2..66129eb4371d 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -31,7 +31,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
kfree(new_ns);
return ERR_PTR(ret);
}
- atomic_set(&new_ns->count, 1);
+ refcount_set(&new_ns->count, 1);
new_ns->ns.ops = &cgroupns_operations;
return new_ns;
}
diff --git a/kernel/compat.c b/kernel/compat.c
index 19aec5d98108..933bcb31ae10 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -108,8 +108,8 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
struct timezone __user *, tz)
{
+ struct timespec64 new_ts;
struct timeval user_tv;
- struct timespec new_ts;
struct timezone new_tz;
if (tv) {
@@ -123,7 +123,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
return -EFAULT;
}
- return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+ return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
@@ -240,18 +240,20 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
struct compat_timespec __user *, rmtp)
{
struct timespec tu, rmt;
+ struct timespec64 tu64;
mm_segment_t oldfs;
long ret;
if (compat_get_timespec(&tu, rqtp))
return -EFAULT;
- if (!timespec_valid(&tu))
+ tu64 = timespec_to_timespec64(tu);
+ if (!timespec64_valid(&tu64))
return -EINVAL;
oldfs = get_fs();
set_fs(KERNEL_DS);
- ret = hrtimer_nanosleep(&tu,
+ ret = hrtimer_nanosleep(&tu64,
rmtp ? (struct timespec __user *)&rmt : NULL,
HRTIMER_MODE_REL, CLOCK_MONOTONIC);
set_fs(oldfs);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 37b223e4fc05..cb5103413bd8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1125,6 +1125,8 @@ core_initcall(cpu_hotplug_pm_sync_init);
#endif /* CONFIG_PM_SLEEP_SMP */
+int __boot_cpu_id;
+
#endif /* CONFIG_SMP */
/* Boot processor state steps */
@@ -1656,13 +1658,13 @@ static ssize_t write_cpuhp_target(struct device *dev,
ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
mutex_unlock(&cpuhp_state_mutex);
if (ret)
- return ret;
+ goto out;
if (st->state < target)
ret = do_cpu_up(dev->id, target);
else
ret = do_cpu_down(dev->id, target);
-
+out:
unlock_device_hotplug();
return ret ? ret : count;
}
@@ -1815,6 +1817,10 @@ void __init boot_cpu_init(void)
set_cpu_active(cpu, true);
set_cpu_present(cpu, true);
set_cpu_possible(cpu, true);
+
+#ifdef CONFIG_SMP
+ __boot_cpu_id = cpu;
+#endif
}
/*
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
new file mode 100644
index 000000000000..fcbd568f1e95
--- /dev/null
+++ b/kernel/crash_core.c
@@ -0,0 +1,439 @@
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/crash_core.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ * crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline, *tmp;
+
+ /* for each entry of the comma-separated list */
+ do {
+ unsigned long long start, end = ULLONG_MAX, size;
+
+ /* get the start of the range */
+ start = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (*cur != '-') {
+ pr_warn("crashkernel: '-' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ /* if no ':' is here, than we read the end */
+ if (*cur != ':') {
+ end = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (end <= start) {
+ pr_warn("crashkernel: end <= start\n");
+ return -EINVAL;
+ }
+ }
+
+ if (*cur != ':') {
+ pr_warn("crashkernel: ':' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ size = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (size >= system_ram) {
+ pr_warn("crashkernel: invalid size\n");
+ return -EINVAL;
+ }
+
+ /* match ? */
+ if (system_ram >= start && system_ram < end) {
+ *crash_size = size;
+ break;
+ }
+ } while (*cur++ == ',');
+
+ if (*crash_size > 0) {
+ while (*cur && *cur != ' ' && *cur != '@')
+ cur++;
+ if (*cur == '@') {
+ cur++;
+ *crash_base = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected after '@'\n");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ * crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ if (*cur == '@')
+ *crash_base = memparse(cur+1, &cur);
+ else if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ if (!ck_cmdline)
+ return NULL;
+
+ return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *name,
+ const char *suffix)
+{
+ char *first_colon, *first_space;
+ char *ck_cmdline;
+
+ BUG_ON(!crash_size || !crash_base);
+ *crash_size = 0;
+ *crash_base = 0;
+
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+
+ if (!ck_cmdline)
+ return -EINVAL;
+
+ ck_cmdline += strlen(name);
+
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ suffix);
+ /*
+ * if the commandline contains a ':', then that's the extended
+ * syntax -- if not, it must be the classic syntax
+ */
+ first_colon = strchr(ck_cmdline, ':');
+ first_space = strchr(ck_cmdline, ' ');
+ if (first_colon && (!first_space || first_colon < first_space))
+ return parse_crashkernel_mem(ck_cmdline, system_ram,
+ crash_size, crash_base);
+
+ return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+ void *data, size_t data_len)
+{
+ struct elf_note *note = (struct elf_note *)buf;
+
+ note->n_namesz = strlen(name) + 1;
+ note->n_descsz = data_len;
+ note->n_type = type;
+ buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
+ memcpy(buf, name, note->n_namesz);
+ buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
+ memcpy(buf, data, data_len);
+ buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
+
+ return buf;
+}
+
+void final_note(Elf_Word *buf)
+{
+ memset(buf, 0, sizeof(struct elf_note));
+}
+
+static void update_vmcoreinfo_note(void)
+{
+ u32 *buf = vmcoreinfo_note;
+
+ if (!vmcoreinfo_size)
+ return;
+ buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+ vmcoreinfo_size);
+ final_note(buf);
+}
+
+void crash_save_vmcoreinfo(void)
+{
+ vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+ update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+ va_list args;
+ char buf[0x50];
+ size_t r;
+
+ va_start(args, fmt);
+ r = vscnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+
+ memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+ vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+phys_addr_t __weak paddr_vmcoreinfo_note(void)
+{
+ return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+ VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+ VMCOREINFO_SYMBOL(init_uts_ns);
+ VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+ VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+ VMCOREINFO_SYMBOL(_stext);
+ VMCOREINFO_SYMBOL(vmap_area_list);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ VMCOREINFO_SYMBOL(mem_map);
+ VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+ VMCOREINFO_SYMBOL(mem_section);
+ VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+ VMCOREINFO_STRUCT_SIZE(mem_section);
+ VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+ VMCOREINFO_STRUCT_SIZE(page);
+ VMCOREINFO_STRUCT_SIZE(pglist_data);
+ VMCOREINFO_STRUCT_SIZE(zone);
+ VMCOREINFO_STRUCT_SIZE(free_area);
+ VMCOREINFO_STRUCT_SIZE(list_head);
+ VMCOREINFO_SIZE(nodemask_t);
+ VMCOREINFO_OFFSET(page, flags);
+ VMCOREINFO_OFFSET(page, _refcount);
+ VMCOREINFO_OFFSET(page, mapping);
+ VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(page, _mapcount);
+ VMCOREINFO_OFFSET(page, private);
+ VMCOREINFO_OFFSET(page, compound_dtor);
+ VMCOREINFO_OFFSET(page, compound_order);
+ VMCOREINFO_OFFSET(page, compound_head);
+ VMCOREINFO_OFFSET(pglist_data, node_zones);
+ VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+ VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+ VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+ VMCOREINFO_OFFSET(pglist_data, node_id);
+ VMCOREINFO_OFFSET(zone, free_area);
+ VMCOREINFO_OFFSET(zone, vm_stat);
+ VMCOREINFO_OFFSET(zone, spanned_pages);
+ VMCOREINFO_OFFSET(free_area, free_list);
+ VMCOREINFO_OFFSET(list_head, next);
+ VMCOREINFO_OFFSET(list_head, prev);
+ VMCOREINFO_OFFSET(vmap_area, va_start);
+ VMCOREINFO_OFFSET(vmap_area, list);
+ VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+ log_buf_vmcoreinfo_setup();
+ VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+ VMCOREINFO_NUMBER(NR_FREE_PAGES);
+ VMCOREINFO_NUMBER(PG_lru);
+ VMCOREINFO_NUMBER(PG_private);
+ VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+ VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+ VMCOREINFO_NUMBER(PG_head_mask);
+ VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLB_PAGE
+ VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+#endif
+
+ arch_crash_save_vmcoreinfo();
+ update_vmcoreinfo_note();
+
+ return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c04917cad1bf..1b2be63c8528 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -229,12 +229,18 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
}
if (regs) {
+ mm_segment_t fs;
+
if (crosstask)
goto exit_put;
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+
+ fs = get_fs();
+ set_fs(USER_DS);
perf_callchain_user(&ctx, regs);
+ set_fs(fs);
}
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ff01cba86f43..6c4e523dc1e2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -48,6 +48,8 @@
#include <linux/parser.h>
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
+#include <linux/proc_ns.h>
+#include <linux/mount.h>
#include "internal.h"
@@ -379,6 +381,7 @@ static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
@@ -3991,6 +3994,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
atomic_dec(&nr_comm_events);
+ if (event->attr.namespaces)
+ atomic_dec(&nr_namespaces_events);
if (event->attr.task)
atomic_dec(&nr_task_events);
if (event->attr.freq)
@@ -6491,6 +6496,7 @@ static void perf_event_task(struct task_struct *task,
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
+ perf_event_namespaces(task);
}
/*
@@ -6593,6 +6599,132 @@ void perf_event_comm(struct task_struct *task, bool exec)
}
/*
+ * namespaces tracking
+ */
+
+struct perf_namespaces_event {
+ struct task_struct *task;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ u64 nr_namespaces;
+ struct perf_ns_link_info link_info[NR_NAMESPACES];
+ } event_id;
+};
+
+static int perf_event_namespaces_match(struct perf_event *event)
+{
+ return event->attr.namespaces;
+}
+
+static void perf_event_namespaces_output(struct perf_event *event,
+ void *data)
+{
+ struct perf_namespaces_event *namespaces_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_namespaces_match(event))
+ return;
+
+ perf_event_header__init_id(&namespaces_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, event,
+ namespaces_event->event_id.header.size);
+ if (ret)
+ return;
+
+ namespaces_event->event_id.pid = perf_event_pid(event,
+ namespaces_event->task);
+ namespaces_event->event_id.tid = perf_event_tid(event,
+ namespaces_event->task);
+
+ perf_output_put(&handle, namespaces_event->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
+ struct task_struct *task,
+ const struct proc_ns_operations *ns_ops)
+{
+ struct path ns_path;
+ struct inode *ns_inode;
+ void *error;
+
+ error = ns_get_path(&ns_path, task, ns_ops);
+ if (!error) {
+ ns_inode = ns_path.dentry->d_inode;
+ ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
+ ns_link_info->ino = ns_inode->i_ino;
+ }
+}
+
+void perf_event_namespaces(struct task_struct *task)
+{
+ struct perf_namespaces_event namespaces_event;
+ struct perf_ns_link_info *ns_link_info;
+
+ if (!atomic_read(&nr_namespaces_events))
+ return;
+
+ namespaces_event = (struct perf_namespaces_event){
+ .task = task,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_NAMESPACES,
+ .misc = 0,
+ .size = sizeof(namespaces_event.event_id),
+ },
+ /* .pid */
+ /* .tid */
+ .nr_namespaces = NR_NAMESPACES,
+ /* .link_info[NR_NAMESPACES] */
+ },
+ };
+
+ ns_link_info = namespaces_event.event_id.link_info;
+
+ perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
+ task, &mntns_operations);
+
+#ifdef CONFIG_USER_NS
+ perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
+ task, &userns_operations);
+#endif
+#ifdef CONFIG_NET_NS
+ perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
+ task, &netns_operations);
+#endif
+#ifdef CONFIG_UTS_NS
+ perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
+ task, &utsns_operations);
+#endif
+#ifdef CONFIG_IPC_NS
+ perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
+ task, &ipcns_operations);
+#endif
+#ifdef CONFIG_PID_NS
+ perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
+ task, &pidns_operations);
+#endif
+#ifdef CONFIG_CGROUPS
+ perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
+ task, &cgroupns_operations);
+#endif
+
+ perf_iterate_sb(perf_event_namespaces_output,
+ &namespaces_event,
+ NULL);
+}
+
+/*
* mmap tracking
*/
@@ -7184,6 +7316,21 @@ int perf_event_account_interrupt(struct perf_event *event)
return __perf_event_account_interrupt(event, 1);
}
+static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
+{
+ /*
+ * Due to interrupt latency (AKA "skid"), we may enter the
+ * kernel before taking an overflow, even if the PMU is only
+ * counting user events.
+ * To avoid leaking information to userspace, we must always
+ * reject kernel samples when exclude_kernel is set.
+ */
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return false;
+
+ return true;
+}
+
/*
* Generic event overflow handling, sampling.
*/
@@ -7205,6 +7352,12 @@ static int __perf_event_overflow(struct perf_event *event,
ret = __perf_event_account_interrupt(event, throttle);
/*
+ * For security, drop the skid kernel samples if necessary.
+ */
+ if (!sample_is_allowed(event, regs))
+ return ret;
+
+ /*
* XXX event_limit might not quite work as expected on inherited
* events
*/
@@ -9146,6 +9299,8 @@ static void account_event(struct perf_event *event)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
atomic_inc(&nr_comm_events);
+ if (event->attr.namespaces)
+ atomic_inc(&nr_namespaces_events);
if (event->attr.task)
atomic_inc(&nr_task_events);
if (event->attr.freq)
@@ -9691,6 +9846,11 @@ SYSCALL_DEFINE5(perf_event_open,
return -EACCES;
}
+ if (attr.namespaces) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
return -EINVAL;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 257fa460b846..2831480c63a2 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -297,6 +297,19 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
rb->paused = 1;
}
+void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
+{
+ /*
+ * OVERWRITE is determined by perf_aux_output_end() and can't
+ * be passed in directly.
+ */
+ if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
+ return;
+
+ handle->aux_flags |= flags;
+}
+EXPORT_SYMBOL_GPL(perf_aux_output_flag);
+
/*
* This is called before hardware starts writing to the AUX area to
* obtain an output handle and make sure there's room in the buffer.
@@ -360,6 +373,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
handle->event = event;
handle->head = aux_head;
handle->size = 0;
+ handle->aux_flags = 0;
/*
* In overwrite mode, AUX data stores do not depend on aux_tail,
@@ -408,34 +422,32 @@ err:
* of the AUX buffer management code is that after pmu::stop(), the AUX
* transaction must be stopped and therefore drop the AUX reference count.
*/
-void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
- bool truncated)
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
{
+ bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
struct ring_buffer *rb = handle->rb;
- bool wakeup = truncated;
unsigned long aux_head;
- u64 flags = 0;
-
- if (truncated)
- flags |= PERF_AUX_FLAG_TRUNCATED;
/* in overwrite mode, driver provides aux_head via handle */
if (rb->aux_overwrite) {
- flags |= PERF_AUX_FLAG_OVERWRITE;
+ handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
aux_head = handle->head;
local_set(&rb->aux_head, aux_head);
} else {
+ handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
+
aux_head = local_read(&rb->aux_head);
local_add(size, &rb->aux_head);
}
- if (size || flags) {
+ if (size || handle->aux_flags) {
/*
* Only send RECORD_AUX if we have something useful to communicate
*/
- perf_event_aux_event(handle->event, aux_head, size, flags);
+ perf_event_aux_event(handle->event, aux_head, size,
+ handle->aux_flags);
}
aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
@@ -446,7 +458,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
}
if (wakeup) {
- if (truncated)
+ if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
handle->event->pending_disable = 1;
perf_output_wakeup(handle);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..e53770d2bf95 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -87,6 +87,7 @@
#include <linux/compiler.h>
#include <linux/sysctl.h>
#include <linux/kcov.h>
+#include <linux/livepatch.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -178,6 +179,24 @@ void __weak arch_release_thread_stack(unsigned long *stack)
*/
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+
+static int free_vm_stack_cache(unsigned int cpu)
+{
+ struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
+ int i;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ struct vm_struct *vm_stack = cached_vm_stacks[i];
+
+ if (!vm_stack)
+ continue;
+
+ vfree(vm_stack->addr);
+ cached_vm_stacks[i] = NULL;
+ }
+
+ return 0;
+}
#endif
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
@@ -202,7 +221,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
VMALLOC_START, VMALLOC_END,
- THREADINFO_GFP | __GFP_HIGHMEM,
+ THREADINFO_GFP,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
@@ -466,6 +485,11 @@ void __init fork_init(void)
for (i = 0; i < UCOUNT_COUNTS; i++) {
init_user_ns.ucount_max[i] = max_threads/2;
}
+
+#ifdef CONFIG_VMAP_STACK
+ cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
+ NULL, free_vm_stack_cache);
+#endif
}
int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -536,7 +560,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
set_task_stack_end_magic(tsk);
#ifdef CONFIG_CC_STACKPROTECTOR
- tsk->stack_canary = get_random_int();
+ tsk->stack_canary = get_random_long();
#endif
/*
@@ -1313,7 +1337,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
if (atomic_dec_and_test(&sighand->count)) {
signalfd_cleanup(sighand);
/*
- * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
+ * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
* without an RCU grace period, see __lock_task_sighand().
*/
kmem_cache_free(sighand_cachep, sighand);
@@ -1438,6 +1462,7 @@ static void rt_mutex_init_task(struct task_struct *p)
#ifdef CONFIG_RT_MUTEXES
p->pi_waiters = RB_ROOT;
p->pi_waiters_leftmost = NULL;
+ p->pi_top_task = NULL;
p->pi_blocked_on = NULL;
#endif
}
@@ -1552,6 +1577,18 @@ static __latent_entropy struct task_struct *copy_process(
if (!p)
goto fork_out;
+ /*
+ * This _must_ happen before we call free_task(), i.e. before we jump
+ * to any of the bad_fork_* labels. This is to avoid freeing
+ * p->set_child_tid which is (ab)used as a kthread's data pointer for
+ * kernel threads (PF_KTHREAD).
+ */
+ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+ /*
+ * Clear TID on mm_release()?
+ */
+ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
+
ftrace_graph_init_task(p);
rt_mutex_init_task(p);
@@ -1679,9 +1716,12 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cleanup_perf;
/* copy all the process information */
shm_init_task(p);
- retval = copy_semundo(clone_flags, p);
+ retval = security_task_alloc(p, clone_flags);
if (retval)
goto bad_fork_cleanup_audit;
+ retval = copy_semundo(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_security;
retval = copy_files(clone_flags, p);
if (retval)
goto bad_fork_cleanup_semundo;
@@ -1715,11 +1755,6 @@ static __latent_entropy struct task_struct *copy_process(
}
}
- p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
- /*
- * Clear TID on mm_release()?
- */
- p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
@@ -1797,6 +1832,8 @@ static __latent_entropy struct task_struct *copy_process(
p->parent_exec_id = current->self_exec_id;
}
+ klp_copy_process(p);
+
spin_lock(&current->sighand->siglock);
/*
@@ -1815,11 +1852,13 @@ static __latent_entropy struct task_struct *copy_process(
*/
recalc_sigpending();
if (signal_pending(current)) {
- spin_unlock(&current->sighand->siglock);
- write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
goto bad_fork_cancel_cgroup;
}
+ if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+ retval = -ENOMEM;
+ goto bad_fork_cancel_cgroup;
+ }
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -1877,6 +1916,8 @@ static __latent_entropy struct task_struct *copy_process(
return p;
bad_fork_cancel_cgroup:
+ spin_unlock(&current->sighand->siglock);
+ write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p);
bad_fork_free_pid:
cgroup_threadgroup_change_end(current);
@@ -1903,6 +1944,8 @@ bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
exit_sem(p);
+bad_fork_cleanup_security:
+ security_task_free(p);
bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_perf:
@@ -2144,7 +2187,7 @@ void __init proc_caches_init(void)
{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
@@ -2352,6 +2395,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
}
}
+ perf_event_namespaces(current);
+
bad_unshare_cleanup_cred:
if (new_cred)
put_cred(new_cred);
diff --git a/kernel/futex.c b/kernel/futex.c
index 45858ec73941..357348a6cf6b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -802,7 +802,7 @@ static int refill_pi_state_cache(void)
return 0;
}
-static struct futex_pi_state * alloc_pi_state(void)
+static struct futex_pi_state *alloc_pi_state(void)
{
struct futex_pi_state *pi_state = current->pi_state_cache;
@@ -812,6 +812,11 @@ static struct futex_pi_state * alloc_pi_state(void)
return pi_state;
}
+static void get_pi_state(struct futex_pi_state *pi_state)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
+}
+
/*
* Drops a reference to the pi_state object and frees or caches it
* when the last reference is gone.
@@ -856,7 +861,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
* Look up the task based on what TID userspace gave us.
* We dont trust it.
*/
-static struct task_struct * futex_find_get_task(pid_t pid)
+static struct task_struct *futex_find_get_task(pid_t pid)
{
struct task_struct *p;
@@ -916,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr)
pi_state->owner = NULL;
raw_spin_unlock_irq(&curr->pi_lock);
- rt_mutex_unlock(&pi_state->pi_mutex);
-
+ get_pi_state(pi_state);
spin_unlock(&hb->lock);
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+
raw_spin_lock_irq(&curr->pi_lock);
}
raw_spin_unlock_irq(&curr->pi_lock);
@@ -973,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr)
*
* [10] There is no transient state which leaves owner and user space
* TID out of sync.
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ * hb -> futex_q, relation
+ * futex_q -> pi_state, relation
+ *
+ * (cannot be raw because hb can contain arbitrary amount
+ * of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ * {uval, pi_state}
+ *
+ * (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ * p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ * pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ * hb->lock
+ * pi_mutex->wait_lock
+ * p->pi_lock
+ *
*/
/*
@@ -980,10 +1020,13 @@ void exit_pi_state_list(struct task_struct *curr)
* the pi_state against the user space value. If correct, attach to
* it.
*/
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
+ u32 uval2;
+ int ret;
/*
* Userspace might have messed up non-PI and PI futexes [3]
@@ -991,9 +1034,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
if (unlikely(!pi_state))
return -EINVAL;
+ /*
+ * We get here with hb->lock held, and having found a
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
+ *
+ * The waiter holding a reference on @pi_state also protects against
+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+ * free pi_state before we can take a reference ourselves.
+ */
WARN_ON(!atomic_read(&pi_state->refcount));
/*
+ * Now that we have a pi_state, we can acquire wait_lock
+ * and do the state validation.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
+ * uval was read without holding it, it can have changed. Verify it
+ * still is what we expect it to be, otherwise retry the entire
+ * operation.
+ */
+ if (get_futex_value_locked(&uval2, uaddr))
+ goto out_efault;
+
+ if (uval != uval2)
+ goto out_eagain;
+
+ /*
* Handle the owner died case:
*/
if (uval & FUTEX_OWNER_DIED) {
@@ -1008,11 +1081,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* is not 0. Inconsistent state. [5]
*/
if (pid)
- return -EINVAL;
+ goto out_einval;
/*
* Take a ref on the state and return success. [4]
*/
- goto out_state;
+ goto out_attach;
}
/*
@@ -1024,14 +1097,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* Take a ref on the state and return success. [6]
*/
if (!pid)
- goto out_state;
+ goto out_attach;
} else {
/*
* If the owner died bit is not set, then the pi_state
* must have an owner. [7]
*/
if (!pi_state->owner)
- return -EINVAL;
+ goto out_einval;
}
/*
@@ -1040,11 +1113,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* user space TID. [9/10]
*/
if (pid != task_pid_vnr(pi_state->owner))
- return -EINVAL;
-out_state:
- atomic_inc(&pi_state->refcount);
+ goto out_einval;
+
+out_attach:
+ get_pi_state(pi_state);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
*ps = pi_state;
return 0;
+
+out_einval:
+ ret = -EINVAL;
+ goto out_error;
+
+out_eagain:
+ ret = -EAGAIN;
+ goto out_error;
+
+out_efault:
+ ret = -EFAULT;
+ goto out_error;
+
+out_error:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
/*
@@ -1095,6 +1186,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
/*
* No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
*/
pi_state = alloc_pi_state();
@@ -1119,17 +1213,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
return 0;
}
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_hash_bucket *hb,
union futex_key *key, struct futex_pi_state **ps)
{
- struct futex_q *match = futex_top_waiter(hb, key);
+ struct futex_q *top_waiter = futex_top_waiter(hb, key);
/*
* If there is a waiter on that futex, validate it and
* attach to the pi_state when the validation succeeds.
*/
- if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ if (top_waiter)
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
/*
* We are the first waiter - try to look up the owner based on
@@ -1148,7 +1243,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
- /*If user space value changed, let the caller retry */
+ /* If user space value changed, let the caller retry */
return curval != uval ? -EAGAIN : 0;
}
@@ -1176,7 +1271,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct task_struct *task, int set_waiters)
{
u32 uval, newval, vpid = task_pid_vnr(task);
- struct futex_q *match;
+ struct futex_q *top_waiter;
int ret;
/*
@@ -1202,9 +1297,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
* Lookup existing state first. If it exists, try to attach to
* its pi_state.
*/
- match = futex_top_waiter(hb, key);
- if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ top_waiter = futex_top_waiter(hb, key);
+ if (top_waiter)
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
/*
* No waiter and user TID is 0. We are here because the
@@ -1285,50 +1380,44 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
wake_q_add(wake_q, p);
__unqueue_futex(q);
/*
- * The waiting task can free the futex_q as soon as
- * q->lock_ptr = NULL is written, without taking any locks. A
- * memory barrier is required here to prevent the following
- * store to lock_ptr from getting ahead of the plist_del.
+ * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
+ * is written, without taking any locks. This is possible in the event
+ * of a spurious wakeup, for example. A memory barrier is required here
+ * to prevent the following store to lock_ptr from getting ahead of the
+ * plist_del in __unqueue_futex().
*/
- smp_wmb();
- q->lock_ptr = NULL;
+ smp_store_release(&q->lock_ptr, NULL);
}
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
- struct futex_hash_bucket *hb)
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
{
- struct task_struct *new_owner;
- struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
+ struct task_struct *new_owner;
+ bool postunlock = false;
DEFINE_WAKE_Q(wake_q);
- bool deboost;
int ret = 0;
- if (!pi_state)
- return -EINVAL;
-
- /*
- * If current does not own the pi_state then the futex is
- * inconsistent and user space fiddled with the futex value.
- */
- if (pi_state->owner != current)
- return -EINVAL;
-
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+ if (WARN_ON_ONCE(!new_owner)) {
+ /*
+ * As per the comment in futex_unlock_pi() this should not happen.
+ *
+ * When this happens, give up our locks and try again, giving
+ * the futex_lock_pi() instance time to complete, either by
+ * waiting on the rtmutex or removing itself from the futex
+ * queue.
+ */
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
/*
- * It is possible that the next waiter (the one that brought
- * this owner to the kernel) timed out and is no longer
- * waiting on the lock.
- */
- if (!new_owner)
- new_owner = this->task;
-
- /*
- * We pass it to the next owner. The WAITERS bit is always
- * kept enabled while there is PI state around. We cleanup the
- * owner died bit, because we are the owner.
+ * We pass it to the next owner. The WAITERS bit is always kept
+ * enabled while there is PI state around. We cleanup the owner
+ * died bit, because we are the owner.
*/
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
@@ -1337,6 +1426,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT;
+
} else if (curval != uval) {
/*
* If a unconditional UNLOCK_PI operation (user space did not
@@ -1349,10 +1439,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
else
ret = -EINVAL;
}
- if (ret) {
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return ret;
- }
+
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * This is a point of no return; once we modify the uval there is no
+ * going back and subsequent operations must not fail.
+ */
raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
@@ -1365,22 +1459,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
pi_state->owner = new_owner;
raw_spin_unlock(&new_owner->pi_lock);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
- deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+out_unlock:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- /*
- * First unlock HB so the waiter does not spin on it once he got woken
- * up. Second wake up the waiter before the priority is adjusted. If we
- * deboost first (and lose our higher priority), then the task might get
- * scheduled away before the wake up can take place.
- */
- spin_unlock(&hb->lock);
- wake_up_q(&wake_q);
- if (deboost)
- rt_mutex_adjust_prio(current);
+ if (postunlock)
+ rt_mutex_postunlock(&wake_q);
- return 0;
+ return ret;
}
/*
@@ -1826,7 +1913,7 @@ retry_private:
* If that call succeeds then we have pi_state and an
* initial refcount on it.
*/
- ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
}
switch (ret) {
@@ -1909,7 +1996,7 @@ retry_private:
* refcount on the pi_state and store the pointer in
* the futex_q object of the waiter.
*/
- atomic_inc(&pi_state->refcount);
+ get_pi_state(pi_state);
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
@@ -2009,20 +2096,7 @@ queue_unlock(struct futex_hash_bucket *hb)
hb_waiters_dec(hb);
}
-/**
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q: The futex_q to enqueue
- * @hb: The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me(). The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
- __releases(&hb->lock)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
int prio;
@@ -2039,6 +2113,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
plist_node_init(&q->list, prio);
plist_add(&q->list, &hb->chain);
q->task = current;
+}
+
+/**
+ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb: The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * queue_me() is typically paired with exactly one call to unqueue_me(). The
+ * exceptions involve the PI related operations, which may use unqueue_me_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ __queue_me(q, hb);
spin_unlock(&hb->lock);
}
@@ -2125,10 +2217,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
- struct task_struct *oldowner = pi_state->owner;
u32 uval, uninitialized_var(curval), newval;
+ struct task_struct *oldowner;
int ret;
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ oldowner = pi_state->owner;
/* Owner died? */
if (!pi_state->owner)
newtid |= FUTEX_OWNER_DIED;
@@ -2136,7 +2231,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
/*
* We are here either because we stole the rtmutex from the
* previous highest priority waiter or we are the highest priority
- * waiter but failed to get the rtmutex the first time.
+ * waiter but have failed to get the rtmutex the first time.
+ *
* We have to replace the newowner TID in the user space variable.
* This must be atomic as we have to preserve the owner died bit here.
*
@@ -2144,17 +2240,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* because we can fault here. Imagine swapped out pages or a fork
* that marked all the anonymous memory readonly for cow.
*
- * Modifying pi_state _before_ the user space value would
- * leave the pi_state in an inconsistent state when we fault
- * here, because we need to drop the hash bucket lock to
- * handle the fault. This might be observed in the PID check
- * in lookup_pi_state.
+ * Modifying pi_state _before_ the user space value would leave the
+ * pi_state in an inconsistent state when we fault here, because we
+ * need to drop the locks to handle the fault. This might be observed
+ * in the PID check in lookup_pi_state.
*/
retry:
if (get_futex_value_locked(&uval, uaddr))
goto handle_fault;
- while (1) {
+ for (;;) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
@@ -2169,47 +2264,60 @@ retry:
* itself.
*/
if (pi_state->owner != NULL) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
+ raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ raw_spin_unlock(&pi_state->owner->pi_lock);
}
pi_state->owner = newowner;
- raw_spin_lock_irq(&newowner->pi_lock);
+ raw_spin_lock(&newowner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &newowner->pi_state_list);
- raw_spin_unlock_irq(&newowner->pi_lock);
+ raw_spin_unlock(&newowner->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+
return 0;
/*
- * To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the highest priority
- * waiter itself or the task which stole the rtmutex) the
- * chance to try the fixup of the pi_state. So once we are
- * back from handling the fault we need to check the pi_state
- * after reacquiring the hash bucket lock and before trying to
- * do another fixup. When the fixup has been done already we
- * simply return.
+ * To handle the page fault we need to drop the locks here. That gives
+ * the other task (either the highest priority waiter itself or the
+ * task which stole the rtmutex) the chance to try the fixup of the
+ * pi_state. So once we are back from handling the fault we need to
+ * check the pi_state after reacquiring the locks and before trying to
+ * do another fixup. When the fixup has been done already we simply
+ * return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
*/
handle_fault:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(q->lock_ptr);
ret = fault_in_user_writeable(uaddr);
spin_lock(q->lock_ptr);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
* Check if someone else fixed it for us:
*/
- if (pi_state->owner != oldowner)
- return 0;
+ if (pi_state->owner != oldowner) {
+ ret = 0;
+ goto out_unlock;
+ }
if (ret)
- return ret;
+ goto out_unlock;
goto retry;
+
+out_unlock:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
static long futex_wait_restart(struct restart_block *restart);
@@ -2231,13 +2339,16 @@ static long futex_wait_restart(struct restart_block *restart);
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
{
- struct task_struct *owner;
int ret = 0;
if (locked) {
/*
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case:
+ *
+ * We can safely read pi_state->owner without holding wait_lock
+ * because we now own the rt_mutex, only the owner will attempt
+ * to change it.
*/
if (q->pi_state->owner != current)
ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2245,43 +2356,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
}
/*
- * Catch the rare case, where the lock was released when we were on the
- * way back before we locked the hash bucket.
- */
- if (q->pi_state->owner == current) {
- /*
- * Try to get the rt_mutex now. This might fail as some other
- * task acquired the rt_mutex after we removed ourself from the
- * rt_mutex waiters list.
- */
- if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
- locked = 1;
- goto out;
- }
-
- /*
- * pi_state is incorrect, some other task did a lock steal and
- * we returned due to timeout or signal without taking the
- * rt_mutex. Too late.
- */
- raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
- owner = rt_mutex_owner(&q->pi_state->pi_mutex);
- if (!owner)
- owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
- raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
- ret = fixup_pi_state_owner(uaddr, q, owner);
- goto out;
- }
-
- /*
* Paranoia check. If we did not take the lock, then we should not be
* the owner of the rt_mutex.
*/
- if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
"pi-state %p\n", ret,
q->pi_state->pi_mutex.owner,
q->pi_state->owner);
+ }
out:
return ret ? ret : locked;
@@ -2505,6 +2588,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
+ struct futex_pi_state *pi_state = NULL;
+ struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
int res, ret;
@@ -2557,25 +2642,68 @@ retry_private:
}
}
+ WARN_ON(!q.pi_state);
+
/*
* Only actually queue now that the atomic ops are done:
*/
- queue_me(&q, hb);
+ __queue_me(&q, hb);
- WARN_ON(!q.pi_state);
- /*
- * Block on the PI mutex:
- */
- if (!trylock) {
- ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
- } else {
- ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
/* Fixup the trylock return value: */
ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
}
+ rt_mutex_init_waiter(&rt_waiter);
+
+ /*
+ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
+ * hold it while doing rt_mutex_start_proxy(), because then it will
+ * include hb->lock in the blocking chain, even through we'll not in
+ * fact hold it while blocking. This will lead it to report -EDEADLK
+ * and BUG when futex_unlock_pi() interleaves with this.
+ *
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
+ * latter before calling rt_mutex_start_proxy_lock(). This still fully
+ * serializes against futex_unlock_pi() as that does the exact same
+ * lock handoff sequence.
+ */
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ spin_unlock(q.lock_ptr);
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+
+ spin_lock(q.lock_ptr);
+ goto no_block;
+ }
+
+
+ if (unlikely(to))
+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
spin_lock(q.lock_ptr);
/*
+ * If we failed to acquire the lock (signal/timeout), we must
+ * first acquire the hb->lock before removing the lock from the
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+ * wait lists consistent.
+ *
+ * In particular; it is important that futex_unlock_pi() can not
+ * observe this inconsistency.
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+
+no_block:
+ /*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
@@ -2591,12 +2719,19 @@ retry_private:
* If fixup_owner() faulted and was unable to handle the fault, unlock
* it and return the fault to userspace.
*/
- if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
- rt_mutex_unlock(&q.pi_state->pi_mutex);
+ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
+ if (pi_state) {
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+ }
+
goto out_put_key;
out_unlock_put_key:
@@ -2605,8 +2740,10 @@ out_unlock_put_key:
out_put_key:
put_futex_key(&q.key);
out:
- if (to)
+ if (to) {
+ hrtimer_cancel(&to->timer);
destroy_hrtimer_on_stack(&to->timer);
+ }
return ret != -EINTR ? ret : -ERESTARTNOINTR;
uaddr_faulted:
@@ -2633,7 +2770,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb;
- struct futex_q *match;
+ struct futex_q *top_waiter;
int ret;
retry:
@@ -2657,12 +2794,37 @@ retry:
* all and we at least want to know if user space fiddled
* with the futex value instead of blindly unlocking.
*/
- match = futex_top_waiter(hb, &key);
- if (match) {
- ret = wake_futex_pi(uaddr, uval, match, hb);
+ top_waiter = futex_top_waiter(hb, &key);
+ if (top_waiter) {
+ struct futex_pi_state *pi_state = top_waiter->pi_state;
+
+ ret = -EINVAL;
+ if (!pi_state)
+ goto out_unlock;
+
/*
- * In case of success wake_futex_pi dropped the hash
- * bucket lock.
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ goto out_unlock;
+
+ get_pi_state(pi_state);
+ /*
+ * By taking wait_lock while still holding hb->lock, we ensure
+ * there is no point where we hold neither; and therefore
+ * wake_futex_pi() must observe a state consistent with what we
+ * observed.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+
+ ret = wake_futex_pi(uaddr, uval, pi_state);
+
+ put_pi_state(pi_state);
+
+ /*
+ * Success, we're done! No tricky corner cases.
*/
if (!ret)
goto out_putkey;
@@ -2677,7 +2839,6 @@ retry:
* setting the FUTEX_WAITERS bit. Try again.
*/
if (ret == -EAGAIN) {
- spin_unlock(&hb->lock);
put_futex_key(&key);
goto retry;
}
@@ -2685,7 +2846,7 @@ retry:
* wake_futex_pi has detected invalid state. Tell user
* space.
*/
- goto out_unlock;
+ goto out_putkey;
}
/*
@@ -2695,8 +2856,10 @@ retry:
* preserve the WAITERS bit not the OWNER_DIED one. We are the
* owner.
*/
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+ spin_unlock(&hb->lock);
goto pi_faulted;
+ }
/*
* If uval has changed, let user space handle it.
@@ -2710,7 +2873,6 @@ out_putkey:
return ret;
pi_faulted:
- spin_unlock(&hb->lock);
put_futex_key(&key);
ret = fault_in_user_writeable(uaddr);
@@ -2814,6 +2976,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
u32 __user *uaddr2)
{
struct hrtimer_sleeper timeout, *to = NULL;
+ struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT;
@@ -2840,10 +3003,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* The waiter is allocated on our stack, manipulated by the requeue
* code while we sleep on uaddr.
*/
- debug_rt_mutex_init_waiter(&rt_waiter);
- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
- RB_CLEAR_NODE(&rt_waiter.tree_entry);
- rt_waiter.task = NULL;
+ rt_mutex_init_waiter(&rt_waiter);
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
@@ -2898,8 +3058,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
- if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
- rt_mutex_unlock(&q.pi_state->pi_mutex);
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/*
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
@@ -2917,10 +3079,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
WARN_ON(!q.pi_state);
pi_mutex = &q.pi_state->pi_mutex;
- ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
- debug_rt_mutex_free_waiter(&rt_waiter);
+ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
spin_lock(q.lock_ptr);
+ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+ ret = 0;
+
+ debug_rt_mutex_free_waiter(&rt_waiter);
/*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
@@ -2938,13 +3103,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* the fault, unlock the rt_mutex and return the fault to
* userspace.
*/
- if (ret && rt_mutex_owner(pi_mutex) == current)
- rt_mutex_unlock(pi_mutex);
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/* Unqueue and drop the lock. */
unqueue_me_pi(&q);
}
+ if (pi_state) {
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+ }
+
if (ret == -EINTR) {
/*
* We've already been requeued, but cannot restart by calling
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 2f9df37940a0..c51a49c9be70 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -98,6 +98,12 @@ void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
}
EXPORT_SYMBOL(__gcov_merge_icall_topn);
+void __gcov_exit(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_exit);
+
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 6a5c239c7669..46a18e72bce6 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,9 @@
#include <linux/vmalloc.h>
#include "gcov.h"
-#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
+#if (__GNUC__ >= 7)
+#define GCOV_COUNTERS 9
+#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
#define GCOV_COUNTERS 10
#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
#define GCOV_COUNTERS 9
diff --git a/kernel/groups.c b/kernel/groups.c
index 8dd7a61b7115..d09727692a2a 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -18,7 +18,7 @@ struct group_info *groups_alloc(int gidsetsize)
len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize;
gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
if (!gi)
- gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL);
+ gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL);
if (!gi)
return NULL;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index f0f8e2a8496f..751593ed7c0b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -43,6 +43,7 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_
int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
+static bool hung_task_show_lock;
static struct task_struct *watchdog_task;
@@ -120,12 +121,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
- debug_show_all_locks();
+ hung_task_show_lock = true;
}
touch_nmi_watchdog();
if (sysctl_hung_task_panic) {
+ if (hung_task_show_lock)
+ debug_show_all_locks();
trigger_all_cpu_backtrace();
panic("hung_task: blocked tasks");
}
@@ -172,6 +175,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
if (test_taint(TAINT_DIE) || did_panic)
return;
+ hung_task_show_lock = false;
rcu_read_lock();
for_each_process_thread(g, t) {
if (!max_count--)
@@ -187,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
}
unlock:
rcu_read_unlock();
+ if (hung_task_show_lock)
+ debug_show_all_locks();
}
static long hung_timeout_jiffies(unsigned long last_checked,
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4544b115f5eb..e2d356dd7581 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -59,7 +59,7 @@ static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
struct cpumask *
irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
{
- int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec;
+ int n, nodes, cpus_per_vec, extra_vecs, curvec;
int affv = nvecs - affd->pre_vectors - affd->post_vectors;
int last_affv = affv + affd->pre_vectors;
nodemask_t nodemsk = NODE_MASK_NONE;
@@ -94,19 +94,21 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
goto done;
}
- /* Spread the vectors per node */
- vecs_per_node = affv / nodes;
- /* Account for rounding errors */
- extra_vecs = affv - (nodes * vecs_per_node);
-
for_each_node_mask(n, nodemsk) {
- int ncpus, v, vecs_to_assign = vecs_per_node;
+ int ncpus, v, vecs_to_assign, vecs_per_node;
+
+ /* Spread the vectors per node */
+ vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
/* Get the cpus on this node which are in the mask */
cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n));
/* Calculate the number of cpus per vector */
ncpus = cpumask_weight(nmsk);
+ vecs_to_assign = min(vecs_per_node, ncpus);
+
+ /* Account for rounding errors */
+ extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign);
for (v = 0; curvec < last_affv && v < vecs_to_assign;
curvec++, v++) {
@@ -115,14 +117,14 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
/* Account for extra vectors to compensate rounding errors */
if (extra_vecs) {
cpus_per_vec++;
- if (!--extra_vecs)
- vecs_per_node++;
+ --extra_vecs;
}
irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
}
if (curvec >= last_affv)
break;
+ --nodes;
}
done:
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index be3c34e4f2ac..c94da688ee9b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -348,7 +348,10 @@ void handle_nested_irq(unsigned int irq)
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock_irq(&desc->lock);
- action_ret = action->thread_fn(action->irq, action->dev_id);
+ action_ret = IRQ_NONE;
+ for_each_action_of_desc(desc, action)
+ action_ret |= action->thread_fn(action->irq, action->dev_id);
+
if (!noirqdebug)
note_interrupt(desc, action_ret);
@@ -877,8 +880,8 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
if (!desc)
return;
- __irq_do_set_handler(desc, handle, 1, NULL);
desc->irq_common_data.handler_data = data;
+ __irq_do_set_handler(desc, handle, 1, NULL);
irq_put_desc_busunlock(desc, flags);
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a4afe5cc5af1..070be980c37a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -852,7 +852,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
* This code is triggered unconditionally. Check the affinity
* mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
*/
- if (desc->irq_common_data.affinity)
+ if (cpumask_available(desc->irq_common_data.affinity))
cpumask_copy(mask, desc->irq_common_data.affinity);
else
valid = false;
@@ -1212,8 +1212,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* set the trigger type must match. Also all must
* agree on ONESHOT.
*/
+ unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data);
+
if (!((old->flags & new->flags) & IRQF_SHARED) ||
- ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
+ (oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
((old->flags ^ new->flags) & IRQF_ONESHOT))
goto mismatch;
@@ -1557,7 +1559,7 @@ void remove_irq(unsigned int irq, struct irqaction *act)
struct irq_desc *desc = irq_to_desc(irq);
if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
- __free_irq(irq, act->dev_id);
+ __free_irq(irq, act->dev_id);
}
EXPORT_SYMBOL_GPL(remove_irq);
@@ -1574,20 +1576,27 @@ EXPORT_SYMBOL_GPL(remove_irq);
* have completed.
*
* This function must not be called from interrupt context.
+ *
+ * Returns the devname argument passed to request_irq.
*/
-void free_irq(unsigned int irq, void *dev_id)
+const void *free_irq(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ const char *devname;
if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
- return;
+ return NULL;
#ifdef CONFIG_SMP
if (WARN_ON(desc->affinity_notify))
desc->affinity_notify = NULL;
#endif
- kfree(__free_irq(irq, dev_id));
+ action = __free_irq(irq, dev_id);
+ devname = action->name;
+ kfree(action);
+ return devname;
}
EXPORT_SYMBOL(free_irq);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 85e5546cd791..cd771993f96f 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -60,15 +60,8 @@ void notrace __sanitizer_cov_trace_pc(void)
/*
* We are interested in code coverage as a function of a syscall inputs,
* so we ignore code executed in interrupts.
- * The checks for whether we are in an interrupt are open-coded, because
- * 1. We can't use in_interrupt() here, since it also returns true
- * when we are inside local_bh_disable() section.
- * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
- * since that leads to slower generated code (three separate tests,
- * one for each of the flags).
*/
- if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
- | NMI_MASK)))
+ if (!t || !in_task())
return;
mode = READ_ONCE(t->kcov_mode);
if (mode == KCOV_MODE_TRACE) {
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index bfe62d5b3872..ae1a3ba24df5 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -51,12 +51,6 @@ DEFINE_MUTEX(kexec_mutex);
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;
@@ -996,34 +990,6 @@ unlock:
return ret;
}
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
- size_t data_len)
-{
- struct elf_note note;
-
- note.n_namesz = strlen(name) + 1;
- note.n_descsz = data_len;
- note.n_type = type;
- memcpy(buf, &note, sizeof(note));
- buf += (sizeof(note) + 3)/4;
- memcpy(buf, name, note.n_namesz);
- buf += (note.n_namesz + 3)/4;
- memcpy(buf, data, note.n_descsz);
- buf += (note.n_descsz + 3)/4;
-
- return buf;
-}
-
-static void final_note(u32 *buf)
-{
- struct elf_note note;
-
- note.n_namesz = 0;
- note.n_descsz = 0;
- note.n_type = 0;
- memcpy(buf, &note, sizeof(note));
-}
-
void crash_save_cpu(struct pt_regs *regs, int cpu)
{
struct elf_prstatus prstatus;
@@ -1085,403 +1051,6 @@ subsys_initcall(crash_notes_memory_init);
/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- * crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline, *tmp;
-
- /* for each entry of the comma-separated list */
- do {
- unsigned long long start, end = ULLONG_MAX, size;
-
- /* get the start of the range */
- start = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (*cur != '-') {
- pr_warn("crashkernel: '-' expected\n");
- return -EINVAL;
- }
- cur++;
-
- /* if no ':' is here, than we read the end */
- if (*cur != ':') {
- end = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (end <= start) {
- pr_warn("crashkernel: end <= start\n");
- return -EINVAL;
- }
- }
-
- if (*cur != ':') {
- pr_warn("crashkernel: ':' expected\n");
- return -EINVAL;
- }
- cur++;
-
- size = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (size >= system_ram) {
- pr_warn("crashkernel: invalid size\n");
- return -EINVAL;
- }
-
- /* match ? */
- if (system_ram >= start && system_ram < end) {
- *crash_size = size;
- break;
- }
- } while (*cur++ == ',');
-
- if (*crash_size > 0) {
- while (*cur && *cur != ' ' && *cur != '@')
- cur++;
- if (*cur == '@') {
- cur++;
- *crash_base = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected after '@'\n");
- return -EINVAL;
- }
- }
- }
-
- return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- * crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- if (*cur == '@')
- *crash_base = memparse(cur+1, &cur);
- else if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
- }
-
- return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW 1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
- [SUFFIX_HIGH] = ",high",
- [SUFFIX_LOW] = ",low",
- [SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix" crashkernel command lines like
- *
- * crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
- unsigned long long *crash_size,
- const char *suffix)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- /* check with suffix */
- if (strncmp(cur, suffix, strlen(suffix))) {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
- }
- cur += strlen(suffix);
- if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
- }
-
- return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
- const char *name,
- const char *suffix)
-{
- char *p = cmdline, *ck_cmdline = NULL;
-
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, name);
- while (p) {
- char *end_p = strchr(p, ' ');
- char *q;
-
- if (!end_p)
- end_p = p + strlen(p);
-
- if (!suffix) {
- int i;
-
- /* skip the one with any known suffix */
- for (i = 0; suffix_tbl[i]; i++) {
- q = end_p - strlen(suffix_tbl[i]);
- if (!strncmp(q, suffix_tbl[i],
- strlen(suffix_tbl[i])))
- goto next;
- }
- ck_cmdline = p;
- } else {
- q = end_p - strlen(suffix);
- if (!strncmp(q, suffix, strlen(suffix)))
- ck_cmdline = p;
- }
-next:
- p = strstr(p+1, name);
- }
-
- if (!ck_cmdline)
- return NULL;
-
- return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base,
- const char *name,
- const char *suffix)
-{
- char *first_colon, *first_space;
- char *ck_cmdline;
-
- BUG_ON(!crash_size || !crash_base);
- *crash_size = 0;
- *crash_base = 0;
-
- ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
- if (!ck_cmdline)
- return -EINVAL;
-
- ck_cmdline += strlen(name);
-
- if (suffix)
- return parse_crashkernel_suffix(ck_cmdline, crash_size,
- suffix);
- /*
- * if the commandline contains a ':', then that's the extended
- * syntax -- if not, it must be the classic syntax
- */
- first_colon = strchr(ck_cmdline, ':');
- first_space = strchr(ck_cmdline, ' ');
- if (first_colon && (!first_space || first_colon < first_space))
- return parse_crashkernel_mem(ck_cmdline, system_ram,
- crash_size, crash_base);
-
- return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", NULL);
-}
-
-int __init parse_crashkernel_high(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-
-static void update_vmcoreinfo_note(void)
-{
- u32 *buf = vmcoreinfo_note;
-
- if (!vmcoreinfo_size)
- return;
- buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
- vmcoreinfo_size);
- final_note(buf);
-}
-
-void crash_save_vmcoreinfo(void)
-{
- vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
- update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
- va_list args;
- char buf[0x50];
- size_t r;
-
- va_start(args, fmt);
- r = vscnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
-
- r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-
- memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
- vmcoreinfo_size += r;
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-phys_addr_t __weak paddr_vmcoreinfo_note(void)
-{
- return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
-}
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
- VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
- VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
- VMCOREINFO_SYMBOL(init_uts_ns);
- VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
- VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
- VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmap_area_list);
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
- VMCOREINFO_SYMBOL(mem_map);
- VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL(mem_section);
- VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
- VMCOREINFO_STRUCT_SIZE(mem_section);
- VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
- VMCOREINFO_STRUCT_SIZE(page);
- VMCOREINFO_STRUCT_SIZE(pglist_data);
- VMCOREINFO_STRUCT_SIZE(zone);
- VMCOREINFO_STRUCT_SIZE(free_area);
- VMCOREINFO_STRUCT_SIZE(list_head);
- VMCOREINFO_SIZE(nodemask_t);
- VMCOREINFO_OFFSET(page, flags);
- VMCOREINFO_OFFSET(page, _refcount);
- VMCOREINFO_OFFSET(page, mapping);
- VMCOREINFO_OFFSET(page, lru);
- VMCOREINFO_OFFSET(page, _mapcount);
- VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(page, compound_dtor);
- VMCOREINFO_OFFSET(page, compound_order);
- VMCOREINFO_OFFSET(page, compound_head);
- VMCOREINFO_OFFSET(pglist_data, node_zones);
- VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
- VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
- VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
- VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
- VMCOREINFO_OFFSET(pglist_data, node_id);
- VMCOREINFO_OFFSET(zone, free_area);
- VMCOREINFO_OFFSET(zone, vm_stat);
- VMCOREINFO_OFFSET(zone, spanned_pages);
- VMCOREINFO_OFFSET(free_area, free_list);
- VMCOREINFO_OFFSET(list_head, next);
- VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vmap_area, va_start);
- VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
- log_buf_kexec_setup();
- VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
- VMCOREINFO_NUMBER(NR_FREE_PAGES);
- VMCOREINFO_NUMBER(PG_lru);
- VMCOREINFO_NUMBER(PG_private);
- VMCOREINFO_NUMBER(PG_swapcache);
- VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
- VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
- VMCOREINFO_NUMBER(PG_head_mask);
- VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLB_PAGE
- VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
-#endif
-
- arch_crash_save_vmcoreinfo();
- update_vmcoreinfo_note();
-
- return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
-/*
* Move into place and start executing a preloaded standalone
* executable. If nothing was preloaded return an error.
*/
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 699c5bc51a92..adfe3b4cfe05 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -58,15 +58,6 @@
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
-/*
- * Some oddball architectures like 64bit powerpc have function descriptors
- * so this must be overridable.
- */
-#ifndef kprobe_lookup_name
-#define kprobe_lookup_name(name, addr) \
- addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
-#endif
-
static int kprobes_initialized;
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
@@ -81,6 +72,12 @@ static struct {
raw_spinlock_t lock ____cacheline_aligned_in_smp;
} kretprobe_table_locks[KPROBE_TABLE_SIZE];
+kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
+ unsigned int __unused)
+{
+ return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
+}
+
static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
{
return &(kretprobe_table_locks[hash].lock);
@@ -125,7 +122,7 @@ static void *alloc_insn_page(void)
return module_alloc(PAGE_SIZE);
}
-static void free_insn_page(void *page)
+void __weak free_insn_page(void *page)
{
module_memfree(page);
}
@@ -598,7 +595,7 @@ static void kprobe_optimizer(struct work_struct *work)
}
/* Wait for completing optimization and unoptimization */
-static void wait_for_kprobe_optimizer(void)
+void wait_for_kprobe_optimizer(void)
{
mutex_lock(&kprobe_mutex);
@@ -746,13 +743,20 @@ static void kill_optimized_kprobe(struct kprobe *p)
arch_remove_optimized_kprobe(op);
}
+static inline
+void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
+{
+ if (!kprobe_ftrace(p))
+ arch_prepare_optimized_kprobe(op, p);
+}
+
/* Try to prepare optimized instructions */
static void prepare_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
op = container_of(p, struct optimized_kprobe, kp);
- arch_prepare_optimized_kprobe(op, p);
+ __prepare_optimized_kprobe(op, p);
}
/* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -766,7 +770,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
INIT_LIST_HEAD(&op->list);
op->kp.addr = p->addr;
- arch_prepare_optimized_kprobe(op, p);
+ __prepare_optimized_kprobe(op, p);
return &op->kp;
}
@@ -1391,21 +1395,19 @@ bool within_kprobe_blacklist(unsigned long addr)
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
*/
-static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
+ const char *symbol_name, unsigned int offset)
{
- kprobe_opcode_t *addr = p->addr;
-
- if ((p->symbol_name && p->addr) ||
- (!p->symbol_name && !p->addr))
+ if ((symbol_name && addr) || (!symbol_name && !addr))
goto invalid;
- if (p->symbol_name) {
- kprobe_lookup_name(p->symbol_name, addr);
+ if (symbol_name) {
+ addr = kprobe_lookup_name(symbol_name, offset);
if (!addr)
return ERR_PTR(-ENOENT);
}
- addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
+ addr = (kprobe_opcode_t *)(((char *)addr) + offset);
if (addr)
return addr;
@@ -1413,6 +1415,11 @@ invalid:
return ERR_PTR(-EINVAL);
}
+static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+{
+ return _kprobe_addr(p->addr, p->symbol_name, p->offset);
+}
+
/* Check passed kprobe is valid and return kprobe in kprobe_table. */
static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
@@ -1740,11 +1747,12 @@ void unregister_kprobes(struct kprobe **kps, int num)
}
EXPORT_SYMBOL_GPL(unregister_kprobes);
-int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self,
- unsigned long val, void *data)
+int __weak kprobe_exceptions_notify(struct notifier_block *self,
+ unsigned long val, void *data)
{
return NOTIFY_DONE;
}
+NOKPROBE_SYMBOL(kprobe_exceptions_notify);
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
@@ -1875,6 +1883,25 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
+bool __weak arch_function_offset_within_entry(unsigned long offset)
+{
+ return !offset;
+}
+
+bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+{
+ kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
+
+ if (IS_ERR(kp_addr))
+ return false;
+
+ if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
+ !arch_function_offset_within_entry(offset))
+ return false;
+
+ return true;
+}
+
int register_kretprobe(struct kretprobe *rp)
{
int ret = 0;
@@ -1882,6 +1909,9 @@ int register_kretprobe(struct kretprobe *rp)
int i;
void *addr;
+ if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
+ return -EINVAL;
+
if (kretprobe_blacklist_size) {
addr = kprobe_addr(&rp->kp);
if (IS_ERR(addr))
@@ -2153,6 +2183,12 @@ static int kprobes_module_callback(struct notifier_block *nb,
* The vaddr this probe is installed will soon
* be vfreed buy not synced to disk. Hence,
* disarming the breakpoint isn't needed.
+ *
+ * Note, this will also move any optimized probes
+ * that are pending to be removed from their
+ * corresponding lists to the freeing_list and
+ * will not be touched by the delayed
+ * kprobe_optimizer work handler.
*/
kill_kprobe(p);
}
@@ -2192,8 +2228,8 @@ static int __init init_kprobes(void)
if (kretprobe_blacklist_size) {
/* lookup the function address from its name */
for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
- kprobe_lookup_name(kretprobe_blacklist[i].name,
- kretprobe_blacklist[i].addr);
+ kretprobe_blacklist[i].addr =
+ kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
if (!kretprobe_blacklist[i].addr)
printk("kretprobe: lookup failed: %s\n",
kretprobe_blacklist[i].name);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0999679d6f26..23cd70651238 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -125,6 +125,10 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
}
KERNEL_ATTR_RW(kexec_crash_size);
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_CRASH_CORE
+
static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -134,7 +138,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(vmcoreinfo);
-#endif /* CONFIG_KEXEC_CORE */
+#endif /* CONFIG_CRASH_CORE */
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
@@ -219,6 +223,8 @@ static struct attribute * kernel_attrs[] = {
&kexec_loaded_attr.attr,
&kexec_crash_loaded_attr.attr,
&kexec_crash_size_attr.attr,
+#endif
+#ifdef CONFIG_CRASH_CORE
&vmcoreinfo_attr.attr,
#endif
#ifndef CONFIG_TINY_RCU
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2f26adea0f84..26db528c1d88 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,6 +20,7 @@
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
+#include <linux/cgroup.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -225,6 +226,7 @@ static int kthread(void *_create)
ret = -EINTR;
if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
+ cgroup_kthread_ready();
__kthread_parkme(self);
ret = threadfn(data);
}
@@ -538,6 +540,7 @@ int kthreadd(void *unused)
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
+ cgroup_init_kthreadd();
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index 045022557936..ec4565122e65 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -10,6 +10,7 @@ config LIVEPATCH
depends on SYSFS
depends on KALLSYMS_ALL
depends on HAVE_LIVEPATCH
+ depends on !TRIM_UNUSED_KSYMS
help
Say Y here if you want to support kernel live patching.
This option has no runtime impact until a kernel "patch"
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index e8780c0901d9..2b8bdb1925da 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
obj-$(CONFIG_LIVEPATCH) += livepatch.o
-livepatch-objs := core.o
+livepatch-objs := core.o patch.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index af4643873e71..b9628e43c78f 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -24,61 +24,31 @@
#include <linux/kernel.h>
#include <linux/mutex.h>
#include <linux/slab.h>
-#include <linux/ftrace.h>
#include <linux/list.h>
#include <linux/kallsyms.h>
#include <linux/livepatch.h>
#include <linux/elf.h>
#include <linux/moduleloader.h>
+#include <linux/completion.h>
#include <asm/cacheflush.h>
-
-/**
- * struct klp_ops - structure for tracking registered ftrace ops structs
- *
- * A single ftrace_ops is shared between all enabled replacement functions
- * (klp_func structs) which have the same old_addr. This allows the switch
- * between function versions to happen instantaneously by updating the klp_ops
- * struct's func_stack list. The winner is the klp_func at the top of the
- * func_stack (front of the list).
- *
- * @node: node for the global klp_ops list
- * @func_stack: list head for the stack of klp_func's (active func is on top)
- * @fops: registered ftrace ops struct
- */
-struct klp_ops {
- struct list_head node;
- struct list_head func_stack;
- struct ftrace_ops fops;
-};
+#include "core.h"
+#include "patch.h"
+#include "transition.h"
/*
- * The klp_mutex protects the global lists and state transitions of any
- * structure reachable from them. References to any structure must be obtained
- * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
- * ensure it gets consistent data).
+ * klp_mutex is a coarse lock which serializes access to klp data. All
+ * accesses to klp-related variables and structures must have mutex protection,
+ * except within the following functions which carefully avoid the need for it:
+ *
+ * - klp_ftrace_handler()
+ * - klp_update_patch_state()
*/
-static DEFINE_MUTEX(klp_mutex);
+DEFINE_MUTEX(klp_mutex);
static LIST_HEAD(klp_patches);
-static LIST_HEAD(klp_ops);
static struct kobject *klp_root_kobj;
-static struct klp_ops *klp_find_ops(unsigned long old_addr)
-{
- struct klp_ops *ops;
- struct klp_func *func;
-
- list_for_each_entry(ops, &klp_ops, node) {
- func = list_first_entry(&ops->func_stack, struct klp_func,
- stack_node);
- if (func->old_addr == old_addr)
- return ops;
- }
-
- return NULL;
-}
-
static bool klp_is_module(struct klp_object *obj)
{
return obj->name;
@@ -117,7 +87,6 @@ static void klp_find_object_module(struct klp_object *obj)
mutex_unlock(&module_mutex);
}
-/* klp_mutex must be held by caller */
static bool klp_is_patch_registered(struct klp_patch *patch)
{
struct klp_patch *mypatch;
@@ -182,7 +151,10 @@ static int klp_find_object_symbol(const char *objname, const char *name,
};
mutex_lock(&module_mutex);
- kallsyms_on_each_symbol(klp_find_callback, &args);
+ if (objname)
+ module_kallsyms_on_each_symbol(klp_find_callback, &args);
+ else
+ kallsyms_on_each_symbol(klp_find_callback, &args);
mutex_unlock(&module_mutex);
/*
@@ -233,7 +205,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod)
for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info);
if (sym->st_shndx != SHN_LIVEPATCH) {
- pr_err("symbol %s is not marked as a livepatch symbol",
+ pr_err("symbol %s is not marked as a livepatch symbol\n",
strtab + sym->st_name);
return -EINVAL;
}
@@ -243,7 +215,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod)
".klp.sym.%55[^.].%127[^,],%lu",
objname, symname, &sympos);
if (cnt != 3) {
- pr_err("symbol %s has an incorrectly formatted name",
+ pr_err("symbol %s has an incorrectly formatted name\n",
strtab + sym->st_name);
return -EINVAL;
}
@@ -288,7 +260,7 @@ static int klp_write_object_relocations(struct module *pmod,
*/
cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname);
if (cnt != 1) {
- pr_err("section %s has an incorrectly formatted name",
+ pr_err("section %s has an incorrectly formatted name\n",
secname);
ret = -EINVAL;
break;
@@ -311,191 +283,30 @@ static int klp_write_object_relocations(struct module *pmod,
return ret;
}
-static void notrace klp_ftrace_handler(unsigned long ip,
- unsigned long parent_ip,
- struct ftrace_ops *fops,
- struct pt_regs *regs)
-{
- struct klp_ops *ops;
- struct klp_func *func;
-
- ops = container_of(fops, struct klp_ops, fops);
-
- rcu_read_lock();
- func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
- stack_node);
- if (WARN_ON_ONCE(!func))
- goto unlock;
-
- klp_arch_set_pc(regs, (unsigned long)func->new_func);
-unlock:
- rcu_read_unlock();
-}
-
-/*
- * Convert a function address into the appropriate ftrace location.
- *
- * Usually this is just the address of the function, but on some architectures
- * it's more complicated so allow them to provide a custom behaviour.
- */
-#ifndef klp_get_ftrace_location
-static unsigned long klp_get_ftrace_location(unsigned long faddr)
-{
- return faddr;
-}
-#endif
-
-static void klp_disable_func(struct klp_func *func)
-{
- struct klp_ops *ops;
-
- if (WARN_ON(func->state != KLP_ENABLED))
- return;
- if (WARN_ON(!func->old_addr))
- return;
-
- ops = klp_find_ops(func->old_addr);
- if (WARN_ON(!ops))
- return;
-
- if (list_is_singular(&ops->func_stack)) {
- unsigned long ftrace_loc;
-
- ftrace_loc = klp_get_ftrace_location(func->old_addr);
- if (WARN_ON(!ftrace_loc))
- return;
-
- WARN_ON(unregister_ftrace_function(&ops->fops));
- WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0));
-
- list_del_rcu(&func->stack_node);
- list_del(&ops->node);
- kfree(ops);
- } else {
- list_del_rcu(&func->stack_node);
- }
-
- func->state = KLP_DISABLED;
-}
-
-static int klp_enable_func(struct klp_func *func)
-{
- struct klp_ops *ops;
- int ret;
-
- if (WARN_ON(!func->old_addr))
- return -EINVAL;
-
- if (WARN_ON(func->state != KLP_DISABLED))
- return -EINVAL;
-
- ops = klp_find_ops(func->old_addr);
- if (!ops) {
- unsigned long ftrace_loc;
-
- ftrace_loc = klp_get_ftrace_location(func->old_addr);
- if (!ftrace_loc) {
- pr_err("failed to find location for function '%s'\n",
- func->old_name);
- return -EINVAL;
- }
-
- ops = kzalloc(sizeof(*ops), GFP_KERNEL);
- if (!ops)
- return -ENOMEM;
-
- ops->fops.func = klp_ftrace_handler;
- ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
- FTRACE_OPS_FL_DYNAMIC |
- FTRACE_OPS_FL_IPMODIFY;
-
- list_add(&ops->node, &klp_ops);
-
- INIT_LIST_HEAD(&ops->func_stack);
- list_add_rcu(&func->stack_node, &ops->func_stack);
-
- ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0);
- if (ret) {
- pr_err("failed to set ftrace filter for function '%s' (%d)\n",
- func->old_name, ret);
- goto err;
- }
-
- ret = register_ftrace_function(&ops->fops);
- if (ret) {
- pr_err("failed to register ftrace handler for function '%s' (%d)\n",
- func->old_name, ret);
- ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0);
- goto err;
- }
-
-
- } else {
- list_add_rcu(&func->stack_node, &ops->func_stack);
- }
-
- func->state = KLP_ENABLED;
-
- return 0;
-
-err:
- list_del_rcu(&func->stack_node);
- list_del(&ops->node);
- kfree(ops);
- return ret;
-}
-
-static void klp_disable_object(struct klp_object *obj)
-{
- struct klp_func *func;
-
- klp_for_each_func(obj, func)
- if (func->state == KLP_ENABLED)
- klp_disable_func(func);
-
- obj->state = KLP_DISABLED;
-}
-
-static int klp_enable_object(struct klp_object *obj)
-{
- struct klp_func *func;
- int ret;
-
- if (WARN_ON(obj->state != KLP_DISABLED))
- return -EINVAL;
-
- if (WARN_ON(!klp_is_object_loaded(obj)))
- return -EINVAL;
-
- klp_for_each_func(obj, func) {
- ret = klp_enable_func(func);
- if (ret) {
- klp_disable_object(obj);
- return ret;
- }
- }
- obj->state = KLP_ENABLED;
-
- return 0;
-}
-
static int __klp_disable_patch(struct klp_patch *patch)
{
- struct klp_object *obj;
+ if (klp_transition_patch)
+ return -EBUSY;
/* enforce stacking: only the last enabled patch can be disabled */
if (!list_is_last(&patch->list, &klp_patches) &&
- list_next_entry(patch, list)->state == KLP_ENABLED)
+ list_next_entry(patch, list)->enabled)
return -EBUSY;
- pr_notice("disabling patch '%s'\n", patch->mod->name);
+ klp_init_transition(patch, KLP_UNPATCHED);
- klp_for_each_object(patch, obj) {
- if (obj->state == KLP_ENABLED)
- klp_disable_object(obj);
- }
+ /*
+ * Enforce the order of the func->transition writes in
+ * klp_init_transition() and the TIF_PATCH_PENDING writes in
+ * klp_start_transition(). In the rare case where klp_ftrace_handler()
+ * is called shortly after klp_update_patch_state() switches the task,
+ * this ensures the handler sees that func->transition is set.
+ */
+ smp_wmb();
- patch->state = KLP_DISABLED;
+ klp_start_transition();
+ klp_try_complete_transition();
+ patch->enabled = false;
return 0;
}
@@ -519,7 +330,7 @@ int klp_disable_patch(struct klp_patch *patch)
goto err;
}
- if (patch->state == KLP_DISABLED) {
+ if (!patch->enabled) {
ret = -EINVAL;
goto err;
}
@@ -537,32 +348,61 @@ static int __klp_enable_patch(struct klp_patch *patch)
struct klp_object *obj;
int ret;
- if (WARN_ON(patch->state != KLP_DISABLED))
+ if (klp_transition_patch)
+ return -EBUSY;
+
+ if (WARN_ON(patch->enabled))
return -EINVAL;
/* enforce stacking: only the first disabled patch can be enabled */
if (patch->list.prev != &klp_patches &&
- list_prev_entry(patch, list)->state == KLP_DISABLED)
+ !list_prev_entry(patch, list)->enabled)
return -EBUSY;
+ /*
+ * A reference is taken on the patch module to prevent it from being
+ * unloaded.
+ *
+ * Note: For immediate (no consistency model) patches we don't allow
+ * patch modules to unload since there is no safe/sane method to
+ * determine if a thread is still running in the patched code contained
+ * in the patch module once the ftrace registration is successful.
+ */
+ if (!try_module_get(patch->mod))
+ return -ENODEV;
+
pr_notice("enabling patch '%s'\n", patch->mod->name);
+ klp_init_transition(patch, KLP_PATCHED);
+
+ /*
+ * Enforce the order of the func->transition writes in
+ * klp_init_transition() and the ops->func_stack writes in
+ * klp_patch_object(), so that klp_ftrace_handler() will see the
+ * func->transition updates before the handler is registered and the
+ * new funcs become visible to the handler.
+ */
+ smp_wmb();
+
klp_for_each_object(patch, obj) {
if (!klp_is_object_loaded(obj))
continue;
- ret = klp_enable_object(obj);
- if (ret)
- goto unregister;
+ ret = klp_patch_object(obj);
+ if (ret) {
+ pr_warn("failed to enable patch '%s'\n",
+ patch->mod->name);
+
+ klp_cancel_transition();
+ return ret;
+ }
}
- patch->state = KLP_ENABLED;
+ klp_start_transition();
+ klp_try_complete_transition();
+ patch->enabled = true;
return 0;
-
-unregister:
- WARN_ON(__klp_disable_patch(patch));
- return ret;
}
/**
@@ -599,6 +439,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
* /sys/kernel/livepatch
* /sys/kernel/livepatch/<patch>
* /sys/kernel/livepatch/<patch>/enabled
+ * /sys/kernel/livepatch/<patch>/transition
* /sys/kernel/livepatch/<patch>/<object>
* /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
*/
@@ -608,26 +449,34 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
{
struct klp_patch *patch;
int ret;
- unsigned long val;
+ bool enabled;
- ret = kstrtoul(buf, 10, &val);
+ ret = kstrtobool(buf, &enabled);
if (ret)
- return -EINVAL;
-
- if (val != KLP_DISABLED && val != KLP_ENABLED)
- return -EINVAL;
+ return ret;
patch = container_of(kobj, struct klp_patch, kobj);
mutex_lock(&klp_mutex);
- if (val == patch->state) {
+ if (!klp_is_patch_registered(patch)) {
+ /*
+ * Module with the patch could either disappear meanwhile or is
+ * not properly initialized yet.
+ */
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (patch->enabled == enabled) {
/* already in requested state */
ret = -EINVAL;
goto err;
}
- if (val == KLP_ENABLED) {
+ if (patch == klp_transition_patch) {
+ klp_reverse_transition();
+ } else if (enabled) {
ret = __klp_enable_patch(patch);
if (ret)
goto err;
@@ -652,21 +501,33 @@ static ssize_t enabled_show(struct kobject *kobj,
struct klp_patch *patch;
patch = container_of(kobj, struct klp_patch, kobj);
- return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
+ return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->enabled);
+}
+
+static ssize_t transition_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_patch *patch;
+
+ patch = container_of(kobj, struct klp_patch, kobj);
+ return snprintf(buf, PAGE_SIZE-1, "%d\n",
+ patch == klp_transition_patch);
}
static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
+static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);
static struct attribute *klp_patch_attrs[] = {
&enabled_kobj_attr.attr,
+ &transition_kobj_attr.attr,
NULL
};
static void klp_kobj_release_patch(struct kobject *kobj)
{
- /*
- * Once we have a consistency model we'll need to module_put() the
- * patch module here. See klp_register_patch() for more details.
- */
+ struct klp_patch *patch;
+
+ patch = container_of(kobj, struct klp_patch, kobj);
+ complete(&patch->finish);
}
static struct kobj_type klp_ktype_patch = {
@@ -737,7 +598,6 @@ static void klp_free_patch(struct klp_patch *patch)
klp_free_objects_limited(patch, NULL);
if (!list_empty(&patch->list))
list_del(&patch->list);
- kobject_put(&patch->kobj);
}
static int klp_init_func(struct klp_object *obj, struct klp_func *func)
@@ -746,7 +606,8 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
return -EINVAL;
INIT_LIST_HEAD(&func->stack_node);
- func->state = KLP_DISABLED;
+ func->patched = false;
+ func->transition = false;
/* The format for the sysfs directory is <function,sympos> where sympos
* is the nth occurrence of this symbol in kallsyms for the patched
@@ -787,6 +648,22 @@ static int klp_init_object_loaded(struct klp_patch *patch,
&func->old_addr);
if (ret)
return ret;
+
+ ret = kallsyms_lookup_size_offset(func->old_addr,
+ &func->old_size, NULL);
+ if (!ret) {
+ pr_err("kallsyms size lookup failed for '%s'\n",
+ func->old_name);
+ return -ENOENT;
+ }
+
+ ret = kallsyms_lookup_size_offset((unsigned long)func->new_func,
+ &func->new_size, NULL);
+ if (!ret) {
+ pr_err("kallsyms size lookup failed for '%s' replacement\n",
+ func->old_name);
+ return -ENOENT;
+ }
}
return 0;
@@ -801,7 +678,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
if (!obj->funcs)
return -EINVAL;
- obj->state = KLP_DISABLED;
+ obj->patched = false;
obj->mod = NULL;
klp_find_object_module(obj);
@@ -842,12 +719,15 @@ static int klp_init_patch(struct klp_patch *patch)
mutex_lock(&klp_mutex);
- patch->state = KLP_DISABLED;
+ patch->enabled = false;
+ init_completion(&patch->finish);
ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
klp_root_kobj, "%s", patch->mod->name);
- if (ret)
- goto unlock;
+ if (ret) {
+ mutex_unlock(&klp_mutex);
+ return ret;
+ }
klp_for_each_object(patch, obj) {
ret = klp_init_object(patch, obj);
@@ -863,9 +743,12 @@ static int klp_init_patch(struct klp_patch *patch)
free:
klp_free_objects_limited(patch, obj);
- kobject_put(&patch->kobj);
-unlock:
+
mutex_unlock(&klp_mutex);
+
+ kobject_put(&patch->kobj);
+ wait_for_completion(&patch->finish);
+
return ret;
}
@@ -879,23 +762,29 @@ unlock:
*/
int klp_unregister_patch(struct klp_patch *patch)
{
- int ret = 0;
+ int ret;
mutex_lock(&klp_mutex);
if (!klp_is_patch_registered(patch)) {
ret = -EINVAL;
- goto out;
+ goto err;
}
- if (patch->state == KLP_ENABLED) {
+ if (patch->enabled) {
ret = -EBUSY;
- goto out;
+ goto err;
}
klp_free_patch(patch);
-out:
+ mutex_unlock(&klp_mutex);
+
+ kobject_put(&patch->kobj);
+ wait_for_completion(&patch->finish);
+
+ return 0;
+err:
mutex_unlock(&klp_mutex);
return ret;
}
@@ -908,17 +797,18 @@ EXPORT_SYMBOL_GPL(klp_unregister_patch);
* Initializes the data structure associated with the patch and
* creates the sysfs interface.
*
+ * There is no need to take the reference on the patch module here. It is done
+ * later when the patch is enabled.
+ *
* Return: 0 on success, otherwise error
*/
int klp_register_patch(struct klp_patch *patch)
{
- int ret;
-
if (!patch || !patch->mod)
return -EINVAL;
if (!is_livepatch_module(patch->mod)) {
- pr_err("module %s is not marked as a livepatch module",
+ pr_err("module %s is not marked as a livepatch module\n",
patch->mod->name);
return -EINVAL;
}
@@ -927,20 +817,16 @@ int klp_register_patch(struct klp_patch *patch)
return -ENODEV;
/*
- * A reference is taken on the patch module to prevent it from being
- * unloaded. Right now, we don't allow patch modules to unload since
- * there is currently no method to determine if a thread is still
- * running in the patched code contained in the patch module once
- * the ftrace registration is successful.
+ * Architectures without reliable stack traces have to set
+ * patch->immediate because there's currently no way to patch kthreads
+ * with the consistency model.
*/
- if (!try_module_get(patch->mod))
- return -ENODEV;
-
- ret = klp_init_patch(patch);
- if (ret)
- module_put(patch->mod);
+ if (!klp_have_reliable_stack() && !patch->immediate) {
+ pr_err("This architecture doesn't have support for the livepatch consistency model.\n");
+ return -ENOSYS;
+ }
- return ret;
+ return klp_init_patch(patch);
}
EXPORT_SYMBOL_GPL(klp_register_patch);
@@ -975,13 +861,17 @@ int klp_module_coming(struct module *mod)
goto err;
}
- if (patch->state == KLP_DISABLED)
+ /*
+ * Only patch the module if the patch is enabled or is
+ * in transition.
+ */
+ if (!patch->enabled && patch != klp_transition_patch)
break;
pr_notice("applying patch '%s' to loading module '%s'\n",
patch->mod->name, obj->mod->name);
- ret = klp_enable_object(obj);
+ ret = klp_patch_object(obj);
if (ret) {
pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
patch->mod->name, obj->mod->name, ret);
@@ -1032,10 +922,14 @@ void klp_module_going(struct module *mod)
if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
continue;
- if (patch->state != KLP_DISABLED) {
+ /*
+ * Only unpatch the module if the patch is enabled or
+ * is in transition.
+ */
+ if (patch->enabled || patch == klp_transition_patch) {
pr_notice("reverting patch '%s' on unloading module '%s'\n",
patch->mod->name, obj->mod->name);
- klp_disable_object(obj);
+ klp_unpatch_object(obj);
}
klp_free_object_loaded(obj);
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
new file mode 100644
index 000000000000..c74f24c47837
--- /dev/null
+++ b/kernel/livepatch/core.h
@@ -0,0 +1,6 @@
+#ifndef _LIVEPATCH_CORE_H
+#define _LIVEPATCH_CORE_H
+
+extern struct mutex klp_mutex;
+
+#endif /* _LIVEPATCH_CORE_H */
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
new file mode 100644
index 000000000000..f8269036bf0b
--- /dev/null
+++ b/kernel/livepatch/patch.c
@@ -0,0 +1,272 @@
+/*
+ * patch.c - livepatch patching functions
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/livepatch.h>
+#include <linux/list.h>
+#include <linux/ftrace.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/bug.h>
+#include <linux/printk.h>
+#include "patch.h"
+#include "transition.h"
+
+static LIST_HEAD(klp_ops);
+
+struct klp_ops *klp_find_ops(unsigned long old_addr)
+{
+ struct klp_ops *ops;
+ struct klp_func *func;
+
+ list_for_each_entry(ops, &klp_ops, node) {
+ func = list_first_entry(&ops->func_stack, struct klp_func,
+ stack_node);
+ if (func->old_addr == old_addr)
+ return ops;
+ }
+
+ return NULL;
+}
+
+static void notrace klp_ftrace_handler(unsigned long ip,
+ unsigned long parent_ip,
+ struct ftrace_ops *fops,
+ struct pt_regs *regs)
+{
+ struct klp_ops *ops;
+ struct klp_func *func;
+ int patch_state;
+
+ ops = container_of(fops, struct klp_ops, fops);
+
+ rcu_read_lock();
+
+ func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
+ stack_node);
+
+ /*
+ * func should never be NULL because preemption should be disabled here
+ * and unregister_ftrace_function() does the equivalent of a
+ * synchronize_sched() before the func_stack removal.
+ */
+ if (WARN_ON_ONCE(!func))
+ goto unlock;
+
+ /*
+ * In the enable path, enforce the order of the ops->func_stack and
+ * func->transition reads. The corresponding write barrier is in
+ * __klp_enable_patch().
+ *
+ * (Note that this barrier technically isn't needed in the disable
+ * path. In the rare case where klp_update_patch_state() runs before
+ * this handler, its TIF_PATCH_PENDING read and this func->transition
+ * read need to be ordered. But klp_update_patch_state() already
+ * enforces that.)
+ */
+ smp_rmb();
+
+ if (unlikely(func->transition)) {
+
+ /*
+ * Enforce the order of the func->transition and
+ * current->patch_state reads. Otherwise we could read an
+ * out-of-date task state and pick the wrong function. The
+ * corresponding write barrier is in klp_init_transition().
+ */
+ smp_rmb();
+
+ patch_state = current->patch_state;
+
+ WARN_ON_ONCE(patch_state == KLP_UNDEFINED);
+
+ if (patch_state == KLP_UNPATCHED) {
+ /*
+ * Use the previously patched version of the function.
+ * If no previous patches exist, continue with the
+ * original function.
+ */
+ func = list_entry_rcu(func->stack_node.next,
+ struct klp_func, stack_node);
+
+ if (&func->stack_node == &ops->func_stack)
+ goto unlock;
+ }
+ }
+
+ klp_arch_set_pc(regs, (unsigned long)func->new_func);
+unlock:
+ rcu_read_unlock();
+}
+
+/*
+ * Convert a function address into the appropriate ftrace location.
+ *
+ * Usually this is just the address of the function, but on some architectures
+ * it's more complicated so allow them to provide a custom behaviour.
+ */
+#ifndef klp_get_ftrace_location
+static unsigned long klp_get_ftrace_location(unsigned long faddr)
+{
+ return faddr;
+}
+#endif
+
+static void klp_unpatch_func(struct klp_func *func)
+{
+ struct klp_ops *ops;
+
+ if (WARN_ON(!func->patched))
+ return;
+ if (WARN_ON(!func->old_addr))
+ return;
+
+ ops = klp_find_ops(func->old_addr);
+ if (WARN_ON(!ops))
+ return;
+
+ if (list_is_singular(&ops->func_stack)) {
+ unsigned long ftrace_loc;
+
+ ftrace_loc = klp_get_ftrace_location(func->old_addr);
+ if (WARN_ON(!ftrace_loc))
+ return;
+
+ WARN_ON(unregister_ftrace_function(&ops->fops));
+ WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0));
+
+ list_del_rcu(&func->stack_node);
+ list_del(&ops->node);
+ kfree(ops);
+ } else {
+ list_del_rcu(&func->stack_node);
+ }
+
+ func->patched = false;
+}
+
+static int klp_patch_func(struct klp_func *func)
+{
+ struct klp_ops *ops;
+ int ret;
+
+ if (WARN_ON(!func->old_addr))
+ return -EINVAL;
+
+ if (WARN_ON(func->patched))
+ return -EINVAL;
+
+ ops = klp_find_ops(func->old_addr);
+ if (!ops) {
+ unsigned long ftrace_loc;
+
+ ftrace_loc = klp_get_ftrace_location(func->old_addr);
+ if (!ftrace_loc) {
+ pr_err("failed to find location for function '%s'\n",
+ func->old_name);
+ return -EINVAL;
+ }
+
+ ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ ops->fops.func = klp_ftrace_handler;
+ ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
+ FTRACE_OPS_FL_DYNAMIC |
+ FTRACE_OPS_FL_IPMODIFY;
+
+ list_add(&ops->node, &klp_ops);
+
+ INIT_LIST_HEAD(&ops->func_stack);
+ list_add_rcu(&func->stack_node, &ops->func_stack);
+
+ ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0);
+ if (ret) {
+ pr_err("failed to set ftrace filter for function '%s' (%d)\n",
+ func->old_name, ret);
+ goto err;
+ }
+
+ ret = register_ftrace_function(&ops->fops);
+ if (ret) {
+ pr_err("failed to register ftrace handler for function '%s' (%d)\n",
+ func->old_name, ret);
+ ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0);
+ goto err;
+ }
+
+
+ } else {
+ list_add_rcu(&func->stack_node, &ops->func_stack);
+ }
+
+ func->patched = true;
+
+ return 0;
+
+err:
+ list_del_rcu(&func->stack_node);
+ list_del(&ops->node);
+ kfree(ops);
+ return ret;
+}
+
+void klp_unpatch_object(struct klp_object *obj)
+{
+ struct klp_func *func;
+
+ klp_for_each_func(obj, func)
+ if (func->patched)
+ klp_unpatch_func(func);
+
+ obj->patched = false;
+}
+
+int klp_patch_object(struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+
+ if (WARN_ON(obj->patched))
+ return -EINVAL;
+
+ klp_for_each_func(obj, func) {
+ ret = klp_patch_func(func);
+ if (ret) {
+ klp_unpatch_object(obj);
+ return ret;
+ }
+ }
+ obj->patched = true;
+
+ return 0;
+}
+
+void klp_unpatch_objects(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+
+ klp_for_each_object(patch, obj)
+ if (obj->patched)
+ klp_unpatch_object(obj);
+}
diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h
new file mode 100644
index 000000000000..0db227170c36
--- /dev/null
+++ b/kernel/livepatch/patch.h
@@ -0,0 +1,33 @@
+#ifndef _LIVEPATCH_PATCH_H
+#define _LIVEPATCH_PATCH_H
+
+#include <linux/livepatch.h>
+#include <linux/list.h>
+#include <linux/ftrace.h>
+
+/**
+ * struct klp_ops - structure for tracking registered ftrace ops structs
+ *
+ * A single ftrace_ops is shared between all enabled replacement functions
+ * (klp_func structs) which have the same old_addr. This allows the switch
+ * between function versions to happen instantaneously by updating the klp_ops
+ * struct's func_stack list. The winner is the klp_func at the top of the
+ * func_stack (front of the list).
+ *
+ * @node: node for the global klp_ops list
+ * @func_stack: list head for the stack of klp_func's (active func is on top)
+ * @fops: registered ftrace ops struct
+ */
+struct klp_ops {
+ struct list_head node;
+ struct list_head func_stack;
+ struct ftrace_ops fops;
+};
+
+struct klp_ops *klp_find_ops(unsigned long old_addr);
+
+int klp_patch_object(struct klp_object *obj);
+void klp_unpatch_object(struct klp_object *obj);
+void klp_unpatch_objects(struct klp_patch *patch);
+
+#endif /* _LIVEPATCH_PATCH_H */
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
new file mode 100644
index 000000000000..adc0cc64aa4b
--- /dev/null
+++ b/kernel/livepatch/transition.c
@@ -0,0 +1,553 @@
+/*
+ * transition.c - Kernel Live Patching transition functions
+ *
+ * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/stacktrace.h>
+#include "core.h"
+#include "patch.h"
+#include "transition.h"
+#include "../sched/sched.h"
+
+#define MAX_STACK_ENTRIES 100
+#define STACK_ERR_BUF_SIZE 128
+
+struct klp_patch *klp_transition_patch;
+
+static int klp_target_state = KLP_UNDEFINED;
+
+/*
+ * This work can be performed periodically to finish patching or unpatching any
+ * "straggler" tasks which failed to transition in the first attempt.
+ */
+static void klp_transition_work_fn(struct work_struct *work)
+{
+ mutex_lock(&klp_mutex);
+
+ if (klp_transition_patch)
+ klp_try_complete_transition();
+
+ mutex_unlock(&klp_mutex);
+}
+static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn);
+
+/*
+ * The transition to the target patch state is complete. Clean up the data
+ * structures.
+ */
+static void klp_complete_transition(void)
+{
+ struct klp_object *obj;
+ struct klp_func *func;
+ struct task_struct *g, *task;
+ unsigned int cpu;
+ bool immediate_func = false;
+
+ if (klp_target_state == KLP_UNPATCHED) {
+ /*
+ * All tasks have transitioned to KLP_UNPATCHED so we can now
+ * remove the new functions from the func_stack.
+ */
+ klp_unpatch_objects(klp_transition_patch);
+
+ /*
+ * Make sure klp_ftrace_handler() can no longer see functions
+ * from this patch on the ops->func_stack. Otherwise, after
+ * func->transition gets cleared, the handler may choose a
+ * removed function.
+ */
+ synchronize_rcu();
+ }
+
+ if (klp_transition_patch->immediate)
+ goto done;
+
+ klp_for_each_object(klp_transition_patch, obj) {
+ klp_for_each_func(obj, func) {
+ func->transition = false;
+ if (func->immediate)
+ immediate_func = true;
+ }
+ }
+
+ if (klp_target_state == KLP_UNPATCHED && !immediate_func)
+ module_put(klp_transition_patch->mod);
+
+ /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
+ if (klp_target_state == KLP_PATCHED)
+ synchronize_rcu();
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task) {
+ WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING));
+ task->patch_state = KLP_UNDEFINED;
+ }
+ read_unlock(&tasklist_lock);
+
+ for_each_possible_cpu(cpu) {
+ task = idle_task(cpu);
+ WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING));
+ task->patch_state = KLP_UNDEFINED;
+ }
+
+done:
+ klp_target_state = KLP_UNDEFINED;
+ klp_transition_patch = NULL;
+}
+
+/*
+ * This is called in the error path, to cancel a transition before it has
+ * started, i.e. klp_init_transition() has been called but
+ * klp_start_transition() hasn't. If the transition *has* been started,
+ * klp_reverse_transition() should be used instead.
+ */
+void klp_cancel_transition(void)
+{
+ if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED))
+ return;
+
+ klp_target_state = KLP_UNPATCHED;
+ klp_complete_transition();
+}
+
+/*
+ * Switch the patched state of the task to the set of functions in the target
+ * patch state.
+ *
+ * NOTE: If task is not 'current', the caller must ensure the task is inactive.
+ * Otherwise klp_ftrace_handler() might read the wrong 'patch_state' value.
+ */
+void klp_update_patch_state(struct task_struct *task)
+{
+ rcu_read_lock();
+
+ /*
+ * This test_and_clear_tsk_thread_flag() call also serves as a read
+ * barrier (smp_rmb) for two cases:
+ *
+ * 1) Enforce the order of the TIF_PATCH_PENDING read and the
+ * klp_target_state read. The corresponding write barrier is in
+ * klp_init_transition().
+ *
+ * 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
+ * of func->transition, if klp_ftrace_handler() is called later on
+ * the same CPU. See __klp_disable_patch().
+ */
+ if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING))
+ task->patch_state = READ_ONCE(klp_target_state);
+
+ rcu_read_unlock();
+}
+
+/*
+ * Determine whether the given stack trace includes any references to a
+ * to-be-patched or to-be-unpatched function.
+ */
+static int klp_check_stack_func(struct klp_func *func,
+ struct stack_trace *trace)
+{
+ unsigned long func_addr, func_size, address;
+ struct klp_ops *ops;
+ int i;
+
+ if (func->immediate)
+ return 0;
+
+ for (i = 0; i < trace->nr_entries; i++) {
+ address = trace->entries[i];
+
+ if (klp_target_state == KLP_UNPATCHED) {
+ /*
+ * Check for the to-be-unpatched function
+ * (the func itself).
+ */
+ func_addr = (unsigned long)func->new_func;
+ func_size = func->new_size;
+ } else {
+ /*
+ * Check for the to-be-patched function
+ * (the previous func).
+ */
+ ops = klp_find_ops(func->old_addr);
+
+ if (list_is_singular(&ops->func_stack)) {
+ /* original function */
+ func_addr = func->old_addr;
+ func_size = func->old_size;
+ } else {
+ /* previously patched function */
+ struct klp_func *prev;
+
+ prev = list_next_entry(func, stack_node);
+ func_addr = (unsigned long)prev->new_func;
+ func_size = prev->new_size;
+ }
+ }
+
+ if (address >= func_addr && address < func_addr + func_size)
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+/*
+ * Determine whether it's safe to transition the task to the target patch state
+ * by looking for any to-be-patched or to-be-unpatched functions on its stack.
+ */
+static int klp_check_stack(struct task_struct *task, char *err_buf)
+{
+ static unsigned long entries[MAX_STACK_ENTRIES];
+ struct stack_trace trace;
+ struct klp_object *obj;
+ struct klp_func *func;
+ int ret;
+
+ trace.skip = 0;
+ trace.nr_entries = 0;
+ trace.max_entries = MAX_STACK_ENTRIES;
+ trace.entries = entries;
+ ret = save_stack_trace_tsk_reliable(task, &trace);
+ WARN_ON_ONCE(ret == -ENOSYS);
+ if (ret) {
+ snprintf(err_buf, STACK_ERR_BUF_SIZE,
+ "%s: %s:%d has an unreliable stack\n",
+ __func__, task->comm, task->pid);
+ return ret;
+ }
+
+ klp_for_each_object(klp_transition_patch, obj) {
+ if (!obj->patched)
+ continue;
+ klp_for_each_func(obj, func) {
+ ret = klp_check_stack_func(func, &trace);
+ if (ret) {
+ snprintf(err_buf, STACK_ERR_BUF_SIZE,
+ "%s: %s:%d is sleeping on function %s\n",
+ __func__, task->comm, task->pid,
+ func->old_name);
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Try to safely switch a task to the target patch state. If it's currently
+ * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or
+ * if the stack is unreliable, return false.
+ */
+static bool klp_try_switch_task(struct task_struct *task)
+{
+ struct rq *rq;
+ struct rq_flags flags;
+ int ret;
+ bool success = false;
+ char err_buf[STACK_ERR_BUF_SIZE];
+
+ err_buf[0] = '\0';
+
+ /* check if this task has already switched over */
+ if (task->patch_state == klp_target_state)
+ return true;
+
+ /*
+ * For arches which don't have reliable stack traces, we have to rely
+ * on other methods (e.g., switching tasks at kernel exit).
+ */
+ if (!klp_have_reliable_stack())
+ return false;
+
+ /*
+ * Now try to check the stack for any to-be-patched or to-be-unpatched
+ * functions. If all goes well, switch the task to the target patch
+ * state.
+ */
+ rq = task_rq_lock(task, &flags);
+
+ if (task_running(rq, task) && task != current) {
+ snprintf(err_buf, STACK_ERR_BUF_SIZE,
+ "%s: %s:%d is running\n", __func__, task->comm,
+ task->pid);
+ goto done;
+ }
+
+ ret = klp_check_stack(task, err_buf);
+ if (ret)
+ goto done;
+
+ success = true;
+
+ clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+ task->patch_state = klp_target_state;
+
+done:
+ task_rq_unlock(rq, task, &flags);
+
+ /*
+ * Due to console deadlock issues, pr_debug() can't be used while
+ * holding the task rq lock. Instead we have to use a temporary buffer
+ * and print the debug message after releasing the lock.
+ */
+ if (err_buf[0] != '\0')
+ pr_debug("%s", err_buf);
+
+ return success;
+
+}
+
+/*
+ * Try to switch all remaining tasks to the target patch state by walking the
+ * stacks of sleeping tasks and looking for any to-be-patched or
+ * to-be-unpatched functions. If such functions are found, the task can't be
+ * switched yet.
+ *
+ * If any tasks are still stuck in the initial patch state, schedule a retry.
+ */
+void klp_try_complete_transition(void)
+{
+ unsigned int cpu;
+ struct task_struct *g, *task;
+ bool complete = true;
+
+ WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
+
+ /*
+ * If the patch can be applied or reverted immediately, skip the
+ * per-task transitions.
+ */
+ if (klp_transition_patch->immediate)
+ goto success;
+
+ /*
+ * Try to switch the tasks to the target patch state by walking their
+ * stacks and looking for any to-be-patched or to-be-unpatched
+ * functions. If such functions are found on a stack, or if the stack
+ * is deemed unreliable, the task can't be switched yet.
+ *
+ * Usually this will transition most (or all) of the tasks on a system
+ * unless the patch includes changes to a very common function.
+ */
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task)
+ if (!klp_try_switch_task(task))
+ complete = false;
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Ditto for the idle "swapper" tasks.
+ */
+ get_online_cpus();
+ for_each_possible_cpu(cpu) {
+ task = idle_task(cpu);
+ if (cpu_online(cpu)) {
+ if (!klp_try_switch_task(task))
+ complete = false;
+ } else if (task->patch_state != klp_target_state) {
+ /* offline idle tasks can be switched immediately */
+ clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+ task->patch_state = klp_target_state;
+ }
+ }
+ put_online_cpus();
+
+ if (!complete) {
+ /*
+ * Some tasks weren't able to be switched over. Try again
+ * later and/or wait for other methods like kernel exit
+ * switching.
+ */
+ schedule_delayed_work(&klp_transition_work,
+ round_jiffies_relative(HZ));
+ return;
+ }
+
+success:
+ pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
+ klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
+ /* we're done, now cleanup the data structures */
+ klp_complete_transition();
+}
+
+/*
+ * Start the transition to the specified target patch state so tasks can begin
+ * switching to it.
+ */
+void klp_start_transition(void)
+{
+ struct task_struct *g, *task;
+ unsigned int cpu;
+
+ WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
+
+ pr_notice("'%s': %s...\n", klp_transition_patch->mod->name,
+ klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
+ /*
+ * If the patch can be applied or reverted immediately, skip the
+ * per-task transitions.
+ */
+ if (klp_transition_patch->immediate)
+ return;
+
+ /*
+ * Mark all normal tasks as needing a patch state update. They'll
+ * switch either in klp_try_complete_transition() or as they exit the
+ * kernel.
+ */
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task)
+ if (task->patch_state != klp_target_state)
+ set_tsk_thread_flag(task, TIF_PATCH_PENDING);
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Mark all idle tasks as needing a patch state update. They'll switch
+ * either in klp_try_complete_transition() or at the idle loop switch
+ * point.
+ */
+ for_each_possible_cpu(cpu) {
+ task = idle_task(cpu);
+ if (task->patch_state != klp_target_state)
+ set_tsk_thread_flag(task, TIF_PATCH_PENDING);
+ }
+}
+
+/*
+ * Initialize the global target patch state and all tasks to the initial patch
+ * state, and initialize all function transition states to true in preparation
+ * for patching or unpatching.
+ */
+void klp_init_transition(struct klp_patch *patch, int state)
+{
+ struct task_struct *g, *task;
+ unsigned int cpu;
+ struct klp_object *obj;
+ struct klp_func *func;
+ int initial_state = !state;
+
+ WARN_ON_ONCE(klp_target_state != KLP_UNDEFINED);
+
+ klp_transition_patch = patch;
+
+ /*
+ * Set the global target patch state which tasks will switch to. This
+ * has no effect until the TIF_PATCH_PENDING flags get set later.
+ */
+ klp_target_state = state;
+
+ /*
+ * If the patch can be applied or reverted immediately, skip the
+ * per-task transitions.
+ */
+ if (patch->immediate)
+ return;
+
+ /*
+ * Initialize all tasks to the initial patch state to prepare them for
+ * switching to the target state.
+ */
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task) {
+ WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED);
+ task->patch_state = initial_state;
+ }
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Ditto for the idle "swapper" tasks.
+ */
+ for_each_possible_cpu(cpu) {
+ task = idle_task(cpu);
+ WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED);
+ task->patch_state = initial_state;
+ }
+
+ /*
+ * Enforce the order of the task->patch_state initializations and the
+ * func->transition updates to ensure that klp_ftrace_handler() doesn't
+ * see a func in transition with a task->patch_state of KLP_UNDEFINED.
+ *
+ * Also enforce the order of the klp_target_state write and future
+ * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't
+ * set a task->patch_state to KLP_UNDEFINED.
+ */
+ smp_wmb();
+
+ /*
+ * Set the func transition states so klp_ftrace_handler() will know to
+ * switch to the transition logic.
+ *
+ * When patching, the funcs aren't yet in the func_stack and will be
+ * made visible to the ftrace handler shortly by the calls to
+ * klp_patch_object().
+ *
+ * When unpatching, the funcs are already in the func_stack and so are
+ * already visible to the ftrace handler.
+ */
+ klp_for_each_object(patch, obj)
+ klp_for_each_func(obj, func)
+ func->transition = true;
+}
+
+/*
+ * This function can be called in the middle of an existing transition to
+ * reverse the direction of the target patch state. This can be done to
+ * effectively cancel an existing enable or disable operation if there are any
+ * tasks which are stuck in the initial patch state.
+ */
+void klp_reverse_transition(void)
+{
+ unsigned int cpu;
+ struct task_struct *g, *task;
+
+ klp_transition_patch->enabled = !klp_transition_patch->enabled;
+
+ klp_target_state = !klp_target_state;
+
+ /*
+ * Clear all TIF_PATCH_PENDING flags to prevent races caused by
+ * klp_update_patch_state() running in parallel with
+ * klp_start_transition().
+ */
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task)
+ clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+ read_unlock(&tasklist_lock);
+
+ for_each_possible_cpu(cpu)
+ clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
+
+ /* Let any remaining calls to klp_update_patch_state() complete */
+ synchronize_rcu();
+
+ klp_start_transition();
+}
+
+/* Called from copy_process() during fork */
+void klp_copy_process(struct task_struct *child)
+{
+ child->patch_state = current->patch_state;
+
+ /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */
+}
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h
new file mode 100644
index 000000000000..ce09b326546c
--- /dev/null
+++ b/kernel/livepatch/transition.h
@@ -0,0 +1,14 @@
+#ifndef _LIVEPATCH_TRANSITION_H
+#define _LIVEPATCH_TRANSITION_H
+
+#include <linux/livepatch.h>
+
+extern struct klp_patch *klp_transition_patch;
+
+void klp_init_transition(struct klp_patch *patch, int state);
+void klp_cancel_transition(void);
+void klp_start_transition(void);
+void klp_try_complete_transition(void);
+void klp_reverse_transition(void);
+
+#endif /* _LIVEPATCH_TRANSITION_H */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a95e5d1f4a9c..c0e31bfee25c 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -30,6 +30,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/sched/task.h>
+#include <linux/sched/mm.h>
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
@@ -660,6 +661,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
struct lockdep_subclass_key *key;
struct hlist_head *hash_head;
struct lock_class *class;
+ bool is_static = false;
if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
debug_locks_off();
@@ -673,10 +675,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
/*
* Static locks do not have their class-keys yet - for them the key
- * is the lock object itself:
+ * is the lock object itself. If the lock is in the per cpu area,
+ * the canonical address of the lock (per cpu offset removed) is
+ * used.
*/
- if (unlikely(!lock->key))
- lock->key = (void *)lock;
+ if (unlikely(!lock->key)) {
+ unsigned long can_addr, addr = (unsigned long)lock;
+
+ if (__is_kernel_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (__is_module_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (static_obj(lock))
+ lock->key = (void *)lock;
+ else
+ return ERR_PTR(-EINVAL);
+ is_static = true;
+ }
/*
* NOTE: the class-key must be unique. For dynamic locks, a static
@@ -708,7 +723,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
}
}
- return NULL;
+ return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
}
/*
@@ -726,19 +741,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
class = look_up_lock_class(lock, subclass);
- if (likely(class))
+ if (likely(!IS_ERR_OR_NULL(class)))
goto out_set_class_cache;
/*
* Debug-check: all keys must be persistent!
- */
- if (!static_obj(lock->key)) {
+ */
+ if (IS_ERR(class)) {
debug_locks_off();
printk("INFO: trying to register non-static key.\n");
printk("the code is fine but needs lockdep annotation.\n");
printk("turning off the locking correctness validator.\n");
dump_stack();
-
return NULL;
}
@@ -1144,10 +1158,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
return 0;
printk("\n");
- printk("======================================================\n");
- printk("[ INFO: possible circular locking dependency detected ]\n");
+ pr_warn("======================================================\n");
+ pr_warn("WARNING: possible circular locking dependency detected\n");
print_kernel_ident();
- printk("-------------------------------------------------------\n");
+ pr_warn("------------------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(check_src);
@@ -1482,11 +1496,11 @@ print_bad_irq_dependency(struct task_struct *curr,
return 0;
printk("\n");
- printk("======================================================\n");
- printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
+ pr_warn("=====================================================\n");
+ pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
irqclass, irqclass);
print_kernel_ident();
- printk("------------------------------------------------------\n");
+ pr_warn("-----------------------------------------------------\n");
printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
curr->comm, task_pid_nr(curr),
curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1711,10 +1725,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
return 0;
printk("\n");
- printk("=============================================\n");
- printk("[ INFO: possible recursive locking detected ]\n");
+ pr_warn("============================================\n");
+ pr_warn("WARNING: possible recursive locking detected\n");
print_kernel_ident();
- printk("---------------------------------------------\n");
+ pr_warn("--------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(next);
@@ -2061,10 +2075,10 @@ static void print_collision(struct task_struct *curr,
struct lock_chain *chain)
{
printk("\n");
- printk("======================\n");
- printk("[chain_key collision ]\n");
+ pr_warn("============================\n");
+ pr_warn("WARNING: chain_key collision\n");
print_kernel_ident();
- printk("----------------------\n");
+ pr_warn("----------------------------\n");
printk("%s/%d: ", current->comm, task_pid_nr(current));
printk("Hash chain already cached but the contents don't match!\n");
@@ -2360,10 +2374,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
return 0;
printk("\n");
- printk("=================================\n");
- printk("[ INFO: inconsistent lock state ]\n");
+ pr_warn("================================\n");
+ pr_warn("WARNING: inconsistent lock state\n");
print_kernel_ident();
- printk("---------------------------------\n");
+ pr_warn("--------------------------------\n");
printk("inconsistent {%s} -> {%s} usage.\n",
usage_str[prev_bit], usage_str[new_bit]);
@@ -2425,10 +2439,10 @@ print_irq_inversion_bug(struct task_struct *curr,
return 0;
printk("\n");
- printk("=========================================================\n");
- printk("[ INFO: possible irq lock inversion dependency detected ]\n");
+ pr_warn("========================================================\n");
+ pr_warn("WARNING: possible irq lock inversion dependency detected\n");
print_kernel_ident();
- printk("---------------------------------------------------------\n");
+ pr_warn("--------------------------------------------------------\n");
printk("%s/%d just changed the state of lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(this);
@@ -2863,6 +2877,8 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
if (unlikely(!debug_locks))
return;
+ gfp_mask = current_gfp_context(gfp_mask);
+
/* no reclaim without waiting on it */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
return;
@@ -2872,7 +2888,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
return;
/* We're only interested __GFP_FS allocations for now */
- if (!(gfp_mask & __GFP_FS))
+ if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
return;
/*
@@ -2881,6 +2897,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
return;
+ /* Disable lockdep if explicitly requested */
+ if (gfp_mask & __GFP_NOLOCKDEP)
+ return;
+
mark_held_locks(curr, RECLAIM_FS);
}
@@ -3170,10 +3190,10 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
return 0;
printk("\n");
- printk("==================================\n");
- printk("[ BUG: Nested lock was not taken ]\n");
+ pr_warn("==================================\n");
+ pr_warn("WARNING: Nested lock was not taken\n");
print_kernel_ident();
- printk("----------------------------------\n");
+ pr_warn("----------------------------------\n");
printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
print_lock(hlock);
@@ -3383,10 +3403,10 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
return 0;
printk("\n");
- printk("=====================================\n");
- printk("[ BUG: bad unlock balance detected! ]\n");
+ pr_warn("=====================================\n");
+ pr_warn("WARNING: bad unlock balance detected!\n");
print_kernel_ident();
- printk("-------------------------------------\n");
+ pr_warn("-------------------------------------\n");
printk("%s/%d is trying to release lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
@@ -3419,7 +3439,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
* Clearly if the lock hasn't been acquired _ever_, we're not
* holding it either, so report failure.
*/
- if (!class)
+ if (IS_ERR_OR_NULL(class))
return 0;
/*
@@ -3437,13 +3457,67 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
return 0;
}
+/* @depth must not be zero */
+static struct held_lock *find_held_lock(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned int depth, int *idx)
+{
+ struct held_lock *ret, *hlock, *prev_hlock;
+ int i;
+
+ i = depth - 1;
+ hlock = curr->held_locks + i;
+ ret = hlock;
+ if (match_held_lock(hlock, lock))
+ goto out;
+
+ ret = NULL;
+ for (i--, prev_hlock = hlock--;
+ i >= 0;
+ i--, prev_hlock = hlock--) {
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock->irq_context != hlock->irq_context) {
+ ret = NULL;
+ break;
+ }
+ if (match_held_lock(hlock, lock)) {
+ ret = hlock;
+ break;
+ }
+ }
+
+out:
+ *idx = i;
+ return ret;
+}
+
+static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
+ int idx)
+{
+ struct held_lock *hlock;
+
+ for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) {
+ if (!__lock_acquire(hlock->instance,
+ hlock_class(hlock)->subclass,
+ hlock->trylock,
+ hlock->read, hlock->check,
+ hlock->hardirqs_off,
+ hlock->nest_lock, hlock->acquire_ip,
+ hlock->references, hlock->pin_count))
+ return 1;
+ }
+ return 0;
+}
+
static int
__lock_set_class(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, unsigned int subclass,
unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ struct held_lock *hlock;
struct lock_class *class;
unsigned int depth;
int i;
@@ -3456,21 +3530,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
if (DEBUG_LOCKS_WARN_ON(!depth))
return 0;
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
- }
- return print_unlock_imbalance_bug(curr, lock, ip);
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock)
+ return print_unlock_imbalance_bug(curr, lock, ip);
-found_it:
lockdep_init_map(lock, name, key, 0);
class = register_lock_class(lock, subclass, 0);
hlock->class_idx = class - lock_classes + 1;
@@ -3478,15 +3541,46 @@ found_it:
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
- for (; i < depth; i++) {
- hlock = curr->held_locks + i;
- if (!__lock_acquire(hlock->instance,
- hlock_class(hlock)->subclass, hlock->trylock,
- hlock->read, hlock->check, hlock->hardirqs_off,
- hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count))
- return 0;
- }
+ if (reacquire_held_locks(curr, depth, i))
+ return 0;
+
+ /*
+ * I took it apart and put it back together again, except now I have
+ * these 'spare' parts.. where shall I put them.
+ */
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+ return 0;
+ return 1;
+}
+
+static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock;
+ unsigned int depth;
+ int i;
+
+ depth = curr->lockdep_depth;
+ /*
+ * This function is about (re)setting the class of a held lock,
+ * yet we're not actually holding any locks. Naughty user!
+ */
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return 0;
+
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock)
+ return print_unlock_imbalance_bug(curr, lock, ip);
+
+ curr->lockdep_depth = i;
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ WARN(hlock->read, "downgrading a read lock");
+ hlock->read = 1;
+ hlock->acquire_ip = ip;
+
+ if (reacquire_held_locks(curr, depth, i))
+ return 0;
/*
* I took it apart and put it back together again, except now I have
@@ -3508,7 +3602,7 @@ static int
__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ struct held_lock *hlock;
unsigned int depth;
int i;
@@ -3527,21 +3621,10 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
* Check whether the lock exists in the current stack
* of held locks:
*/
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
- }
- return print_unlock_imbalance_bug(curr, lock, ip);
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock)
+ return print_unlock_imbalance_bug(curr, lock, ip);
-found_it:
if (hlock->instance == lock)
lock_release_holdtime(hlock);
@@ -3568,15 +3651,8 @@ found_it:
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
- for (i++; i < depth; i++) {
- hlock = curr->held_locks + i;
- if (!__lock_acquire(hlock->instance,
- hlock_class(hlock)->subclass, hlock->trylock,
- hlock->read, hlock->check, hlock->hardirqs_off,
- hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count))
- return 0;
- }
+ if (reacquire_held_locks(curr, depth, i + 1))
+ return 0;
/*
* We had N bottles of beer on the wall, we drank one, but now
@@ -3741,6 +3817,23 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
}
EXPORT_SYMBOL_GPL(lock_set_class);
+void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
+ check_flags(flags);
+ if (__lock_downgrade(lock, ip))
+ check_chain_key(current);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_downgrade);
+
/*
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
@@ -3861,13 +3954,15 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock);
void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
{
- current->lockdep_reclaim_gfp = gfp_mask;
+ current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
}
+EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
void lockdep_clear_current_reclaim_state(void)
{
current->lockdep_reclaim_gfp = 0;
}
+EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state);
#ifdef CONFIG_LOCK_STAT
static int
@@ -3880,10 +3975,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
return 0;
printk("\n");
- printk("=================================\n");
- printk("[ BUG: bad contention detected! ]\n");
+ pr_warn("=================================\n");
+ pr_warn("WARNING: bad contention detected!\n");
print_kernel_ident();
- printk("---------------------------------\n");
+ pr_warn("---------------------------------\n");
printk("%s/%d is trying to contend lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
@@ -3903,7 +3998,7 @@ static void
__lock_contended(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ struct held_lock *hlock;
struct lock_class_stats *stats;
unsigned int depth;
int i, contention_point, contending_point;
@@ -3916,22 +4011,12 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock) {
+ print_lock_contention_bug(curr, lock, ip);
+ return;
}
- print_lock_contention_bug(curr, lock, ip);
- return;
-found_it:
if (hlock->instance != lock)
return;
@@ -3955,7 +4040,7 @@ static void
__lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ struct held_lock *hlock;
struct lock_class_stats *stats;
unsigned int depth;
u64 now, waittime = 0;
@@ -3969,22 +4054,12 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock) {
+ print_lock_contention_bug(curr, lock, _RET_IP_);
+ return;
}
- print_lock_contention_bug(curr, lock, _RET_IP_);
- return;
-found_it:
if (hlock->instance != lock)
return;
@@ -4172,7 +4247,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
* If the class exists we look it up and zap it:
*/
class = look_up_lock_class(lock, j);
- if (class)
+ if (!IS_ERR_OR_NULL(class))
zap_class(class);
}
/*
@@ -4244,10 +4319,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
return;
printk("\n");
- printk("=========================\n");
- printk("[ BUG: held lock freed! ]\n");
+ pr_warn("=========================\n");
+ pr_warn("WARNING: held lock freed!\n");
print_kernel_ident();
- printk("-------------------------\n");
+ pr_warn("-------------------------\n");
printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
print_lock(hlock);
@@ -4302,11 +4377,11 @@ static void print_held_locks_bug(void)
return;
printk("\n");
- printk("=====================================\n");
- printk("[ BUG: %s/%d still has locks held! ]\n",
+ pr_warn("====================================\n");
+ pr_warn("WARNING: %s/%d still has locks held!\n",
current->comm, task_pid_nr(current));
print_kernel_ident();
- printk("-------------------------------------\n");
+ pr_warn("------------------------------------\n");
lockdep_print_held_locks(current);
printk("\nstack backtrace:\n");
dump_stack();
@@ -4371,7 +4446,7 @@ retry:
} while_each_thread(g, p);
printk("\n");
- printk("=============================================\n\n");
+ pr_warn("=============================================\n\n");
if (unlock)
read_unlock(&tasklist_lock);
@@ -4401,10 +4476,10 @@ asmlinkage __visible void lockdep_sys_exit(void)
if (!debug_locks_off())
return;
printk("\n");
- printk("================================================\n");
- printk("[ BUG: lock held when returning to user space! ]\n");
+ pr_warn("================================================\n");
+ pr_warn("WARNING: lock held when returning to user space!\n");
print_kernel_ident();
- printk("------------------------------------------------\n");
+ pr_warn("------------------------------------------------\n");
printk("%s/%d is leaving the kernel with locks still held!\n",
curr->comm, curr->pid);
lockdep_print_held_locks(curr);
@@ -4421,13 +4496,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
/* Note: the following can be executed concurrently, so be careful. */
printk("\n");
- pr_err("===============================\n");
- pr_err("[ ERR: suspicious RCU usage. ]\n");
+ pr_warn("=============================\n");
+ pr_warn("WARNING: suspicious RCU usage\n");
print_kernel_ident();
- pr_err("-------------------------------\n");
- pr_err("%s:%d %s!\n", file, line, s);
- pr_err("\nother info that might help us debug this:\n\n");
- pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+ pr_warn("-----------------------------\n");
+ printk("%s:%d %s!\n", file, line, s);
+ printk("\nother info that might help us debug this:\n\n");
+ printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
: !rcu_is_watching()
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index c2b88490d857..c08fbd2f5ba9 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -46,13 +46,13 @@ enum {
(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
/*
- * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text,
+ * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
* .data and .bss to fit in required 32MB limit for the kernel. With
- * PROVE_LOCKING we could go over this limit and cause system boot-up problems.
+ * CONFIG_LOCKDEP we could go over this limit and cause system boot-up problems.
* So, reduce the static allocations for lockdeps related structures so that
* everything fits in current required size limit.
*/
-#ifdef CONFIG_PROVE_LOCKING_SMALL
+#ifdef CONFIG_LOCKDEP_SMALL
/*
* MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
* we track.
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 97ee9df32e0f..58e366ad36f4 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -102,10 +102,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
return;
}
- printk("\n============================================\n");
- printk( "[ BUG: circular locking deadlock detected! ]\n");
- printk("%s\n", print_tainted());
- printk( "--------------------------------------------\n");
+ pr_warn("\n");
+ pr_warn("============================================\n");
+ pr_warn("WARNING: circular locking deadlock detected!\n");
+ pr_warn("%s\n", print_tainted());
+ pr_warn("--------------------------------------------\n");
printk("%s/%d is deadlocking current task %s/%d\n\n",
task->comm, task_pid_nr(task),
current->comm, task_pid_nr(current));
@@ -174,12 +175,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
lock->name = name;
}
-void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
-{
-}
-
-void rt_mutex_deadlock_account_unlock(struct task_struct *task)
-{
-}
-
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index d0519c3432b6..b585af9a1b50 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -9,9 +9,6 @@
* This file contains macros used solely by rtmutex.c. Debug version.
*/
-extern void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
-extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6edc32ecd9c5..28cd09e635ed 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -224,6 +224,12 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
}
#endif
+/*
+ * Only use with rt_mutex_waiter_{less,equal}()
+ */
+#define task_to_waiter(p) \
+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
+
static inline int
rt_mutex_waiter_less(struct rt_mutex_waiter *left,
struct rt_mutex_waiter *right)
@@ -238,12 +244,30 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
* then right waiter has a dl_prio() too.
*/
if (dl_prio(left->prio))
- return dl_time_before(left->task->dl.deadline,
- right->task->dl.deadline);
+ return dl_time_before(left->deadline, right->deadline);
return 0;
}
+static inline int
+rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
+ struct rt_mutex_waiter *right)
+{
+ if (left->prio != right->prio)
+ return 0;
+
+ /*
+ * If both waiters have dl_prio(), we check the deadlines of the
+ * associated tasks.
+ * If left waiter has a dl_prio(), and we didn't return 0 above,
+ * then right waiter has a dl_prio() too.
+ */
+ if (dl_prio(left->prio))
+ return left->deadline == right->deadline;
+
+ return 1;
+}
+
static void
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
@@ -322,72 +346,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
RB_CLEAR_NODE(&waiter->pi_tree_entry);
}
-/*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
+static void rt_mutex_adjust_prio(struct task_struct *p)
{
- if (likely(!task_has_pi_waiters(task)))
- return task->normal_prio;
+ struct task_struct *pi_task = NULL;
- return min(task_top_pi_waiter(task)->prio,
- task->normal_prio);
-}
+ lockdep_assert_held(&p->pi_lock);
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
- if (likely(!task_has_pi_waiters(task)))
- return NULL;
+ if (task_has_pi_waiters(p))
+ pi_task = task_top_pi_waiter(p)->task;
- return task_top_pi_waiter(task)->task;
-}
-
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
-{
- if (!task_has_pi_waiters(task))
- return newprio;
-
- if (task_top_pi_waiter(task)->task->prio <= newprio)
- return task_top_pi_waiter(task)->task->prio;
- return newprio;
-}
-
-/*
- * Adjust the priority of a task, after its pi_waiters got modified.
- *
- * This can be both boosting and unboosting. task->pi_lock must be held.
- */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
-{
- int prio = rt_mutex_getprio(task);
-
- if (task->prio != prio || dl_prio(prio))
- rt_mutex_setprio(task, prio);
-}
-
-/*
- * Adjust task priority (undo boosting). Called from the exit path of
- * rt_mutex_slowunlock() and rt_mutex_slowlock().
- *
- * (Note: We do this outside of the protection of lock->wait_lock to
- * allow the lock to be taken while or before we readjust the priority
- * of task. We do not use the spin_xx_mutex() variants here as we are
- * outside of the debug path.)
- */
-void rt_mutex_adjust_prio(struct task_struct *task)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&task->pi_lock, flags);
- __rt_mutex_adjust_prio(task);
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ rt_mutex_setprio(p, pi_task);
}
/*
@@ -610,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* enabled we continue, but stop the requeueing in the chain
* walk.
*/
- if (waiter->prio == task->prio) {
+ if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
if (!detect_deadlock)
goto out_unlock_pi;
else
@@ -706,7 +674,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/* [7] Requeue the waiter in the lock waiter tree. */
rt_mutex_dequeue(lock, waiter);
+
+ /*
+ * Update the waiter prio fields now that we're dequeued.
+ *
+ * These values can have changed through either:
+ *
+ * sys_sched_set_scheduler() / sys_sched_setattr()
+ *
+ * or
+ *
+ * DL CBS enforcement advancing the effective deadline.
+ *
+ * Even though pi_waiters also uses these fields, and that tree is only
+ * updated in [11], we can do this here, since we hold [L], which
+ * serializes all pi_waiters access and rb_erase() does not care about
+ * the values of the node being removed.
+ */
waiter->prio = task->prio;
+ waiter->deadline = task->dl.deadline;
+
rt_mutex_enqueue(lock, waiter);
/* [8] Release the task */
@@ -747,7 +734,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
rt_mutex_enqueue_pi(task, waiter);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
} else if (prerequeue_top_waiter == waiter) {
/*
@@ -763,7 +750,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
rt_mutex_enqueue_pi(task, waiter);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
} else {
/*
* Nothing changed. No need to do any priority
@@ -833,6 +820,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
{
+ lockdep_assert_held(&lock->wait_lock);
+
/*
* Before testing whether we can acquire @lock, we set the
* RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -892,7 +881,8 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* the top waiter priority (kernel view),
* @task lost.
*/
- if (task->prio >= rt_mutex_top_waiter(lock)->prio)
+ if (!rt_mutex_waiter_less(task_to_waiter(task),
+ rt_mutex_top_waiter(lock)))
return 0;
/*
@@ -938,8 +928,6 @@ takeit:
*/
rt_mutex_set_owner(lock, task);
- rt_mutex_deadlock_account_lock(lock, task);
-
return 1;
}
@@ -960,6 +948,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex *next_lock;
int chain_walk = 0, res;
+ lockdep_assert_held(&lock->wait_lock);
+
/*
* Early deadlock detection. We really don't want the task to
* enqueue on itself just to untangle the mess later. It's not
@@ -973,10 +963,11 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
waiter->prio = task->prio;
+ waiter->deadline = task->dl.deadline;
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -995,7 +986,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
- __rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(owner);
if (owner->pi_blocked_on)
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1047,12 +1038,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
waiter = rt_mutex_top_waiter(lock);
/*
- * Remove it from current->pi_waiters. We do not adjust a
- * possible priority boost right now. We execute wakeup in the
- * boosted mode and go back to normal after releasing
- * lock->wait_lock.
+ * Remove it from current->pi_waiters and deboost.
+ *
+ * We must in fact deboost here in order to ensure we call
+ * rt_mutex_setprio() to update p->pi_top_task before the
+ * task unblocks.
*/
rt_mutex_dequeue_pi(current, waiter);
+ rt_mutex_adjust_prio(current);
/*
* As we are waking up the top waiter, and the waiter stays
@@ -1064,9 +1057,19 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
*/
lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
- raw_spin_unlock(&current->pi_lock);
-
+ /*
+ * We deboosted before waking the top waiter task such that we don't
+ * run two tasks with the 'same' priority (and ensure the
+ * p->pi_top_task pointer points to a blocked task). This however can
+ * lead to priority inversion if we would get preempted after the
+ * deboost but before waking our donor task, hence the preempt_disable()
+ * before unlock.
+ *
+ * Pairs with preempt_enable() in rt_mutex_postunlock();
+ */
+ preempt_disable();
wake_q_add(wake_q, waiter->task);
+ raw_spin_unlock(&current->pi_lock);
}
/*
@@ -1082,6 +1085,8 @@ static void remove_waiter(struct rt_mutex *lock,
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex *next_lock;
+ lockdep_assert_held(&lock->wait_lock);
+
raw_spin_lock(&current->pi_lock);
rt_mutex_dequeue(lock, waiter);
current->pi_blocked_on = NULL;
@@ -1101,7 +1106,7 @@ static void remove_waiter(struct rt_mutex *lock,
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
- __rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(owner);
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
@@ -1140,8 +1145,7 @@ void rt_mutex_adjust_pi(struct task_struct *task)
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
- if (!waiter || (waiter->prio == task->prio &&
- !dl_prio(task->prio))) {
+ if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
@@ -1155,6 +1159,14 @@ void rt_mutex_adjust_pi(struct task_struct *task)
next_lock, NULL, task);
}
+void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+ waiter->task = NULL;
+}
+
/**
* __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
@@ -1237,9 +1249,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
unsigned long flags;
int ret = 0;
- debug_rt_mutex_init_waiter(&waiter);
- RB_CLEAR_NODE(&waiter.pi_tree_entry);
- RB_CLEAR_NODE(&waiter.tree_entry);
+ rt_mutex_init_waiter(&waiter);
/*
* Technically we could use raw_spin_[un]lock_irq() here, but this can
@@ -1330,7 +1340,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
/*
* Slow path to release a rt-mutex.
- * Return whether the current task needs to undo a potential priority boosting.
+ *
+ * Return whether the current task needs to call rt_mutex_postunlock().
*/
static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
struct wake_q_head *wake_q)
@@ -1342,8 +1353,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
debug_rt_mutex_unlock(lock);
- rt_mutex_deadlock_account_unlock(current);
-
/*
* We must be careful here if the fast path is enabled. If we
* have no waiters queued we cannot set owner to NULL here
@@ -1390,11 +1399,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
* Queue the next waiter for wakeup once we release the wait_lock.
*/
mark_wakeup_next_waiter(wake_q, lock);
-
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- /* check PI boosting */
- return true;
+ return true; /* call rt_mutex_postunlock() */
}
/*
@@ -1409,11 +1416,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
enum rtmutex_chainwalk chwalk))
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- } else
- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+
+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
}
static inline int
@@ -1425,24 +1431,33 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
enum rtmutex_chainwalk chwalk))
{
if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- } else
- return slowfn(lock, state, timeout, chwalk);
+
+ return slowfn(lock, state, timeout, chwalk);
}
static inline int
rt_mutex_fasttrylock(struct rt_mutex *lock,
int (*slowfn)(struct rt_mutex *lock))
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 1;
- }
+
return slowfn(lock);
}
+/*
+ * Performs the wakeup of the the top-waiter and re-enables preemption.
+ */
+void rt_mutex_postunlock(struct wake_q_head *wake_q)
+{
+ wake_up_q(wake_q);
+
+ /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
+ preempt_enable();
+}
+
static inline void
rt_mutex_fastunlock(struct rt_mutex *lock,
bool (*slowfn)(struct rt_mutex *lock,
@@ -1450,18 +1465,11 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
{
DEFINE_WAKE_Q(wake_q);
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
-
- } else {
- bool deboost = slowfn(lock, &wake_q);
-
- wake_up_q(&wake_q);
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
- /* Undo pi boosting if necessary: */
- if (deboost)
- rt_mutex_adjust_prio(current);
- }
+ if (slowfn(lock, &wake_q))
+ rt_mutex_postunlock(&wake_q);
}
/**
@@ -1495,16 +1503,11 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
/*
- * Futex variant with full deadlock detection.
+ * Futex variant, must not use fastpath.
*/
-int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *timeout)
+int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
{
- might_sleep();
-
- return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- RT_MUTEX_FULL_CHAINWALK,
- rt_mutex_slowlock);
+ return rt_mutex_slowtrylock(lock);
}
/**
@@ -1563,20 +1566,43 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
- * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
- * @lock: the rt_mutex to be unlocked
- *
- * Returns: true/false indicating whether priority adjustment is
- * required or not.
+ * Futex variant, that since futex variants do not use the fast-path, can be
+ * simple and will not need to retry.
*/
-bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh)
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wake_q)
{
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
- return false;
+ lockdep_assert_held(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ return false; /* done */
}
- return rt_mutex_slowunlock(lock, wqh);
+
+ /*
+ * We've already deboosted, mark_wakeup_next_waiter() will
+ * retain preempt_disabled when we drop the wait_lock, to
+ * avoid inversion prior to the wakeup. preempt_disable()
+ * therein pairs with rt_mutex_postunlock().
+ */
+ mark_wakeup_next_waiter(wake_q, lock);
+
+ return true; /* call postunlock() */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+{
+ DEFINE_WAKE_Q(wake_q);
+ bool postunlock;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wake_q);
}
/**
@@ -1637,7 +1663,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
__rt_mutex_init(lock, NULL);
debug_rt_mutex_proxy_lock(lock, proxy_owner);
rt_mutex_set_owner(lock, proxy_owner);
- rt_mutex_deadlock_account_lock(lock, proxy_owner);
}
/**
@@ -1657,34 +1682,16 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
{
debug_rt_mutex_proxy_unlock(lock);
rt_mutex_set_owner(lock, NULL);
- rt_mutex_deadlock_account_unlock(proxy_owner);
}
-/**
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for FUTEX_REQUEUE_PI support.
- */
-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task)
{
int ret;
- raw_spin_lock_irq(&lock->wait_lock);
-
- if (try_to_take_rt_mutex(lock, task, NULL)) {
- raw_spin_unlock_irq(&lock->wait_lock);
+ if (try_to_take_rt_mutex(lock, task, NULL))
return 1;
- }
/* We enforce deadlock detection for futexes */
ret = task_blocks_on_rt_mutex(lock, waiter, task,
@@ -1703,14 +1710,38 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
if (unlikely(ret))
remove_waiter(lock, waiter);
- raw_spin_unlock_irq(&lock->wait_lock);
-
debug_rt_mutex_print_deadlock(waiter);
return ret;
}
/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
* rt_mutex_next_owner - return the next owner of the lock
*
* @lock: the rt lock query
@@ -1731,36 +1762,87 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
}
/**
- * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
* @lock: the rt_mutex we were woken on
* @to: the timeout, null if none. hrtimer should already have
* been started.
* @waiter: the pre-initialized rt_mutex_waiter
*
- * Complete the lock acquisition started our behalf by another thread.
+ * Wait for the the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
*
* Returns:
* 0 - success
* <0 - error, one of -EINTR, -ETIMEDOUT
*
- * Special API call for PI-futex requeue support
+ * Special API call for PI-futex support
*/
-int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *to,
struct rt_mutex_waiter *waiter)
{
int ret;
raw_spin_lock_irq(&lock->wait_lock);
-
- set_current_state(TASK_INTERRUPTIBLE);
-
/* sleep on the mutex */
+ set_current_state(TASK_INTERRUPTIBLE);
ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
- if (unlikely(ret))
- remove_waiter(lock, waiter);
+ return ret;
+}
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ * true - did the cleanup, we done.
+ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ * caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool cleanup = false;
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
+ * Do an unconditional try-lock, this deals with the lock stealing
+ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
+ * sets a NULL owner.
+ *
+ * We're not interested in the return value, because the subsequent
+ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
+ * we will own the lock and it will have removed the waiter. If we
+ * failed the trylock, we're still not owner and we need to remove
+ * ourselves.
+ */
+ try_to_take_rt_mutex(lock, current, waiter);
+ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
+ remove_waiter(lock, waiter);
+ cleanup = true;
+ }
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
@@ -1769,5 +1851,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
raw_spin_unlock_irq(&lock->wait_lock);
- return ret;
+ return cleanup;
}
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index c4060584c407..6607802efa8b 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -11,8 +11,6 @@
*/
#define rt_mutex_deadlock_check(l) (0)
-#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
-#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
#define debug_rt_mutex_init_waiter(w) do { } while (0)
#define debug_rt_mutex_free_waiter(w) do { } while (0)
#define debug_rt_mutex_lock(l) do { } while (0)
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 856dfff5c33a..72ad45a9a794 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -34,6 +34,7 @@ struct rt_mutex_waiter {
struct rt_mutex *deadlock_lock;
#endif
int prio;
+ u64 deadline;
};
/*
@@ -103,16 +104,26 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task);
-extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter);
-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
-extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh);
-extern void rt_mutex_adjust_prio(struct task_struct *task);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter);
+
+extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wqh);
+
+extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 90a74ccd85a4..4d48b1c4870d 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -124,10 +124,8 @@ EXPORT_SYMBOL(up_write);
*/
void downgrade_write(struct rw_semaphore *sem)
{
- /*
- * lockdep: a downgraded write will live on as a write
- * dependency.
- */
+ lock_downgrade(&sem->dep_map, _RET_IP_);
+
rwsem_set_reader_owned(sem);
__downgrade_write(sem);
}
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 6b7abb334ca6..39f56c870051 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -353,8 +353,8 @@ static int test_cycle(unsigned int ncpus)
struct stress {
struct work_struct work;
struct ww_mutex *locks;
+ unsigned long timeout;
int nlocks;
- int nloops;
};
static int *get_random_order(int count)
@@ -398,12 +398,11 @@ static void stress_inorder_work(struct work_struct *work)
if (!order)
return;
- ww_acquire_init(&ctx, &ww_class);
-
do {
int contended = -1;
int n, err;
+ ww_acquire_init(&ctx, &ww_class);
retry:
err = 0;
for (n = 0; n < nlocks; n++) {
@@ -433,9 +432,9 @@ retry:
__func__, err);
break;
}
- } while (--stress->nloops);
- ww_acquire_fini(&ctx);
+ ww_acquire_fini(&ctx);
+ } while (!time_after(jiffies, stress->timeout));
kfree(order);
kfree(stress);
@@ -470,9 +469,9 @@ static void stress_reorder_work(struct work_struct *work)
kfree(order);
order = NULL;
- ww_acquire_init(&ctx, &ww_class);
-
do {
+ ww_acquire_init(&ctx, &ww_class);
+
list_for_each_entry(ll, &locks, link) {
err = ww_mutex_lock(ll->lock, &ctx);
if (!err)
@@ -495,9 +494,9 @@ static void stress_reorder_work(struct work_struct *work)
dummy_load(stress);
list_for_each_entry(ll, &locks, link)
ww_mutex_unlock(ll->lock);
- } while (--stress->nloops);
- ww_acquire_fini(&ctx);
+ ww_acquire_fini(&ctx);
+ } while (!time_after(jiffies, stress->timeout));
out:
list_for_each_entry_safe(ll, ln, &locks, link)
@@ -523,7 +522,7 @@ static void stress_one_work(struct work_struct *work)
__func__, err);
break;
}
- } while (--stress->nloops);
+ } while (!time_after(jiffies, stress->timeout));
kfree(stress);
}
@@ -533,7 +532,7 @@ static void stress_one_work(struct work_struct *work)
#define STRESS_ONE BIT(2)
#define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE)
-static int stress(int nlocks, int nthreads, int nloops, unsigned int flags)
+static int stress(int nlocks, int nthreads, unsigned int flags)
{
struct ww_mutex *locks;
int n;
@@ -575,7 +574,7 @@ static int stress(int nlocks, int nthreads, int nloops, unsigned int flags)
INIT_WORK(&stress->work, fn);
stress->locks = locks;
stress->nlocks = nlocks;
- stress->nloops = nloops;
+ stress->timeout = jiffies + 2*HZ;
queue_work(wq, &stress->work);
nthreads--;
@@ -619,15 +618,15 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER);
+ ret = stress(16, 2*ncpus, STRESS_INORDER);
if (ret)
return ret;
- ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER);
+ ret = stress(16, 2*ncpus, STRESS_REORDER);
if (ret)
return ret;
- ret = stress(4095, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL);
+ ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
if (ret)
return ret;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 07e85e5229da..23a6483c3666 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -182,18 +182,6 @@ struct page_map {
struct vmem_altmap altmap;
};
-void get_zone_device_page(struct page *page)
-{
- percpu_ref_get(page->pgmap->ref);
-}
-EXPORT_SYMBOL(get_zone_device_page);
-
-void put_zone_device_page(struct page *page)
-{
- put_dev_pagemap(page->pgmap);
-}
-EXPORT_SYMBOL(put_zone_device_page);
-
static void pgmap_radix_release(struct resource *res)
{
resource_size_t key, align_start, align_size, align_end;
@@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
struct resource *res = &page_map->res;
resource_size_t align_start, align_size;
struct dev_pagemap *pgmap = &page_map->pgmap;
+ unsigned long pfn;
+
+ for_each_device_pfn(pfn, page_map)
+ put_page(pfn_to_page(pfn));
if (percpu_ref_tryget_live(pgmap->ref)) {
dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
@@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
*
* Notes:
* 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
- * (or devm release event).
+ * (or devm release event). The expected order of events is that @ref has
+ * been through percpu_ref_kill() before devm_memremap_pages_release(). The
+ * wait for the completion of all references being dropped and
+ * percpu_ref_exit() must occur after devm_memremap_pages_release().
*
* 2/ @res is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
*/
list_del(&page->lru);
page->pgmap = pgmap;
+ percpu_ref_get(ref);
}
devres_add(dev, page_map);
return __va(res->start);
diff --git a/kernel/module.c b/kernel/module.c
index 7eba6dea4f41..4a3665f8f837 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -49,6 +49,9 @@
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
+#ifdef CONFIG_STRICT_MODULE_RWX
+#include <asm/set_memory.h>
+#endif
#include <asm/mmu_context.h>
#include <linux/license.h>
#include <asm/sections.h>
@@ -665,16 +668,7 @@ static void percpu_modcopy(struct module *mod,
memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
}
-/**
- * is_module_percpu_address - test whether address is from module static percpu
- * @addr: address to test
- *
- * Test whether @addr belongs to module static percpu area.
- *
- * RETURNS:
- * %true if @addr is from module static percpu area
- */
-bool is_module_percpu_address(unsigned long addr)
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
{
struct module *mod;
unsigned int cpu;
@@ -688,9 +682,15 @@ bool is_module_percpu_address(unsigned long addr)
continue;
for_each_possible_cpu(cpu) {
void *start = per_cpu_ptr(mod->percpu, cpu);
-
- if ((void *)addr >= start &&
- (void *)addr < start + mod->percpu_size) {
+ void *va = (void *)addr;
+
+ if (va >= start && va < start + mod->percpu_size) {
+ if (can_addr) {
+ *can_addr = (unsigned long) (va - start);
+ *can_addr += (unsigned long)
+ per_cpu_ptr(mod->percpu,
+ get_boot_cpu_id());
+ }
preempt_enable();
return true;
}
@@ -701,6 +701,20 @@ bool is_module_percpu_address(unsigned long addr)
return false;
}
+/**
+ * is_module_percpu_address - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * RETURNS:
+ * %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+ return __is_module_percpu_address(addr, NULL);
+}
+
#else /* ... !CONFIG_SMP */
static inline void __percpu *mod_percpu(struct module *mod)
@@ -732,6 +746,11 @@ bool is_module_percpu_address(unsigned long addr)
return false;
}
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+ return false;
+}
+
#endif /* CONFIG_SMP */
#define MODINFO_ATTR(field) \
@@ -947,6 +966,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
return -EFAULT;
name[MODULE_NAME_LEN-1] = '\0';
+ audit_log_kern_module(name);
+
if (mutex_lock_interruptible(&module_mutex) != 0)
return -EINTR;
@@ -2846,7 +2867,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
/* Suck in entire file: we'll want most of it. */
info->hdr = __vmalloc(info->len,
- GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL);
+ GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL);
if (!info->hdr)
return -ENOMEM;
@@ -4017,7 +4038,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
/* Don't lock: we're in enough trouble already. */
preempt_disable();
- if ((colon = strchr(name, ':')) != NULL) {
+ if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
if ((mod = find_module_all(name, colon - name, false)) != NULL)
ret = mod_find_symname(mod, colon+1);
} else {
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 782102e59eed..f6c5d330059a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/syscalls.h>
#include <linux/cgroup.h>
+#include <linux/perf_event.h>
static struct kmem_cache *nsproxy_cachep;
@@ -262,6 +263,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
goto out;
}
switch_task_namespaces(tsk, new_nsproxy);
+
+ perf_event_namespaces(tsk);
out:
fput(file);
return err;
diff --git a/kernel/padata.c b/kernel/padata.c
index 05316c9f32da..ac8f1e524836 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -154,8 +154,6 @@ EXPORT_SYMBOL(padata_do_parallel);
* A pointer to the control struct of the next object that needs
* serialization, if present in one of the percpu reorder queues.
*
- * NULL, if all percpu reorder queues are empty.
- *
* -EINPROGRESS, if the next object that needs serialization will
* be parallel processed by another cpu and is not yet present in
* the cpu's reorder queue.
@@ -182,23 +180,22 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
cpu = padata_index_to_cpu(pd, next_index);
next_queue = per_cpu_ptr(pd->pqueue, cpu);
- padata = NULL;
-
reorder = &next_queue->reorder;
+ spin_lock(&reorder->lock);
if (!list_empty(&reorder->list)) {
padata = list_entry(reorder->list.next,
struct padata_priv, list);
- spin_lock(&reorder->lock);
list_del_init(&padata->list);
atomic_dec(&pd->reorder_objects);
- spin_unlock(&reorder->lock);
pd->processed++;
+ spin_unlock(&reorder->lock);
goto out;
}
+ spin_unlock(&reorder->lock);
if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
padata = ERR_PTR(-ENODATA);
@@ -234,12 +231,11 @@ static void padata_reorder(struct parallel_data *pd)
padata = padata_get_next(pd);
/*
- * All reorder queues are empty, or the next object that needs
- * serialization is parallel processed by another cpu and is
- * still on it's way to the cpu's reorder queue, nothing to
- * do for now.
+ * If the next object that needs serialization is parallel
+ * processed by another cpu and is still on it's way to the
+ * cpu's reorder queue, nothing to do for now.
*/
- if (!padata || PTR_ERR(padata) == -EINPROGRESS)
+ if (PTR_ERR(padata) == -EINPROGRESS)
break;
/*
@@ -353,7 +349,7 @@ static int padata_setup_cpumasks(struct parallel_data *pd,
cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
- free_cpumask_var(pd->cpumask.cbcpu);
+ free_cpumask_var(pd->cpumask.pcpu);
return -ENOMEM;
}
diff --git a/kernel/params.c b/kernel/params.c
index a6d6149c0fe6..60b2d8101355 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -160,58 +160,6 @@ static int parse_one(char *param,
return -ENOENT;
}
-/* You can use " around spaces, but can't escape ". */
-/* Hyphens and underscores equivalent in parameter names. */
-static char *next_arg(char *args, char **param, char **val)
-{
- unsigned int i, equals = 0;
- int in_quote = 0, quoted = 0;
- char *next;
-
- if (*args == '"') {
- args++;
- in_quote = 1;
- quoted = 1;
- }
-
- for (i = 0; args[i]; i++) {
- if (isspace(args[i]) && !in_quote)
- break;
- if (equals == 0) {
- if (args[i] == '=')
- equals = i;
- }
- if (args[i] == '"')
- in_quote = !in_quote;
- }
-
- *param = args;
- if (!equals)
- *val = NULL;
- else {
- args[equals] = '\0';
- *val = args + equals + 1;
-
- /* Don't include quotes in value. */
- if (**val == '"') {
- (*val)++;
- if (args[i-1] == '"')
- args[i-1] = '\0';
- }
- }
- if (quoted && args[i-1] == '"')
- args[i-1] = '\0';
-
- if (args[i]) {
- args[i] = '\0';
- next = args + i + 1;
- } else
- next = args + i;
-
- /* Chew up trailing spaces. */
- return skip_spaces(next);
-}
-
/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
char *parse_args(const char *doing,
char *args,
diff --git a/kernel/pid.c b/kernel/pid.c
index 0143ac0ddceb..fd1cde1e4576 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -321,8 +321,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
if (unlikely(is_child_reaper(pid))) {
- if (pid_ns_prepare_proc(ns))
+ if (pid_ns_prepare_proc(ns)) {
+ disable_pid_allocation(ns);
goto out_free;
+ }
}
get_pid_ns(ns);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index de461aa0bf9a..74a5a7255b4d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -277,7 +277,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* if reparented.
*/
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE);
if (pid_ns->nr_hashed == init_pids)
break;
schedule();
@@ -374,6 +374,29 @@ static struct ns_common *pidns_get(struct task_struct *task)
return ns ? &ns->ns : NULL;
}
+static struct ns_common *pidns_for_children_get(struct task_struct *task)
+{
+ struct pid_namespace *ns = NULL;
+
+ task_lock(task);
+ if (task->nsproxy) {
+ ns = task->nsproxy->pid_ns_for_children;
+ get_pid_ns(ns);
+ }
+ task_unlock(task);
+
+ if (ns) {
+ read_lock(&tasklist_lock);
+ if (!ns->child_reaper) {
+ put_pid_ns(ns);
+ ns = NULL;
+ }
+ read_unlock(&tasklist_lock);
+ }
+
+ return ns ? &ns->ns : NULL;
+}
+
static void pidns_put(struct ns_common *ns)
{
put_pid_ns(to_pid_ns(ns));
@@ -443,6 +466,17 @@ const struct proc_ns_operations pidns_operations = {
.get_parent = pidns_get_parent,
};
+const struct proc_ns_operations pidns_for_children_operations = {
+ .name = "pid_for_children",
+ .real_ns_name = "pid",
+ .type = CLONE_NEWPID,
+ .get = pidns_for_children_get,
+ .put = pidns_put,
+ .install = pidns_install,
+ .owner = pidns_owner,
+ .get_parent = pidns_get_parent,
+};
+
static __init int pid_namespaces_init(void)
{
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d79a38de425a..fa46606f3356 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -36,6 +36,9 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
+#ifdef CONFIG_STRICT_KERNEL_RWX
+#include <asm/set_memory.h>
+#endif
#include "power.h"
@@ -1422,7 +1425,7 @@ static unsigned int nr_meta_pages;
* Numbers of normal and highmem page frames allocated for hibernation image
* before suspending devices.
*/
-unsigned int alloc_normal, alloc_highmem;
+static unsigned int alloc_normal, alloc_highmem;
/*
* Memory bitmap used for marking saveable pages (during hibernation) or
* hibernation image pages (during restore)
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index d5760c42f042..61d41ca41844 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -2,12 +2,13 @@
#include <linux/kernel.h>
#include <linux/console.h>
+#include <linux/errno.h>
#include <linux/string.h>
#include "console_cmdline.h"
#include "braille.h"
-char *_braille_console_setup(char **str, char **brl_options)
+int _braille_console_setup(char **str, char **brl_options)
{
if (!strncmp(*str, "brl,", 4)) {
*brl_options = "";
@@ -15,14 +16,14 @@ char *_braille_console_setup(char **str, char **brl_options)
} else if (!strncmp(*str, "brl=", 4)) {
*brl_options = *str + 4;
*str = strchr(*brl_options, ',');
- if (!*str)
+ if (!*str) {
pr_err("need port name after brl=\n");
- else
- *((*str)++) = 0;
- } else
- return NULL;
+ return -EINVAL;
+ }
+ *((*str)++) = 0;
+ }
- return *str;
+ return 0;
}
int
diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h
index 769d771145c8..749a6756843a 100644
--- a/kernel/printk/braille.h
+++ b/kernel/printk/braille.h
@@ -9,7 +9,14 @@ braille_set_options(struct console_cmdline *c, char *brl_options)
c->brl_options = brl_options;
}
-char *
+/*
+ * Setup console according to braille options.
+ * Return -EINVAL on syntax error, 0 on success (or no braille option was
+ * actually given).
+ * Modifies str to point to the serial options
+ * Sets brl_options to the parsed braille options.
+ */
+int
_braille_console_setup(char **str, char **brl_options);
int
@@ -25,10 +32,10 @@ braille_set_options(struct console_cmdline *c, char *brl_options)
{
}
-static inline char *
+static inline int
_braille_console_setup(char **str, char **brl_options)
{
- return NULL;
+ return 0;
}
static inline int
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2984fb0f0257..a1db38abac5b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -32,7 +32,7 @@
#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
-#include <linux/kexec.h>
+#include <linux/crash_core.h>
#include <linux/kdb.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
@@ -270,7 +270,6 @@ static struct console *exclusive_console;
static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
-static int selected_console = -1;
static int preferred_console = -1;
int console_set_on_cmdline;
EXPORT_SYMBOL(console_set_on_cmdline);
@@ -1002,7 +1001,7 @@ const struct file_operations kmsg_fops = {
.release = devkmsg_release,
};
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_CORE
/*
* This appends the listed symbols to /proc/vmcore
*
@@ -1011,7 +1010,7 @@ const struct file_operations kmsg_fops = {
* symbols are specifically used so that utilities can access and extract the
* dmesg log from a vmcore file after a crash.
*/
-void log_buf_kexec_setup(void)
+void log_buf_vmcoreinfo_setup(void)
{
VMCOREINFO_SYMBOL(log_buf);
VMCOREINFO_SYMBOL(log_buf_len);
@@ -1911,14 +1910,14 @@ static int __add_preferred_console(char *name, int idx, char *options,
i++, c++) {
if (strcmp(c->name, name) == 0 && c->index == idx) {
if (!brl_options)
- selected_console = i;
+ preferred_console = i;
return 0;
}
}
if (i == MAX_CMDLINECONSOLES)
return -E2BIG;
if (!brl_options)
- selected_console = i;
+ preferred_console = i;
strlcpy(c->name, name, sizeof(c->name));
c->options = options;
braille_set_options(c, brl_options);
@@ -2031,15 +2030,16 @@ void resume_console(void)
* @cpu: unused
*
* If printk() is called from a CPU that is not online yet, the messages
- * will be spooled but will not show up on the console. This function is
- * called when a new CPU comes online (or fails to come up), and ensures
- * that any such output gets printed.
+ * will be printed on the console only if there are CON_ANYTIME consoles.
+ * This function is called when a new CPU comes online (or fails to come
+ * up) or goes offline.
*/
static int console_cpu_notify(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
- console_lock();
- console_unlock();
+ /* If trylock fails, someone else is doing the printing */
+ if (console_trylock())
+ console_unlock();
}
return 0;
}
@@ -2161,7 +2161,7 @@ void console_unlock(void)
}
/*
- * Console drivers are called under logbuf_lock, so
+ * Console drivers are called with interrupts disabled, so
* @console_may_schedule should be cleared before; however, we may
* end up dumping a lot of lines, for example, if called from
* console registration path, and should invoke cond_resched()
@@ -2169,11 +2169,15 @@ void console_unlock(void)
* scheduling stall on a slow console leading to RCU stall and
* softlockup warnings which exacerbate the issue with more
* messages practically incapacitating the system.
+ *
+ * console_trylock() is not able to detect the preemptive
+ * context reliably. Therefore the value must be stored before
+ * and cleared after the the "again" goto label.
*/
do_cond_resched = console_may_schedule;
+again:
console_may_schedule = 0;
-again:
/*
* We released the console_sem lock, so we need to recheck if
* cpu is online and (if not) is there at least one CON_ANYTIME
@@ -2409,6 +2413,7 @@ void register_console(struct console *newcon)
unsigned long flags;
struct console *bcon = NULL;
struct console_cmdline *c;
+ static bool has_preferred;
if (console_drivers)
for_each_console(bcon)
@@ -2435,15 +2440,15 @@ void register_console(struct console *newcon)
if (console_drivers && console_drivers->flags & CON_BOOT)
bcon = console_drivers;
- if (preferred_console < 0 || bcon || !console_drivers)
- preferred_console = selected_console;
+ if (!has_preferred || bcon || !console_drivers)
+ has_preferred = preferred_console >= 0;
/*
* See if we want to use this console driver. If we
* didn't select a console we take the first one
* that registers here.
*/
- if (preferred_console < 0) {
+ if (!has_preferred) {
if (newcon->index < 0)
newcon->index = 0;
if (newcon->setup == NULL ||
@@ -2451,7 +2456,7 @@ void register_console(struct console *newcon)
newcon->flags |= CON_ENABLED;
if (newcon->device) {
newcon->flags |= CON_CONSDEV;
- preferred_console = 0;
+ has_preferred = true;
}
}
}
@@ -2484,9 +2489,9 @@ void register_console(struct console *newcon)
}
newcon->flags |= CON_ENABLED;
- if (i == selected_console) {
+ if (i == preferred_console) {
newcon->flags |= CON_CONSDEV;
- preferred_console = selected_console;
+ has_preferred = true;
}
break;
}
@@ -2611,6 +2616,30 @@ int unregister_console(struct console *console)
EXPORT_SYMBOL(unregister_console);
/*
+ * Initialize the console device. This is called *early*, so
+ * we can't necessarily depend on lots of kernel help here.
+ * Just do some early initializations, and do the complex setup
+ * later.
+ */
+void __init console_init(void)
+{
+ initcall_t *call;
+
+ /* Setup the default TTY line discipline. */
+ n_tty_init();
+
+ /*
+ * set up the console device so that later boot sequences can
+ * inform about problems etc..
+ */
+ call = __con_initcall_start;
+ while (call < __con_initcall_end) {
+ (*call)();
+ call++;
+ }
+}
+
+/*
* Some boot consoles access data that is in the init section and which will
* be discarded after the initcalls have been run. To make sure that no code
* will access this data, unregister the boot consoles in a late initcall.
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0af928712174..60f356d91060 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -60,19 +60,25 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
}
+void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
+ const struct cred *ptracer_cred)
+{
+ BUG_ON(!list_empty(&child->ptrace_entry));
+ list_add(&child->ptrace_entry, &new_parent->ptraced);
+ child->parent = new_parent;
+ child->ptracer_cred = get_cred(ptracer_cred);
+}
+
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
*
* Must be called with the tasklist lock write-held.
*/
-void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
+static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
- BUG_ON(!list_empty(&child->ptrace_entry));
- list_add(&child->ptrace_entry, &new_parent->ptraced);
- child->parent = new_parent;
rcu_read_lock();
- child->ptracer_cred = get_cred(__task_cred(new_parent));
+ __ptrace_link(child, new_parent, __task_cred(new_parent));
rcu_read_unlock();
}
@@ -184,11 +190,17 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
WARN_ON(!task->ptrace || task->parent != current);
+ /*
+ * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely.
+ * Recheck state under the lock to close this race.
+ */
spin_lock_irq(&task->sighand->siglock);
- if (__fatal_signal_pending(task))
- wake_up_state(task, __TASK_TRACED);
- else
- task->state = TASK_TRACED;
+ if (task->state == __TASK_TRACED) {
+ if (__fatal_signal_pending(task))
+ wake_up_state(task, __TASK_TRACED);
+ else
+ task->state = TASK_TRACED;
+ }
spin_unlock_irq(&task->sighand->siglock);
}
@@ -380,7 +392,7 @@ static int ptrace_attach(struct task_struct *task, long request,
flags |= PT_SEIZED;
task->ptrace = flags;
- __ptrace_link(task, current);
+ ptrace_link(task, current);
/* SEIZE doesn't trap tracee on attach */
if (!seize)
@@ -453,7 +465,7 @@ static int ptrace_traceme(void)
*/
if (!ret && !(current->real_parent->flags & PF_EXITING)) {
current->ptrace = PT_PTRACED;
- __ptrace_link(current, current->real_parent);
+ ptrace_link(current, current->real_parent);
}
}
write_unlock_irq(&tasklist_lock);
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 18dfc485225c..23803c7d5180 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,10 +3,13 @@
KCOV_INSTRUMENT := n
obj-y += update.o sync.o
-obj-$(CONFIG_SRCU) += srcu.o
+obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
+obj-$(CONFIG_TREE_SRCU) += srcutree.o
+obj-$(CONFIG_TINY_SRCU) += srcutiny.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
obj-$(CONFIG_TREE_RCU) += tree.o
obj-$(CONFIG_PREEMPT_RCU) += tree.o
obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
obj-$(CONFIG_TINY_RCU) += tiny.o
+obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 0d6ff3e471be..73e16ec4054b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -56,6 +56,83 @@
#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
DYNTICK_TASK_FLAG)
+
+/*
+ * Grace-period counter management.
+ */
+
+#define RCU_SEQ_CTR_SHIFT 2
+#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
+
+/*
+ * Return the counter portion of a sequence number previously returned
+ * by rcu_seq_snap() or rcu_seq_current().
+ */
+static inline unsigned long rcu_seq_ctr(unsigned long s)
+{
+ return s >> RCU_SEQ_CTR_SHIFT;
+}
+
+/*
+ * Return the state portion of a sequence number previously returned
+ * by rcu_seq_snap() or rcu_seq_current().
+ */
+static inline int rcu_seq_state(unsigned long s)
+{
+ return s & RCU_SEQ_STATE_MASK;
+}
+
+/*
+ * Set the state portion of the pointed-to sequence number.
+ * The caller is responsible for preventing conflicting updates.
+ */
+static inline void rcu_seq_set_state(unsigned long *sp, int newstate)
+{
+ WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK);
+ WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate);
+}
+
+/* Adjust sequence number for start of update-side operation. */
+static inline void rcu_seq_start(unsigned long *sp)
+{
+ WRITE_ONCE(*sp, *sp + 1);
+ smp_mb(); /* Ensure update-side operation after counter increment. */
+ WARN_ON_ONCE(rcu_seq_state(*sp) != 1);
+}
+
+/* Adjust sequence number for end of update-side operation. */
+static inline void rcu_seq_end(unsigned long *sp)
+{
+ smp_mb(); /* Ensure update-side operation before counter increment. */
+ WARN_ON_ONCE(!rcu_seq_state(*sp));
+ WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1);
+}
+
+/* Take a snapshot of the update side's sequence number. */
+static inline unsigned long rcu_seq_snap(unsigned long *sp)
+{
+ unsigned long s;
+
+ s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK;
+ smp_mb(); /* Above access must not bleed into critical section. */
+ return s;
+}
+
+/* Return the current value the update side's sequence number, no ordering. */
+static inline unsigned long rcu_seq_current(unsigned long *sp)
+{
+ return READ_ONCE(*sp);
+}
+
+/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred.
+ */
+static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
+{
+ return ULONG_CMP_GE(READ_ONCE(*sp), s);
+}
+
/*
* debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
* by call_rcu() and rcu callback execution, and are therefore not part of the
@@ -109,12 +186,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
rcu_lock_acquire(&rcu_callback_map);
if (__is_kfree_rcu_offset(offset)) {
- RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
+ RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);)
kfree((void *)head - offset);
rcu_lock_release(&rcu_callback_map);
return true;
} else {
- RCU_TRACE(trace_rcu_invoke_callback(rn, head));
+ RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
head->func(head);
rcu_lock_release(&rcu_callback_map);
return false;
@@ -144,4 +221,76 @@ void rcu_test_sync_prims(void);
*/
extern void resched_cpu(int cpu);
+#if defined(SRCU) || !defined(TINY_RCU)
+
+#include <linux/rcu_node_tree.h>
+
+extern int rcu_num_lvls;
+extern int num_rcu_lvl[];
+extern int rcu_num_nodes;
+static bool rcu_fanout_exact;
+static int rcu_fanout_leaf;
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
+ */
+static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
+{
+ int i;
+
+ if (rcu_fanout_exact) {
+ levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+ for (i = rcu_num_lvls - 2; i >= 0; i--)
+ levelspread[i] = RCU_FANOUT;
+ } else {
+ int ccur;
+ int cprv;
+
+ cprv = nr_cpu_ids;
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
+ ccur = levelcnt[i];
+ levelspread[i] = (cprv + ccur - 1) / ccur;
+ cprv = ccur;
+ }
+ }
+}
+
+/*
+ * Do a full breadth-first scan of the rcu_node structures for the
+ * specified rcu_state structure.
+ */
+#define rcu_for_each_node_breadth_first(rsp, rnp) \
+ for ((rnp) = &(rsp)->node[0]; \
+ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
+
+/*
+ * Do a breadth-first scan of the non-leaf rcu_node structures for the
+ * specified rcu_state structure. Note that if there is a singleton
+ * rcu_node tree with but one rcu_node structure, this loop is a no-op.
+ */
+#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
+ for ((rnp) = &(rsp)->node[0]; \
+ (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
+
+/*
+ * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
+ * structure. Note that if there is a singleton rcu_node tree with but
+ * one rcu_node structure, this loop -will- visit the rcu_node structure.
+ * It is still a leaf node, even if it is also the root node.
+ */
+#define rcu_for_each_leaf_node(rsp, rnp) \
+ for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
+ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
+
+/*
+ * Iterate over all possible CPUs in a leaf RCU node.
+ */
+#define for_each_leaf_node_possible_cpu(rnp, cpu) \
+ for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
+ cpu <= rnp->grphi; \
+ cpu = cpumask_next((cpu), cpu_possible_mask))
+
+#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
new file mode 100644
index 000000000000..2b62a38b080f
--- /dev/null
+++ b/kernel/rcu/rcu_segcblist.c
@@ -0,0 +1,505 @@
+/*
+ * RCU segmented callback lists, function definitions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+
+#include "rcu_segcblist.h"
+
+/* Initialize simple callback list. */
+void rcu_cblist_init(struct rcu_cblist *rclp)
+{
+ rclp->head = NULL;
+ rclp->tail = &rclp->head;
+ rclp->len = 0;
+ rclp->len_lazy = 0;
+}
+
+/*
+ * Debug function to actually count the number of callbacks.
+ * If the number exceeds the limit specified, return -1.
+ */
+long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
+{
+ int cnt = 0;
+ struct rcu_head **rhpp = &rclp->head;
+
+ for (;;) {
+ if (!*rhpp)
+ return cnt;
+ if (++cnt > lim)
+ return -1;
+ rhpp = &(*rhpp)->next;
+ }
+}
+
+/*
+ * Dequeue the oldest rcu_head structure from the specified callback
+ * list. This function assumes that the callback is non-lazy, but
+ * the caller can later invoke rcu_cblist_dequeued_lazy() if it
+ * finds otherwise (and if it cares about laziness). This allows
+ * different users to have different ways of determining laziness.
+ */
+struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
+{
+ struct rcu_head *rhp;
+
+ rhp = rclp->head;
+ if (!rhp)
+ return NULL;
+ rclp->len--;
+ rclp->head = rhp->next;
+ if (!rclp->head)
+ rclp->tail = &rclp->head;
+ return rhp;
+}
+
+/*
+ * Initialize an rcu_segcblist structure.
+ */
+void rcu_segcblist_init(struct rcu_segcblist *rsclp)
+{
+ int i;
+
+ BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
+ BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
+ rsclp->head = NULL;
+ for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+ rsclp->tails[i] = &rsclp->head;
+ rsclp->len = 0;
+ rsclp->len_lazy = 0;
+}
+
+/*
+ * Disable the specified rcu_segcblist structure, so that callbacks can
+ * no longer be posted to it. This structure must be empty.
+ */
+void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
+{
+ WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
+ WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
+ rsclp->tails[RCU_NEXT_TAIL] = NULL;
+}
+
+/*
+ * Is the specified segment of the specified rcu_segcblist structure
+ * empty of callbacks?
+ */
+bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
+{
+ if (seg == RCU_DONE_TAIL)
+ return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
+ return rsclp->tails[seg - 1] == rsclp->tails[seg];
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are ready to be invoked?
+ */
+bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
+{
+ return rcu_segcblist_is_enabled(rsclp) &&
+ &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are still pending, that is, not yet ready to be invoked?
+ */
+bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
+{
+ return rcu_segcblist_is_enabled(rsclp) &&
+ !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
+}
+
+/*
+ * Dequeue and return the first ready-to-invoke callback. If there
+ * are no ready-to-invoke callbacks, return NULL. Disables interrupts
+ * to avoid interference. Does not protect from interference from other
+ * CPUs or tasks.
+ */
+struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
+{
+ unsigned long flags;
+ int i;
+ struct rcu_head *rhp;
+
+ local_irq_save(flags);
+ if (!rcu_segcblist_ready_cbs(rsclp)) {
+ local_irq_restore(flags);
+ return NULL;
+ }
+ rhp = rsclp->head;
+ BUG_ON(!rhp);
+ rsclp->head = rhp->next;
+ for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
+ if (rsclp->tails[i] != &rhp->next)
+ break;
+ rsclp->tails[i] = &rsclp->head;
+ }
+ smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
+ WRITE_ONCE(rsclp->len, rsclp->len - 1);
+ local_irq_restore(flags);
+ return rhp;
+}
+
+/*
+ * Account for the fact that a previously dequeued callback turned out
+ * to be marked as lazy.
+ */
+void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rsclp->len_lazy--;
+ local_irq_restore(flags);
+}
+
+/*
+ * Return a pointer to the first callback in the specified rcu_segcblist
+ * structure. This is useful for diagnostics.
+ */
+struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
+{
+ if (rcu_segcblist_is_enabled(rsclp))
+ return rsclp->head;
+ return NULL;
+}
+
+/*
+ * Return a pointer to the first pending callback in the specified
+ * rcu_segcblist structure. This is useful just after posting a given
+ * callback -- if that callback is the first pending callback, then
+ * you cannot rely on someone else having already started up the required
+ * grace period.
+ */
+struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
+{
+ if (rcu_segcblist_is_enabled(rsclp))
+ return *rsclp->tails[RCU_DONE_TAIL];
+ return NULL;
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * have not yet been processed beyond having been posted, that is,
+ * does it contain callbacks in its last segment?
+ */
+bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
+{
+ return rcu_segcblist_is_enabled(rsclp) &&
+ !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
+}
+
+/*
+ * Enqueue the specified callback onto the specified rcu_segcblist
+ * structure, updating accounting as needed. Note that the ->len
+ * field may be accessed locklessly, hence the WRITE_ONCE().
+ * The ->len field is used by rcu_barrier() and friends to determine
+ * if it must post a callback on this structure, and it is OK
+ * for rcu_barrier() to sometimes post callbacks needlessly, but
+ * absolutely not OK for it to ever miss posting a callback.
+ */
+void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
+ struct rcu_head *rhp, bool lazy)
+{
+ WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
+ if (lazy)
+ rsclp->len_lazy++;
+ smp_mb(); /* Ensure counts are updated before callback is enqueued. */
+ rhp->next = NULL;
+ *rsclp->tails[RCU_NEXT_TAIL] = rhp;
+ rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
+}
+
+/*
+ * Entrain the specified callback onto the specified rcu_segcblist at
+ * the end of the last non-empty segment. If the entire rcu_segcblist
+ * is empty, make no change, but return false.
+ *
+ * This is intended for use by rcu_barrier()-like primitives, -not-
+ * for normal grace-period use. IMPORTANT: The callback you enqueue
+ * will wait for all prior callbacks, NOT necessarily for a grace
+ * period. You have been warned.
+ */
+bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
+ struct rcu_head *rhp, bool lazy)
+{
+ int i;
+
+ if (rcu_segcblist_n_cbs(rsclp) == 0)
+ return false;
+ WRITE_ONCE(rsclp->len, rsclp->len + 1);
+ if (lazy)
+ rsclp->len_lazy++;
+ smp_mb(); /* Ensure counts are updated before callback is entrained. */
+ rhp->next = NULL;
+ for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
+ if (rsclp->tails[i] != rsclp->tails[i - 1])
+ break;
+ *rsclp->tails[i] = rhp;
+ for (; i <= RCU_NEXT_TAIL; i++)
+ rsclp->tails[i] = &rhp->next;
+ return true;
+}
+
+/*
+ * Extract only the counts from the specified rcu_segcblist structure,
+ * and place them in the specified rcu_cblist structure. This function
+ * supports both callback orphaning and invocation, hence the separation
+ * of counts and callbacks. (Callbacks ready for invocation must be
+ * orphaned and adopted separately from pending callbacks, but counts
+ * apply to all callbacks. Locking must be used to make sure that
+ * both orphaned-callbacks lists are consistent.)
+ */
+void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ rclp->len_lazy += rsclp->len_lazy;
+ rclp->len += rsclp->len;
+ rsclp->len_lazy = 0;
+ WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
+}
+
+/*
+ * Extract only those callbacks ready to be invoked from the specified
+ * rcu_segcblist structure and place them in the specified rcu_cblist
+ * structure.
+ */
+void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ int i;
+
+ if (!rcu_segcblist_ready_cbs(rsclp))
+ return; /* Nothing to do. */
+ *rclp->tail = rsclp->head;
+ rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
+ *rsclp->tails[RCU_DONE_TAIL] = NULL;
+ rclp->tail = rsclp->tails[RCU_DONE_TAIL];
+ for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
+ if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
+ rsclp->tails[i] = &rsclp->head;
+}
+
+/*
+ * Extract only those callbacks still pending (not yet ready to be
+ * invoked) from the specified rcu_segcblist structure and place them in
+ * the specified rcu_cblist structure. Note that this loses information
+ * about any callbacks that might have been partway done waiting for
+ * their grace period. Too bad! They will have to start over.
+ */
+void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ int i;
+
+ if (!rcu_segcblist_pend_cbs(rsclp))
+ return; /* Nothing to do. */
+ *rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
+ rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
+ *rsclp->tails[RCU_DONE_TAIL] = NULL;
+ for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
+ rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
+}
+
+/*
+ * Insert counts from the specified rcu_cblist structure in the
+ * specified rcu_segcblist structure.
+ */
+void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ rsclp->len_lazy += rclp->len_lazy;
+ /* ->len sampled locklessly. */
+ WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
+ rclp->len_lazy = 0;
+ rclp->len = 0;
+}
+
+/*
+ * Move callbacks from the specified rcu_cblist to the beginning of the
+ * done-callbacks segment of the specified rcu_segcblist.
+ */
+void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ int i;
+
+ if (!rclp->head)
+ return; /* No callbacks to move. */
+ *rclp->tail = rsclp->head;
+ rsclp->head = rclp->head;
+ for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
+ if (&rsclp->head == rsclp->tails[i])
+ rsclp->tails[i] = rclp->tail;
+ else
+ break;
+ rclp->head = NULL;
+ rclp->tail = &rclp->head;
+}
+
+/*
+ * Move callbacks from the specified rcu_cblist to the end of the
+ * new-callbacks segment of the specified rcu_segcblist.
+ */
+void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ if (!rclp->head)
+ return; /* Nothing to do. */
+ *rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
+ rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
+ rclp->head = NULL;
+ rclp->tail = &rclp->head;
+}
+
+/*
+ * Advance the callbacks in the specified rcu_segcblist structure based
+ * on the current value passed in for the grace-period counter.
+ */
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
+{
+ int i, j;
+
+ WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+ if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+ return;
+
+ /*
+ * Find all callbacks whose ->gp_seq numbers indicate that they
+ * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
+ */
+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+ if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+ break;
+ rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
+ }
+
+ /* If no callbacks moved, nothing more need be done. */
+ if (i == RCU_WAIT_TAIL)
+ return;
+
+ /* Clean up tail pointers that might have been misordered above. */
+ for (j = RCU_WAIT_TAIL; j < i; j++)
+ rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
+
+ /*
+ * Callbacks moved, so clean up the misordered ->tails[] pointers
+ * that now point into the middle of the list of ready-to-invoke
+ * callbacks. The overall effect is to copy down the later pointers
+ * into the gap that was created by the now-ready segments.
+ */
+ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+ if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
+ break; /* No more callbacks. */
+ rsclp->tails[j] = rsclp->tails[i];
+ rsclp->gp_seq[j] = rsclp->gp_seq[i];
+ }
+}
+
+/*
+ * "Accelerate" callbacks based on more-accurate grace-period information.
+ * The reason for this is that RCU does not synchronize the beginnings and
+ * ends of grace periods, and that callbacks are posted locally. This in
+ * turn means that the callbacks must be labelled conservatively early
+ * on, as getting exact information would degrade both performance and
+ * scalability. When more accurate grace-period information becomes
+ * available, previously posted callbacks can be "accelerated", marking
+ * them to complete at the end of the earlier grace period.
+ *
+ * This function operates on an rcu_segcblist structure, and also the
+ * grace-period sequence number seq at which new callbacks would become
+ * ready to invoke. Returns true if there are callbacks that won't be
+ * ready to invoke until seq, false otherwise.
+ */
+bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
+{
+ int i;
+
+ WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+ if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+ return false;
+
+ /*
+ * Find the segment preceding the oldest segment of callbacks
+ * whose ->gp_seq[] completion is at or after that passed in via
+ * "seq", skipping any empty segments. This oldest segment, along
+ * with any later segments, can be merged in with any newly arrived
+ * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
+ * as their ->gp_seq[] grace-period completion sequence number.
+ */
+ for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
+ if (rsclp->tails[i] != rsclp->tails[i - 1] &&
+ ULONG_CMP_LT(rsclp->gp_seq[i], seq))
+ break;
+
+ /*
+ * If all the segments contain callbacks that correspond to
+ * earlier grace-period sequence numbers than "seq", leave.
+ * Assuming that the rcu_segcblist structure has enough
+ * segments in its arrays, this can only happen if some of
+ * the non-done segments contain callbacks that really are
+ * ready to invoke. This situation will get straightened
+ * out by the next call to rcu_segcblist_advance().
+ *
+ * Also advance to the oldest segment of callbacks whose
+ * ->gp_seq[] completion is at or after that passed in via "seq",
+ * skipping any empty segments.
+ */
+ if (++i >= RCU_NEXT_TAIL)
+ return false;
+
+ /*
+ * Merge all later callbacks, including newly arrived callbacks,
+ * into the segment located by the for-loop above. Assign "seq"
+ * as the ->gp_seq[] value in order to correctly handle the case
+ * where there were no pending callbacks in the rcu_segcblist
+ * structure other than in the RCU_NEXT_TAIL segment.
+ */
+ for (; i < RCU_NEXT_TAIL; i++) {
+ rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
+ rsclp->gp_seq[i] = seq;
+ }
+ return true;
+}
+
+/*
+ * Scan the specified rcu_segcblist structure for callbacks that need
+ * a grace period later than the one specified by "seq". We don't look
+ * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
+ * have a grace-period sequence number.
+ */
+bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
+ unsigned long seq)
+{
+ int i;
+
+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+ if (rsclp->tails[i - 1] != rsclp->tails[i] &&
+ ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+ return true;
+ return false;
+}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
new file mode 100644
index 000000000000..6e36e36478cd
--- /dev/null
+++ b/kernel/rcu/rcu_segcblist.h
@@ -0,0 +1,164 @@
+/*
+ * RCU segmented callback lists, internal-to-rcu header file
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#include <linux/rcu_segcblist.h>
+
+/*
+ * Account for the fact that a previously dequeued callback turned out
+ * to be marked as lazy.
+ */
+static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
+{
+ rclp->len_lazy--;
+}
+
+/*
+ * Interim function to return rcu_cblist head pointer. Longer term, the
+ * rcu_cblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
+{
+ return rclp->head;
+}
+
+/*
+ * Interim function to return rcu_cblist head pointer. Longer term, the
+ * rcu_cblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
+{
+ WARN_ON_ONCE(!rclp->head);
+ return rclp->tail;
+}
+
+void rcu_cblist_init(struct rcu_cblist *rclp);
+long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim);
+struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
+
+/*
+ * Is the specified rcu_segcblist structure empty?
+ *
+ * But careful! The fact that the ->head field is NULL does not
+ * necessarily imply that there are no callbacks associated with
+ * this structure. When callbacks are being invoked, they are
+ * removed as a group. If callback invocation must be preempted,
+ * the remaining callbacks will be added back to the list. Either
+ * way, the counts are updated later.
+ *
+ * So it is often the case that rcu_segcblist_n_cbs() should be used
+ * instead.
+ */
+static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
+{
+ return !rsclp->head;
+}
+
+/* Return number of callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
+{
+ return READ_ONCE(rsclp->len);
+}
+
+/* Return number of lazy callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
+{
+ return rsclp->len_lazy;
+}
+
+/* Return number of lazy callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
+{
+ return rsclp->len - rsclp->len_lazy;
+}
+
+/*
+ * Is the specified rcu_segcblist enabled, for example, not corresponding
+ * to an offline or callback-offloaded CPU?
+ */
+static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
+{
+ return !!rsclp->tails[RCU_NEXT_TAIL];
+}
+
+/*
+ * Are all segments following the specified segment of the specified
+ * rcu_segcblist structure empty of callbacks? (The specified
+ * segment might well contain callbacks.)
+ */
+static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
+{
+ return !*rsclp->tails[seg];
+}
+
+/*
+ * Interim function to return rcu_segcblist head pointer. Longer term, the
+ * rcu_segcblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
+{
+ return rsclp->head;
+}
+
+/*
+ * Interim function to return rcu_segcblist head pointer. Longer term, the
+ * rcu_segcblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
+{
+ WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
+ return rsclp->tails[RCU_NEXT_TAIL];
+}
+
+void rcu_segcblist_init(struct rcu_segcblist *rsclp);
+void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg);
+bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
+struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp);
+void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp);
+struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
+struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp);
+void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
+ struct rcu_head *rhp, bool lazy);
+bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
+ struct rcu_head *rhp, bool lazy);
+void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
+bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
+bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
+ unsigned long seq);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index cccc417a8135..ae6e574d4cf5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -559,19 +559,34 @@ static void srcu_torture_barrier(void)
static void srcu_torture_stats(void)
{
- int cpu;
- int idx = srcu_ctlp->completed & 0x1;
+ int __maybe_unused cpu;
+ int idx;
- pr_alert("%s%s per-CPU(idx=%d):",
+#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU)
+#ifdef CONFIG_TREE_SRCU
+ idx = srcu_ctlp->srcu_idx & 0x1;
+#else /* #ifdef CONFIG_TREE_SRCU */
+ idx = srcu_ctlp->completed & 0x1;
+#endif /* #else #ifdef CONFIG_TREE_SRCU */
+ pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
unsigned long l0, l1;
unsigned long u0, u1;
long c0, c1;
- struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
+#ifdef CONFIG_TREE_SRCU
+ struct srcu_data *counts;
+ counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
+ u0 = counts->srcu_unlock_count[!idx];
+ u1 = counts->srcu_unlock_count[idx];
+#else /* #ifdef CONFIG_TREE_SRCU */
+ struct srcu_array *counts;
+
+ counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
u0 = counts->unlock_count[!idx];
u1 = counts->unlock_count[idx];
+#endif /* #else #ifdef CONFIG_TREE_SRCU */
/*
* Make sure that a lock is always counted if the corresponding
@@ -579,14 +594,26 @@ static void srcu_torture_stats(void)
*/
smp_rmb();
+#ifdef CONFIG_TREE_SRCU
+ l0 = counts->srcu_lock_count[!idx];
+ l1 = counts->srcu_lock_count[idx];
+#else /* #ifdef CONFIG_TREE_SRCU */
l0 = counts->lock_count[!idx];
l1 = counts->lock_count[idx];
+#endif /* #else #ifdef CONFIG_TREE_SRCU */
c0 = l0 - u0;
c1 = l1 - u1;
pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
}
pr_cont("\n");
+#elif defined(CONFIG_TINY_SRCU)
+ idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
+ pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n",
+ torture_type, TORTURE_FLAG, idx,
+ READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
+ READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
+#endif
}
static void srcu_torture_synchronize_expedited(void)
@@ -1333,12 +1360,14 @@ rcu_torture_stats_print(void)
cur_ops->stats();
if (rtcv_snap == rcu_torture_current_version &&
rcu_torture_current != NULL) {
- int __maybe_unused flags;
- unsigned long __maybe_unused gpnum;
- unsigned long __maybe_unused completed;
+ int __maybe_unused flags = 0;
+ unsigned long __maybe_unused gpnum = 0;
+ unsigned long __maybe_unused completed = 0;
rcutorture_get_gp_data(cur_ops->ttype,
&flags, &gpnum, &completed);
+ srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
+ &flags, &gpnum, &completed);
wtp = READ_ONCE(writer_task);
pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
rcu_torture_writer_state_getname(),
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index ef3bcfb15b39..dea03614263f 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -22,7 +22,7 @@
* Lai Jiangshan <laijs@cn.fujitsu.com>
*
* For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU/ *.txt
+ * Documentation/RCU/ *.txt
*
*/
@@ -243,8 +243,14 @@ static bool srcu_readers_active(struct srcu_struct *sp)
* cleanup_srcu_struct - deconstruct a sleep-RCU structure
* @sp: structure to clean up.
*
- * Must invoke this after you are finished using a given srcu_struct that
- * was initialized via init_srcu_struct(), else you leak memory.
+ * Must invoke this only after you are finished using a given srcu_struct
+ * that was initialized via init_srcu_struct(). This code does some
+ * probabalistic checking, spotting late uses of srcu_read_lock(),
+ * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu().
+ * If any such late uses are detected, the per-CPU memory associated with
+ * the srcu_struct is simply leaked and WARN_ON() is invoked. If the
+ * caller frees the srcu_struct itself, a use-after-free crash will likely
+ * ensue, but at least there will be a warning printed.
*/
void cleanup_srcu_struct(struct srcu_struct *sp)
{
@@ -257,7 +263,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
/*
* Counts the new reader in the appropriate per-CPU element of the
- * srcu_struct. Must be called from process context.
+ * srcu_struct.
* Returns an index that must be passed to the matching srcu_read_unlock().
*/
int __srcu_read_lock(struct srcu_struct *sp)
@@ -265,7 +271,7 @@ int __srcu_read_lock(struct srcu_struct *sp)
int idx;
idx = READ_ONCE(sp->completed) & 0x1;
- __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
+ this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
return idx;
}
@@ -275,7 +281,6 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
* Removes the count for the old reader from the appropriate per-CPU
* element of the srcu_struct. Note that this may well be a different
* CPU than that which was incremented by the corresponding srcu_read_lock().
- * Must be called from process context.
*/
void __srcu_read_unlock(struct srcu_struct *sp, int idx)
{
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
new file mode 100644
index 000000000000..32798eb14853
--- /dev/null
+++ b/kernel/rcu/srcutiny.c
@@ -0,0 +1,217 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion,
+ * tiny version for non-preemptible single-CPU use.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ */
+
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/srcu.h>
+
+#include <linux/rcu_node_tree.h>
+#include "rcu_segcblist.h"
+#include "rcu.h"
+
+static int init_srcu_struct_fields(struct srcu_struct *sp)
+{
+ sp->srcu_lock_nesting[0] = 0;
+ sp->srcu_lock_nesting[1] = 0;
+ init_swait_queue_head(&sp->srcu_wq);
+ sp->srcu_gp_seq = 0;
+ rcu_segcblist_init(&sp->srcu_cblist);
+ sp->srcu_gp_running = false;
+ sp->srcu_gp_waiting = false;
+ sp->srcu_idx = 0;
+ INIT_WORK(&sp->srcu_work, srcu_drive_gp);
+ return 0;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+ struct lock_class_key *key)
+{
+ /* Don't re-initialize a lock while it is held. */
+ debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+ lockdep_init_map(&sp->dep_map, name, key, 0);
+ return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function. Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+ return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+ WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
+ flush_work(&sp->srcu_work);
+ WARN_ON(rcu_seq_state(sp->srcu_gp_seq));
+ WARN_ON(sp->srcu_gp_running);
+ WARN_ON(sp->srcu_gp_waiting);
+ WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist));
+}
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct. Can be invoked from irq/bh handlers, but the matching
+ * __srcu_read_unlock() must be in the same handler instance. Returns an
+ * index that must be passed to the matching srcu_read_unlock().
+ */
+int __srcu_read_lock(struct srcu_struct *sp)
+{
+ int idx;
+
+ idx = READ_ONCE(sp->srcu_idx);
+ WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
+ return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
+
+/*
+ * Removes the count for the old reader from the appropriate element of
+ * the srcu_struct.
+ */
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+ int newval = sp->srcu_lock_nesting[idx] - 1;
+
+ WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
+ if (!newval && READ_ONCE(sp->srcu_gp_waiting))
+ swake_up(&sp->srcu_wq);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+
+/*
+ * Workqueue handler to drive one grace period and invoke any callbacks
+ * that become ready as a result. Single-CPU and !PREEMPT operation
+ * means that we get away with murder on synchronization. ;-)
+ */
+void srcu_drive_gp(struct work_struct *wp)
+{
+ int idx;
+ struct rcu_cblist ready_cbs;
+ struct srcu_struct *sp;
+ struct rcu_head *rhp;
+
+ sp = container_of(wp, struct srcu_struct, srcu_work);
+ if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist))
+ return; /* Already running or nothing to do. */
+
+ /* Tag recently arrived callbacks and wait for readers. */
+ WRITE_ONCE(sp->srcu_gp_running, true);
+ rcu_segcblist_accelerate(&sp->srcu_cblist,
+ rcu_seq_snap(&sp->srcu_gp_seq));
+ rcu_seq_start(&sp->srcu_gp_seq);
+ idx = sp->srcu_idx;
+ WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
+ WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
+ swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
+ WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+ rcu_seq_end(&sp->srcu_gp_seq);
+
+ /* Update callback list based on GP, and invoke ready callbacks. */
+ rcu_segcblist_advance(&sp->srcu_cblist,
+ rcu_seq_current(&sp->srcu_gp_seq));
+ if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
+ rcu_cblist_init(&ready_cbs);
+ local_irq_disable();
+ rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
+ local_irq_enable();
+ rhp = rcu_cblist_dequeue(&ready_cbs);
+ for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+ local_bh_disable();
+ rhp->func(rhp);
+ local_bh_enable();
+ }
+ local_irq_disable();
+ rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
+ local_irq_enable();
+ }
+ WRITE_ONCE(sp->srcu_gp_running, false);
+
+ /*
+ * If more callbacks, reschedule ourselves. This can race with
+ * a call_srcu() at interrupt level, but the ->srcu_gp_running
+ * checks will straighten that out.
+ */
+ if (!rcu_segcblist_empty(&sp->srcu_cblist))
+ schedule_work(&sp->srcu_work);
+}
+EXPORT_SYMBOL_GPL(srcu_drive_gp);
+
+/*
+ * Enqueue an SRCU callback on the specified srcu_struct structure,
+ * initiating grace-period processing if it is not already running.
+ */
+void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+ rcu_callback_t func)
+{
+ unsigned long flags;
+
+ head->func = func;
+ local_irq_save(flags);
+ rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+ local_irq_restore(flags);
+ if (!READ_ONCE(sp->srcu_gp_running))
+ schedule_work(&sp->srcu_work);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
+
+/*
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+ struct rcu_synchronize rs;
+
+ init_rcu_head_on_stack(&rs.head);
+ init_completion(&rs.completion);
+ call_srcu(sp, &rs.head, wakeme_after_rcu);
+ wait_for_completion(&rs.completion);
+ destroy_rcu_head_on_stack(&rs.head);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
new file mode 100644
index 000000000000..157654fa436a
--- /dev/null
+++ b/kernel/rcu/srcutree.c
@@ -0,0 +1,1154 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ * Copyright (C) Fujitsu, 2012
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ * Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU/ *.txt
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/srcu.h>
+
+#include "rcu.h"
+#include "rcu_segcblist.h"
+
+ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */
+module_param(exp_holdoff, ulong, 0444);
+
+static void srcu_invoke_callbacks(struct work_struct *work);
+static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+
+/*
+ * Initialize SRCU combining tree. Note that statically allocated
+ * srcu_struct structures might already have srcu_read_lock() and
+ * srcu_read_unlock() running against them. So if the is_static parameter
+ * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
+ */
+static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
+{
+ int cpu;
+ int i;
+ int level = 0;
+ int levelspread[RCU_NUM_LVLS];
+ struct srcu_data *sdp;
+ struct srcu_node *snp;
+ struct srcu_node *snp_first;
+
+ /* Work out the overall tree geometry. */
+ sp->level[0] = &sp->node[0];
+ for (i = 1; i < rcu_num_lvls; i++)
+ sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1];
+ rcu_init_levelspread(levelspread, num_rcu_lvl);
+
+ /* Each pass through this loop initializes one srcu_node structure. */
+ rcu_for_each_node_breadth_first(sp, snp) {
+ spin_lock_init(&snp->lock);
+ WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
+ ARRAY_SIZE(snp->srcu_data_have_cbs));
+ for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
+ snp->srcu_have_cbs[i] = 0;
+ snp->srcu_data_have_cbs[i] = 0;
+ }
+ snp->srcu_gp_seq_needed_exp = 0;
+ snp->grplo = -1;
+ snp->grphi = -1;
+ if (snp == &sp->node[0]) {
+ /* Root node, special case. */
+ snp->srcu_parent = NULL;
+ continue;
+ }
+
+ /* Non-root node. */
+ if (snp == sp->level[level + 1])
+ level++;
+ snp->srcu_parent = sp->level[level - 1] +
+ (snp - sp->level[level]) /
+ levelspread[level - 1];
+ }
+
+ /*
+ * Initialize the per-CPU srcu_data array, which feeds into the
+ * leaves of the srcu_node tree.
+ */
+ WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
+ ARRAY_SIZE(sdp->srcu_unlock_count));
+ level = rcu_num_lvls - 1;
+ snp_first = sp->level[level];
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(sp->sda, cpu);
+ spin_lock_init(&sdp->lock);
+ rcu_segcblist_init(&sdp->srcu_cblist);
+ sdp->srcu_cblist_invoking = false;
+ sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq;
+ sdp->mynode = &snp_first[cpu / levelspread[level]];
+ for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
+ if (snp->grplo < 0)
+ snp->grplo = cpu;
+ snp->grphi = cpu;
+ }
+ sdp->cpu = cpu;
+ INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
+ sdp->sp = sp;
+ sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
+ if (is_static)
+ continue;
+
+ /* Dynamically allocated, better be no srcu_read_locks()! */
+ for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
+ sdp->srcu_lock_count[i] = 0;
+ sdp->srcu_unlock_count[i] = 0;
+ }
+ }
+}
+
+/*
+ * Initialize non-compile-time initialized fields, including the
+ * associated srcu_node and srcu_data structures. The is_static
+ * parameter is passed through to init_srcu_struct_nodes(), and
+ * also tells us that ->sda has already been wired up to srcu_data.
+ */
+static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
+{
+ mutex_init(&sp->srcu_cb_mutex);
+ mutex_init(&sp->srcu_gp_mutex);
+ sp->srcu_idx = 0;
+ sp->srcu_gp_seq = 0;
+ sp->srcu_barrier_seq = 0;
+ mutex_init(&sp->srcu_barrier_mutex);
+ atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
+ INIT_DELAYED_WORK(&sp->work, process_srcu);
+ if (!is_static)
+ sp->sda = alloc_percpu(struct srcu_data);
+ init_srcu_struct_nodes(sp, is_static);
+ sp->srcu_gp_seq_needed_exp = 0;
+ sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
+ return sp->sda ? 0 : -ENOMEM;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+ struct lock_class_key *key)
+{
+ /* Don't re-initialize a lock while it is held. */
+ debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+ lockdep_init_map(&sp->dep_map, name, key, 0);
+ spin_lock_init(&sp->gp_lock);
+ return init_srcu_struct_fields(sp, false);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function. Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+ spin_lock_init(&sp->gp_lock);
+ return init_srcu_struct_fields(sp, false);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * First-use initialization of statically allocated srcu_struct
+ * structure. Wiring up the combining tree is more than can be
+ * done with compile-time initialization, so this check is added
+ * to each update-side SRCU primitive. Use ->gp_lock, which -is-
+ * compile-time initialized, to resolve races involving multiple
+ * CPUs trying to garner first-use privileges.
+ */
+static void check_init_srcu_struct(struct srcu_struct *sp)
+{
+ unsigned long flags;
+
+ WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT);
+ /* The smp_load_acquire() pairs with the smp_store_release(). */
+ if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
+ return; /* Already initialized. */
+ spin_lock_irqsave(&sp->gp_lock, flags);
+ if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
+ spin_unlock_irqrestore(&sp->gp_lock, flags);
+ return;
+ }
+ init_srcu_struct_fields(sp, true);
+ spin_unlock_irqrestore(&sp->gp_lock, flags);
+}
+
+/*
+ * Returns approximate total of the readers' ->srcu_lock_count[] values
+ * for the rank of per-CPU counters specified by idx.
+ */
+static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
+{
+ int cpu;
+ unsigned long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+
+ sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
+ }
+ return sum;
+}
+
+/*
+ * Returns approximate total of the readers' ->srcu_unlock_count[] values
+ * for the rank of per-CPU counters specified by idx.
+ */
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
+{
+ int cpu;
+ unsigned long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+
+ sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
+ }
+ return sum;
+}
+
+/*
+ * Return true if the number of pre-existing readers is determined to
+ * be zero.
+ */
+static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
+{
+ unsigned long unlocks;
+
+ unlocks = srcu_readers_unlock_idx(sp, idx);
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted. Needs to be a smp_mb() as the read side may
+ * contain a read from a variable that is written to before the
+ * synchronize_srcu() in the write side. In this case smp_mb()s
+ * A and B act like the store buffering pattern.
+ *
+ * This smp_mb() also pairs with smp_mb() C to prevent accesses
+ * after the synchronize_srcu() from being executed before the
+ * grace period ends.
+ */
+ smp_mb(); /* A */
+
+ /*
+ * If the locks are the same as the unlocks, then there must have
+ * been no readers on this index at some time in between. This does
+ * not mean that there are no more readers, as one could have read
+ * the current index but not have incremented the lock counter yet.
+ *
+ * Possible bug: There is no guarantee that there haven't been
+ * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were
+ * counted, meaning that this could return true even if there are
+ * still active readers. Since there are no memory barriers around
+ * srcu_flip(), the CPU is not required to increment ->srcu_idx
+ * before running srcu_readers_unlock_idx(), which means that there
+ * could be an arbitrarily large number of critical sections that
+ * execute after srcu_readers_unlock_idx() but use the old value
+ * of ->srcu_idx.
+ */
+ return srcu_readers_lock_idx(sp, idx) == unlocks;
+}
+
+/**
+ * srcu_readers_active - returns true if there are readers. and false
+ * otherwise
+ * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
+ *
+ * Note that this is not an atomic primitive, and can therefore suffer
+ * severe errors when invoked on an active srcu_struct. That said, it
+ * can be useful as an error check at cleanup time.
+ */
+static bool srcu_readers_active(struct srcu_struct *sp)
+{
+ int cpu;
+ unsigned long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+
+ sum += READ_ONCE(cpuc->srcu_lock_count[0]);
+ sum += READ_ONCE(cpuc->srcu_lock_count[1]);
+ sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
+ sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
+ }
+ return sum;
+}
+
+#define SRCU_INTERVAL 1
+
+/*
+ * Return grace-period delay, zero if there are expedited grace
+ * periods pending, SRCU_INTERVAL otherwise.
+ */
+static unsigned long srcu_get_delay(struct srcu_struct *sp)
+{
+ if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq),
+ READ_ONCE(sp->srcu_gp_seq_needed_exp)))
+ return 0;
+ return SRCU_INTERVAL;
+}
+
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+ int cpu;
+
+ if (WARN_ON(!srcu_get_delay(sp)))
+ return; /* Leakage unless caller handles error. */
+ if (WARN_ON(srcu_readers_active(sp)))
+ return; /* Leakage unless caller handles error. */
+ flush_delayed_work(&sp->work);
+ for_each_possible_cpu(cpu)
+ flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+ if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ WARN_ON(srcu_readers_active(sp))) {
+ pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
+ return; /* Caller forgot to stop doing call_srcu()? */
+ }
+ free_percpu(sp->sda);
+ sp->sda = NULL;
+}
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int __srcu_read_lock(struct srcu_struct *sp)
+{
+ int idx;
+
+ idx = READ_ONCE(sp->srcu_idx) & 0x1;
+ this_cpu_inc(sp->sda->srcu_lock_count[idx]);
+ smp_mb(); /* B */ /* Avoid leaking the critical section. */
+ return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
+
+/*
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct. Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ */
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+ smp_mb(); /* C */ /* Avoid leaking the critical section. */
+ this_cpu_inc(sp->sda->srcu_unlock_count[idx]);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited(). We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections. If there are still some readers after a few microseconds,
+ * we repeatedly block for 1-millisecond time periods.
+ */
+#define SRCU_RETRY_CHECK_DELAY 5
+
+/*
+ * Start an SRCU grace period.
+ */
+static void srcu_gp_start(struct srcu_struct *sp)
+{
+ struct srcu_data *sdp = this_cpu_ptr(sp->sda);
+ int state;
+
+ RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock),
+ "Invoked srcu_gp_start() without ->gp_lock!");
+ WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&sp->srcu_gp_seq));
+ (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
+ rcu_seq_snap(&sp->srcu_gp_seq));
+ smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
+ rcu_seq_start(&sp->srcu_gp_seq);
+ state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+ WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
+}
+
+/*
+ * Track online CPUs to guide callback workqueue placement.
+ */
+DEFINE_PER_CPU(bool, srcu_online);
+
+void srcu_online_cpu(unsigned int cpu)
+{
+ WRITE_ONCE(per_cpu(srcu_online, cpu), true);
+}
+
+void srcu_offline_cpu(unsigned int cpu)
+{
+ WRITE_ONCE(per_cpu(srcu_online, cpu), false);
+}
+
+/*
+ * Place the workqueue handler on the specified CPU if online, otherwise
+ * just run it whereever. This is useful for placing workqueue handlers
+ * that are to invoke the specified CPU's callbacks.
+ */
+static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+ struct delayed_work *dwork,
+ unsigned long delay)
+{
+ bool ret;
+
+ preempt_disable();
+ if (READ_ONCE(per_cpu(srcu_online, cpu)))
+ ret = queue_delayed_work_on(cpu, wq, dwork, delay);
+ else
+ ret = queue_delayed_work(wq, dwork, delay);
+ preempt_enable();
+ return ret;
+}
+
+/*
+ * Schedule callback invocation for the specified srcu_data structure,
+ * if possible, on the corresponding CPU.
+ */
+static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
+{
+ srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
+ &sdp->work, delay);
+}
+
+/*
+ * Schedule callback invocation for all srcu_data structures associated
+ * with the specified srcu_node structure that have callbacks for the
+ * just-completed grace period, the one corresponding to idx. If possible,
+ * schedule this invocation on the corresponding CPUs.
+ */
+static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
+ unsigned long mask, unsigned long delay)
+{
+ int cpu;
+
+ for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
+ if (!(mask & (1 << (cpu - snp->grplo))))
+ continue;
+ srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay);
+ }
+}
+
+/*
+ * Note the end of an SRCU grace period. Initiates callback invocation
+ * and starts a new grace period if needed.
+ *
+ * The ->srcu_cb_mutex acquisition does not protect any data, but
+ * instead prevents more than one grace period from starting while we
+ * are initiating callback invocation. This allows the ->srcu_have_cbs[]
+ * array to have a finite number of elements.
+ */
+static void srcu_gp_end(struct srcu_struct *sp)
+{
+ unsigned long cbdelay;
+ bool cbs;
+ unsigned long gpseq;
+ int idx;
+ int idxnext;
+ unsigned long mask;
+ struct srcu_node *snp;
+
+ /* Prevent more than one additional grace period. */
+ mutex_lock(&sp->srcu_cb_mutex);
+
+ /* End the current grace period. */
+ spin_lock_irq(&sp->gp_lock);
+ idx = rcu_seq_state(sp->srcu_gp_seq);
+ WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
+ cbdelay = srcu_get_delay(sp);
+ sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ rcu_seq_end(&sp->srcu_gp_seq);
+ gpseq = rcu_seq_current(&sp->srcu_gp_seq);
+ if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
+ sp->srcu_gp_seq_needed_exp = gpseq;
+ spin_unlock_irq(&sp->gp_lock);
+ mutex_unlock(&sp->srcu_gp_mutex);
+ /* A new grace period can start at this point. But only one. */
+
+ /* Initiate callback invocation as needed. */
+ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
+ idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
+ rcu_for_each_node_breadth_first(sp, snp) {
+ spin_lock_irq(&snp->lock);
+ cbs = false;
+ if (snp >= sp->level[rcu_num_lvls - 1])
+ cbs = snp->srcu_have_cbs[idx] == gpseq;
+ snp->srcu_have_cbs[idx] = gpseq;
+ rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+ if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
+ snp->srcu_gp_seq_needed_exp = gpseq;
+ mask = snp->srcu_data_have_cbs[idx];
+ snp->srcu_data_have_cbs[idx] = 0;
+ spin_unlock_irq(&snp->lock);
+ if (cbs) {
+ smp_mb(); /* GP end before CB invocation. */
+ srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
+ }
+ }
+
+ /* Callback initiation done, allow grace periods after next. */
+ mutex_unlock(&sp->srcu_cb_mutex);
+
+ /* Start a new grace period if needed. */
+ spin_lock_irq(&sp->gp_lock);
+ gpseq = rcu_seq_current(&sp->srcu_gp_seq);
+ if (!rcu_seq_state(gpseq) &&
+ ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
+ srcu_gp_start(sp);
+ spin_unlock_irq(&sp->gp_lock);
+ /* Throttle expedited grace periods: Should be rare! */
+ srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
+ ? 0 : SRCU_INTERVAL);
+ } else {
+ spin_unlock_irq(&sp->gp_lock);
+ }
+}
+
+/*
+ * Funnel-locking scheme to scalably mediate many concurrent expedited
+ * grace-period requests. This function is invoked for the first known
+ * expedited request for a grace period that has already been requested,
+ * but without expediting. To start a completely new grace period,
+ * whether expedited or not, use srcu_funnel_gp_start() instead.
+ */
+static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
+ unsigned long s)
+{
+ unsigned long flags;
+
+ for (; snp != NULL; snp = snp->srcu_parent) {
+ if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
+ ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
+ return;
+ spin_lock_irqsave(&snp->lock, flags);
+ if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
+ spin_unlock_irqrestore(&snp->lock, flags);
+ return;
+ }
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore(&snp->lock, flags);
+ }
+ spin_lock_irqsave(&sp->gp_lock, flags);
+ if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
+ sp->srcu_gp_seq_needed_exp = s;
+ spin_unlock_irqrestore(&sp->gp_lock, flags);
+}
+
+/*
+ * Funnel-locking scheme to scalably mediate many concurrent grace-period
+ * requests. The winner has to do the work of actually starting grace
+ * period s. Losers must either ensure that their desired grace-period
+ * number is recorded on at least their leaf srcu_node structure, or they
+ * must take steps to invoke their own callbacks.
+ */
+static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
+ unsigned long s, bool do_norm)
+{
+ unsigned long flags;
+ int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
+ struct srcu_node *snp = sdp->mynode;
+ unsigned long snp_seq;
+
+ /* Each pass through the loop does one level of the srcu_node tree. */
+ for (; snp != NULL; snp = snp->srcu_parent) {
+ if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
+ return; /* GP already done and CBs recorded. */
+ spin_lock_irqsave(&snp->lock, flags);
+ if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
+ snp_seq = snp->srcu_have_cbs[idx];
+ if (snp == sdp->mynode && snp_seq == s)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ spin_unlock_irqrestore(&snp->lock, flags);
+ if (snp == sdp->mynode && snp_seq != s) {
+ smp_mb(); /* CBs after GP! */
+ srcu_schedule_cbs_sdp(sdp, do_norm
+ ? SRCU_INTERVAL
+ : 0);
+ return;
+ }
+ if (!do_norm)
+ srcu_funnel_exp_start(sp, snp, s);
+ return;
+ }
+ snp->srcu_have_cbs[idx] = s;
+ if (snp == sdp->mynode)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
+ snp->srcu_gp_seq_needed_exp = s;
+ spin_unlock_irqrestore(&snp->lock, flags);
+ }
+
+ /* Top of tree, must ensure the grace period will be started. */
+ spin_lock_irqsave(&sp->gp_lock, flags);
+ if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
+ /*
+ * Record need for grace period s. Pair with load
+ * acquire setting up for initialization.
+ */
+ smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/
+ }
+ if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
+ sp->srcu_gp_seq_needed_exp = s;
+
+ /* If grace period not already done and none in progress, start it. */
+ if (!rcu_seq_done(&sp->srcu_gp_seq, s) &&
+ rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
+ WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
+ srcu_gp_start(sp);
+ queue_delayed_work(system_power_efficient_wq, &sp->work,
+ srcu_get_delay(sp));
+ }
+ spin_unlock_irqrestore(&sp->gp_lock, flags);
+}
+
+/*
+ * Wait until all readers counted by array index idx complete, but
+ * loop an additional time if there is an expedited grace period pending.
+ * The caller must ensure that ->srcu_idx is not changed while checking.
+ */
+static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
+{
+ for (;;) {
+ if (srcu_readers_active_idx_check(sp, idx))
+ return true;
+ if (--trycount + !srcu_get_delay(sp) <= 0)
+ return false;
+ udelay(SRCU_RETRY_CHECK_DELAY);
+ }
+}
+
+/*
+ * Increment the ->srcu_idx counter so that future SRCU readers will
+ * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows
+ * us to wait for pre-existing readers in a starvation-free manner.
+ */
+static void srcu_flip(struct srcu_struct *sp)
+{
+ WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
+
+ /*
+ * Ensure that if the updater misses an __srcu_read_unlock()
+ * increment, that task's next __srcu_read_lock() will see the
+ * above counter update. Note that both this memory barrier
+ * and the one in srcu_readers_active_idx_check() provide the
+ * guarantee for __srcu_read_lock().
+ */
+ smp_mb(); /* D */ /* Pairs with C. */
+}
+
+/*
+ * If SRCU is likely idle, return true, otherwise return false.
+ *
+ * Note that it is OK for several current from-idle requests for a new
+ * grace period from idle to specify expediting because they will all end
+ * up requesting the same grace period anyhow. So no loss.
+ *
+ * Note also that if any CPU (including the current one) is still invoking
+ * callbacks, this function will nevertheless say "idle". This is not
+ * ideal, but the overhead of checking all CPUs' callback lists is even
+ * less ideal, especially on large systems. Furthermore, the wakeup
+ * can happen before the callback is fully removed, so we have no choice
+ * but to accept this type of error.
+ *
+ * This function is also subject to counter-wrap errors, but let's face
+ * it, if this function was preempted for enough time for the counters
+ * to wrap, it really doesn't matter whether or not we expedite the grace
+ * period. The extra overhead of a needlessly expedited grace period is
+ * negligible when amoritized over that time period, and the extra latency
+ * of a needlessly non-expedited grace period is similarly negligible.
+ */
+static bool srcu_might_be_idle(struct srcu_struct *sp)
+{
+ unsigned long curseq;
+ unsigned long flags;
+ struct srcu_data *sdp;
+ unsigned long t;
+
+ /* If the local srcu_data structure has callbacks, not idle. */
+ local_irq_save(flags);
+ sdp = this_cpu_ptr(sp->sda);
+ if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
+ local_irq_restore(flags);
+ return false; /* Callbacks already present, so not idle. */
+ }
+ local_irq_restore(flags);
+
+ /*
+ * No local callbacks, so probabalistically probe global state.
+ * Exact information would require acquiring locks, which would
+ * kill scalability, hence the probabalistic nature of the probe.
+ */
+
+ /* First, see if enough time has passed since the last GP. */
+ t = ktime_get_mono_fast_ns();
+ if (exp_holdoff == 0 ||
+ time_in_range_open(t, sp->srcu_last_gp_end,
+ sp->srcu_last_gp_end + exp_holdoff))
+ return false; /* Too soon after last GP. */
+
+ /* Next, check for probable idleness. */
+ curseq = rcu_seq_current(&sp->srcu_gp_seq);
+ smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
+ if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed)))
+ return false; /* Grace period in progress, so not idle. */
+ smp_mb(); /* Order ->srcu_gp_seq with prior access. */
+ if (curseq != rcu_seq_current(&sp->srcu_gp_seq))
+ return false; /* GP # changed, so not idle. */
+ return true; /* With reasonable probability, idle! */
+}
+
+/*
+ * Enqueue an SRCU callback on the srcu_data structure associated with
+ * the current CPU and the specified srcu_struct structure, initiating
+ * grace-period processing if it is not already running.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing SRCU read-side critical section. On systems with
+ * more than one CPU, this means that when "func()" is invoked, each CPU
+ * is guaranteed to have executed a full memory barrier since the end of
+ * its last corresponding SRCU read-side critical section whose beginning
+ * preceded the call to call_rcu(). It also means that each CPU executing
+ * an SRCU read-side critical section that continues beyond the start of
+ * "func()" must have executed a memory barrier after the call_rcu()
+ * but before the beginning of that SRCU read-side critical section.
+ * Note that these guarantees include CPUs that are offline, idle, or
+ * executing in user mode, as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting SRCU callback function "func()", then both CPU A and CPU
+ * B are guaranteed to execute a full memory barrier during the time
+ * interval between the call to call_rcu() and the invocation of "func()".
+ * This guarantee applies even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ *
+ * Of course, these guarantees apply only for invocations of call_srcu(),
+ * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
+ * srcu_struct structure.
+ */
+void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+ rcu_callback_t func, bool do_norm)
+{
+ unsigned long flags;
+ bool needexp = false;
+ bool needgp = false;
+ unsigned long s;
+ struct srcu_data *sdp;
+
+ check_init_srcu_struct(sp);
+ rhp->func = func;
+ local_irq_save(flags);
+ sdp = this_cpu_ptr(sp->sda);
+ spin_lock(&sdp->lock);
+ rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&sp->srcu_gp_seq));
+ s = rcu_seq_snap(&sp->srcu_gp_seq);
+ (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
+ if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
+ sdp->srcu_gp_seq_needed = s;
+ needgp = true;
+ }
+ if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
+ sdp->srcu_gp_seq_needed_exp = s;
+ needexp = true;
+ }
+ spin_unlock_irqrestore(&sdp->lock, flags);
+ if (needgp)
+ srcu_funnel_gp_start(sp, sdp, s, do_norm);
+ else if (needexp)
+ srcu_funnel_exp_start(sp, sdp->mynode, s);
+}
+
+void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+ rcu_callback_t func)
+{
+ __call_srcu(sp, rhp, func, true);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
+
+/*
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ */
+static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
+{
+ struct rcu_synchronize rcu;
+
+ RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
+
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return;
+ might_sleep();
+ check_init_srcu_struct(sp);
+ init_completion(&rcu.completion);
+ init_rcu_head_on_stack(&rcu.head);
+ __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
+ wait_for_completion(&rcu.completion);
+ destroy_rcu_head_on_stack(&rcu.head);
+}
+
+/**
+ * synchronize_srcu_expedited - Brute-force SRCU grace period
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
+ *
+ * Note that synchronize_srcu_expedited() has the same deadlock and
+ * memory-ordering properties as does synchronize_srcu().
+ */
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+ __synchronize_srcu(sp, rcu_gp_is_normal());
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for the count to drain to zero of both indexes. To avoid the
+ * possible starvation of synchronize_srcu(), it waits for the count of
+ * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
+ * and then flip the srcu_idx and wait for the count of the other index.
+ *
+ * Can block; must be called from process context.
+ *
+ * Note that it is illegal to call synchronize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section,
+ * as long as the resulting graph of srcu_structs is acyclic.
+ *
+ * There are memory-ordering constraints implied by synchronize_srcu().
+ * On systems with more than one CPU, when synchronize_srcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since
+ * the end of its last corresponding SRCU-sched read-side critical section
+ * whose beginning preceded the call to synchronize_srcu(). In addition,
+ * each CPU having an SRCU read-side critical section that extends beyond
+ * the return from synchronize_srcu() is guaranteed to have executed a
+ * full memory barrier after the beginning of synchronize_srcu() and before
+ * the beginning of that SRCU read-side critical section. Note that these
+ * guarantees include CPUs that are offline, idle, or executing in user mode,
+ * as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_srcu(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
+ * are the same CPU, but again only if the system has more than one CPU.
+ *
+ * Of course, these memory-ordering guarantees apply only when
+ * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
+ * passed the same srcu_struct structure.
+ *
+ * If SRCU is likely idle, expedite the first request. This semantic
+ * was provided by Classic SRCU, and is relied upon by its users, so TREE
+ * SRCU must also provide it. Note that detecting idleness is heuristic
+ * and subject to both false positives and negatives.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+ if (srcu_might_be_idle(sp) || rcu_gp_is_expedited())
+ synchronize_srcu_expedited(sp);
+ else
+ __synchronize_srcu(sp, true);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/*
+ * Callback function for srcu_barrier() use.
+ */
+static void srcu_barrier_cb(struct rcu_head *rhp)
+{
+ struct srcu_data *sdp;
+ struct srcu_struct *sp;
+
+ sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
+ sp = sdp->sp;
+ if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
+ complete(&sp->srcu_barrier_completion);
+}
+
+/**
+ * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
+ * @sp: srcu_struct on which to wait for in-flight callbacks.
+ */
+void srcu_barrier(struct srcu_struct *sp)
+{
+ int cpu;
+ struct srcu_data *sdp;
+ unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq);
+
+ check_init_srcu_struct(sp);
+ mutex_lock(&sp->srcu_barrier_mutex);
+ if (rcu_seq_done(&sp->srcu_barrier_seq, s)) {
+ smp_mb(); /* Force ordering following return. */
+ mutex_unlock(&sp->srcu_barrier_mutex);
+ return; /* Someone else did our work for us. */
+ }
+ rcu_seq_start(&sp->srcu_barrier_seq);
+ init_completion(&sp->srcu_barrier_completion);
+
+ /* Initial count prevents reaching zero until all CBs are posted. */
+ atomic_set(&sp->srcu_barrier_cpu_cnt, 1);
+
+ /*
+ * Each pass through this loop enqueues a callback, but only
+ * on CPUs already having callbacks enqueued. Note that if
+ * a CPU already has callbacks enqueue, it must have already
+ * registered the need for a future grace period, so all we
+ * need do is enqueue a callback that will use the same
+ * grace period as the last callback already in the queue.
+ */
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(sp->sda, cpu);
+ spin_lock_irq(&sdp->lock);
+ atomic_inc(&sp->srcu_barrier_cpu_cnt);
+ sdp->srcu_barrier_head.func = srcu_barrier_cb;
+ if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
+ &sdp->srcu_barrier_head, 0))
+ atomic_dec(&sp->srcu_barrier_cpu_cnt);
+ spin_unlock_irq(&sdp->lock);
+ }
+
+ /* Remove the initial count, at which point reaching zero can happen. */
+ if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
+ complete(&sp->srcu_barrier_completion);
+ wait_for_completion(&sp->srcu_barrier_completion);
+
+ rcu_seq_end(&sp->srcu_barrier_seq);
+ mutex_unlock(&sp->srcu_barrier_mutex);
+}
+EXPORT_SYMBOL_GPL(srcu_barrier);
+
+/**
+ * srcu_batches_completed - return batches completed.
+ * @sp: srcu_struct on which to report batch completion.
+ *
+ * Report the number of batches, correlated with, but not necessarily
+ * precisely the same as, the number of grace periods that have elapsed.
+ */
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
+{
+ return sp->srcu_idx;
+}
+EXPORT_SYMBOL_GPL(srcu_batches_completed);
+
+/*
+ * Core SRCU state machine. Push state bits of ->srcu_gp_seq
+ * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has
+ * completed in that state.
+ */
+static void srcu_advance_state(struct srcu_struct *sp)
+{
+ int idx;
+
+ mutex_lock(&sp->srcu_gp_mutex);
+
+ /*
+ * Because readers might be delayed for an extended period after
+ * fetching ->srcu_idx for their index, at any point in time there
+ * might well be readers using both idx=0 and idx=1. We therefore
+ * need to wait for readers to clear from both index values before
+ * invoking a callback.
+ *
+ * The load-acquire ensures that we see the accesses performed
+ * by the prior grace period.
+ */
+ idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
+ if (idx == SRCU_STATE_IDLE) {
+ spin_lock_irq(&sp->gp_lock);
+ if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
+ WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
+ spin_unlock_irq(&sp->gp_lock);
+ mutex_unlock(&sp->srcu_gp_mutex);
+ return;
+ }
+ idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+ if (idx == SRCU_STATE_IDLE)
+ srcu_gp_start(sp);
+ spin_unlock_irq(&sp->gp_lock);
+ if (idx != SRCU_STATE_IDLE) {
+ mutex_unlock(&sp->srcu_gp_mutex);
+ return; /* Someone else started the grace period. */
+ }
+ }
+
+ if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+ idx = 1 ^ (sp->srcu_idx & 1);
+ if (!try_check_zero(sp, idx, 1)) {
+ mutex_unlock(&sp->srcu_gp_mutex);
+ return; /* readers present, retry later. */
+ }
+ srcu_flip(sp);
+ rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
+ }
+
+ if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+
+ /*
+ * SRCU read-side critical sections are normally short,
+ * so check at least twice in quick succession after a flip.
+ */
+ idx = 1 ^ (sp->srcu_idx & 1);
+ if (!try_check_zero(sp, idx, 2)) {
+ mutex_unlock(&sp->srcu_gp_mutex);
+ return; /* readers present, retry later. */
+ }
+ srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */
+ }
+}
+
+/*
+ * Invoke a limited number of SRCU callbacks that have passed through
+ * their grace period. If there are more to do, SRCU will reschedule
+ * the workqueue. Note that needed memory barriers have been executed
+ * in this task's context by srcu_readers_active_idx_check().
+ */
+static void srcu_invoke_callbacks(struct work_struct *work)
+{
+ bool more;
+ struct rcu_cblist ready_cbs;
+ struct rcu_head *rhp;
+ struct srcu_data *sdp;
+ struct srcu_struct *sp;
+
+ sdp = container_of(work, struct srcu_data, work.work);
+ sp = sdp->sp;
+ rcu_cblist_init(&ready_cbs);
+ spin_lock_irq(&sdp->lock);
+ smp_mb(); /* Old grace periods before callback invocation! */
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&sp->srcu_gp_seq));
+ if (sdp->srcu_cblist_invoking ||
+ !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
+ spin_unlock_irq(&sdp->lock);
+ return; /* Someone else on the job or nothing to do. */
+ }
+
+ /* We are on the job! Extract and invoke ready callbacks. */
+ sdp->srcu_cblist_invoking = true;
+ rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
+ spin_unlock_irq(&sdp->lock);
+ rhp = rcu_cblist_dequeue(&ready_cbs);
+ for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+ local_bh_disable();
+ rhp->func(rhp);
+ local_bh_enable();
+ }
+
+ /*
+ * Update counts, accelerate new callbacks, and if needed,
+ * schedule another round of callback invocation.
+ */
+ spin_lock_irq(&sdp->lock);
+ rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
+ (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
+ rcu_seq_snap(&sp->srcu_gp_seq));
+ sdp->srcu_cblist_invoking = false;
+ more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
+ spin_unlock_irq(&sdp->lock);
+ if (more)
+ srcu_schedule_cbs_sdp(sdp, 0);
+}
+
+/*
+ * Finished one round of SRCU grace period. Start another if there are
+ * more SRCU callbacks queued, otherwise put SRCU into not-running state.
+ */
+static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
+{
+ bool pushgp = true;
+
+ spin_lock_irq(&sp->gp_lock);
+ if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
+ if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
+ /* All requests fulfilled, time to go idle. */
+ pushgp = false;
+ }
+ } else if (!rcu_seq_state(sp->srcu_gp_seq)) {
+ /* Outstanding request and no GP. Start one. */
+ srcu_gp_start(sp);
+ }
+ spin_unlock_irq(&sp->gp_lock);
+
+ if (pushgp)
+ queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
+}
+
+/*
+ * This is the work-queue function that handles SRCU grace periods.
+ */
+void process_srcu(struct work_struct *work)
+{
+ struct srcu_struct *sp;
+
+ sp = container_of(work, struct srcu_struct, work.work);
+
+ srcu_advance_state(sp);
+ srcu_reschedule(sp, srcu_get_delay(sp));
+}
+EXPORT_SYMBOL_GPL(process_srcu);
+
+void srcutorture_get_gp_data(enum rcutorture_type test_type,
+ struct srcu_struct *sp, int *flags,
+ unsigned long *gpnum, unsigned long *completed)
+{
+ if (test_type != SRCU_FLAVOR)
+ return;
+ *flags = 0;
+ *completed = rcu_seq_ctr(sp->srcu_gp_seq);
+ *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed);
+}
+EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 6ad330dbbae2..e5385731e391 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -79,7 +79,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
*/
static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
{
- RCU_TRACE(reset_cpu_stall_ticks(rcp));
+ RCU_TRACE(reset_cpu_stall_ticks(rcp);)
if (rcp->donetail != rcp->curtail) {
rcp->donetail = rcp->curtail;
return 1;
@@ -125,7 +125,7 @@ void rcu_bh_qs(void)
*/
void rcu_check_callbacks(int user)
{
- RCU_TRACE(check_cpu_stalls());
+ RCU_TRACE(check_cpu_stalls();)
if (user)
rcu_sched_qs();
else if (!in_softirq())
@@ -143,7 +143,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
const char *rn = NULL;
struct rcu_head *next, *list;
unsigned long flags;
- RCU_TRACE(int cb_count = 0);
+ RCU_TRACE(int cb_count = 0;)
/* Move the ready-to-invoke callbacks to a local list. */
local_irq_save(flags);
@@ -152,7 +152,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
local_irq_restore(flags);
return;
}
- RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
+ RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);)
list = rcp->rcucblist;
rcp->rcucblist = *rcp->donetail;
*rcp->donetail = NULL;
@@ -162,7 +162,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
local_irq_restore(flags);
/* Invoke the callbacks on the local list. */
- RCU_TRACE(rn = rcp->name);
+ RCU_TRACE(rn = rcp->name;)
while (list) {
next = list->next;
prefetch(next);
@@ -171,9 +171,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
__rcu_reclaim(rn, list);
local_bh_enable();
list = next;
- RCU_TRACE(cb_count++);
+ RCU_TRACE(cb_count++;)
}
- RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
+ RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);)
RCU_TRACE(trace_rcu_batch_end(rcp->name,
cb_count, 0, need_resched(),
is_idle_task(current),
@@ -221,7 +221,7 @@ static void __call_rcu(struct rcu_head *head,
local_irq_save(flags);
*rcp->curtail = head;
rcp->curtail = &head->next;
- RCU_TRACE(rcp->qlen++);
+ RCU_TRACE(rcp->qlen++;)
local_irq_restore(flags);
if (unlikely(is_idle_task(current))) {
@@ -254,8 +254,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
- RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
- RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
+ RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);)
+ RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);)
rcu_early_boot_tests();
}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index c64b827ecbca..371034e77f87 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -52,7 +52,7 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
RCU_TRACE(.name = "rcu_bh")
};
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
#include <linux/kernel_stat.h>
int rcu_scheduler_active __read_mostly;
@@ -65,15 +65,16 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
* to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
* The reason for this is that Tiny RCU does not need kthreads, so does
* not have to care about the fact that the scheduler is half-initialized
- * at a certain phase of the boot process.
+ * at a certain phase of the boot process. Unless SRCU is in the mix.
*/
void __init rcu_scheduler_starting(void)
{
WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+ rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
+ ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
}
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
#ifdef CONFIG_RCU_TRACE
@@ -162,8 +163,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
static void check_cpu_stalls(void)
{
- RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
- RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
+ RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);)
+ RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);)
}
#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 50fee7689e71..e354e475e645 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -57,6 +57,7 @@
#include <linux/random.h>
#include <linux/trace_events.h>
#include <linux/suspend.h>
+#include <linux/ftrace.h>
#include "tree.h"
#include "rcu.h"
@@ -97,8 +98,8 @@ struct rcu_state sname##_state = { \
.gpnum = 0UL - 300UL, \
.completed = 0UL - 300UL, \
.orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
- .orphan_nxttail = &sname##_state.orphan_nxtlist, \
- .orphan_donetail = &sname##_state.orphan_donelist, \
+ .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \
+ .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
@@ -123,7 +124,7 @@ static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
module_param(rcu_fanout_leaf, int, 0444);
int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
/* Number of rcu_nodes at specified level. */
-static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
+int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
/* panic() on RCU Stall sysctl. */
int sysctl_panic_on_rcu_stall __read_mostly;
@@ -199,7 +200,7 @@ static const int gp_cleanup_delay;
/*
* Number of grace periods between delays, normalized by the duration of
- * the delay. The longer the the delay, the more the grace periods between
+ * the delay. The longer the delay, the more the grace periods between
* each delay. The reason for this normalization is that it means that,
* for non-zero delays, the overall slowdown of grace periods is constant
* regardless of the duration of the delay. This arrangement balances
@@ -272,11 +273,19 @@ void rcu_bh_qs(void)
}
}
-static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+/*
+ * Steal a bit from the bottom of ->dynticks for idle entry/exit
+ * control. Initially this is for TLB flushing.
+ */
+#define RCU_DYNTICK_CTRL_MASK 0x1
+#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
+#ifndef rcu_eqs_special_exit
+#define rcu_eqs_special_exit() do { } while (0)
+#endif
static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
- .dynticks = ATOMIC_INIT(1),
+ .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
.dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
.dynticks_idle = ATOMIC_INIT(1),
@@ -284,21 +293,40 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
};
/*
+ * There's a few places, currently just in the tracing infrastructure,
+ * that uses rcu_irq_enter() to make sure RCU is watching. But there's
+ * a small location where that will not even work. In those cases
+ * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter()
+ * can be called.
+ */
+static DEFINE_PER_CPU(bool, disable_rcu_irq_enter);
+
+bool rcu_irq_enter_disabled(void)
+{
+ return this_cpu_read(disable_rcu_irq_enter);
+}
+
+/*
* Record entry into an extended quiescent state. This is only to be
* called when not already in an extended quiescent state.
*/
static void rcu_dynticks_eqs_enter(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- int special;
+ int seq;
/*
- * CPUs seeing atomic_inc_return() must see prior RCU read-side
+ * CPUs seeing atomic_add_return() must see prior RCU read-side
* critical sections, and we also must force ordering with the
* next idle sojourn.
*/
- special = atomic_inc_return(&rdtp->dynticks);
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1);
+ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
+ /* Better be in an extended quiescent state! */
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ (seq & RCU_DYNTICK_CTRL_CTR));
+ /* Better not have special action (TLB flush) pending! */
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ (seq & RCU_DYNTICK_CTRL_MASK));
}
/*
@@ -308,15 +336,22 @@ static void rcu_dynticks_eqs_enter(void)
static void rcu_dynticks_eqs_exit(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- int special;
+ int seq;
/*
- * CPUs seeing atomic_inc_return() must see prior idle sojourns,
+ * CPUs seeing atomic_add_return() must see prior idle sojourns,
* and we also must force ordering with the next RCU read-side
* critical section.
*/
- special = atomic_inc_return(&rdtp->dynticks);
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1));
+ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !(seq & RCU_DYNTICK_CTRL_CTR));
+ if (seq & RCU_DYNTICK_CTRL_MASK) {
+ atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks);
+ smp_mb__after_atomic(); /* _exit after clearing mask. */
+ /* Prefer duplicate flushes to losing a flush. */
+ rcu_eqs_special_exit();
+ }
}
/*
@@ -333,9 +368,9 @@ static void rcu_dynticks_eqs_online(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- if (atomic_read(&rdtp->dynticks) & 0x1)
+ if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR)
return;
- atomic_add(0x1, &rdtp->dynticks);
+ atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
}
/*
@@ -347,7 +382,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- return !(atomic_read(&rdtp->dynticks) & 0x1);
+ return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR);
}
/*
@@ -358,7 +393,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
{
int snap = atomic_add_return(0, &rdtp->dynticks);
- return snap;
+ return snap & ~RCU_DYNTICK_CTRL_MASK;
}
/*
@@ -367,7 +402,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
*/
static bool rcu_dynticks_in_eqs(int snap)
{
- return !(snap & 0x1);
+ return !(snap & RCU_DYNTICK_CTRL_CTR);
}
/*
@@ -387,14 +422,34 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
static void rcu_dynticks_momentary_idle(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- int special = atomic_add_return(2, &rdtp->dynticks);
+ int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
+ &rdtp->dynticks);
/* It is illegal to call this from idle state. */
- WARN_ON_ONCE(!(special & 0x1));
+ WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
}
-DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
-EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
+/*
+ * Set the special (bottom) bit of the specified CPU so that it
+ * will take special action (such as flushing its TLB) on the
+ * next exit from an extended quiescent state. Returns true if
+ * the bit was successfully set, or false if the CPU was not in
+ * an extended quiescent state.
+ */
+bool rcu_eqs_special_set(int cpu)
+{
+ int old;
+ int new;
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ do {
+ old = atomic_read(&rdtp->dynticks);
+ if (old & RCU_DYNTICK_CTRL_CTR)
+ return false;
+ new = old | RCU_DYNTICK_CTRL_MASK;
+ } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old);
+ return true;
+}
/*
* Let the RCU core know that this CPU has gone through the scheduler,
@@ -403,44 +458,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
* memory barriers to let the RCU core know about it, regardless of what
* this CPU might (or might not) do in the near future.
*
- * We inform the RCU core by emulating a zero-duration dyntick-idle
- * period, which we in turn do by incrementing the ->dynticks counter
- * by two.
+ * We inform the RCU core by emulating a zero-duration dyntick-idle period.
*
* The caller must have disabled interrupts.
*/
static void rcu_momentary_dyntick_idle(void)
{
- struct rcu_data *rdp;
- int resched_mask;
- struct rcu_state *rsp;
-
- /*
- * Yes, we can lose flag-setting operations. This is OK, because
- * the flag will be set again after some delay.
- */
- resched_mask = raw_cpu_read(rcu_sched_qs_mask);
- raw_cpu_write(rcu_sched_qs_mask, 0);
-
- /* Find the flavor that needs a quiescent state. */
- for_each_rcu_flavor(rsp) {
- rdp = raw_cpu_ptr(rsp->rda);
- if (!(resched_mask & rsp->flavor_mask))
- continue;
- smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
- if (READ_ONCE(rdp->mynode->completed) !=
- READ_ONCE(rdp->cond_resched_completed))
- continue;
-
- /*
- * Pretend to be momentarily idle for the quiescent state.
- * This allows the grace-period kthread to record the
- * quiescent state, with no need for this CPU to do anything
- * further.
- */
- rcu_dynticks_momentary_idle();
- break;
- }
+ raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false);
+ rcu_dynticks_momentary_idle();
}
/*
@@ -448,14 +473,22 @@ static void rcu_momentary_dyntick_idle(void)
* and requires special handling for preemptible RCU.
* The caller must have disabled interrupts.
*/
-void rcu_note_context_switch(void)
+void rcu_note_context_switch(bool preempt)
{
barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
rcu_sched_qs();
rcu_preempt_note_context_switch();
- if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ /* Load rcu_urgent_qs before other flags. */
+ if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs)))
+ goto out;
+ this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
+ if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs)))
rcu_momentary_dyntick_idle();
+ this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
+ if (!preempt)
+ rcu_note_voluntary_context_switch_lite(current);
+out:
trace_rcu_utilization(TPS("End context switch"));
barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
@@ -478,29 +511,26 @@ void rcu_all_qs(void)
{
unsigned long flags;
+ if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs))
+ return;
+ preempt_disable();
+ /* Load rcu_urgent_qs before other flags. */
+ if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) {
+ preempt_enable();
+ return;
+ }
+ this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
barrier(); /* Avoid RCU read-side critical sections leaking down. */
- if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) {
+ if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) {
local_irq_save(flags);
rcu_momentary_dyntick_idle();
local_irq_restore(flags);
}
- if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) {
- /*
- * Yes, we just checked a per-CPU variable with preemption
- * enabled, so we might be migrated to some other CPU at
- * this point. That is OK because in that case, the
- * migration will supply the needed quiescent state.
- * We might end up needlessly disabling preemption and
- * invoking rcu_sched_qs() on the destination CPU, but
- * the probability and cost are both quite low, so this
- * should not be a problem in practice.
- */
- preempt_disable();
+ if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)))
rcu_sched_qs();
- preempt_enable();
- }
- this_cpu_inc(rcu_qs_ctr);
+ this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
barrier(); /* Avoid RCU read-side critical sections leaking up. */
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(rcu_all_qs);
@@ -689,15 +719,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
default:
break;
}
- if (rsp != NULL) {
- *flags = READ_ONCE(rsp->gp_flags);
- *gpnum = READ_ONCE(rsp->gpnum);
- *completed = READ_ONCE(rsp->completed);
+ if (rsp == NULL)
return;
- }
- *flags = 0;
- *gpnum = 0;
- *completed = 0;
+ *flags = READ_ONCE(rsp->gp_flags);
+ *gpnum = READ_ONCE(rsp->gpnum);
+ *completed = READ_ONCE(rsp->completed);
}
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
@@ -713,16 +739,6 @@ void rcutorture_record_progress(unsigned long vernum)
EXPORT_SYMBOL_GPL(rcutorture_record_progress);
/*
- * Does the CPU have callbacks ready to be invoked?
- */
-static int
-cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
-{
- return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
- rdp->nxttail[RCU_NEXT_TAIL] != NULL;
-}
-
-/*
* Return the root node of the specified rcu_state structure.
*/
static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
@@ -752,44 +768,39 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
static bool
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
- int i;
-
if (rcu_gp_in_progress(rsp))
return false; /* No, a grace period is already in progress. */
if (rcu_future_needs_gp(rsp))
return true; /* Yes, a no-CBs CPU needs one. */
- if (!rdp->nxttail[RCU_NEXT_TAIL])
+ if (!rcu_segcblist_is_enabled(&rdp->cblist))
return false; /* No, this is a no-CBs (or offline) CPU. */
- if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
+ if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
return true; /* Yes, CPU has newly registered callbacks. */
- for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
- if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
- ULONG_CMP_LT(READ_ONCE(rsp->completed),
- rdp->nxtcompleted[i]))
- return true; /* Yes, CBs for future grace period. */
+ if (rcu_segcblist_future_gp_needed(&rdp->cblist,
+ READ_ONCE(rsp->completed)))
+ return true; /* Yes, CBs for future grace period. */
return false; /* No grace period needed. */
}
/*
- * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
+ * rcu_eqs_enter_common - current CPU is entering an extended quiescent state
*
- * If the new value of the ->dynticks_nesting counter now is zero,
- * we really have entered idle, and must do the appropriate accounting.
- * The caller must have disabled interrupts.
+ * Enter idle, doing appropriate accounting. The caller must have
+ * disabled interrupts.
*/
-static void rcu_eqs_enter_common(long long oldval, bool user)
+static void rcu_eqs_enter_common(bool user)
{
struct rcu_state *rsp;
struct rcu_data *rdp;
- RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);)
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
+ trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
!user && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused =
idle_task(smp_processor_id());
- trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
+ trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0);
rcu_ftrace_dump(DUMP_ORIG);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
current->pid, current->comm,
@@ -800,7 +811,10 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
do_nocb_deferred_wakeup(rdp);
}
rcu_prepare_for_idle();
- rcu_dynticks_eqs_enter();
+ __this_cpu_inc(disable_rcu_irq_enter);
+ rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */
+ rcu_dynticks_eqs_enter(); /* After this, tracing works again. */
+ __this_cpu_dec(disable_rcu_irq_enter);
rcu_dynticks_task_enter();
/*
@@ -821,19 +835,15 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
*/
static void rcu_eqs_enter(bool user)
{
- long long oldval;
struct rcu_dynticks *rdtp;
rdtp = this_cpu_ptr(&rcu_dynticks);
- oldval = rdtp->dynticks_nesting;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (oldval & DYNTICK_TASK_NEST_MASK) == 0);
- if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
- rdtp->dynticks_nesting = 0;
- rcu_eqs_enter_common(oldval, user);
- } else {
+ (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
+ if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
+ rcu_eqs_enter_common(user);
+ else
rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
- }
}
/**
@@ -892,19 +902,18 @@ void rcu_user_enter(void)
*/
void rcu_irq_exit(void)
{
- long long oldval;
struct rcu_dynticks *rdtp;
RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
- oldval = rdtp->dynticks_nesting;
- rdtp->dynticks_nesting--;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- rdtp->dynticks_nesting < 0);
- if (rdtp->dynticks_nesting)
- trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
- else
- rcu_eqs_enter_common(oldval, true);
+ rdtp->dynticks_nesting < 1);
+ if (rdtp->dynticks_nesting <= 1) {
+ rcu_eqs_enter_common(true);
+ } else {
+ trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1);
+ rdtp->dynticks_nesting--;
+ }
rcu_sysidle_enter(1);
}
@@ -1150,6 +1159,24 @@ bool notrace rcu_is_watching(void)
}
EXPORT_SYMBOL_GPL(rcu_is_watching);
+/*
+ * If a holdout task is actually running, request an urgent quiescent
+ * state from its CPU. This is unsynchronized, so migrations can cause
+ * the request to go to the wrong CPU. Which is OK, all that will happen
+ * is that the CPU's next context switch will be a bit slower and next
+ * time around this task will generate another request.
+ */
+void rcu_request_urgent_qs_task(struct task_struct *t)
+{
+ int cpu;
+
+ barrier();
+ cpu = task_cpu(t);
+ if (!task_curr(t))
+ return; /* This task is not running on that CPU. */
+ smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true);
+}
+
#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
/*
@@ -1235,7 +1262,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
bool *isidle, unsigned long *maxj)
{
unsigned long jtsq;
- int *rcrmp;
+ bool *rnhqp;
+ bool *ruqp;
unsigned long rjtsc;
struct rcu_node *rnp;
@@ -1271,11 +1299,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* might not be the case for nohz_full CPUs looping in the kernel.
*/
rnp = rdp->mynode;
+ ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
- READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) &&
+ READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) &&
READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
return 1;
+ } else {
+ /* Load rcu_qs_ctr before store to rcu_urgent_qs. */
+ smp_store_release(ruqp, true);
}
/* Check for the CPU being offline. */
@@ -1292,7 +1324,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* in-kernel CPU-bound tasks cannot advance grace periods.
* So if the grace period is old enough, make the CPU pay attention.
* Note that the unsynchronized assignments to the per-CPU
- * rcu_sched_qs_mask variable are safe. Yes, setting of
+ * rcu_need_heavy_qs variable are safe. Yes, setting of
* bits can be lost, but they will be set again on the next
* force-quiescent-state pass. So lost bit sets do not result
* in incorrect behavior, merely in a grace period lasting
@@ -1306,16 +1338,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* is set too high, we override with half of the RCU CPU stall
* warning delay.
*/
- rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
- if (time_after(jiffies, rdp->rsp->gp_start + jtsq) ||
- time_after(jiffies, rdp->rsp->jiffies_resched)) {
- if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
- WRITE_ONCE(rdp->cond_resched_completed,
- READ_ONCE(rdp->mynode->completed));
- smp_mb(); /* ->cond_resched_completed before *rcrmp. */
- WRITE_ONCE(*rcrmp,
- READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
- }
+ rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu);
+ if (!READ_ONCE(*rnhqp) &&
+ (time_after(jiffies, rdp->rsp->gp_start + jtsq) ||
+ time_after(jiffies, rdp->rsp->jiffies_resched))) {
+ WRITE_ONCE(*rnhqp, true);
+ /* Store rcu_need_heavy_qs before rcu_urgent_qs. */
+ smp_store_release(ruqp, true);
rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
}
@@ -1475,7 +1504,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
- totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
+ totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
+ cpu)->cblist);
pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
smp_processor_id(), (long)(jiffies - rsp->gp_start),
(long)rsp->gpnum, (long)rsp->completed, totqlen);
@@ -1529,7 +1559,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
print_cpu_stall_info(rsp, smp_processor_id());
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
- totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
+ totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
+ cpu)->cblist);
pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
jiffies - rsp->gp_start,
(long)rsp->gpnum, (long)rsp->completed, totqlen);
@@ -1632,30 +1663,6 @@ void rcu_cpu_stall_reset(void)
}
/*
- * Initialize the specified rcu_data structure's default callback list
- * to empty. The default callback list is the one that is not used by
- * no-callbacks CPUs.
- */
-static void init_default_callback_list(struct rcu_data *rdp)
-{
- int i;
-
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
-}
-
-/*
- * Initialize the specified rcu_data structure's callback list to empty.
- */
-static void init_callback_list(struct rcu_data *rdp)
-{
- if (init_nocb_callback_list(rdp))
- return;
- init_default_callback_list(rdp);
-}
-
-/*
* Determine the value that ->completed will have at the end of the
* next subsequent grace period. This is used to tag callbacks so that
* a CPU can invoke callbacks in a timely fashion even if that CPU has
@@ -1709,7 +1716,6 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
unsigned long *c_out)
{
unsigned long c;
- int i;
bool ret = false;
struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
@@ -1755,13 +1761,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
/*
* Get a new grace-period number. If there really is no grace
* period in progress, it will be smaller than the one we obtained
- * earlier. Adjust callbacks as needed. Note that even no-CBs
- * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
+ * earlier. Adjust callbacks as needed.
*/
c = rcu_cbs_completed(rdp->rsp, rnp_root);
- for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
- if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
- rdp->nxtcompleted[i] = c;
+ if (!rcu_is_nocb_cpu(rdp->cpu))
+ (void)rcu_segcblist_accelerate(&rdp->cblist, c);
/*
* If the needed for the required grace period is already
@@ -1793,9 +1797,7 @@ out:
/*
* Clean up any old requests for the just-ended grace period. Also return
- * whether any additional grace periods have been requested. Also invoke
- * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
- * waiting for this grace period to complete.
+ * whether any additional grace periods have been requested.
*/
static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
@@ -1841,57 +1843,27 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
- unsigned long c;
- int i;
- bool ret;
-
- /* If the CPU has no callbacks, nothing to do. */
- if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
- return false;
-
- /*
- * Starting from the sublist containing the callbacks most
- * recently assigned a ->completed number and working down, find the
- * first sublist that is not assignable to an upcoming grace period.
- * Such a sublist has something in it (first two tests) and has
- * a ->completed number assigned that will complete sooner than
- * the ->completed number for newly arrived callbacks (last test).
- *
- * The key point is that any later sublist can be assigned the
- * same ->completed number as the newly arrived callbacks, which
- * means that the callbacks in any of these later sublist can be
- * grouped into a single sublist, whether or not they have already
- * been assigned a ->completed number.
- */
- c = rcu_cbs_completed(rsp, rnp);
- for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
- if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
- !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
- break;
+ bool ret = false;
- /*
- * If there are no sublist for unassigned callbacks, leave.
- * At the same time, advance "i" one sublist, so that "i" will
- * index into the sublist where all the remaining callbacks should
- * be grouped into.
- */
- if (++i >= RCU_NEXT_TAIL)
+ /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;
/*
- * Assign all subsequent callbacks' ->completed number to the next
- * full grace period and group them all in the sublist initially
- * indexed by "i".
+ * Callbacks are often registered with incomplete grace-period
+ * information. Something about the fact that getting exact
+ * information requires acquiring a global lock... RCU therefore
+ * makes a conservative estimate of the grace period number at which
+ * a given callback will become ready to invoke. The following
+ * code checks this estimate and improves it when possible, thus
+ * accelerating callback invocation to an earlier grace-period
+ * number.
*/
- for (; i <= RCU_NEXT_TAIL; i++) {
- rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
- rdp->nxtcompleted[i] = c;
- }
- /* Record any needed additional grace periods. */
- ret = rcu_start_future_gp(rnp, rdp, NULL);
+ if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp)))
+ ret = rcu_start_future_gp(rnp, rdp, NULL);
/* Trace depending on how much we were able to accelerate. */
- if (!*rdp->nxttail[RCU_WAIT_TAIL])
+ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
else
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
@@ -1911,32 +1883,15 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
- int i, j;
-
- /* If the CPU has no callbacks, nothing to do. */
- if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+ /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;
/*
* Find all callbacks whose ->completed numbers indicate that they
* are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
*/
- for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
- if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
- break;
- rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
- }
- /* Clean up any sublist tail pointers that were misordered above. */
- for (j = RCU_WAIT_TAIL; j < i; j++)
- rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
-
- /* Copy down callbacks to fill in empty sublists. */
- for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
- if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
- break;
- rdp->nxttail[j] = rdp->nxttail[i];
- rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
- }
+ rcu_segcblist_advance(&rdp->cblist, rnp->completed);
/* Classify any remaining callbacks. */
return rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1981,7 +1936,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
need_gp = !!(rnp->qsmask & rdp->grpmask);
rdp->cpu_no_qs.b.norm = need_gp;
- rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
rdp->core_needs_qs = need_gp;
zero_cpu_stall_ticks(rdp);
WRITE_ONCE(rdp->gpwrap, false);
@@ -2579,7 +2534,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* within the current grace period.
*/
rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
- rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
@@ -2653,13 +2608,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
* because _rcu_barrier() excludes CPU-hotplug operations, so it
* cannot be running now. Thus no memory barrier is required.
*/
- if (rdp->nxtlist != NULL) {
- rsp->qlen_lazy += rdp->qlen_lazy;
- rsp->qlen += rdp->qlen;
- rdp->n_cbs_orphaned += rdp->qlen;
- rdp->qlen_lazy = 0;
- WRITE_ONCE(rdp->qlen, 0);
- }
+ rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
+ rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
/*
* Next, move those callbacks still needing a grace period to
@@ -2667,31 +2617,18 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
* Some of the callbacks might have gone partway through a grace
* period, but that is too bad. They get to start over because we
* cannot assume that grace periods are synchronized across CPUs.
- * We don't bother updating the ->nxttail[] array yet, instead
- * we just reset the whole thing later on.
*/
- if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
- *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
- rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
- *rdp->nxttail[RCU_DONE_TAIL] = NULL;
- }
+ rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
/*
* Then move the ready-to-invoke callbacks to the orphanage,
* where some other CPU will pick them up. These will not be
* required to pass though another grace period: They are done.
*/
- if (rdp->nxtlist != NULL) {
- *rsp->orphan_donetail = rdp->nxtlist;
- rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
- }
+ rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
- /*
- * Finally, initialize the rcu_data structure's list to empty and
- * disallow further callbacks on this CPU.
- */
- init_callback_list(rdp);
- rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+ /* Finally, disallow further callbacks on this CPU. */
+ rcu_segcblist_disable(&rdp->cblist);
}
/*
@@ -2700,7 +2637,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
*/
static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
{
- int i;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
/* No-CBs CPUs are handled specially. */
@@ -2709,13 +2645,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
return;
/* Do the accounting first. */
- rdp->qlen_lazy += rsp->qlen_lazy;
- rdp->qlen += rsp->qlen;
- rdp->n_cbs_adopted += rsp->qlen;
- if (rsp->qlen_lazy != rsp->qlen)
+ rdp->n_cbs_adopted += rsp->orphan_done.len;
+ if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
rcu_idle_count_callbacks_posted();
- rsp->qlen_lazy = 0;
- rsp->qlen = 0;
+ rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
/*
* We do not need a memory barrier here because the only way we
@@ -2723,24 +2656,13 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
* we are the task doing the rcu_barrier().
*/
- /* First adopt the ready-to-invoke callbacks. */
- if (rsp->orphan_donelist != NULL) {
- *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
- *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
- for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
- if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
- rdp->nxttail[i] = rsp->orphan_donetail;
- rsp->orphan_donelist = NULL;
- rsp->orphan_donetail = &rsp->orphan_donelist;
- }
-
- /* And then adopt the callbacks that still need a grace period. */
- if (rsp->orphan_nxtlist != NULL) {
- *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
- rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
- rsp->orphan_nxtlist = NULL;
- rsp->orphan_nxttail = &rsp->orphan_nxtlist;
- }
+ /* First adopt the ready-to-invoke callbacks, then the done ones. */
+ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
+ WARN_ON_ONCE(rsp->orphan_done.head);
+ rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
+ WARN_ON_ONCE(rsp->orphan_pend.head);
+ WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
+ !rcu_segcblist_n_cbs(&rdp->cblist));
}
/*
@@ -2748,14 +2670,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
*/
static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
{
- RCU_TRACE(unsigned long mask);
- RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
- RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
+ RCU_TRACE(unsigned long mask;)
+ RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);)
+ RCU_TRACE(struct rcu_node *rnp = rdp->mynode;)
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
return;
- RCU_TRACE(mask = rdp->grpmask);
+ RCU_TRACE(mask = rdp->grpmask;)
trace_rcu_grace_period(rsp->name,
rnp->gpnum + 1 - !!(rnp->qsmask & mask),
TPS("cpuofl"));
@@ -2828,9 +2750,11 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
rcu_adopt_orphan_cbs(rsp, flags);
raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
- WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
- "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
- cpu, rdp->qlen, rdp->nxtlist);
+ WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
+ !rcu_segcblist_empty(&rdp->cblist),
+ "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
+ cpu, rcu_segcblist_n_cbs(&rdp->cblist),
+ rcu_segcblist_first_cb(&rdp->cblist));
}
/*
@@ -2840,14 +2764,17 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
- struct rcu_head *next, *list, **tail;
- long bl, count, count_lazy;
- int i;
+ struct rcu_head *rhp;
+ struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
+ long bl, count;
/* If no callbacks are ready, just return. */
- if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
- trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
- trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist),
+ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ trace_rcu_batch_start(rsp->name,
+ rcu_segcblist_n_lazy_cbs(&rdp->cblist),
+ rcu_segcblist_n_cbs(&rdp->cblist), 0);
+ trace_rcu_batch_end(rsp->name, 0,
+ !rcu_segcblist_empty(&rdp->cblist),
need_resched(), is_idle_task(current),
rcu_is_callbacks_kthread());
return;
@@ -2855,73 +2782,61 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
/*
* Extract the list of ready callbacks, disabling to prevent
- * races with call_rcu() from interrupt handlers.
+ * races with call_rcu() from interrupt handlers. Leave the
+ * callback counts, as rcu_barrier() needs to be conservative.
*/
local_irq_save(flags);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
bl = rdp->blimit;
- trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
- list = rdp->nxtlist;
- rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
- *rdp->nxttail[RCU_DONE_TAIL] = NULL;
- tail = rdp->nxttail[RCU_DONE_TAIL];
- for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
- if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
- rdp->nxttail[i] = &rdp->nxtlist;
+ trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist),
+ rcu_segcblist_n_cbs(&rdp->cblist), bl);
+ rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
local_irq_restore(flags);
/* Invoke callbacks. */
- count = count_lazy = 0;
- while (list) {
- next = list->next;
- prefetch(next);
- debug_rcu_head_unqueue(list);
- if (__rcu_reclaim(rsp->name, list))
- count_lazy++;
- list = next;
- /* Stop only if limit reached and CPU has something to do. */
- if (++count >= bl &&
+ rhp = rcu_cblist_dequeue(&rcl);
+ for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+ debug_rcu_head_unqueue(rhp);
+ if (__rcu_reclaim(rsp->name, rhp))
+ rcu_cblist_dequeued_lazy(&rcl);
+ /*
+ * Stop only if limit reached and CPU has something to do.
+ * Note: The rcl structure counts down from zero.
+ */
+ if (-rcl.len >= bl &&
(need_resched() ||
(!is_idle_task(current) && !rcu_is_callbacks_kthread())))
break;
}
local_irq_save(flags);
- trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
- is_idle_task(current),
- rcu_is_callbacks_kthread());
-
- /* Update count, and requeue any remaining callbacks. */
- if (list != NULL) {
- *tail = rdp->nxtlist;
- rdp->nxtlist = list;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- if (&rdp->nxtlist == rdp->nxttail[i])
- rdp->nxttail[i] = tail;
- else
- break;
- }
+ count = -rcl.len;
+ trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(),
+ is_idle_task(current), rcu_is_callbacks_kthread());
+
+ /* Update counts and requeue any remaining callbacks. */
+ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
smp_mb(); /* List handling before counting for rcu_barrier(). */
- rdp->qlen_lazy -= count_lazy;
- WRITE_ONCE(rdp->qlen, rdp->qlen - count);
rdp->n_cbs_invoked += count;
+ rcu_segcblist_insert_count(&rdp->cblist, &rcl);
/* Reinstate batch limit if we have worked down the excess. */
- if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
+ count = rcu_segcblist_n_cbs(&rdp->cblist);
+ if (rdp->blimit == LONG_MAX && count <= qlowmark)
rdp->blimit = blimit;
/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
- if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
+ if (count == 0 && rdp->qlen_last_fqs_check != 0) {
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
- } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
- rdp->qlen_last_fqs_check = rdp->qlen;
- WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
+ } else if (count < rdp->qlen_last_fqs_check - qhimark)
+ rdp->qlen_last_fqs_check = count;
+ WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
local_irq_restore(flags);
/* Re-invoke RCU core processing if there are callbacks remaining. */
- if (cpu_has_callbacks_ready_to_invoke(rdp))
+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
invoke_rcu_core();
}
@@ -3087,7 +3002,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
bool needwake;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
- WARN_ON_ONCE(rdp->beenonline == 0);
+ WARN_ON_ONCE(!rdp->beenonline);
/* Update RCU state based on any recent quiescent states. */
rcu_check_quiescent_state(rsp, rdp);
@@ -3105,7 +3020,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
}
/* If there are callbacks ready, invoke them. */
- if (cpu_has_callbacks_ready_to_invoke(rdp))
+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
invoke_rcu_callbacks(rsp, rdp);
/* Do any needed deferred wakeups of rcuo kthreads. */
@@ -3177,7 +3092,8 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
* invoking force_quiescent_state() if the newly enqueued callback
* is the only one waiting for a grace period to complete.
*/
- if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+ if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
+ rdp->qlen_last_fqs_check + qhimark)) {
/* Are we ignoring a completed grace period? */
note_gp_changes(rsp, rdp);
@@ -3195,10 +3111,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
/* Give the grace period a kick. */
rdp->blimit = LONG_MAX;
if (rsp->n_force_qs == rdp->n_force_qs_snap &&
- *rdp->nxttail[RCU_DONE_TAIL] != head)
+ rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
force_quiescent_state(rsp);
rdp->n_force_qs_snap = rsp->n_force_qs;
- rdp->qlen_last_fqs_check = rdp->qlen;
+ rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
}
}
}
@@ -3238,7 +3154,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
rdp = this_cpu_ptr(rsp->rda);
/* Add the callback to our list. */
- if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
+ if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) {
int offline;
if (cpu != -1)
@@ -3257,23 +3173,21 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
*/
BUG_ON(cpu != -1);
WARN_ON_ONCE(!rcu_is_watching());
- if (!likely(rdp->nxtlist))
- init_default_callback_list(rdp);
+ if (rcu_segcblist_empty(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist);
}
- WRITE_ONCE(rdp->qlen, rdp->qlen + 1);
- if (lazy)
- rdp->qlen_lazy++;
- else
+ rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
+ if (!lazy)
rcu_idle_count_callbacks_posted();
- smp_mb(); /* Count before adding callback for rcu_barrier(). */
- *rdp->nxttail[RCU_NEXT_TAIL] = head;
- rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
- rdp->qlen_lazy, rdp->qlen);
+ rcu_segcblist_n_lazy_cbs(&rdp->cblist),
+ rcu_segcblist_n_cbs(&rdp->cblist));
else
- trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
+ trace_rcu_callback(rsp->name, head,
+ rcu_segcblist_n_lazy_cbs(&rdp->cblist),
+ rcu_segcblist_n_cbs(&rdp->cblist));
/* Go handle any RCU core processing required. */
__call_rcu_core(rsp, rdp, head, flags);
@@ -3519,41 +3433,6 @@ void cond_synchronize_sched(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(cond_synchronize_sched);
-/* Adjust sequence number for start of update-side operation. */
-static void rcu_seq_start(unsigned long *sp)
-{
- WRITE_ONCE(*sp, *sp + 1);
- smp_mb(); /* Ensure update-side operation after counter increment. */
- WARN_ON_ONCE(!(*sp & 0x1));
-}
-
-/* Adjust sequence number for end of update-side operation. */
-static void rcu_seq_end(unsigned long *sp)
-{
- smp_mb(); /* Ensure update-side operation before counter increment. */
- WRITE_ONCE(*sp, *sp + 1);
- WARN_ON_ONCE(*sp & 0x1);
-}
-
-/* Take a snapshot of the update side's sequence number. */
-static unsigned long rcu_seq_snap(unsigned long *sp)
-{
- unsigned long s;
-
- s = (READ_ONCE(*sp) + 3) & ~0x1;
- smp_mb(); /* Above access must not bleed into critical section. */
- return s;
-}
-
-/*
- * Given a snapshot from rcu_seq_snap(), determine whether or not a
- * full update-side operation has occurred.
- */
-static bool rcu_seq_done(unsigned long *sp, unsigned long s)
-{
- return ULONG_CMP_GE(READ_ONCE(*sp), s);
-}
-
/*
* Check to see if there is any immediate RCU-related work to be done
* by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -3577,7 +3456,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
/* Is the RCU core waiting for a quiescent state from this CPU? */
if (rcu_scheduler_fully_active &&
rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
- rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) {
rdp->n_rp_core_needs_qs++;
} else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) {
rdp->n_rp_report_qs++;
@@ -3585,7 +3464,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
}
/* Does this CPU have callbacks ready to invoke? */
- if (cpu_has_callbacks_ready_to_invoke(rdp)) {
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
rdp->n_rp_cb_ready++;
return 1;
}
@@ -3649,10 +3528,10 @@ static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
for_each_rcu_flavor(rsp) {
rdp = this_cpu_ptr(rsp->rda);
- if (!rdp->nxtlist)
+ if (rcu_segcblist_empty(&rdp->cblist))
continue;
hc = true;
- if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {
+ if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) {
al = false;
break;
}
@@ -3761,7 +3640,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
__call_rcu(&rdp->barrier_head,
rcu_barrier_callback, rsp, cpu, 0);
}
- } else if (READ_ONCE(rdp->qlen)) {
+ } else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
_rcu_barrier_trace(rsp, "OnlineQ", cpu,
rsp->barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -3870,8 +3749,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
- if (!rdp->nxtlist)
- init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
+ if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
+ !init_nocb_callback_list(rdp))
+ rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
rcu_sysidle_init_percpu_data(rdp->dynticks);
rcu_dynticks_eqs_online();
@@ -3890,12 +3770,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
rdp->completed = rnp->completed;
rdp->cpu_no_qs.b.norm = true;
- rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
+ rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu);
rdp->core_needs_qs = false;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+/*
+ * Invoked early in the CPU-online process, when pretty much all
+ * services are available. The incoming CPU is not present.
+ */
int rcutree_prepare_cpu(unsigned int cpu)
{
struct rcu_state *rsp;
@@ -3909,6 +3793,9 @@ int rcutree_prepare_cpu(unsigned int cpu)
return 0;
}
+/*
+ * Update RCU priority boot kthread affinity for CPU-hotplug changes.
+ */
static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
{
struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
@@ -3916,20 +3803,34 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
}
+/*
+ * Near the end of the CPU-online process. Pretty much all services
+ * enabled, and the CPU is now very much alive.
+ */
int rcutree_online_cpu(unsigned int cpu)
{
sync_sched_exp_online_cleanup(cpu);
rcutree_affinity_setting(cpu, -1);
+ if (IS_ENABLED(CONFIG_TREE_SRCU))
+ srcu_online_cpu(cpu);
return 0;
}
+/*
+ * Near the beginning of the process. The CPU is still very much alive
+ * with pretty much all services enabled.
+ */
int rcutree_offline_cpu(unsigned int cpu)
{
rcutree_affinity_setting(cpu, cpu);
+ if (IS_ENABLED(CONFIG_TREE_SRCU))
+ srcu_offline_cpu(cpu);
return 0;
}
-
+/*
+ * Near the end of the offline process. We do only tracing here.
+ */
int rcutree_dying_cpu(unsigned int cpu)
{
struct rcu_state *rsp;
@@ -3939,6 +3840,9 @@ int rcutree_dying_cpu(unsigned int cpu)
return 0;
}
+/*
+ * The outgoing CPU is gone and we are running elsewhere.
+ */
int rcutree_dead_cpu(unsigned int cpu)
{
struct rcu_state *rsp;
@@ -3956,6 +3860,10 @@ int rcutree_dead_cpu(unsigned int cpu)
* incoming CPUs are not allowed to use RCU read-side critical sections
* until this function is called. Failing to observe this restriction
* will result in lockdep splats.
+ *
+ * Note that this function is special in that it is invoked directly
+ * from the incoming CPU rather than from the cpuhp_step mechanism.
+ * This is because this function must be invoked at a precise location.
*/
void rcu_cpu_starting(unsigned int cpu)
{
@@ -3981,9 +3889,6 @@ void rcu_cpu_starting(unsigned int cpu)
* The CPU is exiting the idle loop into the arch_cpu_idle_dead()
* function. We now remove it from the rcu_node tree's ->qsmaskinit
* bit masks.
- * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
- * function. We now remove it from the rcu_node tree's ->qsmaskinit
- * bit masks.
*/
static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
{
@@ -3999,6 +3904,14 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+/*
+ * The outgoing function has no further need of RCU, so remove it from
+ * the list of CPUs that RCU must track.
+ *
+ * Note that this function is special in that it is invoked directly
+ * from the outgoing CPU rather than from the cpuhp_step mechanism.
+ * This is because this function must be invoked at a precise location.
+ */
void rcu_report_dead(unsigned int cpu)
{
struct rcu_state *rsp;
@@ -4013,6 +3926,10 @@ void rcu_report_dead(unsigned int cpu)
}
#endif
+/*
+ * On non-huge systems, use expedited RCU grace periods to make suspend
+ * and hibernation run faster.
+ */
static int rcu_pm_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
@@ -4083,7 +4000,7 @@ early_initcall(rcu_spawn_gp_kthread);
* task is booting the system, and such primitives are no-ops). After this
* function is called, any synchronous grace-period primitives are run as
* expedited, with the requesting task driving the grace period forward.
- * A later core_initcall() rcu_exp_runtime_mode() will switch to full
+ * A later core_initcall() rcu_set_runtime_mode() will switch to full
* runtime RCU functionality.
*/
void rcu_scheduler_starting(void)
@@ -4096,31 +4013,6 @@ void rcu_scheduler_starting(void)
}
/*
- * Compute the per-level fanout, either using the exact fanout specified
- * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
- */
-static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
-{
- int i;
-
- if (rcu_fanout_exact) {
- levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
- for (i = rcu_num_lvls - 2; i >= 0; i--)
- levelspread[i] = RCU_FANOUT;
- } else {
- int ccur;
- int cprv;
-
- cprv = nr_cpu_ids;
- for (i = rcu_num_lvls - 1; i >= 0; i--) {
- ccur = levelcnt[i];
- levelspread[i] = (cprv + ccur - 1) / ccur;
- cprv = ccur;
- }
- }
-}
-
-/*
* Helper function for rcu_init() that initializes one rcu_state structure.
*/
static void __init rcu_init_one(struct rcu_state *rsp)
@@ -4129,9 +4021,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
static const char * const fqs[] = RCU_FQS_NAME_INIT;
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
- static u8 fl_mask = 0x1;
- int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
int cpustride = 1;
int i;
@@ -4146,20 +4036,16 @@ static void __init rcu_init_one(struct rcu_state *rsp)
/* Initialize the level-tracking arrays. */
- for (i = 0; i < rcu_num_lvls; i++)
- levelcnt[i] = num_rcu_lvl[i];
for (i = 1; i < rcu_num_lvls; i++)
- rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1];
- rcu_init_levelspread(levelspread, levelcnt);
- rsp->flavor_mask = fl_mask;
- fl_mask <<= 1;
+ rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1];
+ rcu_init_levelspread(levelspread, num_rcu_lvl);
/* Initialize the elements themselves, starting from the leaves. */
for (i = rcu_num_lvls - 1; i >= 0; i--) {
cpustride *= levelspread[i];
rnp = rsp->level[i];
- for (j = 0; j < levelcnt[i]; j++, rnp++) {
+ for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) {
raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
&rcu_node_class[i], buf[i]);
@@ -4332,6 +4218,8 @@ void __init rcu_init(void)
for_each_online_cpu(cpu) {
rcutree_prepare_cpu(cpu);
rcu_cpu_starting(cpu);
+ if (IS_ENABLED(CONFIG_TREE_SRCU))
+ srcu_online_cpu(cpu);
}
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ec62a05bfdb3..ba38262c3554 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -30,80 +30,9 @@
#include <linux/seqlock.h>
#include <linux/swait.h>
#include <linux/stop_machine.h>
+#include <linux/rcu_node_tree.h>
-/*
- * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
- * CONFIG_RCU_FANOUT_LEAF.
- * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this did work well going from three levels to four.
- * Of course, your mileage may vary.
- */
-
-#ifdef CONFIG_RCU_FANOUT
-#define RCU_FANOUT CONFIG_RCU_FANOUT
-#else /* #ifdef CONFIG_RCU_FANOUT */
-# ifdef CONFIG_64BIT
-# define RCU_FANOUT 64
-# else
-# define RCU_FANOUT 32
-# endif
-#endif /* #else #ifdef CONFIG_RCU_FANOUT */
-
-#ifdef CONFIG_RCU_FANOUT_LEAF
-#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
-#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
-# ifdef CONFIG_64BIT
-# define RCU_FANOUT_LEAF 64
-# else
-# define RCU_FANOUT_LEAF 32
-# endif
-#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
-
-#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
-#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT)
-#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT)
-#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT)
-
-#if NR_CPUS <= RCU_FANOUT_1
-# define RCU_NUM_LVLS 1
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_NODES NUM_RCU_LVL_0
-# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
-# define RCU_NODE_NAME_INIT { "rcu_node_0" }
-# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
-#elif NR_CPUS <= RCU_FANOUT_2
-# define RCU_NUM_LVLS 2
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
-# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
-# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
-# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
-#elif NR_CPUS <= RCU_FANOUT_3
-# define RCU_NUM_LVLS 3
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
-# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
-# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
-# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
-#elif NR_CPUS <= RCU_FANOUT_4
-# define RCU_NUM_LVLS 4
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
-# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
-# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
-# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
-# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
-#else
-# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
-
-extern int rcu_num_lvls;
-extern int rcu_num_nodes;
+#include "rcu_segcblist.h"
/*
* Dynticks per-CPU state.
@@ -113,6 +42,9 @@ struct rcu_dynticks {
/* Process level is worth LLONG_MAX/2. */
int dynticks_nmi_nesting; /* Track NMI nesting level. */
atomic_t dynticks; /* Even value for idle, else odd. */
+ bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
+ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
+ bool rcu_urgent_qs; /* GP old need light quiescent state. */
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
long long dynticks_idle_nesting;
/* irq/process nesting level from idle. */
@@ -262,41 +194,6 @@ struct rcu_node {
#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo))
/*
- * Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
- */
-#define rcu_for_each_node_breadth_first(rsp, rnp) \
- for ((rnp) = &(rsp)->node[0]; \
- (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
-
-/*
- * Do a breadth-first scan of the non-leaf rcu_node structures for the
- * specified rcu_state structure. Note that if there is a singleton
- * rcu_node tree with but one rcu_node structure, this loop is a no-op.
- */
-#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
- for ((rnp) = &(rsp)->node[0]; \
- (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
-
-/*
- * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
- * structure. Note that if there is a singleton rcu_node tree with but
- * one rcu_node structure, this loop -will- visit the rcu_node structure.
- * It is still a leaf node, even if it is also the root node.
- */
-#define rcu_for_each_leaf_node(rsp, rnp) \
- for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
- (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
-
-/*
- * Iterate over all possible CPUs in a leaf RCU node.
- */
-#define for_each_leaf_node_possible_cpu(rnp, cpu) \
- for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
- cpu <= rnp->grphi; \
- cpu = cpumask_next((cpu), cpu_possible_mask))
-
-/*
* Union to allow "aggregate OR" operation on the need for a quiescent
* state by the normal and expedited grace periods.
*/
@@ -336,34 +233,9 @@ struct rcu_data {
/* period it is aware of. */
/* 2) batch handling */
- /*
- * If nxtlist is not NULL, it is partitioned as follows.
- * Any of the partitions might be empty, in which case the
- * pointer to that partition will be equal to the pointer for
- * the following partition. When the list is empty, all of
- * the nxttail elements point to the ->nxtlist pointer itself,
- * which in that case is NULL.
- *
- * [nxtlist, *nxttail[RCU_DONE_TAIL]):
- * Entries that batch # <= ->completed
- * The grace period for these entries has completed, and
- * the other grace-period-completed entries may be moved
- * here temporarily in rcu_process_callbacks().
- * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
- * Entries that batch # <= ->completed - 1: waiting for current GP
- * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
- * Entries known to have arrived before current GP ended
- * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
- * Entries that might have arrived after current GP ended
- * Note that the value of *nxttail[RCU_NEXT_TAIL] will
- * always be NULL, as this is the end of the list.
- */
- struct rcu_head *nxtlist;
- struct rcu_head **nxttail[RCU_NEXT_SIZE];
- unsigned long nxtcompleted[RCU_NEXT_SIZE];
- /* grace periods for sublists. */
- long qlen_lazy; /* # of lazy queued callbacks */
- long qlen; /* # of queued callbacks, incl lazy */
+ struct rcu_segcblist cblist; /* Segmented callback list, with */
+ /* different callbacks waiting for */
+ /* different grace periods. */
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
@@ -482,7 +354,6 @@ struct rcu_state {
struct rcu_node *level[RCU_NUM_LVLS + 1];
/* Hierarchy levels (+1 to */
/* shut bogus gcc warning) */
- u8 flavor_mask; /* bit in flavor mask. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
call_rcu_func_t call; /* call_rcu() flavor. */
int ncpus; /* # CPUs seen so far. */
@@ -502,14 +373,11 @@ struct rcu_state {
raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
/* Protect following fields. */
- struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
+ struct rcu_cblist orphan_pend; /* Orphaned callbacks that */
/* need a grace period. */
- struct rcu_head **orphan_nxttail; /* Tail of above. */
- struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
+ struct rcu_cblist orphan_done; /* Orphaned callbacks that */
/* are ready to invoke. */
- struct rcu_head **orphan_donetail; /* Tail of above. */
- long qlen_lazy; /* Number of lazy callbacks. */
- long qlen; /* Total number of callbacks. */
+ /* (Contains counts.) */
/* End of fields guarded by orphan_lock. */
struct mutex barrier_mutex; /* Guards barrier fields. */
@@ -596,6 +464,7 @@ extern struct rcu_state rcu_preempt_state;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
+bool rcu_eqs_special_set(int cpu);
#ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -673,6 +542,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
static void rcu_dynticks_task_enter(void);
static void rcu_dynticks_task_exit(void);
+#ifdef CONFIG_SRCU
+void srcu_online_cpu(unsigned int cpu);
+void srcu_offline_cpu(unsigned int cpu);
+#else /* #ifdef CONFIG_SRCU */
+void srcu_online_cpu(unsigned int cpu) { }
+void srcu_offline_cpu(unsigned int cpu) { }
+#endif /* #else #ifdef CONFIG_SRCU */
+
#endif /* #ifndef RCU_TREE_NONCORE */
#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index a7b639ccd46e..e513b4ab1197 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -292,7 +292,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
rnp->grplo, rnp->grphi,
TPS("wait"));
- wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+ wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
sync_exp_work_done(rsp,
&rdp->exp_workdone2, s));
return true;
@@ -331,6 +331,8 @@ static void sync_sched_exp_handler(void *data)
return;
}
__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
+ /* Store .exp before .rcu_urgent_qs. */
+ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
resched_cpu(smp_processor_id());
}
@@ -531,7 +533,8 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
rnp->exp_seq_rq = s;
spin_unlock(&rnp->exp_lock);
}
- wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
+ smp_mb(); /* All above changes before wakeup. */
+ wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]);
}
trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
mutex_unlock(&rsp->exp_wake_mutex);
@@ -609,9 +612,9 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
/* Wait for expedited grace period to complete. */
rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
rnp = rcu_get_root(rsp);
- wait_event(rnp->exp_wq[(s >> 1) & 0x3],
- sync_exp_work_done(rsp,
- &rdp->exp_workdone0, s));
+ wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
+ sync_exp_work_done(rsp, &rdp->exp_workdone0, s));
+ smp_mb(); /* Workqueue actions happen before return. */
/* Let the next expedited grace period start. */
mutex_unlock(&rsp->exp_mutex);
@@ -735,15 +738,3 @@ void synchronize_rcu_expedited(void)
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
-
-/*
- * Switch to run-time mode once Tree RCU has fully initialized.
- */
-static int __init rcu_exp_runtime_mode(void)
-{
- rcu_test_sync_prims();
- rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
- rcu_test_sync_prims();
- return 0;
-}
-core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a62a8f1caac..c9a48657512a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1350,10 +1350,10 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
*/
if ((rdp->completed != rnp->completed ||
unlikely(READ_ONCE(rdp->gpwrap))) &&
- rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
+ rcu_segcblist_pend_cbs(&rdp->cblist))
note_gp_changes(rsp, rdp);
- if (cpu_has_callbacks_ready_to_invoke(rdp))
+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
cbs_ready = true;
}
return cbs_ready;
@@ -1461,7 +1461,7 @@ static void rcu_prepare_for_idle(void)
rdtp->last_accelerate = jiffies;
for_each_rcu_flavor(rsp) {
rdp = this_cpu_ptr(rsp->rda);
- if (!*rdp->nxttail[RCU_DONE_TAIL])
+ if (rcu_segcblist_pend_cbs(&rdp->cblist))
continue;
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
@@ -1529,7 +1529,7 @@ static void rcu_oom_notify_cpu(void *unused)
for_each_rcu_flavor(rsp) {
rdp = raw_cpu_ptr(rsp->rda);
- if (rdp->qlen_lazy != 0) {
+ if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) {
atomic_inc(&oom_callback_count);
rsp->call(&rdp->oom_head, rcu_oom_callback);
}
@@ -1709,7 +1709,7 @@ __setup("rcu_nocbs=", rcu_nocb_setup);
static int __init parse_rcu_nocb_poll(char *arg)
{
- rcu_nocb_poll = 1;
+ rcu_nocb_poll = true;
return 0;
}
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
@@ -1860,7 +1860,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- rdp->nocb_defer_wakeup = RCU_NOGP_WAKE;
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE);
+ /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
+ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmptyIsDeferred"));
}
@@ -1872,7 +1874,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvf"));
} else {
- rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE;
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE);
+ /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
+ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvfIsDeferred"));
}
@@ -1930,30 +1934,26 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
struct rcu_data *rdp,
unsigned long flags)
{
- long ql = rsp->qlen;
- long qll = rsp->qlen_lazy;
+ long ql = rsp->orphan_done.len;
+ long qll = rsp->orphan_done.len_lazy;
/* If this is not a no-CBs CPU, tell the caller to do it the old way. */
if (!rcu_is_nocb_cpu(smp_processor_id()))
return false;
- rsp->qlen = 0;
- rsp->qlen_lazy = 0;
/* First, enqueue the donelist, if any. This preserves CB ordering. */
- if (rsp->orphan_donelist != NULL) {
- __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
- rsp->orphan_donetail, ql, qll, flags);
- ql = qll = 0;
- rsp->orphan_donelist = NULL;
- rsp->orphan_donetail = &rsp->orphan_donelist;
+ if (rsp->orphan_done.head) {
+ __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done),
+ rcu_cblist_tail(&rsp->orphan_done),
+ ql, qll, flags);
}
- if (rsp->orphan_nxtlist != NULL) {
- __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
- rsp->orphan_nxttail, ql, qll, flags);
- ql = qll = 0;
- rsp->orphan_nxtlist = NULL;
- rsp->orphan_nxttail = &rsp->orphan_nxtlist;
+ if (rsp->orphan_pend.head) {
+ __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
+ rcu_cblist_tail(&rsp->orphan_pend),
+ ql, qll, flags);
}
+ rcu_cblist_init(&rsp->orphan_done);
+ rcu_cblist_init(&rsp->orphan_pend);
return true;
}
@@ -2395,16 +2395,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
return false;
/* If there are early-boot callbacks, move them to nocb lists. */
- if (rdp->nxtlist) {
- rdp->nocb_head = rdp->nxtlist;
- rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
- atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
- atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
- rdp->nxtlist = NULL;
- rdp->qlen = 0;
- rdp->qlen_lazy = 0;
+ if (!rcu_segcblist_empty(&rdp->cblist)) {
+ rdp->nocb_head = rcu_segcblist_head(&rdp->cblist);
+ rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist);
+ atomic_long_set(&rdp->nocb_q_count,
+ rcu_segcblist_n_cbs(&rdp->cblist));
+ atomic_long_set(&rdp->nocb_q_count_lazy,
+ rcu_segcblist_n_lazy_cbs(&rdp->cblist));
+ rcu_segcblist_init(&rdp->cblist);
}
- rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+ rcu_segcblist_disable(&rdp->cblist);
return true;
}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 8751a748499a..6cea17a1ea30 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -41,11 +41,11 @@
#include <linux/mutex.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include <linux/prefetch.h>
#define RCU_TREE_NONCORE
#include "tree.h"
-
-DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+#include "rcu.h"
static int r_open(struct inode *inode, struct file *file,
const struct seq_operations *op)
@@ -121,7 +121,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
cpu_is_offline(rdp->cpu) ? '!' : ' ',
ulong2long(rdp->completed), ulong2long(rdp->gpnum),
rdp->cpu_no_qs.b.norm,
- rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
+ rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu),
rdp->core_needs_qs);
seq_printf(m, " dt=%d/%llx/%d df=%lu",
rcu_dynticks_snap(rdp->dynticks),
@@ -130,17 +130,15 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->dynticks_fqs);
seq_printf(m, " of=%lu", rdp->offline_fqs);
rcu_nocb_q_lengths(rdp, &ql, &qll);
- qll += rdp->qlen_lazy;
- ql += rdp->qlen;
+ qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist);
+ ql += rcu_segcblist_n_cbs(&rdp->cblist);
seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
qll, ql,
- ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
- rdp->nxttail[RCU_NEXT_TAIL]],
- ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
- rdp->nxttail[RCU_NEXT_READY_TAIL]],
- ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
- rdp->nxttail[RCU_WAIT_TAIL]],
- ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
+ ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)],
+ ".R"[!rcu_segcblist_segempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL)],
+ ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)],
+ ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]);
#ifdef CONFIG_RCU_BOOST
seq_printf(m, " kt=%d/%c ktl=%x",
per_cpu(rcu_cpu_has_work, rdp->cpu),
@@ -278,7 +276,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
- READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
+ READ_ONCE(rsp->n_force_qs_lh),
+ rsp->orphan_done.len_lazy,
+ rsp->orphan_done.len);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 55c8530316c7..273e869ca21d 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
* non-expedited counterparts? Intended for use within RCU. Note
* that if the user specifies both rcu_expedited and rcu_normal, then
* rcu_normal wins. (Except during the time period during boot from
- * when the first task is spawned until the rcu_exp_runtime_mode()
+ * when the first task is spawned until the rcu_set_runtime_mode()
* core_initcall() is invoked, at which point everything is expedited.)
*/
bool rcu_gp_is_normal(void)
@@ -190,6 +190,39 @@ void rcu_end_inkernel_boot(void)
#endif /* #ifndef CONFIG_TINY_RCU */
+/*
+ * Test each non-SRCU synchronous grace-period wait API. This is
+ * useful just after a change in mode for these primitives, and
+ * during early boot.
+ */
+void rcu_test_sync_prims(void)
+{
+ if (!IS_ENABLED(CONFIG_PROVE_RCU))
+ return;
+ synchronize_rcu();
+ synchronize_rcu_bh();
+ synchronize_sched();
+ synchronize_rcu_expedited();
+ synchronize_rcu_bh_expedited();
+ synchronize_sched_expedited();
+}
+
+#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
+
+/*
+ * Switch to run-time mode once RCU has fully initialized.
+ */
+static int __init rcu_set_runtime_mode(void)
+{
+ rcu_test_sync_prims();
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+ rcu_test_sync_prims();
+ return 0;
+}
+core_initcall(rcu_set_runtime_mode);
+
+#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
+
#ifdef CONFIG_PREEMPT_RCU
/*
@@ -632,6 +665,7 @@ static void check_holdout_task(struct task_struct *t,
put_task_struct(t);
return;
}
+ rcu_request_urgent_qs_task(t);
if (!needreport)
return;
if (*firstreport) {
@@ -817,23 +851,6 @@ static void rcu_spawn_tasks_kthread(void)
#endif /* #ifdef CONFIG_TASKS_RCU */
-/*
- * Test each non-SRCU synchronous grace-period wait API. This is
- * useful just after a change in mode for these primitives, and
- * during early boot.
- */
-void rcu_test_sync_prims(void)
-{
- if (!IS_ENABLED(CONFIG_PROVE_RCU))
- return;
- synchronize_rcu();
- synchronize_rcu_bh();
- synchronize_sched();
- synchronize_rcu_expedited();
- synchronize_rcu_bh_expedited();
- synchronize_sched_expedited();
-}
-
#ifdef CONFIG_PROVE_RCU
/*
diff --git a/kernel/relay.c b/kernel/relay.c
index 0e413d9eec8a..39a9dfc69486 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1212,7 +1212,6 @@ static ssize_t subbuf_splice_actor(struct file *in,
.nr_pages = 0,
.nr_pages_max = PIPE_DEF_BUFFERS,
.partial = partial,
- .flags = flags,
.ops = &relay_pipe_buf_ops,
.spd_release = relay_page_release,
};
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index a08795e21628..00a45c45beca 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -96,10 +96,10 @@ static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
static int __sched_clock_stable_early = 1;
/*
- * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset
+ * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset
*/
-static __read_mostly u64 raw_offset;
-static __read_mostly u64 gtod_offset;
+__read_mostly u64 __sched_clock_offset;
+static __read_mostly u64 __gtod_offset;
struct sched_clock_data {
u64 tick_raw;
@@ -131,17 +131,24 @@ static void __set_sched_clock_stable(void)
/*
* Attempt to make the (initial) unstable->stable transition continuous.
*/
- raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw);
+ __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
- scd->tick_gtod, gtod_offset,
- scd->tick_raw, raw_offset);
+ scd->tick_gtod, __gtod_offset,
+ scd->tick_raw, __sched_clock_offset);
static_branch_enable(&__sched_clock_stable);
tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
-static void __clear_sched_clock_stable(struct work_struct *work)
+static void __sched_clock_work(struct work_struct *work)
+{
+ static_branch_disable(&__sched_clock_stable);
+}
+
+static DECLARE_WORK(sched_clock_work, __sched_clock_work);
+
+static void __clear_sched_clock_stable(void)
{
struct sched_clock_data *scd = this_scd();
@@ -154,17 +161,17 @@ static void __clear_sched_clock_stable(struct work_struct *work)
*
* Still do what we can.
*/
- gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod);
+ __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod);
printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
- scd->tick_gtod, gtod_offset,
- scd->tick_raw, raw_offset);
+ scd->tick_gtod, __gtod_offset,
+ scd->tick_raw, __sched_clock_offset);
- static_branch_disable(&__sched_clock_stable);
tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
-}
-static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
+ if (sched_clock_stable())
+ schedule_work(&sched_clock_work);
+}
void clear_sched_clock_stable(void)
{
@@ -173,7 +180,7 @@ void clear_sched_clock_stable(void)
smp_mb(); /* matches sched_clock_init_late() */
if (sched_clock_running == 2)
- schedule_work(&sched_clock_work);
+ __clear_sched_clock_stable();
}
void sched_clock_init_late(void)
@@ -214,7 +221,7 @@ static inline u64 wrap_max(u64 x, u64 y)
*/
static u64 sched_clock_local(struct sched_clock_data *scd)
{
- u64 now, clock, old_clock, min_clock, max_clock;
+ u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
again:
@@ -231,9 +238,10 @@ again:
* scd->tick_gtod + TICK_NSEC);
*/
- clock = scd->tick_gtod + gtod_offset + delta;
- min_clock = wrap_max(scd->tick_gtod, old_clock);
- max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
+ gtod = scd->tick_gtod + __gtod_offset;
+ clock = gtod + delta;
+ min_clock = wrap_max(gtod, old_clock);
+ max_clock = wrap_max(old_clock, gtod + TICK_NSEC);
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
@@ -317,7 +325,7 @@ u64 sched_clock_cpu(int cpu)
u64 clock;
if (sched_clock_stable())
- return sched_clock() + raw_offset;
+ return sched_clock() + __sched_clock_offset;
if (unlikely(!sched_clock_running))
return 0ull;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3b31fc05a0f1..803c3bc274c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -86,21 +86,6 @@ int sysctl_sched_rt_runtime = 950000;
cpumask_var_t cpu_isolated_map;
/*
- * this_rq_lock - lock this runqueue and disable interrupts.
- */
-static struct rq *this_rq_lock(void)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- local_irq_disable();
- rq = this_rq();
- raw_spin_lock(&rq->lock);
-
- return rq;
-}
-
-/*
* __task_rq_lock - lock the rq @p resides on.
*/
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@@ -233,8 +218,11 @@ void update_rq_clock(struct rq *rq)
return;
#ifdef CONFIG_SCHED_DEBUG
+ if (sched_feat(WARN_DOUBLE_CLOCK))
+ SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED;
#endif
+
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
if (delta < 0)
return;
@@ -261,13 +249,14 @@ static void hrtick_clear(struct rq *rq)
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+ struct rq_flags rf;
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
update_rq_clock(rq);
rq->curr->sched_class->task_tick(rq, rq->curr, 1);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
return HRTIMER_NORESTART;
}
@@ -287,11 +276,12 @@ static void __hrtick_restart(struct rq *rq)
static void __hrtick_start(void *arg)
{
struct rq *rq = arg;
+ struct rq_flags rf;
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
__hrtick_restart(rq);
rq->hrtick_csd_pending = 0;
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
/*
@@ -762,17 +752,23 @@ static void set_load_weight(struct task_struct *p)
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
- update_rq_clock(rq);
+ if (!(flags & ENQUEUE_NOCLOCK))
+ update_rq_clock(rq);
+
if (!(flags & ENQUEUE_RESTORE))
sched_info_queued(rq, p);
+
p->sched_class->enqueue_task(rq, p, flags);
}
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
- update_rq_clock(rq);
+ if (!(flags & DEQUEUE_NOCLOCK))
+ update_rq_clock(rq);
+
if (!(flags & DEQUEUE_SAVE))
sched_info_dequeued(rq, p);
+
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -946,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
*
* Returns (locked) new rq. Old rq's lock is released.
*/
-static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
+static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
+ struct task_struct *p, int new_cpu)
{
lockdep_assert_held(&rq->lock);
p->on_rq = TASK_ON_RQ_MIGRATING;
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, rf);
rq = cpu_rq(new_cpu);
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, rf);
BUG_ON(task_cpu(p) != new_cpu);
enqueue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -980,7 +977,8 @@ struct migration_arg {
* So we race with normal scheduler movements, but that's OK, as long
* as the task is no longer on this CPU.
*/
-static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
+static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
+ struct task_struct *p, int dest_cpu)
{
if (unlikely(!cpu_active(dest_cpu)))
return rq;
@@ -989,7 +987,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
return rq;
- rq = move_queued_task(rq, p, dest_cpu);
+ update_rq_clock(rq);
+ rq = move_queued_task(rq, rf, p, dest_cpu);
return rq;
}
@@ -1004,6 +1003,7 @@ static int migration_cpu_stop(void *data)
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
struct rq *rq = this_rq();
+ struct rq_flags rf;
/*
* The original target CPU might have gone down and we might
@@ -1018,7 +1018,7 @@ static int migration_cpu_stop(void *data)
sched_ttwu_pending();
raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@@ -1026,11 +1026,11 @@ static int migration_cpu_stop(void *data)
*/
if (task_rq(p) == rq) {
if (task_on_rq_queued(p))
- rq = __migrate_task(rq, p, arg->dest_cpu);
+ rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
else
p->wake_cpu = arg->dest_cpu;
}
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
raw_spin_unlock(&p->pi_lock);
local_irq_enable();
@@ -1063,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* holding rq->lock.
*/
lockdep_assert_held(&rq->lock);
- dequeue_task(rq, p, DEQUEUE_SAVE);
+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
}
if (running)
put_prev_task(rq, p);
@@ -1071,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
p->sched_class->set_cpus_allowed(p, new_mask);
if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_curr_task(rq, p);
}
@@ -1150,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
* OK, since we're going to drop the lock immediately
* afterwards anyway.
*/
- rq_unpin_lock(rq, &rf);
- rq = move_queued_task(rq, p, dest_cpu);
- rq_repin_lock(rq, &rf);
+ rq = move_queued_task(rq, &rf, p, dest_cpu);
}
out:
task_rq_unlock(rq, p, &rf);
@@ -1217,16 +1215,24 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
{
if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
+ struct rq_flags srf, drf;
src_rq = task_rq(p);
dst_rq = cpu_rq(cpu);
+ rq_pin_lock(src_rq, &srf);
+ rq_pin_lock(dst_rq, &drf);
+
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
+
+ rq_unpin_lock(dst_rq, &drf);
+ rq_unpin_lock(src_rq, &srf);
+
} else {
/*
* Task isn't running anymore; make it appear like we migrated
@@ -1680,7 +1686,7 @@ static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
struct rq_flags *rf)
{
- int en_flags = ENQUEUE_WAKEUP;
+ int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
lockdep_assert_held(&rq->lock);
@@ -1726,14 +1732,13 @@ void sched_ttwu_pending(void)
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p;
- unsigned long flags;
struct rq_flags rf;
if (!llist)
return;
- raw_spin_lock_irqsave(&rq->lock, flags);
- rq_pin_lock(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
while (llist) {
int wake_flags = 0;
@@ -1747,8 +1752,7 @@ void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, wake_flags, &rf);
}
- rq_unpin_lock(rq, &rf);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
void scheduler_ipi(void)
@@ -1806,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
void wake_up_if_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
+ struct rq_flags rf;
rcu_read_lock();
@@ -1816,11 +1820,11 @@ void wake_up_if_idle(int cpu)
if (set_nr_if_polling(rq->idle)) {
trace_sched_wake_idle_without_ipi(cpu);
} else {
- raw_spin_lock_irqsave(&rq->lock, flags);
+ rq_lock_irqsave(rq, &rf);
if (is_idle_task(rq->curr))
smp_send_reschedule(cpu);
/* Else CPU is not idle, do nothing here: */
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
out:
@@ -1846,11 +1850,10 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
}
#endif
- raw_spin_lock(&rq->lock);
- rq_pin_lock(rq, &rf);
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
ttwu_do_activate(rq, p, wake_flags, &rf);
- rq_unpin_lock(rq, &rf);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
/*
@@ -2097,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
* disabled avoiding further scheduler activity on it and we've
* not yet picked a replacement task.
*/
- rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, rf);
raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
- rq_repin_lock(rq, rf);
+ rq_relock(rq, rf);
}
if (!(p->state & TASK_NORMAL))
@@ -2114,7 +2115,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
delayacct_blkio_end();
atomic_dec(&rq->nr_iowait);
}
- ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
}
ttwu_do_wakeup(rq, p, 0, rf);
@@ -2555,7 +2556,7 @@ void wake_up_new_task(struct task_struct *p)
update_rq_clock(rq);
post_init_entity_util_avg(&p->se);
- activate_task(rq, p, 0);
+ activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
@@ -3093,15 +3094,18 @@ void scheduler_tick(void)
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
+ struct rq_flags rf;
sched_clock_tick();
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
+
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
calc_global_load_tick(rq);
- raw_spin_unlock(&rq->lock);
+
+ rq_unlock(rq, &rf);
perf_event_task_tick();
@@ -3378,7 +3382,7 @@ static void __sched notrace __schedule(bool preempt)
hrtick_clear(rq);
local_irq_disable();
- rcu_note_context_switch();
+ rcu_note_context_switch(preempt);
/*
* Make sure that signal_pending_state()->signal_pending() below
@@ -3386,18 +3390,18 @@ static void __sched notrace __schedule(bool preempt)
* done by the caller to avoid the race with signal_wake_up().
*/
smp_mb__before_spinlock();
- raw_spin_lock(&rq->lock);
- rq_pin_lock(rq, &rf);
+ rq_lock(rq, &rf);
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
+ update_rq_clock(rq);
switch_count = &prev->nivcsw;
if (!preempt && prev->state) {
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
- deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
prev->on_rq = 0;
if (prev->in_iowait) {
@@ -3421,9 +3425,6 @@ static void __sched notrace __schedule(bool preempt)
switch_count = &prev->nvcsw;
}
- if (task_on_rq_queued(prev))
- update_rq_clock(rq);
-
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
@@ -3439,8 +3440,7 @@ static void __sched notrace __schedule(bool preempt)
rq = context_switch(rq, prev, next, &rf);
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- rq_unpin_lock(rq, &rf);
- raw_spin_unlock_irq(&rq->lock);
+ rq_unlock_irq(rq, &rf);
}
balance_callback(rq);
@@ -3502,6 +3502,31 @@ asmlinkage __visible void __sched schedule(void)
}
EXPORT_SYMBOL(schedule);
+/*
+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
+ * state (have scheduled out non-voluntarily) by making sure that all
+ * tasks have either left the run queue or have gone into user space.
+ * As idle tasks do not do either, they must not ever be preempted
+ * (schedule out non-voluntarily).
+ *
+ * schedule_idle() is similar to schedule_preempt_disable() except that it
+ * never enables preemption because it does not call sched_submit_work().
+ */
+void __sched schedule_idle(void)
+{
+ /*
+ * As this skips calling sched_submit_work(), which the idle task does
+ * regardless because that function is a nop when the task is in a
+ * TASK_RUNNING state, make sure this isn't used someplace that the
+ * current task can be in any other state. Note, idle is always in the
+ * TASK_RUNNING state.
+ */
+ WARN_ON_ONCE(current->state);
+ do {
+ __schedule(false);
+ } while (need_resched());
+}
+
#ifdef CONFIG_CONTEXT_TRACKING
asmlinkage __visible void __sched schedule_user(void)
{
@@ -3671,10 +3696,25 @@ EXPORT_SYMBOL(default_wake_function);
#ifdef CONFIG_RT_MUTEXES
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+ if (pi_task)
+ prio = min(prio, pi_task->prio);
+
+ return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+ return __rt_effective_prio(pi_task, prio);
+}
+
/*
* rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
+ * @p: task to boost
+ * @pi_task: donor task
*
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
@@ -3682,17 +3722,42 @@ EXPORT_SYMBOL(default_wake_function);
* Used by the rt_mutex code to implement priority inheritance
* logic. Call site only calls if the priority of the task changed.
*/
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
- int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ int prio, oldprio, queued, running, queue_flag =
+ DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *prev_class;
struct rq_flags rf;
struct rq *rq;
- BUG_ON(prio > MAX_PRIO);
+ /* XXX used to be waiter->prio, not waiter->task->prio */
+ prio = __rt_effective_prio(pi_task, p->normal_prio);
+
+ /*
+ * If nothing changed; bail early.
+ */
+ if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
+ return;
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
+ /*
+ * Set under pi_lock && rq->lock, such that the value can be used under
+ * either lock.
+ *
+ * Note that there is loads of tricky to make this pointer cache work
+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
+ * ensure a task is de-boosted (pi_task is set to NULL) before the
+ * task is allowed to run again (and can exit). This ensures the pointer
+ * points to a blocked task -- which guaratees the task is present.
+ */
+ p->pi_top_task = pi_task;
+
+ /*
+ * For FIFO/RR we only need to set prio, if that matches we're done.
+ */
+ if (prio == p->prio && !dl_prio(prio))
+ goto out_unlock;
/*
* Idle task boosting is a nono in general. There is one
@@ -3712,7 +3777,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
goto out_unlock;
}
- trace_sched_pi_setprio(p, prio);
+ trace_sched_pi_setprio(p, pi_task);
oldprio = p->prio;
if (oldprio == prio)
@@ -3736,7 +3801,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
* running task
*/
if (dl_prio(prio)) {
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
@@ -3774,6 +3838,11 @@ out_unlock:
balance_callback(rq);
preempt_enable();
}
+#else
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ return prio;
+}
#endif
void set_user_nice(struct task_struct *p, long nice)
@@ -3805,7 +3874,7 @@ void set_user_nice(struct task_struct *p, long nice)
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
if (running)
put_prev_task(rq, p);
@@ -3816,7 +3885,7 @@ void set_user_nice(struct task_struct *p, long nice)
delta = p->prio - old_prio;
if (queued) {
- enqueue_task(rq, p, ENQUEUE_RESTORE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -4020,10 +4089,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
* Keep a potential priority boosting if called from
* sched_setscheduler().
*/
+ p->prio = normal_prio(p);
if (keep_boost)
- p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
- else
- p->prio = normal_prio(p);
+ p->prio = rt_effective_prio(p, p->prio);
if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -4126,7 +4194,7 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_class *prev_class;
struct rq_flags rf;
int reset_on_fork;
- int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
/* May grab non-irq protected spin_locks: */
@@ -4310,7 +4378,7 @@ change:
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ new_effective_prio = rt_effective_prio(p, newprio);
if (new_effective_prio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
@@ -4923,7 +4991,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
*/
SYSCALL_DEFINE0(sched_yield)
{
- struct rq *rq = this_rq_lock();
+ struct rq_flags rf;
+ struct rq *rq;
+
+ local_irq_disable();
+ rq = this_rq();
+ rq_lock(rq, &rf);
schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
@@ -4932,9 +5005,8 @@ SYSCALL_DEFINE0(sched_yield)
* Since we are going to call schedule() anyway, there's
* no need to preempt or enable interrupts:
*/
- __release(rq->lock);
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
- do_raw_spin_unlock(&rq->lock);
+ preempt_disable();
+ rq_unlock(rq, &rf);
sched_preempt_enable_no_resched();
schedule();
@@ -5514,7 +5586,7 @@ void sched_setnuma(struct task_struct *p, int nid)
p->numa_preferred_nid = nid;
if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_curr_task(rq, p);
task_rq_unlock(rq, p, &rf);
@@ -5579,11 +5651,11 @@ static struct task_struct fake_task = {
* there's no concurrency possible, we hold the required locks anyway
* because of lock validation efforts.
*/
-static void migrate_tasks(struct rq *dead_rq)
+static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
- struct rq_flags rf;
+ struct rq_flags orf = *rf;
int dest_cpu;
/*
@@ -5602,9 +5674,7 @@ static void migrate_tasks(struct rq *dead_rq)
* class method both need to have an up-to-date
* value of rq->clock[_task]
*/
- rq_pin_lock(rq, &rf);
update_rq_clock(rq);
- rq_unpin_lock(rq, &rf);
for (;;) {
/*
@@ -5617,8 +5687,7 @@ static void migrate_tasks(struct rq *dead_rq)
/*
* pick_next_task() assumes pinned rq->lock:
*/
- rq_repin_lock(rq, &rf);
- next = pick_next_task(rq, &fake_task, &rf);
+ next = pick_next_task(rq, &fake_task, rf);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -5631,10 +5700,9 @@ static void migrate_tasks(struct rq *dead_rq)
* because !cpu_active at this point, which means load-balance
* will not interfere. Also, stop-machine.
*/
- rq_unpin_lock(rq, &rf);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, rf);
raw_spin_lock(&next->pi_lock);
- raw_spin_lock(&rq->lock);
+ rq_relock(rq, rf);
/*
* Since we're inside stop-machine, _nothing_ should have
@@ -5648,12 +5716,12 @@ static void migrate_tasks(struct rq *dead_rq)
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
-
- rq = __migrate_task(rq, next, dest_cpu);
+ rq = __migrate_task(rq, rf, next, dest_cpu);
if (rq != dead_rq) {
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, rf);
rq = dead_rq;
- raw_spin_lock(&rq->lock);
+ *rf = orf;
+ rq_relock(rq, rf);
}
raw_spin_unlock(&next->pi_lock);
}
@@ -5732,7 +5800,7 @@ static void cpuset_cpu_active(void)
* cpuset configurations.
*/
}
- cpuset_update_active_cpus(true);
+ cpuset_update_active_cpus();
}
static int cpuset_cpu_inactive(unsigned int cpu)
@@ -5755,7 +5823,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
if (overflow)
return -EBUSY;
- cpuset_update_active_cpus(false);
+ cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
partition_sched_domains(1, NULL, NULL);
@@ -5766,7 +5834,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
int sched_cpu_activate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
+ struct rq_flags rf;
set_cpu_active(cpu, true);
@@ -5784,12 +5852,12 @@ int sched_cpu_activate(unsigned int cpu)
* 2) At runtime, if cpuset_cpu_active() fails to rebuild the
* domains.
*/
- raw_spin_lock_irqsave(&rq->lock, flags);
+ rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_online(rq);
}
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
update_max_interval();
@@ -5847,18 +5915,20 @@ int sched_cpu_starting(unsigned int cpu)
int sched_cpu_dying(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
+ struct rq_flags rf;
/* Handle pending wakeups and then migrate everything off */
sched_ttwu_pending();
- raw_spin_lock_irqsave(&rq->lock, flags);
+
+ rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
- migrate_tasks(rq);
+ migrate_tasks(rq, &rf);
BUG_ON(rq->nr_running != 1);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
+
calc_load_migrate(rq);
update_max_interval();
nohz_balance_exit_idle(cpu);
@@ -6412,7 +6482,8 @@ static void sched_change_group(struct task_struct *tsk, int type)
*/
void sched_move_task(struct task_struct *tsk)
{
- int queued, running;
+ int queued, running, queue_flags =
+ DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq_flags rf;
struct rq *rq;
@@ -6423,14 +6494,14 @@ void sched_move_task(struct task_struct *tsk)
queued = task_on_rq_queued(tsk);
if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+ dequeue_task(rq, tsk, queue_flags);
if (running)
put_prev_task(rq, tsk);
sched_change_group(tsk, TASK_MOVE_GROUP);
if (queued)
- enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
+ enqueue_task(rq, tsk, queue_flags);
if (running)
set_curr_task(rq, tsk);
@@ -7008,14 +7079,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
+ struct rq_flags rf;
- raw_spin_lock_irq(&rq->lock);
+ rq_lock_irq(rq, &rf);
cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0;
if (cfs_rq->throttled)
unthrottle_cfs_rq(cfs_rq);
- raw_spin_unlock_irq(&rq->lock);
+ rq_unlock_irq(rq, &rf);
}
if (runtime_was_enabled && !runtime_enabled)
cfs_bandwidth_usage_dec();
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 54c577578da6..076a2e31951c 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -61,6 +61,11 @@ struct sugov_cpu {
unsigned long util;
unsigned long max;
unsigned int flags;
+
+ /* The field below is for single-CPU policies only. */
+#ifdef CONFIG_NO_HZ_COMMON
+ unsigned long saved_idle_calls;
+#endif
};
static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -93,22 +98,20 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
{
struct cpufreq_policy *policy = sg_policy->policy;
+ if (sg_policy->next_freq == next_freq)
+ return;
+
+ sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
if (policy->fast_switch_enabled) {
- if (sg_policy->next_freq == next_freq) {
- trace_cpu_frequency(policy->cur, smp_processor_id());
- return;
- }
- sg_policy->next_freq = next_freq;
next_freq = cpufreq_driver_fast_switch(policy, next_freq);
if (next_freq == CPUFREQ_ENTRY_INVALID)
return;
policy->cur = next_freq;
trace_cpu_frequency(next_freq, smp_processor_id());
- } else if (sg_policy->next_freq != next_freq) {
- sg_policy->next_freq = next_freq;
+ } else {
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
}
@@ -192,6 +195,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
sg_cpu->iowait_boost >>= 1;
}
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+{
+ unsigned long idle_calls = tick_nohz_get_idle_calls();
+ bool ret = idle_calls == sg_cpu->saved_idle_calls;
+
+ sg_cpu->saved_idle_calls = idle_calls;
+ return ret;
+}
+#else
+static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* CONFIG_NO_HZ_COMMON */
+
static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int flags)
{
@@ -200,6 +216,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
struct cpufreq_policy *policy = sg_policy->policy;
unsigned long util, max;
unsigned int next_f;
+ bool busy;
sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -207,40 +224,36 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
if (!sugov_should_update_freq(sg_policy, time))
return;
+ busy = sugov_cpu_is_busy(sg_cpu);
+
if (flags & SCHED_CPUFREQ_RT_DL) {
next_f = policy->cpuinfo.max_freq;
} else {
sugov_get_util(&util, &max);
sugov_iowait_boost(sg_cpu, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
+ /*
+ * Do not reduce the frequency if the CPU has not been idle
+ * recently, as the reduction is likely to be premature then.
+ */
+ if (busy && next_f < sg_policy->next_freq)
+ next_f = sg_policy->next_freq;
}
sugov_update_commit(sg_policy, time, next_f);
}
-static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
- unsigned long util, unsigned long max,
- unsigned int flags)
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned int max_f = policy->cpuinfo.max_freq;
- u64 last_freq_update_time = sg_policy->last_freq_update_time;
+ unsigned long util = 0, max = 1;
unsigned int j;
- if (flags & SCHED_CPUFREQ_RT_DL)
- return max_f;
-
- sugov_iowait_boost(sg_cpu, &util, &max);
-
for_each_cpu(j, policy->cpus) {
- struct sugov_cpu *j_sg_cpu;
+ struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
s64 delta_ns;
- if (j == smp_processor_id())
- continue;
-
- j_sg_cpu = &per_cpu(sugov_cpu, j);
/*
* If the CPU utilization was last updated before the previous
* frequency update and the time elapsed between the last update
@@ -248,13 +261,13 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
* enough, don't take the CPU into account as it probably is
* idle now (and clear iowait_boost for it).
*/
- delta_ns = last_freq_update_time - j_sg_cpu->last_update;
+ delta_ns = time - j_sg_cpu->last_update;
if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0;
continue;
}
if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
- return max_f;
+ return policy->cpuinfo.max_freq;
j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max;
@@ -289,7 +302,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
- next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
+ if (flags & SCHED_CPUFREQ_RT_DL)
+ next_f = sg_policy->policy->cpuinfo.max_freq;
+ else
+ next_f = sugov_next_freq_shared(sg_cpu, time);
+
sugov_update_commit(sg_policy, time, next_f);
}
@@ -473,7 +490,6 @@ static int sugov_init(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy;
struct sugov_tunables *tunables;
- unsigned int lat;
int ret = 0;
/* State should be equivalent to EXIT */
@@ -512,10 +528,16 @@ static int sugov_init(struct cpufreq_policy *policy)
goto stop_kthread;
}
- tunables->rate_limit_us = LATENCY_MULTIPLIER;
- lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
- if (lat)
- tunables->rate_limit_us *= lat;
+ if (policy->transition_delay_us) {
+ tunables->rate_limit_us = policy->transition_delay_us;
+ } else {
+ unsigned int lat;
+
+ tunables->rate_limit_us = LATENCY_MULTIPLIER;
+ lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+ if (lat)
+ tunables->rate_limit_us *= lat;
+ }
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b46c8..aea3135c5d90 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+ enum cpu_usage_stat idx)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ u64_stats_update_begin(&irqtime->sync);
+ cpustat[idx] += delta;
+ irqtime->total += delta;
+ irqtime->tick_delta += delta;
+ u64_stats_update_end(&irqtime->sync);
+}
+
/*
* Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
void irqtime_account_irq(struct task_struct *curr)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
- u64 *cpustat = kcpustat_this_cpu->cpustat;
s64 delta;
int cpu;
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
- u64_stats_update_begin(&irqtime->sync);
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count()) {
- cpustat[CPUTIME_IRQ] += delta;
- irqtime->tick_delta += delta;
- } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
- cpustat[CPUTIME_SOFTIRQ] += delta;
- irqtime->tick_delta += delta;
- }
-
- u64_stats_update_end(&irqtime->sync);
+ if (hardirq_count())
+ irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dea138964b91..d71109321841 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -717,18 +717,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_SMP
+
+#include "sched-pelt.h"
+
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
-/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
- * dependent on this value.
- */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
-
/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
{
@@ -2733,47 +2727,15 @@ static inline void update_cfs_shares(struct sched_entity *se)
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
-/* Precomputed fixed inverse multiplies for multiplication by y^n */
-static const u32 runnable_avg_yN_inv[] = {
- 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
- 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
- 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
- 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
- 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
- 0x85aac367, 0x82cd8698,
-};
-
-/*
- * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
- * over-estimates when re-combining.
- */
-static const u32 runnable_avg_yN_sum[] = {
- 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
- 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
- 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
-};
-
-/*
- * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
- * lower integers. See Documentation/scheduler/sched-avg.txt how these
- * were generated:
- */
-static const u32 __accumulated_sum_N32[] = {
- 0, 23371, 35056, 40899, 43820, 45281,
- 46011, 46376, 46559, 46650, 46696, 46719,
-};
-
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
-static __always_inline u64 decay_load(u64 val, u64 n)
+static u64 decay_load(u64 val, u64 n)
{
unsigned int local_n;
- if (!n)
- return val;
- else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ if (unlikely(n > LOAD_AVG_PERIOD * 63))
return 0;
/* after bounds checking we can collapse to 32-bit */
@@ -2795,30 +2757,97 @@ static __always_inline u64 decay_load(u64 val, u64 n)
return val;
}
+static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
+{
+ u32 c1, c2, c3 = d3; /* y^0 == 1 */
+
+ /*
+ * c1 = d1 y^p
+ */
+ c1 = decay_load((u64)d1, periods);
+
+ /*
+ * p-1
+ * c2 = 1024 \Sum y^n
+ * n=1
+ *
+ * inf inf
+ * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
+ * n=0 n=p
+ */
+ c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
+
+ return c1 + c2 + c3;
+}
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
/*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
+ * Accumulate the three separate parts of the sum; d1 the remainder
+ * of the last (incomplete) period, d2 the span of full periods and d3
+ * the remainder of the (incomplete) current period.
+ *
+ * d1 d2 d3
+ * ^ ^ ^
+ * | | |
+ * |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ * p-1
+ * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
+ * n=1
*
- * We can compute this reasonably efficiently by combining:
- * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
+ * = u y^p + (Step 1)
+ *
+ * p-1
+ * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
+ * n=1
*/
-static u32 __compute_runnable_contrib(u64 n)
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
- u32 contrib = 0;
+ unsigned long scale_freq, scale_cpu;
+ u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
+ u64 periods;
- if (likely(n <= LOAD_AVG_PERIOD))
- return runnable_avg_yN_sum[n];
- else if (unlikely(n >= LOAD_AVG_MAX_N))
- return LOAD_AVG_MAX;
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
- /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
- contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
- n %= LOAD_AVG_PERIOD;
- contrib = decay_load(contrib, n);
- return contrib + runnable_avg_yN_sum[n];
-}
+ delta += sa->period_contrib;
+ periods = delta / 1024; /* A period is 1024us (~1ms) */
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+ /*
+ * Step 1: decay old *_sum if we crossed period boundaries.
+ */
+ if (periods) {
+ sa->load_sum = decay_load(sa->load_sum, periods);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ decay_load(cfs_rq->runnable_load_sum, periods);
+ }
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+
+ /*
+ * Step 2
+ */
+ delta %= 1024;
+ contrib = __accumulate_pelt_segments(periods,
+ 1024 - sa->period_contrib, delta);
+ }
+ sa->period_contrib = delta;
+
+ contrib = cap_scale(contrib, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
+
+ return periods;
+}
/*
* We can represent the historical contribution to runnable average as the
@@ -2849,13 +2878,10 @@ static u32 __compute_runnable_contrib(u64 n)
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
-__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
- u64 delta, scaled_delta, periods;
- u32 contrib;
- unsigned int delta_w, scaled_delta_w, decayed = 0;
- unsigned long scale_freq, scale_cpu;
+ u64 delta;
delta = now - sa->last_update_time;
/*
@@ -2874,83 +2900,52 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
delta >>= 10;
if (!delta)
return 0;
- sa->last_update_time = now;
-
- scale_freq = arch_scale_freq_capacity(NULL, cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
- /* delta_w is the amount already accumulated against our next period */
- delta_w = sa->period_contrib;
- if (delta + delta_w >= 1024) {
- decayed = 1;
- /* how much left for next period will start over, we don't know yet */
- sa->period_contrib = 0;
+ sa->last_update_time += delta << 10;
- /*
- * Now that we know we're crossing a period boundary, figure
- * out how much from delta we need to complete the current
- * period and accrue it.
- */
- delta_w = 1024 - delta_w;
- scaled_delta_w = cap_scale(delta_w, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta_w;
- if (cfs_rq) {
- cfs_rq->runnable_load_sum +=
- weight * scaled_delta_w;
- }
- }
- if (running)
- sa->util_sum += scaled_delta_w * scale_cpu;
-
- delta -= delta_w;
-
- /* Figure out how many additional periods this update spans */
- periods = delta / 1024;
- delta %= 1024;
+ /*
+ * Now we know we crossed measurement unit boundaries. The *_avg
+ * accrues by two steps:
+ *
+ * Step 1: accumulate *_sum since last_update_time. If we haven't
+ * crossed period boundaries, finish.
+ */
+ if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+ return 0;
- sa->load_sum = decay_load(sa->load_sum, periods + 1);
- if (cfs_rq) {
- cfs_rq->runnable_load_sum =
- decay_load(cfs_rq->runnable_load_sum, periods + 1);
- }
- sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
-
- /* Efficiently calculate \sum (1..n_period) 1024*y^i */
- contrib = __compute_runnable_contrib(periods);
- contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
- }
- if (running)
- sa->util_sum += contrib * scale_cpu;
+ /*
+ * Step 2: update *_avg.
+ */
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_avg =
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
}
+ sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
- /* Remainder of delta accrued against u_0` */
- scaled_delta = cap_scale(delta, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * scaled_delta;
- }
- if (running)
- sa->util_sum += scaled_delta * scale_cpu;
+ return 1;
+}
- sa->period_contrib += delta;
+static int
+__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+{
+ return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+}
- if (decayed) {
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
- if (cfs_rq) {
- cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
- }
- sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
- }
+static int
+__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return ___update_load_avg(now, cpu, &se->avg,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
+}
- return decayed;
+static int
+__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+{
+ return ___update_load_avg(now, cpu, &cfs_rq->avg,
+ scale_load_down(cfs_rq->load.weight),
+ cfs_rq->curr != NULL, cfs_rq);
}
/*
@@ -3014,6 +3009,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next)
{
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
if (!sched_feat(ATTACH_AGE_LOAD))
return;
@@ -3024,11 +3022,11 @@ void set_task_rq_fair(struct sched_entity *se,
* time. This will result in the wakee task is less decayed, but giving
* the wakee more load sounds not bad.
*/
- if (se->avg.last_update_time && prev) {
- u64 p_last_update_time;
- u64 n_last_update_time;
+ if (!(se->avg.last_update_time && prev))
+ return;
#ifndef CONFIG_64BIT
+ {
u64 p_last_update_time_copy;
u64 n_last_update_time_copy;
@@ -3043,14 +3041,13 @@ void set_task_rq_fair(struct sched_entity *se,
} while (p_last_update_time != p_last_update_time_copy ||
n_last_update_time != n_last_update_time_copy);
+ }
#else
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
#endif
- __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
- &se->avg, 0, 0, NULL);
- se->avg.last_update_time = n_last_update_time;
- }
+ __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
+ se->avg.last_update_time = n_last_update_time;
}
/* Take into account change of utilization of a child task group */
@@ -3173,6 +3170,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
return 1;
}
+/*
+ * Check if we need to update the load and the utilization of a blocked
+ * group_entity:
+ */
+static inline bool skip_blocked_update(struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+ /*
+ * If sched_entity still have not zero load or utilization, we have to
+ * decay it:
+ */
+ if (se->avg.load_avg || se->avg.util_avg)
+ return false;
+
+ /*
+ * If there is a pending propagation, we have to update the load and
+ * the utilization of the sched_entity:
+ */
+ if (gcfs_rq->propagate_avg)
+ return false;
+
+ /*
+ * Otherwise, the load and the utilization of the sched_entity is
+ * already zero and there is no pending propagation, so it will be a
+ * waste of time to try to decay it:
+ */
+ return true;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -3265,8 +3292,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
set_tg_cfs_propagate(cfs_rq);
}
- decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
+ decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
@@ -3298,11 +3324,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
- __update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
- }
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+ __update_load_avg_se(now, cpu, cfs_rq, se);
decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
decayed |= propagate_entity_load_avg(se);
@@ -3407,7 +3430,7 @@ void sync_entity_load_avg(struct sched_entity *se)
u64 last_update_time;
last_update_time = cfs_rq_last_update_time(cfs_rq);
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
}
/*
@@ -4271,8 +4294,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
struct rq *rq = rq_of(cfs_rq);
+ struct rq_flags rf;
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
@@ -4289,7 +4313,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
unthrottle_cfs_rq(cfs_rq);
next:
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
if (!remaining)
break;
@@ -5097,15 +5121,16 @@ void cpu_load_update_nohz_stop(void)
unsigned long curr_jiffies = READ_ONCE(jiffies);
struct rq *this_rq = this_rq();
unsigned long load;
+ struct rq_flags rf;
if (curr_jiffies == this_rq->last_load_update_tick)
return;
load = weighted_cpuload(cpu_of(this_rq));
- raw_spin_lock(&this_rq->lock);
+ rq_lock(this_rq, &rf);
update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
- raw_spin_unlock(&this_rq->lock);
+ rq_unlock(this_rq, &rf);
}
#else /* !CONFIG_NO_HZ_COMMON */
static inline void cpu_load_update_nohz(struct rq *this_rq,
@@ -6769,7 +6794,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
lockdep_assert_held(&env->src_rq->lock);
p->on_rq = TASK_ON_RQ_MIGRATING;
- deactivate_task(env->src_rq, p, 0);
+ deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
}
@@ -6902,7 +6927,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq);
- activate_task(rq, p, 0);
+ activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
}
@@ -6913,9 +6938,12 @@ static void attach_task(struct rq *rq, struct task_struct *p)
*/
static void attach_one_task(struct rq *rq, struct task_struct *p)
{
- raw_spin_lock(&rq->lock);
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
attach_task(rq, p);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
/*
@@ -6926,8 +6954,10 @@ static void attach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->tasks;
struct task_struct *p;
+ struct rq_flags rf;
- raw_spin_lock(&env->dst_rq->lock);
+ rq_lock(env->dst_rq, &rf);
+ update_rq_clock(env->dst_rq);
while (!list_empty(tasks)) {
p = list_first_entry(tasks, struct task_struct, se.group_node);
@@ -6936,7 +6966,7 @@ static void attach_tasks(struct lb_env *env)
attach_task(env->dst_rq, p);
}
- raw_spin_unlock(&env->dst_rq->lock);
+ rq_unlock(env->dst_rq, &rf);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6944,9 +6974,9 @@ static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq;
- unsigned long flags;
+ struct rq_flags rf;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
/*
@@ -6954,6 +6984,8 @@ static void update_blocked_averages(int cpu)
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
+ struct sched_entity *se;
+
/* throttled entities do not contribute to load */
if (throttled_hierarchy(cfs_rq))
continue;
@@ -6961,11 +6993,12 @@ static void update_blocked_averages(int cpu)
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
update_tg_load_avg(cfs_rq, 0);
- /* Propagate pending load changes to the parent */
- if (cfs_rq->tg->se[cpu])
- update_load_avg(cfs_rq->tg->se[cpu], 0);
+ /* Propagate pending load changes to the parent, if any: */
+ se = cfs_rq->tg->se[cpu];
+ if (se && !skip_blocked_update(se))
+ update_load_avg(se, 0);
}
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
/*
@@ -7019,12 +7052,12 @@ static inline void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
- unsigned long flags;
+ struct rq_flags rf;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
static unsigned long task_h_load(struct task_struct *p)
@@ -7525,6 +7558,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
+ struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
bool overload = false;
@@ -7541,7 +7575,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
if (local_group) {
sds->local = sg;
- sgs = &sds->local_stat;
+ sgs = local;
if (env->idle != CPU_NEWLY_IDLE ||
time_after_eq(jiffies, sg->sgc->next_update))
@@ -7565,8 +7599,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
* the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- group_has_capacity(env, &sds->local_stat) &&
- (sgs->sum_nr_running > 1)) {
+ group_has_capacity(env, local) &&
+ (sgs->sum_nr_running > local->sum_nr_running + 1)) {
sgs->group_no_capacity = 1;
sgs->group_type = group_classify(sg, sgs);
}
@@ -7597,7 +7631,7 @@ next_group:
/**
* check_asym_packing - Check to see if the group is packed into the
- * sched doman.
+ * sched domain.
*
* This is primarily intended to used at the sibling level. Some
* cores like POWER7 prefer to use lower numbered SMT threads. In the
@@ -8042,7 +8076,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd_parent = sd->parent;
struct sched_group *group;
struct rq *busiest;
- unsigned long flags;
+ struct rq_flags rf;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
struct lb_env env = {
@@ -8105,7 +8139,7 @@ redo:
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
- raw_spin_lock_irqsave(&busiest->lock, flags);
+ rq_lock_irqsave(busiest, &rf);
update_rq_clock(busiest);
/*
@@ -8122,14 +8156,14 @@ more_balance:
* See task_rq_lock() family for the details.
*/
- raw_spin_unlock(&busiest->lock);
+ rq_unlock(busiest, &rf);
if (cur_ld_moved) {
attach_tasks(&env);
ld_moved += cur_ld_moved;
}
- local_irq_restore(flags);
+ local_irq_restore(rf.flags);
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
@@ -8207,6 +8241,8 @@ more_balance:
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
+ unsigned long flags;
+
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
@@ -8444,8 +8480,9 @@ static int active_load_balance_cpu_stop(void *data)
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
struct task_struct *p = NULL;
+ struct rq_flags rf;
- raw_spin_lock_irq(&busiest_rq->lock);
+ rq_lock_irq(busiest_rq, &rf);
/* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() ||
@@ -8496,7 +8533,7 @@ static int active_load_balance_cpu_stop(void *data)
rcu_read_unlock();
out_unlock:
busiest_rq->active_balance = 0;
- raw_spin_unlock(&busiest_rq->lock);
+ rq_unlock(busiest_rq, &rf);
if (p)
attach_one_task(target_rq, p);
@@ -8794,10 +8831,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
* do the balance.
*/
if (time_after_eq(jiffies, rq->next_balance)) {
- raw_spin_lock_irq(&rq->lock);
+ struct rq_flags rf;
+
+ rq_lock_irq(rq, &rf);
update_rq_clock(rq);
cpu_load_update_idle(rq);
- raw_spin_unlock_irq(&rq->lock);
+ rq_unlock_irq(rq, &rf);
+
rebalance_domains(rq, CPU_IDLE);
}
@@ -8988,8 +9028,9 @@ static void task_fork_fair(struct task_struct *p)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
struct rq *rq = this_rq();
+ struct rq_flags rf;
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
update_rq_clock(rq);
cfs_rq = task_cfs_rq(current);
@@ -9010,7 +9051,7 @@ static void task_fork_fair(struct task_struct *p)
}
se->vruntime -= cfs_rq->min_vruntime;
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
/*
@@ -9372,7 +9413,6 @@ static DEFINE_MUTEX(shares_mutex);
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
- unsigned long flags;
/*
* We can't change the weight of the root cgroup.
@@ -9389,19 +9429,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
tg->shares = shares;
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
- struct sched_entity *se;
+ struct sched_entity *se = tg->se[i];
+ struct rq_flags rf;
- se = tg->se[i];
/* Propagate contribution to hierarchy */
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- /* Possible calls to update_curr() need rq clock */
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
for_each_sched_entity(se) {
update_load_avg(se, UPDATE_TG);
update_cfs_shares(se);
}
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
done:
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1b3c8189b286..11192e0cb122 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true)
*/
SCHED_FEAT(SIS_AVG_CPU, false)
+/*
+ * Issue a WARN when we do multiple update_rq_clock() calls
+ * in a single rq->lock section. Default disabled because the
+ * annotations are not complete.
+ */
+SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
+
#ifdef HAVE_RT_PUSH_IPI
/*
* In order to avoid a thundering herd attack of CPUs that are
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index ac6d5176463d..ef63adce0c9c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -10,6 +10,7 @@
#include <linux/mm.h>
#include <linux/stackprotector.h>
#include <linux/suspend.h>
+#include <linux/livepatch.h>
#include <asm/tlb.h>
@@ -264,7 +265,10 @@ static void do_idle(void)
smp_mb__after_atomic();
sched_ttwu_pending();
- schedule_preempt_disabled();
+ schedule_idle();
+
+ if (unlikely(klp_patch_pending(current)))
+ klp_update_patch_state(current);
}
bool cpu_in_idle(unsigned long pc)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9f3e40226dec..979b7341008a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq)
#define RT_PUSH_IPI_EXECUTING 1
#define RT_PUSH_IPI_RESTART 2
+/*
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * On large CPU boxes, there's the case that several CPUs could schedule
+ * a lower priority task at the same time, in which case it will look for
+ * any overloaded CPUs that it could pull a task from. To do this, the runqueue
+ * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
+ * for a single overloaded CPU's runqueue lock can produce a large latency.
+ * (This has actually been observed on large boxes running cyclictest).
+ * Instead of taking the runqueue lock of the overloaded CPU, each of the
+ * CPUs that scheduled a lower priority task simply sends an IPI to the
+ * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
+ * lots of contention. The overloaded CPU will look to push its non-running
+ * RT task off, and if it does, it can then ignore the other IPIs coming
+ * in, and just pass those IPIs off to any other overloaded CPU.
+ *
+ * When a CPU schedules a lower priority task, it only sends an IPI to
+ * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
+ * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
+ * RT overloaded tasks, would cause 100 IPIs to go out at once.
+ *
+ * The overloaded RT CPU, when receiving an IPI, will try to push off its
+ * overloaded RT tasks and then send an IPI to the next CPU that has
+ * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
+ * have completed. Just because a CPU may have pushed off its own overloaded
+ * RT task does not mean it should stop sending the IPI around to other
+ * overloaded CPUs. There may be another RT task waiting to run on one of
+ * those CPUs that are of higher priority than the one that was just
+ * pushed.
+ *
+ * An optimization that could possibly be made is to make a CPU array similar
+ * to the cpupri array mask of all running RT tasks, but for the overloaded
+ * case, then the IPI could be sent to only the CPU with the highest priority
+ * RT task waiting, and that CPU could send off further IPIs to the CPU with
+ * the next highest waiting task. Since the overloaded case is much less likely
+ * to happen, the complexity of this implementation may not be worth it.
+ * Instead, just send an IPI around to all overloaded CPUs.
+ *
+ * The rq->rt.push_flags holds the status of the IPI that is going around.
+ * A run queue can only send out a single IPI at a time. The possible flags
+ * for rq->rt.push_flags are:
+ *
+ * (None or zero): No IPI is going around for the current rq
+ * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
+ * RT_PUSH_IPI_RESTART: The priority of the running task for the rq
+ * has changed, and the IPI should restart
+ * circulating the overloaded CPUs again.
+ *
+ * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
+ * before sending to the next CPU.
+ *
+ * Instead of having all CPUs that schedule a lower priority task send
+ * an IPI to the same "first" CPU in the RT overload mask, they send it
+ * to the next overloaded CPU after their own CPU. This helps distribute
+ * the work when there's more than one overloaded CPU and multiple CPUs
+ * scheduling in lower priority tasks.
+ *
+ * When a rq schedules a lower priority task than what was currently
+ * running, the next CPU with overloaded RT tasks is examined first.
+ * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
+ * priority task, it will send an IPI first to CPU 5, then CPU 5 will
+ * send to CPU 1 if it is still overloaded. CPU 1 will clear the
+ * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
+ *
+ * The first CPU to notice IPI_RESTART is set, will clear that flag and then
+ * send an IPI to the next overloaded CPU after the rq->cpu and not the next
+ * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
+ * schedules a lower priority task, and the IPI_RESTART gets set while the
+ * handling is being done on CPU 5, it will clear the flag and send it back to
+ * CPU 4 instead of CPU 1.
+ *
+ * Note, the above logic can be disabled by turning off the sched_feature
+ * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
+ * taken by the CPU requesting a pull and the waiting RT task will be pulled
+ * by that CPU. This may be fine for machines with few CPUs.
+ */
static void tell_cpu_to_push(struct rq *rq)
{
int cpu;
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
new file mode 100644
index 000000000000..cd200d16529e
--- /dev/null
+++ b/kernel/sched/sched-pelt.h
@@ -0,0 +1,13 @@
+/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
+
+static const u32 runnable_avg_yN_inv[] = {
+ 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+ 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+ 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+ 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+ 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+ 0x85aac367, 0x82cd8698,
+};
+
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf92214ad8..6dda2aab731e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40];
#define DEQUEUE_SLEEP 0x01
#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
#define ENQUEUE_MOVE 0x04
+#define ENQUEUE_NOCLOCK 0x08
-#define ENQUEUE_HEAD 0x08
-#define ENQUEUE_REPLENISH 0x10
+#define ENQUEUE_HEAD 0x10
+#define ENQUEUE_REPLENISH 0x20
#ifdef CONFIG_SMP
-#define ENQUEUE_MIGRATED 0x20
+#define ENQUEUE_MIGRATED 0x40
#else
#define ENQUEUE_MIGRATED 0x00
#endif
@@ -1465,6 +1467,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
}
#endif
+extern void schedule_idle(void);
+
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
extern void update_max_interval(void);
@@ -1624,6 +1628,7 @@ static inline void sched_avg_update(struct rq *rq) { }
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock);
+
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock)
__acquires(rq->lock);
@@ -1645,6 +1650,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
+static inline void
+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irqsave(&rq->lock, rf->flags);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irq(&rq->lock);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock(&rq->lock);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_relock(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock(&rq->lock);
+ rq_repin_lock(rq, rf);
+}
+
+static inline void
+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+}
+
+static inline void
+rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock_irq(&rq->lock);
+}
+
+static inline void
+rq_unlock(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock(&rq->lock);
+}
+
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
@@ -1869,6 +1930,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
+ u64 total;
u64 tick_delta;
u64 irq_start_time;
struct u64_stats_sync sync;
@@ -1876,16 +1938,20 @@ struct irqtime {
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
static inline u64 irq_time_read(int cpu)
{
struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
- u64 *cpustat = kcpustat_cpu(cpu).cpustat;
unsigned int seq;
u64 total;
do {
seq = __u64_stats_fetch_begin(&irqtime->sync);
- total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
+ total = irqtime->total;
} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
return total;
diff --git a/kernel/signal.c b/kernel/signal.c
index 7e59ebc2c25e..ca92bcfeb322 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1237,7 +1237,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
}
/*
* This sighand can be already freed and even reused, but
- * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which
+ * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
* initializes ->siglock: this slab can't go away, it has
* the same object type, ->siglock can't be reinitialized.
*
@@ -1318,7 +1318,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
}
}
-int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
{
int error;
rcu_read_lock();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 744fa611cae0..4e09821f9d9e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -309,7 +309,7 @@ restart:
account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
WARN_ON_ONCE(in_interrupt());
- tsk_restore_flags(current, old_flags, PF_MEMALLOC);
+ current_restore_flags(old_flags, PF_MEMALLOC);
}
asmlinkage __visible void do_softirq(void)
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9c15a9124e83..f8edee9c792d 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -54,8 +54,8 @@ int snprint_stack_trace(char *buf, size_t size,
EXPORT_SYMBOL_GPL(snprint_stack_trace);
/*
- * Architectures that do not implement save_stack_trace_tsk or
- * save_stack_trace_regs get this weak alias and a once-per-bootup warning
+ * Architectures that do not implement save_stack_trace_*()
+ * get these weak aliases and once-per-bootup warnings
* (whenever this facility is utilized - for example by procfs):
*/
__weak void
@@ -69,3 +69,11 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
{
WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
}
+
+__weak int
+save_stack_trace_tsk_reliable(struct task_struct *tsk,
+ struct stack_trace *trace)
+{
+ WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n");
+ return -ENOSYS;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index 7ff6d1b10cec..8a94b4eabcaa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1396,8 +1396,7 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
!capable(CAP_SYS_RESOURCE))
retval = -EPERM;
if (!retval)
- retval = security_task_setrlimit(tsk->group_leader,
- resource, new_rlim);
+ retval = security_task_setrlimit(tsk, resource, new_rlim);
if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
/*
* The caller is asking for an immediate RLIMIT_CPU
@@ -1432,25 +1431,26 @@ out:
}
/* rcu lock must be held */
-static int check_prlimit_permission(struct task_struct *task)
+static int check_prlimit_permission(struct task_struct *task,
+ unsigned int flags)
{
const struct cred *cred = current_cred(), *tcred;
+ bool id_match;
if (current == task)
return 0;
tcred = __task_cred(task);
- if (uid_eq(cred->uid, tcred->euid) &&
- uid_eq(cred->uid, tcred->suid) &&
- uid_eq(cred->uid, tcred->uid) &&
- gid_eq(cred->gid, tcred->egid) &&
- gid_eq(cred->gid, tcred->sgid) &&
- gid_eq(cred->gid, tcred->gid))
- return 0;
- if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
- return 0;
+ id_match = (uid_eq(cred->uid, tcred->euid) &&
+ uid_eq(cred->uid, tcred->suid) &&
+ uid_eq(cred->uid, tcred->uid) &&
+ gid_eq(cred->gid, tcred->egid) &&
+ gid_eq(cred->gid, tcred->sgid) &&
+ gid_eq(cred->gid, tcred->gid));
+ if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
+ return -EPERM;
- return -EPERM;
+ return security_task_prlimit(cred, tcred, flags);
}
SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
@@ -1460,12 +1460,17 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
struct rlimit64 old64, new64;
struct rlimit old, new;
struct task_struct *tsk;
+ unsigned int checkflags = 0;
int ret;
+ if (old_rlim)
+ checkflags |= LSM_PRLIMIT_READ;
+
if (new_rlim) {
if (copy_from_user(&new64, new_rlim, sizeof(new64)))
return -EFAULT;
rlim64_to_rlim(&new64, &new);
+ checkflags |= LSM_PRLIMIT_WRITE;
}
rcu_read_lock();
@@ -1474,7 +1479,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
rcu_read_unlock();
return -ESRCH;
}
- ret = check_prlimit_permission(tsk);
+ ret = check_prlimit_permission(tsk, checkflags);
if (ret) {
rcu_read_unlock();
return ret;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index acf0a5a06da7..4dfba1a76cc3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1176,6 +1176,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = timer_migration_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
},
#endif
#ifdef CONFIG_BPF_SYSCALL
@@ -2133,9 +2135,12 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
if (write) {
if (*negp)
return -EINVAL;
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
*valp = *lvalp;
} else {
unsigned int val = *valp;
+ *negp = false;
*lvalp = (unsigned long)val;
}
return 0;
@@ -2571,7 +2576,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
int write, void *data)
{
if (write) {
- if (*lvalp > LONG_MAX / HZ)
+ if (*lvalp > INT_MAX / HZ)
return 1;
*valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
} else {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 8a5e44236f78..4559e914452b 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -30,6 +30,7 @@
#include <linux/pid_namespace.h>
#include <net/genetlink.h>
#include <linux/atomic.h>
+#include <linux/sched/cputime.h>
/*
* Maximum length of a cpumask that can be specified in
@@ -210,6 +211,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
struct task_struct *tsk, *first;
unsigned long flags;
int rc = -ESRCH;
+ u64 delta, utime, stime;
+ u64 start_time;
/*
* Add additional stats from live tasks except zombie thread group
@@ -227,6 +230,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
memset(stats, 0, sizeof(*stats));
tsk = first;
+ start_time = ktime_get_ns();
do {
if (tsk->exit_state)
continue;
@@ -238,6 +242,16 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
*/
delayacct_add_tsk(stats, tsk);
+ /* calculate task elapsed time in nsec */
+ delta = start_time - tsk->start_time;
+ /* Convert to micro seconds */
+ do_div(delta, NSEC_PER_USEC);
+ stats->ac_etime += delta;
+
+ task_cputime(tsk, &utime, &stime);
+ stats->ac_utime += div_u64(utime, NSEC_PER_USEC);
+ stats->ac_stime += div_u64(stime, NSEC_PER_USEC);
+
stats->nvcsw += tsk->nvcsw;
stats->nivcsw += tsk->nivcsw;
} while_each_thread(first, tsk);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ce3a31e8eb36..5cb5b0008d97 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -541,7 +541,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
*
* Returns the granularity of underlying alarm base clock
*/
-static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
{
if (!alarmtimer_get_rtcdev())
return -EINVAL;
@@ -558,14 +558,14 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
*
* Provides the underlying alarm base time.
*/
-static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
+static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp)
{
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
if (!alarmtimer_get_rtcdev())
return -EINVAL;
- *tp = ktime_to_timespec(base->gettime());
+ *tp = ktime_to_timespec64(base->gettime());
return 0;
}
@@ -598,19 +598,19 @@ static int alarm_timer_create(struct k_itimer *new_timer)
* Copies out the current itimerspec data
*/
static void alarm_timer_get(struct k_itimer *timr,
- struct itimerspec *cur_setting)
+ struct itimerspec64 *cur_setting)
{
ktime_t relative_expiry_time =
alarm_expires_remaining(&(timr->it.alarm.alarmtimer));
if (ktime_to_ns(relative_expiry_time) > 0) {
- cur_setting->it_value = ktime_to_timespec(relative_expiry_time);
+ cur_setting->it_value = ktime_to_timespec64(relative_expiry_time);
} else {
cur_setting->it_value.tv_sec = 0;
cur_setting->it_value.tv_nsec = 0;
}
- cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval);
+ cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval);
}
/**
@@ -640,8 +640,8 @@ static int alarm_timer_del(struct k_itimer *timr)
* Sets the timer to new_setting, and starts the timer.
*/
static int alarm_timer_set(struct k_itimer *timr, int flags,
- struct itimerspec *new_setting,
- struct itimerspec *old_setting)
+ struct itimerspec64 *new_setting,
+ struct itimerspec64 *old_setting)
{
ktime_t exp;
@@ -659,8 +659,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
return TIMER_RETRY;
/* start the timer */
- timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
- exp = timespec_to_ktime(new_setting->it_value);
+ timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval);
+ exp = timespec64_to_ktime(new_setting->it_value);
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
ktime_t now;
@@ -790,13 +790,14 @@ out:
* Handles clock_nanosleep calls against _ALARM clockids
*/
static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
- struct timespec *tsreq, struct timespec __user *rmtp)
+ struct timespec64 *tsreq,
+ struct timespec __user *rmtp)
{
enum alarmtimer_type type = clock2alarm(which_clock);
+ struct restart_block *restart;
struct alarm alarm;
ktime_t exp;
int ret = 0;
- struct restart_block *restart;
if (!alarmtimer_get_rtcdev())
return -ENOTSUPP;
@@ -809,7 +810,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
- exp = timespec_to_ktime(*tsreq);
+ exp = timespec64_to_ktime(*tsreq);
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
ktime_t now = alarm_bases[type].gettime();
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 97ac0951f164..4237e0744e26 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -468,7 +468,7 @@ void clockevents_register_device(struct clock_event_device *dev)
}
EXPORT_SYMBOL_GPL(clockevents_register_device);
-void clockevents_config(struct clock_event_device *dev, u32 freq)
+static void clockevents_config(struct clock_event_device *dev, u32 freq)
{
u64 sec;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index ec08f527d7ee..ac053bb5296e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -987,7 +987,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
* Returns:
* 0 when the timer was not active
* 1 when the timer was active
- * -1 when the timer is currently excuting the callback function and
+ * -1 when the timer is currently executing the callback function and
* cannot be stopped
*/
int hrtimer_try_to_cancel(struct hrtimer *timer)
@@ -1368,10 +1368,7 @@ retry:
ktime_to_ns(delta));
}
-/*
- * local version of hrtimer_peek_ahead_timers() called with interrupts
- * disabled.
- */
+/* called with interrupts disabled */
static inline void __hrtimer_peek_ahead_timers(void)
{
struct tick_device *td;
@@ -1506,7 +1503,7 @@ out:
return ret;
}
-long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp,
const enum hrtimer_mode mode, const clockid_t clockid)
{
struct restart_block *restart;
@@ -1519,7 +1516,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
slack = 0;
hrtimer_init_on_stack(&t.timer, clockid, mode);
- hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
+ hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
if (do_nanosleep(&t, mode))
goto out;
@@ -1550,15 +1547,17 @@ out:
SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
struct timespec __user *, rmtp)
{
+ struct timespec64 tu64;
struct timespec tu;
if (copy_from_user(&tu, rqtp, sizeof(tu)))
return -EFAULT;
- if (!timespec_valid(&tu))
+ tu64 = timespec_to_timespec64(tu);
+ if (!timespec64_valid(&tu64))
return -EINVAL;
- return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
/*
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 9cff0ab82b63..31d588d37a17 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -297,7 +297,7 @@ out:
return err;
}
-static int pc_clock_gettime(clockid_t id, struct timespec *ts)
+static int pc_clock_gettime(clockid_t id, struct timespec64 *ts)
{
struct posix_clock_desc cd;
int err;
@@ -316,7 +316,7 @@ static int pc_clock_gettime(clockid_t id, struct timespec *ts)
return err;
}
-static int pc_clock_getres(clockid_t id, struct timespec *ts)
+static int pc_clock_getres(clockid_t id, struct timespec64 *ts)
{
struct posix_clock_desc cd;
int err;
@@ -335,7 +335,7 @@ static int pc_clock_getres(clockid_t id, struct timespec *ts)
return err;
}
-static int pc_clock_settime(clockid_t id, const struct timespec *ts)
+static int pc_clock_settime(clockid_t id, const struct timespec64 *ts)
{
struct posix_clock_desc cd;
int err;
@@ -399,7 +399,7 @@ static int pc_timer_delete(struct k_itimer *kit)
return err;
}
-static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
+static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts)
{
clockid_t id = kit->it_clock;
struct posix_clock_desc cd;
@@ -414,7 +414,7 @@ static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
}
static int pc_timer_settime(struct k_itimer *kit, int flags,
- struct itimerspec *ts, struct itimerspec *old)
+ struct itimerspec64 *ts, struct itimerspec64 *old)
{
clockid_t id = kit->it_clock;
struct posix_clock_desc cd;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 4513ad16a253..d2a1e6dd0291 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -116,7 +116,7 @@ static inline u64 virt_ticks(struct task_struct *p)
}
static int
-posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
{
int error = check_clock(which_clock);
if (!error) {
@@ -135,7 +135,7 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
}
static int
-posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
+posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp)
{
/*
* You can never reset a CPU clock, but we check for other errors
@@ -261,7 +261,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
static int posix_cpu_clock_get_task(struct task_struct *tsk,
const clockid_t which_clock,
- struct timespec *tp)
+ struct timespec64 *tp)
{
int err = -EINVAL;
u64 rtn;
@@ -275,13 +275,13 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
}
if (!err)
- *tp = ns_to_timespec(rtn);
+ *tp = ns_to_timespec64(rtn);
return err;
}
-static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
+static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp)
{
const pid_t pid = CPUCLOCK_PID(which_clock);
int err = -EINVAL;
@@ -562,7 +562,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
* and try again. (This happens when the timer is in the middle of firing.)
*/
static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
- struct itimerspec *new, struct itimerspec *old)
+ struct itimerspec64 *new, struct itimerspec64 *old)
{
unsigned long flags;
struct sighand_struct *sighand;
@@ -572,7 +572,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
WARN_ON_ONCE(p == NULL);
- new_expires = timespec_to_ns(&new->it_value);
+ new_expires = timespec64_to_ns(&new->it_value);
/*
* Protect against sighand release/switch in exit/exec and p->cpu_timers
@@ -633,7 +633,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
bump_cpu_timer(timer, val);
if (val < timer->it.cpu.expires) {
old_expires = timer->it.cpu.expires - val;
- old->it_value = ns_to_timespec(old_expires);
+ old->it_value = ns_to_timespec64(old_expires);
} else {
old->it_value.tv_nsec = 1;
old->it_value.tv_sec = 0;
@@ -671,7 +671,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
* Install the new reload setting, and
* set up the signal and overrun bookkeeping.
*/
- timer->it.cpu.incr = timespec_to_ns(&new->it_interval);
+ timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
/*
* This acts as a modification timestamp for the timer,
@@ -695,12 +695,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
ret = 0;
out:
if (old)
- old->it_interval = ns_to_timespec(old_incr);
+ old->it_interval = ns_to_timespec64(old_incr);
return ret;
}
-static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
{
u64 now;
struct task_struct *p = timer->it.cpu.task;
@@ -710,7 +710,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
/*
* Easy part: convert the reload time.
*/
- itp->it_interval = ns_to_timespec(timer->it.cpu.incr);
+ itp->it_interval = ns_to_timespec64(timer->it.cpu.incr);
if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */
itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
@@ -739,7 +739,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
* Call the timer disarmed, nothing else to do.
*/
timer->it.cpu.expires = 0;
- itp->it_value = ns_to_timespec(timer->it.cpu.expires);
+ itp->it_value = ns_to_timespec64(timer->it.cpu.expires);
return;
} else {
cpu_timer_sample_group(timer->it_clock, p, &now);
@@ -748,7 +748,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
}
if (now < timer->it.cpu.expires) {
- itp->it_value = ns_to_timespec(timer->it.cpu.expires - now);
+ itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now);
} else {
/*
* The timer should have expired already, but the firing
@@ -825,6 +825,10 @@ static void check_thread_timers(struct task_struct *tsk,
* At the hard limit, we just die.
* No need to calculate anything else now.
*/
+ if (print_fatal_signals) {
+ pr_info("CPU Watchdog Timeout (hard): %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
+ }
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
@@ -836,9 +840,10 @@ static void check_thread_timers(struct task_struct *tsk,
soft += USEC_PER_SEC;
sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
}
- printk(KERN_INFO
- "RT Watchdog Timeout: %s[%d]\n",
- tsk->comm, task_pid_nr(tsk));
+ if (print_fatal_signals) {
+ pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
+ }
__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
}
}
@@ -935,6 +940,10 @@ static void check_process_timers(struct task_struct *tsk,
* At the hard limit, we just die.
* No need to calculate anything else now.
*/
+ if (print_fatal_signals) {
+ pr_info("RT Watchdog Timeout (hard): %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
+ }
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
@@ -942,6 +951,10 @@ static void check_process_timers(struct task_struct *tsk,
/*
* At the soft limit, send a SIGXCPU every second.
*/
+ if (print_fatal_signals) {
+ pr_info("CPU Watchdog Timeout (soft): %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
+ }
__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
if (soft < hard) {
soft++;
@@ -1214,7 +1227,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
}
static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
- struct timespec *rqtp, struct itimerspec *it)
+ struct timespec64 *rqtp, struct itimerspec64 *it)
{
struct k_itimer timer;
int error;
@@ -1229,7 +1242,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
error = posix_cpu_timer_create(&timer);
timer.it_process = current;
if (!error) {
- static struct itimerspec zero_it;
+ static struct itimerspec64 zero_it;
memset(it, 0, sizeof *it);
it->it_value = *rqtp;
@@ -1264,7 +1277,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
/*
* We were interrupted by a signal.
*/
- *rqtp = ns_to_timespec(timer.it.cpu.expires);
+ *rqtp = ns_to_timespec64(timer.it.cpu.expires);
error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
if (!error) {
/*
@@ -1301,10 +1314,11 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
- struct timespec *rqtp, struct timespec __user *rmtp)
+ struct timespec64 *rqtp, struct timespec __user *rmtp)
{
struct restart_block *restart_block = &current->restart_block;
- struct itimerspec it;
+ struct itimerspec64 it;
+ struct timespec ts;
int error;
/*
@@ -1312,7 +1326,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
*/
if (CPUCLOCK_PERTHREAD(which_clock) &&
(CPUCLOCK_PID(which_clock) == 0 ||
- CPUCLOCK_PID(which_clock) == current->pid))
+ CPUCLOCK_PID(which_clock) == task_pid_vnr(current)))
return -EINVAL;
error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
@@ -1324,13 +1338,14 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
/*
* Report back to the user the time still remaining.
*/
- if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ ts = timespec64_to_timespec(it.it_value);
+ if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp)))
return -EFAULT;
restart_block->fn = posix_cpu_nsleep_restart;
restart_block->nanosleep.clockid = which_clock;
restart_block->nanosleep.rmtp = rmtp;
- restart_block->nanosleep.expires = timespec_to_ns(rqtp);
+ restart_block->nanosleep.expires = timespec64_to_ns(rqtp);
}
return error;
}
@@ -1338,11 +1353,12 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
{
clockid_t which_clock = restart_block->nanosleep.clockid;
- struct timespec t;
- struct itimerspec it;
+ struct itimerspec64 it;
+ struct timespec64 t;
+ struct timespec tmp;
int error;
- t = ns_to_timespec(restart_block->nanosleep.expires);
+ t = ns_to_timespec64(restart_block->nanosleep.expires);
error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
@@ -1351,10 +1367,11 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
/*
* Report back to the user the time still remaining.
*/
- if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ tmp = timespec64_to_timespec(it.it_value);
+ if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp)))
return -EFAULT;
- restart_block->nanosleep.expires = timespec_to_ns(&t);
+ restart_block->nanosleep.expires = timespec64_to_ns(&t);
}
return error;
@@ -1364,12 +1381,12 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
static int process_cpu_clock_getres(const clockid_t which_clock,
- struct timespec *tp)
+ struct timespec64 *tp)
{
return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
}
static int process_cpu_clock_get(const clockid_t which_clock,
- struct timespec *tp)
+ struct timespec64 *tp)
{
return posix_cpu_clock_get(PROCESS_CLOCK, tp);
}
@@ -1379,7 +1396,7 @@ static int process_cpu_timer_create(struct k_itimer *timer)
return posix_cpu_timer_create(timer);
}
static int process_cpu_nsleep(const clockid_t which_clock, int flags,
- struct timespec *rqtp,
+ struct timespec64 *rqtp,
struct timespec __user *rmtp)
{
return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
@@ -1389,12 +1406,12 @@ static long process_cpu_nsleep_restart(struct restart_block *restart_block)
return -EINVAL;
}
static int thread_cpu_clock_getres(const clockid_t which_clock,
- struct timespec *tp)
+ struct timespec64 *tp)
{
return posix_cpu_clock_getres(THREAD_CLOCK, tp);
}
static int thread_cpu_clock_get(const clockid_t which_clock,
- struct timespec *tp)
+ struct timespec64 *tp)
{
return posix_cpu_clock_get(THREAD_CLOCK, tp);
}
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index cd6716e115e8..c0cd53eb018a 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -49,26 +49,32 @@ SYS_NI(alarm);
SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
const struct timespec __user *, tp)
{
+ struct timespec64 new_tp64;
struct timespec new_tp;
if (which_clock != CLOCK_REALTIME)
return -EINVAL;
if (copy_from_user(&new_tp, tp, sizeof (*tp)))
return -EFAULT;
- return do_sys_settimeofday(&new_tp, NULL);
+
+ new_tp64 = timespec_to_timespec64(new_tp);
+ return do_sys_settimeofday64(&new_tp64, NULL);
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
struct timespec __user *,tp)
{
+ struct timespec64 kernel_tp64;
struct timespec kernel_tp;
switch (which_clock) {
- case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break;
- case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break;
- case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break;
+ case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
+ case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
+ case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
default: return -EINVAL;
}
+
+ kernel_tp = timespec64_to_timespec(kernel_tp64);
if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
return -EFAULT;
return 0;
@@ -97,6 +103,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
const struct timespec __user *, rqtp,
struct timespec __user *, rmtp)
{
+ struct timespec64 t64;
struct timespec t;
switch (which_clock) {
@@ -105,9 +112,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
case CLOCK_BOOTTIME:
if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
return -EFAULT;
- if (!timespec_valid(&t))
+ t64 = timespec_to_timespec64(t);
+ if (!timespec64_valid(&t64))
return -EINVAL;
- return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ?
+ return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
default:
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 50a6a47020de..4d7b2ce09c27 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -130,12 +130,12 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
/*
* These ones are defined below.
*/
-static int common_nsleep(const clockid_t, int flags, struct timespec *t,
+static int common_nsleep(const clockid_t, int flags, struct timespec64 *t,
struct timespec __user *rmtp);
static int common_timer_create(struct k_itimer *new_timer);
-static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static void common_timer_get(struct k_itimer *, struct itimerspec64 *);
static int common_timer_set(struct k_itimer *, int,
- struct itimerspec *, struct itimerspec *);
+ struct itimerspec64 *, struct itimerspec64 *);
static int common_timer_del(struct k_itimer *timer);
static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
@@ -204,17 +204,17 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
}
/* Get clock_realtime */
-static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
+static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp)
{
- ktime_get_real_ts(tp);
+ ktime_get_real_ts64(tp);
return 0;
}
/* Set clock_realtime */
static int posix_clock_realtime_set(const clockid_t which_clock,
- const struct timespec *tp)
+ const struct timespec64 *tp)
{
- return do_sys_settimeofday(tp, NULL);
+ return do_sys_settimeofday64(tp, NULL);
}
static int posix_clock_realtime_adj(const clockid_t which_clock,
@@ -226,54 +226,54 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
/*
* Get monotonic time for posix timers
*/
-static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
+static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp)
{
- ktime_get_ts(tp);
+ ktime_get_ts64(tp);
return 0;
}
/*
* Get monotonic-raw time for posix timers
*/
-static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
- getrawmonotonic(tp);
+ getrawmonotonic64(tp);
return 0;
}
-static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
+static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
{
- *tp = current_kernel_time();
+ *tp = current_kernel_time64();
return 0;
}
static int posix_get_monotonic_coarse(clockid_t which_clock,
- struct timespec *tp)
+ struct timespec64 *tp)
{
- *tp = get_monotonic_coarse();
+ *tp = get_monotonic_coarse64();
return 0;
}
-static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp)
{
- *tp = ktime_to_timespec(KTIME_LOW_RES);
+ *tp = ktime_to_timespec64(KTIME_LOW_RES);
return 0;
}
-static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
{
- get_monotonic_boottime(tp);
+ get_monotonic_boottime64(tp);
return 0;
}
-static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
+static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
{
- timekeeping_clocktai(tp);
+ timekeeping_clocktai64(tp);
return 0;
}
-static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp)
+static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
{
tp->tv_sec = 0;
tp->tv_nsec = hrtimer_resolution;
@@ -734,18 +734,18 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
* report.
*/
static void
-common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
+common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
{
ktime_t now, remaining, iv;
struct hrtimer *timer = &timr->it.real.timer;
- memset(cur_setting, 0, sizeof(struct itimerspec));
+ memset(cur_setting, 0, sizeof(*cur_setting));
iv = timr->it.real.interval;
/* interval timer ? */
if (iv)
- cur_setting->it_interval = ktime_to_timespec(iv);
+ cur_setting->it_interval = ktime_to_timespec64(iv);
else if (!hrtimer_active(timer) &&
(timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
return;
@@ -771,13 +771,14 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
cur_setting->it_value.tv_nsec = 1;
} else
- cur_setting->it_value = ktime_to_timespec(remaining);
+ cur_setting->it_value = ktime_to_timespec64(remaining);
}
/* Get the time remaining on a POSIX.1b interval timer. */
SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
struct itimerspec __user *, setting)
{
+ struct itimerspec64 cur_setting64;
struct itimerspec cur_setting;
struct k_itimer *timr;
struct k_clock *kc;
@@ -792,10 +793,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
if (WARN_ON_ONCE(!kc || !kc->timer_get))
ret = -EINVAL;
else
- kc->timer_get(timr, &cur_setting);
+ kc->timer_get(timr, &cur_setting64);
unlock_timer(timr, flags);
+ cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
return -EFAULT;
@@ -831,7 +833,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
/* timr->it_lock is taken. */
static int
common_timer_set(struct k_itimer *timr, int flags,
- struct itimerspec *new_setting, struct itimerspec *old_setting)
+ struct itimerspec64 *new_setting, struct itimerspec64 *old_setting)
{
struct hrtimer *timer = &timr->it.real.timer;
enum hrtimer_mode mode;
@@ -860,10 +862,10 @@ common_timer_set(struct k_itimer *timr, int flags,
hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
timr->it.real.timer.function = posix_timer_fn;
- hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
+ hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value));
/* Convert interval */
- timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
+ timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval);
/* SIGEV_NONE timers are not queued ! See common_timer_get */
if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
@@ -883,21 +885,23 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
const struct itimerspec __user *, new_setting,
struct itimerspec __user *, old_setting)
{
- struct k_itimer *timr;
+ struct itimerspec64 new_spec64, old_spec64;
+ struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL;
struct itimerspec new_spec, old_spec;
- int error = 0;
+ struct k_itimer *timr;
unsigned long flag;
- struct itimerspec *rtn = old_setting ? &old_spec : NULL;
struct k_clock *kc;
+ int error = 0;
if (!new_setting)
return -EINVAL;
if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
return -EFAULT;
+ new_spec64 = itimerspec_to_itimerspec64(&new_spec);
- if (!timespec_valid(&new_spec.it_interval) ||
- !timespec_valid(&new_spec.it_value))
+ if (!timespec64_valid(&new_spec64.it_interval) ||
+ !timespec64_valid(&new_spec64.it_value))
return -EINVAL;
retry:
timr = lock_timer(timer_id, &flag);
@@ -908,7 +912,7 @@ retry:
if (WARN_ON_ONCE(!kc || !kc->timer_set))
error = -EINVAL;
else
- error = kc->timer_set(timr, flags, &new_spec, rtn);
+ error = kc->timer_set(timr, flags, &new_spec64, rtn);
unlock_timer(timr, flag);
if (error == TIMER_RETRY) {
@@ -916,6 +920,7 @@ retry:
goto retry;
}
+ old_spec = itimerspec64_to_itimerspec(&old_spec64);
if (old_setting && !error &&
copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
error = -EFAULT;
@@ -1014,6 +1019,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
const struct timespec __user *, tp)
{
struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timespec64 new_tp64;
struct timespec new_tp;
if (!kc || !kc->clock_set)
@@ -1021,21 +1027,24 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
if (copy_from_user(&new_tp, tp, sizeof (*tp)))
return -EFAULT;
+ new_tp64 = timespec_to_timespec64(new_tp);
- return kc->clock_set(which_clock, &new_tp);
+ return kc->clock_set(which_clock, &new_tp64);
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
struct timespec __user *,tp)
{
struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timespec64 kernel_tp64;
struct timespec kernel_tp;
int error;
if (!kc)
return -EINVAL;
- error = kc->clock_get(which_clock, &kernel_tp);
+ error = kc->clock_get(which_clock, &kernel_tp64);
+ kernel_tp = timespec64_to_timespec(kernel_tp64);
if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
error = -EFAULT;
@@ -1070,13 +1079,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
struct timespec __user *, tp)
{
struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timespec64 rtn_tp64;
struct timespec rtn_tp;
int error;
if (!kc)
return -EINVAL;
- error = kc->clock_getres(which_clock, &rtn_tp);
+ error = kc->clock_getres(which_clock, &rtn_tp64);
+ rtn_tp = timespec64_to_timespec(rtn_tp64);
if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
error = -EFAULT;
@@ -1088,7 +1099,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
* nanosleep for monotonic and realtime clocks
*/
static int common_nsleep(const clockid_t which_clock, int flags,
- struct timespec *tsave, struct timespec __user *rmtp)
+ struct timespec64 *tsave, struct timespec __user *rmtp)
{
return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
@@ -1100,6 +1111,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
struct timespec __user *, rmtp)
{
struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timespec64 t64;
struct timespec t;
if (!kc)
@@ -1110,10 +1122,11 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
return -EFAULT;
- if (!timespec_valid(&t))
+ t64 = timespec_to_timespec64(t);
+ if (!timespec64_valid(&t64))
return -EINVAL;
- return kc->nsleep(which_clock, flags, &t, rmtp);
+ return kc->nsleep(which_clock, flags, &t64, rmtp);
}
/*
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index ea6b610c4c57..2d8f05aad442 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -206,6 +206,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
update_clock_read_data(&rd);
+ if (sched_clock_timer.function != NULL) {
+ /* update timeout for clock wrap */
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ }
+
r = rate;
if (r >= 4000000) {
r /= 1000000;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7fe53be86077..64c97fc130c4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -993,6 +993,18 @@ ktime_t tick_nohz_get_sleep_length(void)
return ts->sleep_length;
}
+/**
+ * tick_nohz_get_idle_calls - return the current idle calls counter value
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ return ts->idle_calls;
+}
+
static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 25bdd2504571..49c73c6ed648 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -193,8 +193,8 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
struct timezone __user *, tz)
{
+ struct timespec64 new_ts;
struct timeval user_tv;
- struct timespec new_ts;
struct timezone new_tz;
if (tv) {
@@ -212,7 +212,7 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
return -EFAULT;
}
- return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+ return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
@@ -230,20 +230,6 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
}
-/**
- * current_fs_time - Return FS time
- * @sb: Superblock.
- *
- * Return the current time truncated to the time granularity supported by
- * the fs.
- */
-struct timespec current_fs_time(struct super_block *sb)
-{
- struct timespec now = current_kernel_time();
- return timespec_trunc(now, sb->s_time_gran);
-}
-EXPORT_SYMBOL(current_fs_time);
-
/*
* Convert jiffies to milliseconds and back.
*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5b63a2102c29..9652bc57fd09 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -996,8 +996,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history,
return 0;
/* Interpolate shortest distance from beginning or end of history */
- interp_forward = partial_history_cycles > total_history_cycles/2 ?
- true : false;
+ interp_forward = partial_history_cycles > total_history_cycles / 2;
partial_history_cycles = interp_forward ?
total_history_cycles - partial_history_cycles :
partial_history_cycles;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 1dc0256bfb6e..152a706ef8b8 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -241,7 +241,7 @@ int timer_migration_handler(struct ctl_table *table, int write,
int ret;
mutex_lock(&mutex);
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write)
timers_update_migration(false);
mutex_unlock(&mutex);
@@ -1120,7 +1120,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
EXPORT_SYMBOL_GPL(add_timer_on);
/**
- * del_timer - deactive a timer.
+ * del_timer - deactivate a timer.
* @timer: the timer to be deactivated
*
* del_timer() deactivates a timer - this works on both active and inactive
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ff8d5c13d04b..0e7f5428a148 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -16,6 +16,7 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
+#include <linux/nmi.h>
#include <linux/uaccess.h>
@@ -86,6 +87,9 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
next_one:
i = 0;
+
+ touch_nmi_watchdog();
+
raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
curr = timerqueue_getnext(&base->active);
@@ -197,6 +201,8 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
{
struct clock_event_device *dev = td->evtdev;
+ touch_nmi_watchdog();
+
SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
if (cpu < 0)
SEQ_printf(m, "Broadcast device\n");
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d4a06e714645..7e06f04e98fe 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -134,7 +134,8 @@ config FUNCTION_TRACER
select KALLSYMS
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
- select GLOB
+ select GLOB
+ select TASKS_RCU if PREEMPT
help
Enable the kernel to trace every kernel function. This is done
by using a compiler feature to insert a small, 5-byte No-Operation
@@ -455,7 +456,7 @@ config UPROBE_EVENTS
select UPROBES
select PROBE_EVENTS
select TRACING
- default n
+ default y
help
This allows the user to add tracing events on top of userspace
dynamic events (similar to tracepoints) on the fly via the trace
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b2058a7f94bd..193c5f5e3f79 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -690,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q)
/**
* blk_add_trace_rq - Add a trace for a request oriented action
- * @q: queue the io is for
* @rq: the source request
+ * @error: return status to log
* @nr_bytes: number of completed bytes
* @what: the action
*
@@ -699,10 +699,10 @@ void blk_trace_shutdown(struct request_queue *q)
* Records an action against a request. Will log the bio offset + size.
*
**/
-static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+static void blk_add_trace_rq(struct request *rq, int error,
unsigned int nr_bytes, u32 what)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt = rq->q->blk_trace;
if (likely(!bt))
return;
@@ -713,40 +713,32 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, rq->errors, 0, NULL);
-}
-
-static void blk_add_trace_rq_abort(void *ignore,
- struct request_queue *q, struct request *rq)
-{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT);
+ rq->cmd_flags, what, error, 0, NULL);
}
static void blk_add_trace_rq_insert(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
}
static void blk_add_trace_rq_issue(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
}
static void blk_add_trace_rq_requeue(void *ignore,
struct request_queue *q,
struct request *rq)
{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
}
-static void blk_add_trace_rq_complete(void *ignore,
- struct request_queue *q,
- struct request *rq,
- unsigned int nr_bytes)
+static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
+ int error, unsigned int nr_bytes)
{
- blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE);
+ blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
}
/**
@@ -941,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
- rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
+ rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
sizeof(r), &r);
}
@@ -966,7 +958,7 @@ void blk_add_driver_data(struct request_queue *q,
return;
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
- BLK_TA_DRV_DATA, rq->errors, len, data);
+ BLK_TA_DRV_DATA, 0, len, data);
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -974,8 +966,6 @@ static void blk_register_tracepoints(void)
{
int ret;
- ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
- WARN_ON(ret);
ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
@@ -1028,7 +1018,6 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
- unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
tracepoint_synchronize_unregister();
}
@@ -1673,14 +1662,14 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
goto out;
if (attr == &dev_attr_act_mask) {
- if (sscanf(buf, "%llx", &value) != 1) {
+ if (kstrtoull(buf, 0, &value)) {
/* Assume it is a list of trace category names */
ret = blk_trace_str2mask(buf);
if (ret < 0)
goto out;
value = ret;
}
- } else if (sscanf(buf, "%llu", &value) != 1)
+ } else if (kstrtoull(buf, 0, &value))
goto out;
ret = -ENXIO;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cee9802cf3e0..460a031c77e5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -96,7 +96,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
if (unlikely(in_interrupt() ||
current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+ if (unlikely(uaccess_kernel()))
return -EPERM;
if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
return -EPERM;
@@ -501,16 +501,11 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
return true;
}
-static const struct bpf_verifier_ops kprobe_prog_ops = {
+const struct bpf_verifier_ops kprobe_prog_ops = {
.get_func_proto = kprobe_prog_func_proto,
.is_valid_access = kprobe_prog_is_valid_access,
};
-static struct bpf_prog_type_list kprobe_tl __ro_after_init = {
- .ops = &kprobe_prog_ops,
- .type = BPF_PROG_TYPE_KPROBE,
-};
-
BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
@@ -584,16 +579,11 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
return true;
}
-static const struct bpf_verifier_ops tracepoint_prog_ops = {
+const struct bpf_verifier_ops tracepoint_prog_ops = {
.get_func_proto = tp_prog_func_proto,
.is_valid_access = tp_prog_is_valid_access,
};
-static struct bpf_prog_type_list tracepoint_tl __ro_after_init = {
- .ops = &tracepoint_prog_ops,
- .type = BPF_PROG_TYPE_TRACEPOINT,
-};
-
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
enum bpf_reg_type *reg_type)
{
@@ -642,22 +632,8 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
-static const struct bpf_verifier_ops perf_event_prog_ops = {
+const struct bpf_verifier_ops perf_event_prog_ops = {
.get_func_proto = tp_prog_func_proto,
.is_valid_access = pe_prog_is_valid_access,
.convert_ctx_access = pe_prog_convert_ctx_access,
};
-
-static struct bpf_prog_type_list perf_event_tl __ro_after_init = {
- .ops = &perf_event_prog_ops,
- .type = BPF_PROG_TYPE_PERF_EVENT,
-};
-
-static int __init register_kprobe_prog_ops(void)
-{
- bpf_register_prog_type(&kprobe_tl);
- bpf_register_prog_type(&tracepoint_tl);
- bpf_register_prog_type(&perf_event_tl);
- return 0;
-}
-late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b9691ee8f6c1..9e5841dc14b5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -36,6 +36,7 @@
#include <trace/events/sched.h>
+#include <asm/sections.h>
#include <asm/setup.h>
#include "trace_output.h"
@@ -1095,22 +1096,20 @@ static bool update_all_ops;
# error Dynamic ftrace depends on MCOUNT_RECORD
#endif
-static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
-
-struct ftrace_func_probe {
- struct hlist_node node;
- struct ftrace_probe_ops *ops;
- unsigned long flags;
- unsigned long ip;
- void *data;
- struct list_head free_list;
-};
-
struct ftrace_func_entry {
struct hlist_node hlist;
unsigned long ip;
};
+struct ftrace_func_probe {
+ struct ftrace_probe_ops *probe_ops;
+ struct ftrace_ops ops;
+ struct trace_array *tr;
+ struct list_head list;
+ void *data;
+ int ref;
+};
+
/*
* We make these constant because no one should touch them,
* but they are used as the default "empty hash", to avoid allocating
@@ -1271,7 +1270,7 @@ static void
remove_hash_entry(struct ftrace_hash *hash,
struct ftrace_func_entry *entry)
{
- hlist_del(&entry->hlist);
+ hlist_del_rcu(&entry->hlist);
hash->count--;
}
@@ -2807,18 +2806,28 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* callers are done before leaving this function.
* The same goes for freeing the per_cpu data of the per_cpu
* ops.
- *
- * Again, normal synchronize_sched() is not good enough.
- * We need to do a hard force of sched synchronization.
- * This is because we use preempt_disable() to do RCU, but
- * the function tracers can be called where RCU is not watching
- * (like before user_exit()). We can not rely on the RCU
- * infrastructure to do the synchronization, thus we must do it
- * ourselves.
*/
if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
+ /*
+ * We need to do a hard force of sched synchronization.
+ * This is because we use preempt_disable() to do RCU, but
+ * the function tracers can be called where RCU is not watching
+ * (like before user_exit()). We can not rely on the RCU
+ * infrastructure to do the synchronization, thus we must do it
+ * ourselves.
+ */
schedule_on_each_cpu(ftrace_sync);
+ /*
+ * When the kernel is preeptive, tasks can be preempted
+ * while on a ftrace trampoline. Just scheduling a task on
+ * a CPU is not good enough to flush them. Calling
+ * synchornize_rcu_tasks() will wait for those tasks to
+ * execute and either schedule voluntarily or enter user space.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ synchronize_rcu_tasks();
+
arch_ftrace_trampoline_free(ops);
if (ops->flags & FTRACE_OPS_FL_PER_CPU)
@@ -3055,34 +3064,63 @@ struct ftrace_iterator {
struct ftrace_page *pg;
struct dyn_ftrace *func;
struct ftrace_func_probe *probe;
+ struct ftrace_func_entry *probe_entry;
struct trace_parser parser;
struct ftrace_hash *hash;
struct ftrace_ops *ops;
- int hidx;
+ int pidx;
int idx;
unsigned flags;
};
static void *
-t_hash_next(struct seq_file *m, loff_t *pos)
+t_probe_next(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
+ struct trace_array *tr = iter->ops->private;
+ struct list_head *func_probes;
+ struct ftrace_hash *hash;
+ struct list_head *next;
struct hlist_node *hnd = NULL;
struct hlist_head *hhd;
+ int size;
(*pos)++;
iter->pos = *pos;
- if (iter->probe)
- hnd = &iter->probe->node;
- retry:
- if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
+ if (!tr)
return NULL;
- hhd = &ftrace_func_hash[iter->hidx];
+ func_probes = &tr->func_probes;
+ if (list_empty(func_probes))
+ return NULL;
+
+ if (!iter->probe) {
+ next = func_probes->next;
+ iter->probe = list_entry(next, struct ftrace_func_probe, list);
+ }
+
+ if (iter->probe_entry)
+ hnd = &iter->probe_entry->hlist;
+
+ hash = iter->probe->ops.func_hash->filter_hash;
+ size = 1 << hash->size_bits;
+
+ retry:
+ if (iter->pidx >= size) {
+ if (iter->probe->list.next == func_probes)
+ return NULL;
+ next = iter->probe->list.next;
+ iter->probe = list_entry(next, struct ftrace_func_probe, list);
+ hash = iter->probe->ops.func_hash->filter_hash;
+ size = 1 << hash->size_bits;
+ iter->pidx = 0;
+ }
+
+ hhd = &hash->buckets[iter->pidx];
if (hlist_empty(hhd)) {
- iter->hidx++;
+ iter->pidx++;
hnd = NULL;
goto retry;
}
@@ -3092,7 +3130,7 @@ t_hash_next(struct seq_file *m, loff_t *pos)
else {
hnd = hnd->next;
if (!hnd) {
- iter->hidx++;
+ iter->pidx++;
goto retry;
}
}
@@ -3100,26 +3138,28 @@ t_hash_next(struct seq_file *m, loff_t *pos)
if (WARN_ON_ONCE(!hnd))
return NULL;
- iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
+ iter->probe_entry = hlist_entry(hnd, struct ftrace_func_entry, hlist);
return iter;
}
-static void *t_hash_start(struct seq_file *m, loff_t *pos)
+static void *t_probe_start(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
void *p = NULL;
loff_t l;
- if (!(iter->flags & FTRACE_ITER_DO_HASH))
+ if (!(iter->flags & FTRACE_ITER_DO_PROBES))
return NULL;
if (iter->func_pos > *pos)
return NULL;
- iter->hidx = 0;
+ iter->probe = NULL;
+ iter->probe_entry = NULL;
+ iter->pidx = 0;
for (l = 0; l <= (*pos - iter->func_pos); ) {
- p = t_hash_next(m, &l);
+ p = t_probe_next(m, &l);
if (!p)
break;
}
@@ -3127,50 +3167,42 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
return NULL;
/* Only set this if we have an item */
- iter->flags |= FTRACE_ITER_HASH;
+ iter->flags |= FTRACE_ITER_PROBE;
return iter;
}
static int
-t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
+t_probe_show(struct seq_file *m, struct ftrace_iterator *iter)
{
- struct ftrace_func_probe *rec;
+ struct ftrace_func_entry *probe_entry;
+ struct ftrace_probe_ops *probe_ops;
+ struct ftrace_func_probe *probe;
- rec = iter->probe;
- if (WARN_ON_ONCE(!rec))
+ probe = iter->probe;
+ probe_entry = iter->probe_entry;
+
+ if (WARN_ON_ONCE(!probe || !probe_entry))
return -EIO;
- if (rec->ops->print)
- return rec->ops->print(m, rec->ip, rec->ops, rec->data);
+ probe_ops = probe->probe_ops;
- seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
+ if (probe_ops->print)
+ return probe_ops->print(m, probe_entry->ip, probe_ops, probe->data);
- if (rec->data)
- seq_printf(m, ":%p", rec->data);
- seq_putc(m, '\n');
+ seq_printf(m, "%ps:%ps\n", (void *)probe_entry->ip,
+ (void *)probe_ops->func);
return 0;
}
static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+t_func_next(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
- struct ftrace_ops *ops = iter->ops;
struct dyn_ftrace *rec = NULL;
- if (unlikely(ftrace_disabled))
- return NULL;
-
- if (iter->flags & FTRACE_ITER_HASH)
- return t_hash_next(m, pos);
-
(*pos)++;
- iter->pos = iter->func_pos = *pos;
-
- if (iter->flags & FTRACE_ITER_PRINTALL)
- return t_hash_start(m, pos);
retry:
if (iter->idx >= iter->pg->index) {
@@ -3181,11 +3213,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
}
} else {
rec = &iter->pg->records[iter->idx++];
- if (((iter->flags & FTRACE_ITER_FILTER) &&
- !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) ||
-
- ((iter->flags & FTRACE_ITER_NOTRACE) &&
- !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) ||
+ if (((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) &&
+ !ftrace_lookup_ip(iter->hash, rec->ip)) ||
((iter->flags & FTRACE_ITER_ENABLED) &&
!(rec->flags & FTRACE_FL_ENABLED))) {
@@ -3196,24 +3225,51 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
}
if (!rec)
- return t_hash_start(m, pos);
+ return NULL;
+ iter->pos = iter->func_pos = *pos;
iter->func = rec;
return iter;
}
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ loff_t l = *pos; /* t_hash_start() must use original pos */
+ void *ret;
+
+ if (unlikely(ftrace_disabled))
+ return NULL;
+
+ if (iter->flags & FTRACE_ITER_PROBE)
+ return t_probe_next(m, pos);
+
+ if (iter->flags & FTRACE_ITER_PRINTALL) {
+ /* next must increment pos, and t_probe_start does not */
+ (*pos)++;
+ return t_probe_start(m, &l);
+ }
+
+ ret = t_func_next(m, pos);
+
+ if (!ret)
+ return t_probe_start(m, &l);
+
+ return ret;
+}
+
static void reset_iter_read(struct ftrace_iterator *iter)
{
iter->pos = 0;
iter->func_pos = 0;
- iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH);
+ iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE);
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
- struct ftrace_ops *ops = iter->ops;
void *p = NULL;
loff_t l;
@@ -3233,20 +3289,19 @@ static void *t_start(struct seq_file *m, loff_t *pos)
* off, we can short cut and just print out that all
* functions are enabled.
*/
- if ((iter->flags & FTRACE_ITER_FILTER &&
- ftrace_hash_empty(ops->func_hash->filter_hash)) ||
- (iter->flags & FTRACE_ITER_NOTRACE &&
- ftrace_hash_empty(ops->func_hash->notrace_hash))) {
+ if ((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) &&
+ ftrace_hash_empty(iter->hash)) {
+ iter->func_pos = 1; /* Account for the message */
if (*pos > 0)
- return t_hash_start(m, pos);
+ return t_probe_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
/* reset in case of seek/pread */
- iter->flags &= ~FTRACE_ITER_HASH;
+ iter->flags &= ~FTRACE_ITER_PROBE;
return iter;
}
- if (iter->flags & FTRACE_ITER_HASH)
- return t_hash_start(m, pos);
+ if (iter->flags & FTRACE_ITER_PROBE)
+ return t_probe_start(m, pos);
/*
* Unfortunately, we need to restart at ftrace_pages_start
@@ -3256,13 +3311,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
iter->pg = ftrace_pages_start;
iter->idx = 0;
for (l = 0; l <= *pos; ) {
- p = t_next(m, p, &l);
+ p = t_func_next(m, &l);
if (!p)
break;
}
if (!p)
- return t_hash_start(m, pos);
+ return t_probe_start(m, pos);
return iter;
}
@@ -3293,8 +3348,8 @@ static int t_show(struct seq_file *m, void *v)
struct ftrace_iterator *iter = m->private;
struct dyn_ftrace *rec;
- if (iter->flags & FTRACE_ITER_HASH)
- return t_hash_show(m, iter);
+ if (iter->flags & FTRACE_ITER_PROBE)
+ return t_probe_show(m, iter);
if (iter->flags & FTRACE_ITER_PRINTALL) {
if (iter->flags & FTRACE_ITER_NOTRACE)
@@ -3355,12 +3410,13 @@ ftrace_avail_open(struct inode *inode, struct file *file)
return -ENODEV;
iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
- if (iter) {
- iter->pg = ftrace_pages_start;
- iter->ops = &global_ops;
- }
+ if (!iter)
+ return -ENOMEM;
- return iter ? 0 : -ENOMEM;
+ iter->pg = ftrace_pages_start;
+ iter->ops = &global_ops;
+
+ return 0;
}
static int
@@ -3369,13 +3425,14 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
struct ftrace_iterator *iter;
iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
- if (iter) {
- iter->pg = ftrace_pages_start;
- iter->flags = FTRACE_ITER_ENABLED;
- iter->ops = &global_ops;
- }
+ if (!iter)
+ return -ENOMEM;
- return iter ? 0 : -ENOMEM;
+ iter->pg = ftrace_pages_start;
+ iter->flags = FTRACE_ITER_ENABLED;
+ iter->ops = &global_ops;
+
+ return 0;
}
/**
@@ -3440,7 +3497,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
ret = -ENOMEM;
goto out_unlock;
}
- }
+ } else
+ iter->hash = hash;
if (file->f_mode & FMODE_READ) {
iter->pg = ftrace_pages_start;
@@ -3470,7 +3528,7 @@ ftrace_filter_open(struct inode *inode, struct file *file)
struct ftrace_ops *ops = inode->i_private;
return ftrace_regex_open(ops,
- FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
+ FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES,
inode, file);
}
@@ -3573,22 +3631,20 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
/* blank module name to match all modules */
if (!mod_g->len) {
/* blank module globbing: modname xor exclude_mod */
- if ((!exclude_mod) != (!modname))
+ if (!exclude_mod != !modname)
goto func_match;
return 0;
}
- /* not matching the module */
- if (!modname || !mod_matches) {
- if (exclude_mod)
- goto func_match;
- else
- return 0;
- }
-
- if (mod_matches && exclude_mod)
+ /*
+ * exclude_mod is set to trace everything but the given
+ * module. If it is set and the module matches, then
+ * return 0. If it is not set, and the module doesn't match
+ * also return 0. Otherwise, check the function to see if
+ * that matches.
+ */
+ if (!mod_matches == !exclude_mod)
return 0;
-
func_match:
/* blank search means to match all funcs in the mod */
if (!func_g->len)
@@ -3654,6 +3710,56 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
return match_records(hash, buff, len, NULL);
}
+static void ftrace_ops_update_code(struct ftrace_ops *ops,
+ struct ftrace_ops_hash *old_hash)
+{
+ struct ftrace_ops *op;
+
+ if (!ftrace_enabled)
+ return;
+
+ if (ops->flags & FTRACE_OPS_FL_ENABLED) {
+ ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
+ return;
+ }
+
+ /*
+ * If this is the shared global_ops filter, then we need to
+ * check if there is another ops that shares it, is enabled.
+ * If so, we still need to run the modify code.
+ */
+ if (ops->func_hash != &global_ops.local_hash)
+ return;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (op->func_hash == &global_ops.local_hash &&
+ op->flags & FTRACE_OPS_FL_ENABLED) {
+ ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
+ /* Only need to do this once */
+ return;
+ }
+ } while_for_each_ftrace_op(op);
+}
+
+static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
+ struct ftrace_hash **orig_hash,
+ struct ftrace_hash *hash,
+ int enable)
+{
+ struct ftrace_ops_hash old_hash_ops;
+ struct ftrace_hash *old_hash;
+ int ret;
+
+ old_hash = *orig_hash;
+ old_hash_ops.filter_hash = ops->func_hash->filter_hash;
+ old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
+ ret = ftrace_hash_move(ops, enable, orig_hash, hash);
+ if (!ret) {
+ ftrace_ops_update_code(ops, &old_hash_ops);
+ free_ftrace_hash_rcu(old_hash);
+ }
+ return ret;
+}
/*
* We register the module command as a template to show others how
@@ -3661,7 +3767,7 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
*/
static int
-ftrace_mod_callback(struct ftrace_hash *hash,
+ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
char *func, char *cmd, char *module, int enable)
{
int ret;
@@ -3695,16 +3801,11 @@ core_initcall(ftrace_mod_cmd_init);
static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
{
- struct ftrace_func_probe *entry;
- struct hlist_head *hhd;
- unsigned long key;
+ struct ftrace_probe_ops *probe_ops;
+ struct ftrace_func_probe *probe;
- key = hash_long(ip, FTRACE_HASH_BITS);
-
- hhd = &ftrace_func_hash[key];
-
- if (hlist_empty(hhd))
- return;
+ probe = container_of(op, struct ftrace_func_probe, ops);
+ probe_ops = probe->probe_ops;
/*
* Disable preemption for these calls to prevent a RCU grace
@@ -3712,210 +3813,340 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
* on the hash. rcu_read_lock is too dangerous here.
*/
preempt_disable_notrace();
- hlist_for_each_entry_rcu_notrace(entry, hhd, node) {
- if (entry->ip == ip)
- entry->ops->func(ip, parent_ip, &entry->data);
- }
+ probe_ops->func(ip, parent_ip, probe->tr, probe_ops, probe->data);
preempt_enable_notrace();
}
-static struct ftrace_ops trace_probe_ops __read_mostly =
-{
- .func = function_trace_probe_call,
- .flags = FTRACE_OPS_FL_INITIALIZED,
- INIT_OPS_HASH(trace_probe_ops)
+struct ftrace_func_map {
+ struct ftrace_func_entry entry;
+ void *data;
};
-static int ftrace_probe_registered;
+struct ftrace_func_mapper {
+ struct ftrace_hash hash;
+};
-static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)
+/**
+ * allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper
+ *
+ * Returns a ftrace_func_mapper descriptor that can be used to map ips to data.
+ */
+struct ftrace_func_mapper *allocate_ftrace_func_mapper(void)
{
- int ret;
- int i;
+ struct ftrace_hash *hash;
- if (ftrace_probe_registered) {
- /* still need to update the function call sites */
- if (ftrace_enabled)
- ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
- old_hash);
- return;
- }
+ /*
+ * The mapper is simply a ftrace_hash, but since the entries
+ * in the hash are not ftrace_func_entry type, we define it
+ * as a separate structure.
+ */
+ hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+ return (struct ftrace_func_mapper *)hash;
+}
- for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
- struct hlist_head *hhd = &ftrace_func_hash[i];
- if (hhd->first)
- break;
- }
- /* Nothing registered? */
- if (i == FTRACE_FUNC_HASHSIZE)
- return;
+/**
+ * ftrace_func_mapper_find_ip - Find some data mapped to an ip
+ * @mapper: The mapper that has the ip maps
+ * @ip: the instruction pointer to find the data for
+ *
+ * Returns the data mapped to @ip if found otherwise NULL. The return
+ * is actually the address of the mapper data pointer. The address is
+ * returned for use cases where the data is no bigger than a long, and
+ * the user can use the data pointer as its data instead of having to
+ * allocate more memory for the reference.
+ */
+void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper,
+ unsigned long ip)
+{
+ struct ftrace_func_entry *entry;
+ struct ftrace_func_map *map;
- ret = ftrace_startup(&trace_probe_ops, 0);
+ entry = ftrace_lookup_ip(&mapper->hash, ip);
+ if (!entry)
+ return NULL;
- ftrace_probe_registered = 1;
+ map = (struct ftrace_func_map *)entry;
+ return &map->data;
}
-static void __disable_ftrace_function_probe(void)
+/**
+ * ftrace_func_mapper_add_ip - Map some data to an ip
+ * @mapper: The mapper that has the ip maps
+ * @ip: The instruction pointer address to map @data to
+ * @data: The data to map to @ip
+ *
+ * Returns 0 on succes otherwise an error.
+ */
+int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
+ unsigned long ip, void *data)
{
- int i;
+ struct ftrace_func_entry *entry;
+ struct ftrace_func_map *map;
- if (!ftrace_probe_registered)
- return;
+ entry = ftrace_lookup_ip(&mapper->hash, ip);
+ if (entry)
+ return -EBUSY;
- for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
- struct hlist_head *hhd = &ftrace_func_hash[i];
- if (hhd->first)
- return;
- }
+ map = kmalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
- /* no more funcs left */
- ftrace_shutdown(&trace_probe_ops, 0);
+ map->entry.ip = ip;
+ map->data = data;
- ftrace_probe_registered = 0;
-}
+ __add_hash_entry(&mapper->hash, &map->entry);
+ return 0;
+}
-static void ftrace_free_entry(struct ftrace_func_probe *entry)
+/**
+ * ftrace_func_mapper_remove_ip - Remove an ip from the mapping
+ * @mapper: The mapper that has the ip maps
+ * @ip: The instruction pointer address to remove the data from
+ *
+ * Returns the data if it is found, otherwise NULL.
+ * Note, if the data pointer is used as the data itself, (see
+ * ftrace_func_mapper_find_ip(), then the return value may be meaningless,
+ * if the data pointer was set to zero.
+ */
+void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper,
+ unsigned long ip)
{
- if (entry->ops->free)
- entry->ops->free(entry->ops, entry->ip, &entry->data);
+ struct ftrace_func_entry *entry;
+ struct ftrace_func_map *map;
+ void *data;
+
+ entry = ftrace_lookup_ip(&mapper->hash, ip);
+ if (!entry)
+ return NULL;
+
+ map = (struct ftrace_func_map *)entry;
+ data = map->data;
+
+ remove_hash_entry(&mapper->hash, entry);
kfree(entry);
+
+ return data;
+}
+
+/**
+ * free_ftrace_func_mapper - free a mapping of ips and data
+ * @mapper: The mapper that has the ip maps
+ * @free_func: A function to be called on each data item.
+ *
+ * This is used to free the function mapper. The @free_func is optional
+ * and can be used if the data needs to be freed as well.
+ */
+void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
+ ftrace_mapper_func free_func)
+{
+ struct ftrace_func_entry *entry;
+ struct ftrace_func_map *map;
+ struct hlist_head *hhd;
+ int size = 1 << mapper->hash.size_bits;
+ int i;
+
+ if (free_func && mapper->hash.count) {
+ for (i = 0; i < size; i++) {
+ hhd = &mapper->hash.buckets[i];
+ hlist_for_each_entry(entry, hhd, hlist) {
+ map = (struct ftrace_func_map *)entry;
+ free_func(map);
+ }
+ }
+ }
+ free_ftrace_hash(&mapper->hash);
+}
+
+static void release_probe(struct ftrace_func_probe *probe)
+{
+ struct ftrace_probe_ops *probe_ops;
+
+ mutex_lock(&ftrace_lock);
+
+ WARN_ON(probe->ref <= 0);
+
+ /* Subtract the ref that was used to protect this instance */
+ probe->ref--;
+
+ if (!probe->ref) {
+ probe_ops = probe->probe_ops;
+ /*
+ * Sending zero as ip tells probe_ops to free
+ * the probe->data itself
+ */
+ if (probe_ops->free)
+ probe_ops->free(probe_ops, probe->tr, 0, probe->data);
+ list_del(&probe->list);
+ kfree(probe);
+ }
+ mutex_unlock(&ftrace_lock);
+}
+
+static void acquire_probe_locked(struct ftrace_func_probe *probe)
+{
+ /*
+ * Add one ref to keep it from being freed when releasing the
+ * ftrace_lock mutex.
+ */
+ probe->ref++;
}
int
-register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
- void *data)
+register_ftrace_function_probe(char *glob, struct trace_array *tr,
+ struct ftrace_probe_ops *probe_ops,
+ void *data)
{
- struct ftrace_ops_hash old_hash_ops;
- struct ftrace_func_probe *entry;
- struct ftrace_glob func_g;
- struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
- struct ftrace_hash *old_hash = *orig_hash;
+ struct ftrace_func_entry *entry;
+ struct ftrace_func_probe *probe;
+ struct ftrace_hash **orig_hash;
+ struct ftrace_hash *old_hash;
struct ftrace_hash *hash;
- struct ftrace_page *pg;
- struct dyn_ftrace *rec;
- int not;
- unsigned long key;
int count = 0;
+ int size;
int ret;
+ int i;
- func_g.type = filter_parse_regex(glob, strlen(glob),
- &func_g.search, &not);
- func_g.len = strlen(func_g.search);
-
- /* we do not support '!' for function probes */
- if (WARN_ON(not))
+ if (WARN_ON(!tr))
return -EINVAL;
- mutex_lock(&trace_probe_ops.func_hash->regex_lock);
+ /* We do not support '!' for function probes */
+ if (WARN_ON(glob[0] == '!'))
+ return -EINVAL;
- old_hash_ops.filter_hash = old_hash;
- /* Probes only have filters */
- old_hash_ops.notrace_hash = NULL;
- hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
- if (!hash) {
- count = -ENOMEM;
- goto out;
+ mutex_lock(&ftrace_lock);
+ /* Check if the probe_ops is already registered */
+ list_for_each_entry(probe, &tr->func_probes, list) {
+ if (probe->probe_ops == probe_ops)
+ break;
}
-
- if (unlikely(ftrace_disabled)) {
- count = -ENODEV;
- goto out;
+ if (&probe->list == &tr->func_probes) {
+ probe = kzalloc(sizeof(*probe), GFP_KERNEL);
+ if (!probe) {
+ mutex_unlock(&ftrace_lock);
+ return -ENOMEM;
+ }
+ probe->probe_ops = probe_ops;
+ probe->ops.func = function_trace_probe_call;
+ probe->tr = tr;
+ ftrace_ops_init(&probe->ops);
+ list_add(&probe->list, &tr->func_probes);
}
- mutex_lock(&ftrace_lock);
+ acquire_probe_locked(probe);
- do_for_each_ftrace_rec(pg, rec) {
+ mutex_unlock(&ftrace_lock);
- if (rec->flags & FTRACE_FL_DISABLED)
- continue;
+ mutex_lock(&probe->ops.func_hash->regex_lock);
- if (!ftrace_match_record(rec, &func_g, NULL, 0))
- continue;
+ orig_hash = &probe->ops.func_hash->filter_hash;
+ old_hash = *orig_hash;
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry) {
- /* If we did not process any, then return error */
- if (!count)
- count = -ENOMEM;
- goto out_unlock;
- }
+ ret = ftrace_match_records(hash, glob, strlen(glob));
- count++;
+ /* Nothing found? */
+ if (!ret)
+ ret = -EINVAL;
- entry->data = data;
+ if (ret < 0)
+ goto out;
- /*
- * The caller might want to do something special
- * for each function we find. We call the callback
- * to give the caller an opportunity to do so.
- */
- if (ops->init) {
- if (ops->init(ops, rec->ip, &entry->data) < 0) {
- /* caller does not like this func */
- kfree(entry);
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ if (ftrace_lookup_ip(old_hash, entry->ip))
continue;
+ /*
+ * The caller might want to do something special
+ * for each function we find. We call the callback
+ * to give the caller an opportunity to do so.
+ */
+ if (probe_ops->init) {
+ ret = probe_ops->init(probe_ops, tr,
+ entry->ip, data,
+ &probe->data);
+ if (ret < 0) {
+ if (probe_ops->free && count)
+ probe_ops->free(probe_ops, tr,
+ 0, probe->data);
+ probe->data = NULL;
+ goto out;
+ }
}
+ count++;
}
+ }
- ret = enter_record(hash, rec, 0);
- if (ret < 0) {
- kfree(entry);
- count = ret;
- goto out_unlock;
- }
-
- entry->ops = ops;
- entry->ip = rec->ip;
-
- key = hash_long(entry->ip, FTRACE_HASH_BITS);
- hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
+ mutex_lock(&ftrace_lock);
- } while_for_each_ftrace_rec();
+ if (!count) {
+ /* Nothing was added? */
+ ret = -EINVAL;
+ goto out_unlock;
+ }
- ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+ ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash,
+ hash, 1);
+ if (ret < 0)
+ goto err_unlock;
- __enable_ftrace_function_probe(&old_hash_ops);
+ /* One ref for each new function traced */
+ probe->ref += count;
- if (!ret)
- free_ftrace_hash_rcu(old_hash);
- else
- count = ret;
+ if (!(probe->ops.flags & FTRACE_OPS_FL_ENABLED))
+ ret = ftrace_startup(&probe->ops, 0);
out_unlock:
mutex_unlock(&ftrace_lock);
+
+ if (!ret)
+ ret = count;
out:
- mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
+ mutex_unlock(&probe->ops.func_hash->regex_lock);
free_ftrace_hash(hash);
- return count;
-}
+ release_probe(probe);
-enum {
- PROBE_TEST_FUNC = 1,
- PROBE_TEST_DATA = 2
-};
+ return ret;
-static void
-__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
- void *data, int flags)
+ err_unlock:
+ if (!probe_ops->free || !count)
+ goto out_unlock;
+
+ /* Failed to do the move, need to call the free functions */
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ if (ftrace_lookup_ip(old_hash, entry->ip))
+ continue;
+ probe_ops->free(probe_ops, tr, entry->ip, probe->data);
+ }
+ }
+ goto out_unlock;
+}
+
+int
+unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
+ struct ftrace_probe_ops *probe_ops)
{
- struct ftrace_func_entry *rec_entry;
- struct ftrace_func_probe *entry;
- struct ftrace_func_probe *p;
+ struct ftrace_ops_hash old_hash_ops;
+ struct ftrace_func_entry *entry;
+ struct ftrace_func_probe *probe;
struct ftrace_glob func_g;
- struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
- struct ftrace_hash *old_hash = *orig_hash;
- struct list_head free_list;
- struct ftrace_hash *hash;
+ struct ftrace_hash **orig_hash;
+ struct ftrace_hash *old_hash;
+ struct ftrace_hash *hash = NULL;
struct hlist_node *tmp;
+ struct hlist_head hhd;
char str[KSYM_SYMBOL_LEN];
- int i, ret;
+ int count = 0;
+ int i, ret = -ENODEV;
+ int size;
- if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
+ if (!glob || !strlen(glob) || !strcmp(glob, "*"))
func_g.search = NULL;
- else if (glob) {
+ else {
int not;
func_g.type = filter_parse_regex(glob, strlen(glob),
@@ -3925,86 +4156,112 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
/* we do not support '!' for function probes */
if (WARN_ON(not))
- return;
+ return -EINVAL;
}
- mutex_lock(&trace_probe_ops.func_hash->regex_lock);
+ mutex_lock(&ftrace_lock);
+ /* Check if the probe_ops is already registered */
+ list_for_each_entry(probe, &tr->func_probes, list) {
+ if (probe->probe_ops == probe_ops)
+ break;
+ }
+ if (&probe->list == &tr->func_probes)
+ goto err_unlock_ftrace;
- hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
- if (!hash)
- /* Hmm, should report this somehow */
- goto out_unlock;
+ ret = -EINVAL;
+ if (!(probe->ops.flags & FTRACE_OPS_FL_INITIALIZED))
+ goto err_unlock_ftrace;
- INIT_LIST_HEAD(&free_list);
+ acquire_probe_locked(probe);
- for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
- struct hlist_head *hhd = &ftrace_func_hash[i];
+ mutex_unlock(&ftrace_lock);
- hlist_for_each_entry_safe(entry, tmp, hhd, node) {
+ mutex_lock(&probe->ops.func_hash->regex_lock);
- /* break up if statements for readability */
- if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
- continue;
+ orig_hash = &probe->ops.func_hash->filter_hash;
+ old_hash = *orig_hash;
- if ((flags & PROBE_TEST_DATA) && entry->data != data)
- continue;
+ if (ftrace_hash_empty(old_hash))
+ goto out_unlock;
+
+ old_hash_ops.filter_hash = old_hash;
+ /* Probes only have filters */
+ old_hash_ops.notrace_hash = NULL;
+
+ ret = -ENOMEM;
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
+ if (!hash)
+ goto out_unlock;
+
+ INIT_HLIST_HEAD(&hhd);
+
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) {
- /* do this last, since it is the most expensive */
if (func_g.search) {
kallsyms_lookup(entry->ip, NULL, NULL,
NULL, str);
if (!ftrace_match(str, &func_g))
continue;
}
-
- rec_entry = ftrace_lookup_ip(hash, entry->ip);
- /* It is possible more than one entry had this ip */
- if (rec_entry)
- free_hash_entry(hash, rec_entry);
-
- hlist_del_rcu(&entry->node);
- list_add(&entry->free_list, &free_list);
+ count++;
+ remove_hash_entry(hash, entry);
+ hlist_add_head(&entry->hlist, &hhd);
}
}
+
+ /* Nothing found? */
+ if (!count) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
mutex_lock(&ftrace_lock);
- __disable_ftrace_function_probe();
- /*
- * Remove after the disable is called. Otherwise, if the last
- * probe is removed, a null hash means *all enabled*.
- */
- ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+
+ WARN_ON(probe->ref < count);
+
+ probe->ref -= count;
+
+ if (ftrace_hash_empty(hash))
+ ftrace_shutdown(&probe->ops, 0);
+
+ ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash,
+ hash, 1);
+
+ /* still need to update the function call sites */
+ if (ftrace_enabled && !ftrace_hash_empty(hash))
+ ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS,
+ &old_hash_ops);
synchronize_sched();
- if (!ret)
- free_ftrace_hash_rcu(old_hash);
- list_for_each_entry_safe(entry, p, &free_list, free_list) {
- list_del(&entry->free_list);
- ftrace_free_entry(entry);
+ hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) {
+ hlist_del(&entry->hlist);
+ if (probe_ops->free)
+ probe_ops->free(probe_ops, tr, entry->ip, probe->data);
+ kfree(entry);
}
mutex_unlock(&ftrace_lock);
out_unlock:
- mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
+ mutex_unlock(&probe->ops.func_hash->regex_lock);
free_ftrace_hash(hash);
-}
-void
-unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
- void *data)
-{
- __unregister_ftrace_function_probe(glob, ops, data,
- PROBE_TEST_FUNC | PROBE_TEST_DATA);
-}
+ release_probe(probe);
-void
-unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
-{
- __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);
+ return ret;
+
+ err_unlock_ftrace:
+ mutex_unlock(&ftrace_lock);
+ return ret;
}
-void unregister_ftrace_function_probe_all(char *glob)
+void clear_ftrace_function_probes(struct trace_array *tr)
{
- __unregister_ftrace_function_probe(glob, NULL, NULL, 0);
+ struct ftrace_func_probe *probe, *n;
+
+ list_for_each_entry_safe(probe, n, &tr->func_probes, list)
+ unregister_ftrace_function_probe_func(NULL, tr, probe->probe_ops);
}
static LIST_HEAD(ftrace_commands);
@@ -4056,9 +4313,11 @@ __init int unregister_ftrace_command(struct ftrace_func_command *cmd)
return ret;
}
-static int ftrace_process_regex(struct ftrace_hash *hash,
+static int ftrace_process_regex(struct ftrace_iterator *iter,
char *buff, int len, int enable)
{
+ struct ftrace_hash *hash = iter->hash;
+ struct trace_array *tr = iter->ops->private;
char *func, *command, *next = buff;
struct ftrace_func_command *p;
int ret = -EINVAL;
@@ -4078,10 +4337,13 @@ static int ftrace_process_regex(struct ftrace_hash *hash,
command = strsep(&next, ":");
+ if (WARN_ON_ONCE(!tr))
+ return -EINVAL;
+
mutex_lock(&ftrace_cmd_mutex);
list_for_each_entry(p, &ftrace_commands, list) {
if (strcmp(p->name, command) == 0) {
- ret = p->func(hash, func, command, next, enable);
+ ret = p->func(tr, hash, func, command, next, enable);
goto out_unlock;
}
}
@@ -4118,7 +4380,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
if (read >= 0 && trace_parser_loaded(parser) &&
!trace_parser_cont(parser)) {
- ret = ftrace_process_regex(iter->hash, parser->buffer,
+ ret = ftrace_process_regex(iter, parser->buffer,
parser->idx, enable);
trace_parser_clear(parser);
if (ret < 0)
@@ -4163,44 +4425,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return add_hash_entry(hash, ip);
}
-static void ftrace_ops_update_code(struct ftrace_ops *ops,
- struct ftrace_ops_hash *old_hash)
-{
- struct ftrace_ops *op;
-
- if (!ftrace_enabled)
- return;
-
- if (ops->flags & FTRACE_OPS_FL_ENABLED) {
- ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
- return;
- }
-
- /*
- * If this is the shared global_ops filter, then we need to
- * check if there is another ops that shares it, is enabled.
- * If so, we still need to run the modify code.
- */
- if (ops->func_hash != &global_ops.local_hash)
- return;
-
- do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (op->func_hash == &global_ops.local_hash &&
- op->flags & FTRACE_OPS_FL_ENABLED) {
- ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
- /* Only need to do this once */
- return;
- }
- } while_for_each_ftrace_op(op);
-}
-
static int
ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
unsigned long ip, int remove, int reset, int enable)
{
struct ftrace_hash **orig_hash;
- struct ftrace_ops_hash old_hash_ops;
- struct ftrace_hash *old_hash;
struct ftrace_hash *hash;
int ret;
@@ -4235,14 +4464,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
}
mutex_lock(&ftrace_lock);
- old_hash = *orig_hash;
- old_hash_ops.filter_hash = ops->func_hash->filter_hash;
- old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
- ret = ftrace_hash_move(ops, enable, orig_hash, hash);
- if (!ret) {
- ftrace_ops_update_code(ops, &old_hash_ops);
- free_ftrace_hash_rcu(old_hash);
- }
+ ret = ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable);
mutex_unlock(&ftrace_lock);
out_regex_unlock:
@@ -4481,10 +4703,8 @@ static void __init set_ftrace_early_filters(void)
int ftrace_regex_release(struct inode *inode, struct file *file)
{
struct seq_file *m = (struct seq_file *)file->private_data;
- struct ftrace_ops_hash old_hash_ops;
struct ftrace_iterator *iter;
struct ftrace_hash **orig_hash;
- struct ftrace_hash *old_hash;
struct trace_parser *parser;
int filter_hash;
int ret;
@@ -4514,16 +4734,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
orig_hash = &iter->ops->func_hash->notrace_hash;
mutex_lock(&ftrace_lock);
- old_hash = *orig_hash;
- old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash;
- old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash;
- ret = ftrace_hash_move(iter->ops, filter_hash,
- orig_hash, iter->hash);
- if (!ret) {
- ftrace_ops_update_code(iter->ops, &old_hash_ops);
- free_ftrace_hash_rcu(old_hash);
- }
+ ret = ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
+ iter->hash, filter_hash);
mutex_unlock(&ftrace_lock);
+ } else {
+ /* For read only, the hash is the ops hash */
+ iter->hash = NULL;
}
mutex_unlock(&iter->ops->func_hash->regex_lock);
@@ -4847,7 +5063,7 @@ ftrace_graph_release(struct inode *inode, struct file *file)
}
out:
- kfree(fgd->new_hash);
+ free_ftrace_hash(fgd->new_hash);
kfree(fgd);
return ret;
@@ -5262,6 +5478,50 @@ void ftrace_module_init(struct module *mod)
}
#endif /* CONFIG_MODULES */
+void __init ftrace_free_init_mem(void)
+{
+ unsigned long start = (unsigned long)(&__init_begin);
+ unsigned long end = (unsigned long)(&__init_end);
+ struct ftrace_page **last_pg = &ftrace_pages_start;
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ struct dyn_ftrace key;
+ int order;
+
+ key.ip = start;
+ key.flags = end; /* overload flags, as it is unsigned long */
+
+ mutex_lock(&ftrace_lock);
+
+ for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) {
+ if (end < pg->records[0].ip ||
+ start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
+ continue;
+ again:
+ rec = bsearch(&key, pg->records, pg->index,
+ sizeof(struct dyn_ftrace),
+ ftrace_cmp_recs);
+ if (!rec)
+ continue;
+ pg->index--;
+ if (!pg->index) {
+ *last_pg = pg->next;
+ order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+ free_pages((unsigned long)pg->records, order);
+ kfree(pg);
+ pg = container_of(last_pg, struct ftrace_page, next);
+ if (!(*last_pg))
+ ftrace_pages = pg;
+ continue;
+ }
+ memmove(rec, rec + 1,
+ (pg->index - (rec - pg->records)) * sizeof(*rec));
+ /* More than one function may be in this block */
+ goto again;
+ }
+ mutex_unlock(&ftrace_lock);
+}
+
void __init ftrace_init(void)
{
extern unsigned long __start_mcount_loc[];
@@ -5304,25 +5564,13 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
static void ftrace_update_trampoline(struct ftrace_ops *ops)
{
-
-/*
- * Currently there's no safe way to free a trampoline when the kernel
- * is configured with PREEMPT. That is because a task could be preempted
- * when it jumped to the trampoline, it may be preempted for a long time
- * depending on the system load, and currently there's no way to know
- * when it will be off the trampoline. If the trampoline is freed
- * too early, when the task runs again, it will be executing on freed
- * memory and crash.
- */
-#ifdef CONFIG_PREEMPT
- /* Currently, only non dynamic ops can have a trampoline */
- if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
- return;
-#endif
-
arch_ftrace_update_trampoline(ops);
}
+void ftrace_init_trace_array(struct trace_array *tr)
+{
+ INIT_LIST_HEAD(&tr->func_probes);
+}
#else
static struct ftrace_ops global_ops = {
@@ -5377,6 +5625,7 @@ __init void ftrace_init_global_array_ops(struct trace_array *tr)
{
tr->ops = &global_ops;
tr->ops->private = tr;
+ ftrace_init_trace_array(tr);
}
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
@@ -5531,6 +5780,43 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
trace_ignore_this_task(pid_list, next));
}
+static void
+ftrace_pid_follow_sched_process_fork(void *data,
+ struct task_struct *self,
+ struct task_struct *task)
+{
+ struct trace_pid_list *pid_list;
+ struct trace_array *tr = data;
+
+ pid_list = rcu_dereference_sched(tr->function_pids);
+ trace_filter_add_remove_task(pid_list, self, task);
+}
+
+static void
+ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task)
+{
+ struct trace_pid_list *pid_list;
+ struct trace_array *tr = data;
+
+ pid_list = rcu_dereference_sched(tr->function_pids);
+ trace_filter_add_remove_task(pid_list, NULL, task);
+}
+
+void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
+{
+ if (enable) {
+ register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
+ tr);
+ register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ tr);
+ } else {
+ unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
+ tr);
+ unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ tr);
+ }
+}
+
static void clear_ftrace_pids(struct trace_array *tr)
{
struct trace_pid_list *pid_list;
@@ -5554,6 +5840,15 @@ static void clear_ftrace_pids(struct trace_array *tr)
trace_free_pid_list(pid_list);
}
+void ftrace_clear_pids(struct trace_array *tr)
+{
+ mutex_lock(&ftrace_lock);
+
+ clear_ftrace_pids(tr);
+
+ mutex_unlock(&ftrace_lock);
+}
+
static void ftrace_pid_reset(struct trace_array *tr)
{
mutex_lock(&ftrace_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 96fc3c043ad6..4ae268e687fe 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -438,6 +438,7 @@ struct ring_buffer_per_cpu {
raw_spinlock_t reader_lock; /* serialize readers */
arch_spinlock_t lock;
struct lock_class_key lock_key;
+ struct buffer_data_page *free_page;
unsigned long nr_pages;
unsigned int current_context;
struct list_head *pages;
@@ -3405,11 +3406,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *reader;
+ struct buffer_page *head_page;
+ struct buffer_page *commit_page;
+ unsigned commit;
cpu_buffer = iter->cpu_buffer;
- return iter->head_page == cpu_buffer->commit_page &&
- iter->head == rb_commit_index(cpu_buffer);
+ /* Remember, trace recording is off when iterator is in use */
+ reader = cpu_buffer->reader_page;
+ head_page = cpu_buffer->head_page;
+ commit_page = cpu_buffer->commit_page;
+ commit = rb_page_commit(commit_page);
+
+ return ((iter->head_page == commit_page && iter->head == commit) ||
+ (iter->head_page == reader && commit_page == head_page &&
+ head_page->read == commit &&
+ iter->head == rb_page_commit(cpu_buffer->reader_page)));
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
@@ -4377,9 +4390,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
*/
void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
{
- struct buffer_data_page *bpage;
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct buffer_data_page *bpage = NULL;
+ unsigned long flags;
struct page *page;
+ local_irq_save(flags);
+ arch_spin_lock(&cpu_buffer->lock);
+
+ if (cpu_buffer->free_page) {
+ bpage = cpu_buffer->free_page;
+ cpu_buffer->free_page = NULL;
+ }
+
+ arch_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(flags);
+
+ if (bpage)
+ goto out;
+
page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
if (!page)
@@ -4387,6 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
bpage = page_address(page);
+ out:
rb_init_page(bpage);
return bpage;
@@ -4396,13 +4426,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
/**
* ring_buffer_free_read_page - free an allocated read page
* @buffer: the buffer the page was allocate for
+ * @cpu: the cpu buffer the page came from
* @data: the page to free
*
* Free a page allocated from ring_buffer_alloc_read_page.
*/
-void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
+void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
{
- free_page((unsigned long)data);
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct buffer_data_page *bpage = data;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ arch_spin_lock(&cpu_buffer->lock);
+
+ if (!cpu_buffer->free_page) {
+ cpu_buffer->free_page = bpage;
+ bpage = NULL;
+ }
+
+ arch_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(flags);
+
+ free_page((unsigned long)bpage);
}
EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
@@ -4826,9 +4872,9 @@ static __init int test_ringbuffer(void)
rb_data[cpu].cnt = cpu;
rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
"rbtester/%d", cpu);
- if (WARN_ON(!rb_threads[cpu])) {
+ if (WARN_ON(IS_ERR(rb_threads[cpu]))) {
pr_cont("FAILED\n");
- ret = -1;
+ ret = PTR_ERR(rb_threads[cpu]);
goto out_free;
}
@@ -4838,9 +4884,9 @@ static __init int test_ringbuffer(void)
/* Now create the rb hammer! */
rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
- if (WARN_ON(!rb_hammer)) {
+ if (WARN_ON(IS_ERR(rb_hammer))) {
pr_cont("FAILED\n");
- ret = -1;
+ ret = PTR_ERR(rb_hammer);
goto out_free;
}
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index c190a4d5013c..9fbcaf567886 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -171,7 +171,7 @@ static enum event_status read_page(int cpu)
}
}
}
- ring_buffer_free_read_page(buffer, bpage);
+ ring_buffer_free_read_page(buffer, cpu, bpage);
if (ret < 0)
return EVENT_DROPPED;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f35109514a01..1122f151466f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -257,7 +257,7 @@ unsigned long long ns2usecs(u64 nsec)
/* trace_flags that are default zero for instances */
#define ZEROED_TRACE_FLAGS \
- TRACE_ITER_EVENT_FORK
+ (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK)
/*
* The global_trace is the descriptor that holds the top-level tracing
@@ -757,7 +757,7 @@ __trace_buffer_lock_reserve(struct ring_buffer *buffer,
return event;
}
-static void tracer_tracing_on(struct trace_array *tr)
+void tracer_tracing_on(struct trace_array *tr)
{
if (tr->trace_buffer.buffer)
ring_buffer_record_on(tr->trace_buffer.buffer);
@@ -894,23 +894,8 @@ int __trace_bputs(unsigned long ip, const char *str)
EXPORT_SYMBOL_GPL(__trace_bputs);
#ifdef CONFIG_TRACER_SNAPSHOT
-/**
- * trace_snapshot - take a snapshot of the current buffer.
- *
- * This causes a swap between the snapshot buffer and the current live
- * tracing buffer. You can use this to take snapshots of the live
- * trace when some condition is triggered, but continue to trace.
- *
- * Note, make sure to allocate the snapshot with either
- * a tracing_snapshot_alloc(), or by doing it manually
- * with: echo 1 > /sys/kernel/debug/tracing/snapshot
- *
- * If the snapshot buffer is not allocated, it will stop tracing.
- * Basically making a permanent snapshot.
- */
-void tracing_snapshot(void)
+static void tracing_snapshot_instance(struct trace_array *tr)
{
- struct trace_array *tr = &global_trace;
struct tracer *tracer = tr->current_trace;
unsigned long flags;
@@ -938,6 +923,27 @@ void tracing_snapshot(void)
update_max_tr(tr, current, smp_processor_id());
local_irq_restore(flags);
}
+
+/**
+ * trace_snapshot - take a snapshot of the current buffer.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ *
+ * Note, make sure to allocate the snapshot with either
+ * a tracing_snapshot_alloc(), or by doing it manually
+ * with: echo 1 > /sys/kernel/debug/tracing/snapshot
+ *
+ * If the snapshot buffer is not allocated, it will stop tracing.
+ * Basically making a permanent snapshot.
+ */
+void tracing_snapshot(void)
+{
+ struct trace_array *tr = &global_trace;
+
+ tracing_snapshot_instance(tr);
+}
EXPORT_SYMBOL_GPL(tracing_snapshot);
static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
@@ -1039,7 +1045,7 @@ void tracing_snapshot_alloc(void)
EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
#endif /* CONFIG_TRACER_SNAPSHOT */
-static void tracer_tracing_off(struct trace_array *tr)
+void tracer_tracing_off(struct trace_array *tr)
{
if (tr->trace_buffer.buffer)
ring_buffer_record_off(tr->trace_buffer.buffer);
@@ -1424,6 +1430,28 @@ static int wait_on_pipe(struct trace_iterator *iter, bool full)
}
#ifdef CONFIG_FTRACE_STARTUP_TEST
+static bool selftests_can_run;
+
+struct trace_selftests {
+ struct list_head list;
+ struct tracer *type;
+};
+
+static LIST_HEAD(postponed_selftests);
+
+static int save_selftest(struct tracer *type)
+{
+ struct trace_selftests *selftest;
+
+ selftest = kmalloc(sizeof(*selftest), GFP_KERNEL);
+ if (!selftest)
+ return -ENOMEM;
+
+ selftest->type = type;
+ list_add(&selftest->list, &postponed_selftests);
+ return 0;
+}
+
static int run_tracer_selftest(struct tracer *type)
{
struct trace_array *tr = &global_trace;
@@ -1434,6 +1462,14 @@ static int run_tracer_selftest(struct tracer *type)
return 0;
/*
+ * If a tracer registers early in boot up (before scheduling is
+ * initialized and such), then do not run its selftests yet.
+ * Instead, run it a little later in the boot process.
+ */
+ if (!selftests_can_run)
+ return save_selftest(type);
+
+ /*
* Run a selftest on this tracer.
* Here we reset the trace buffer, and set the current
* tracer to be this tracer. The tracer can then run some
@@ -1482,6 +1518,47 @@ static int run_tracer_selftest(struct tracer *type)
printk(KERN_CONT "PASSED\n");
return 0;
}
+
+static __init int init_trace_selftests(void)
+{
+ struct trace_selftests *p, *n;
+ struct tracer *t, **last;
+ int ret;
+
+ selftests_can_run = true;
+
+ mutex_lock(&trace_types_lock);
+
+ if (list_empty(&postponed_selftests))
+ goto out;
+
+ pr_info("Running postponed tracer tests:\n");
+
+ list_for_each_entry_safe(p, n, &postponed_selftests, list) {
+ ret = run_tracer_selftest(p->type);
+ /* If the test fails, then warn and remove from available_tracers */
+ if (ret < 0) {
+ WARN(1, "tracer: %s failed selftest, disabling\n",
+ p->type->name);
+ last = &trace_types;
+ for (t = trace_types; t; t = t->next) {
+ if (t == p->type) {
+ *last = t->next;
+ break;
+ }
+ last = &t->next;
+ }
+ }
+ list_del(&p->list);
+ kfree(p);
+ }
+
+ out:
+ mutex_unlock(&trace_types_lock);
+
+ return 0;
+}
+core_initcall(init_trace_selftests);
#else
static inline int run_tracer_selftest(struct tracer *type)
{
@@ -1899,7 +1976,7 @@ static void __trace_find_cmdline(int pid, char comm[])
map = savedcmd->map_pid_to_cmdline[pid];
if (map != NO_CMDLINE_MAP)
- strcpy(comm, get_saved_cmdlines(map));
+ strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
else
strcpy(comm, "<...>");
}
@@ -1927,6 +2004,18 @@ void tracing_record_cmdline(struct task_struct *tsk)
__this_cpu_write(trace_cmdline_save, false);
}
+/*
+ * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq
+ * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function
+ * simplifies those functions and keeps them in sync.
+ */
+enum print_line_t trace_handle_return(struct trace_seq *s)
+{
+ return trace_seq_has_overflowed(s) ?
+ TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED;
+}
+EXPORT_SYMBOL_GPL(trace_handle_return);
+
void
tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
int pc)
@@ -2479,7 +2568,36 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc)
{
- __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+
+ if (rcu_is_watching()) {
+ __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+ return;
+ }
+
+ /*
+ * When an NMI triggers, RCU is enabled via rcu_nmi_enter(),
+ * but if the above rcu_is_watching() failed, then the NMI
+ * triggered someplace critical, and rcu_irq_enter() should
+ * not be called from NMI.
+ */
+ if (unlikely(in_nmi()))
+ return;
+
+ /*
+ * It is possible that a function is being traced in a
+ * location that RCU is not watching. A call to
+ * rcu_irq_enter() will make sure that it is, but there's
+ * a few internal rcu functions that could be traced
+ * where that wont work either. In those cases, we just
+ * do nothing.
+ */
+ if (unlikely(rcu_irq_enter_disabled()))
+ return;
+
+ rcu_irq_enter_irqson();
+ __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+ rcu_irq_exit_irqson();
}
/**
@@ -3222,13 +3340,14 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
return;
- if (iter->started && cpumask_test_cpu(iter->cpu, iter->started))
+ if (cpumask_available(iter->started) &&
+ cpumask_test_cpu(iter->cpu, iter->started))
return;
if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
return;
- if (iter->started)
+ if (cpumask_available(iter->started))
cpumask_set_cpu(iter->cpu, iter->started);
/* Don't print started cpu buffer for the first entry of the trace */
@@ -4122,6 +4241,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (mask == TRACE_ITER_EVENT_FORK)
trace_event_follow_fork(tr, enabled);
+ if (mask == TRACE_ITER_FUNC_FORK)
+ ftrace_pid_follow_fork(tr, enabled);
+
if (mask == TRACE_ITER_OVERWRITE) {
ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -4355,6 +4477,7 @@ static const char readme_msg[] =
"\t -:[<group>/]<event>\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
+ "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n"
#endif
#ifdef CONFIG_UPROBE_EVENTS
"\t place: <path>:<offset>\n"
@@ -5529,7 +5652,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
.partial = partial_def,
.nr_pages = 0, /* This gets updated below. */
.nr_pages_max = PIPE_DEF_BUFFERS,
- .flags = flags,
.ops = &tracing_pipe_buf_ops,
.spd_release = tracing_spd_release_pipe,
};
@@ -5962,6 +6084,7 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
struct ftrace_buffer_info {
struct trace_iterator iter;
void *spare;
+ unsigned int spare_cpu;
unsigned int read;
};
@@ -6291,9 +6414,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
return -EBUSY;
#endif
- if (!info->spare)
+ if (!info->spare) {
info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
iter->cpu_file);
+ info->spare_cpu = iter->cpu_file;
+ }
if (!info->spare)
return -ENOMEM;
@@ -6353,7 +6478,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
__trace_array_put(iter->tr);
if (info->spare)
- ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
+ ring_buffer_free_read_page(iter->trace_buffer->buffer,
+ info->spare_cpu, info->spare);
kfree(info);
mutex_unlock(&trace_types_lock);
@@ -6364,6 +6490,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
struct buffer_ref {
struct ring_buffer *buffer;
void *page;
+ int cpu;
int ref;
};
@@ -6375,7 +6502,7 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
if (--ref->ref)
return;
- ring_buffer_free_read_page(ref->buffer, ref->page);
+ ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
kfree(ref);
buf->private = 0;
}
@@ -6409,7 +6536,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
if (--ref->ref)
return;
- ring_buffer_free_read_page(ref->buffer, ref->page);
+ ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
kfree(ref);
spd->partial[i].private = 0;
}
@@ -6427,7 +6554,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.pages = pages_def,
.partial = partial_def,
.nr_pages_max = PIPE_DEF_BUFFERS,
- .flags = flags,
.ops = &buffer_pipe_buf_ops,
.spd_release = buffer_spd_release,
};
@@ -6474,11 +6600,13 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
kfree(ref);
break;
}
+ ref->cpu = iter->cpu_file;
r = ring_buffer_read_page(ref->buffer, &ref->page,
len, iter->cpu_file, 1);
if (r < 0) {
- ring_buffer_free_read_page(ref->buffer, ref->page);
+ ring_buffer_free_read_page(ref->buffer, ref->cpu,
+ ref->page);
kfree(ref);
break;
}
@@ -6649,43 +6777,89 @@ static const struct file_operations tracing_dyn_info_fops = {
#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
static void
-ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_snapshot(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
{
- tracing_snapshot();
+ tracing_snapshot_instance(tr);
}
static void
-ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
{
- unsigned long *count = (long *)data;
+ struct ftrace_func_mapper *mapper = data;
+ long *count = NULL;
- if (!*count)
- return;
+ if (mapper)
+ count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+
+ if (count) {
+
+ if (*count <= 0)
+ return;
- if (*count != -1)
(*count)--;
+ }
- tracing_snapshot();
+ tracing_snapshot_instance(tr);
}
static int
ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data)
{
- long count = (long)data;
+ struct ftrace_func_mapper *mapper = data;
+ long *count = NULL;
seq_printf(m, "%ps:", (void *)ip);
seq_puts(m, "snapshot");
- if (count == -1)
- seq_puts(m, ":unlimited\n");
+ if (mapper)
+ count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+
+ if (count)
+ seq_printf(m, ":count=%ld\n", *count);
else
- seq_printf(m, ":count=%ld\n", count);
+ seq_puts(m, ":unlimited\n");
return 0;
}
+static int
+ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
+ unsigned long ip, void *init_data, void **data)
+{
+ struct ftrace_func_mapper *mapper = *data;
+
+ if (!mapper) {
+ mapper = allocate_ftrace_func_mapper();
+ if (!mapper)
+ return -ENOMEM;
+ *data = mapper;
+ }
+
+ return ftrace_func_mapper_add_ip(mapper, ip, init_data);
+}
+
+static void
+ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
+ unsigned long ip, void *data)
+{
+ struct ftrace_func_mapper *mapper = data;
+
+ if (!ip) {
+ if (!mapper)
+ return;
+ free_ftrace_func_mapper(mapper, NULL);
+ return;
+ }
+
+ ftrace_func_mapper_remove_ip(mapper, ip);
+}
+
static struct ftrace_probe_ops snapshot_probe_ops = {
.func = ftrace_snapshot,
.print = ftrace_snapshot_print,
@@ -6694,10 +6868,12 @@ static struct ftrace_probe_ops snapshot_probe_ops = {
static struct ftrace_probe_ops snapshot_count_probe_ops = {
.func = ftrace_count_snapshot,
.print = ftrace_snapshot_print,
+ .init = ftrace_snapshot_init,
+ .free = ftrace_snapshot_free,
};
static int
-ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
+ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
char *glob, char *cmd, char *param, int enable)
{
struct ftrace_probe_ops *ops;
@@ -6711,10 +6887,8 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
- if (glob[0] == '!') {
- unregister_ftrace_function_probe_func(glob+1, ops);
- return 0;
- }
+ if (glob[0] == '!')
+ return unregister_ftrace_function_probe_func(glob+1, tr, ops);
if (!param)
goto out_reg;
@@ -6733,11 +6907,13 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
return ret;
out_reg:
- ret = register_ftrace_function_probe(glob, ops, count);
+ ret = alloc_snapshot(tr);
+ if (ret < 0)
+ goto out;
- if (ret >= 0)
- alloc_snapshot(&global_trace);
+ ret = register_ftrace_function_probe(glob, tr, ops, count);
+ out:
return ret < 0 ? ret : 0;
}
@@ -7346,6 +7522,8 @@ static int instance_mkdir(const char *name)
goto out_free_tr;
}
+ ftrace_init_trace_array(tr);
+
init_tracer_tracefs(tr, tr->dir);
init_trace_flags_index(tr);
__update_tracer_options(tr);
@@ -7401,7 +7579,9 @@ static int instance_rmdir(const char *name)
}
tracing_set_nop(tr);
+ clear_ftrace_function_probes(tr);
event_trace_del_tracer(tr);
+ ftrace_clear_pids(tr);
ftrace_destroy_function_files(tr);
tracefs_remove_recursive(tr->dir);
free_trace_buffers(tr);
@@ -7965,6 +8145,9 @@ __init static int tracer_alloc_buffers(void)
register_tracer(&nop_trace);
+ /* Function tracing may start here (via kernel command line) */
+ init_function_trace();
+
/* All seems OK, enable tracing */
tracing_disabled = 0;
@@ -7999,7 +8182,7 @@ out:
return ret;
}
-void __init trace_init(void)
+void __init early_trace_init(void)
{
if (tracepoint_printk) {
tracepoint_print_iter =
@@ -8010,6 +8193,10 @@ void __init trace_init(void)
static_key_enable(&tracepoint_printk_key.key);
}
tracer_alloc_buffers();
+}
+
+void __init trace_init(void)
+{
trace_event_init();
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ae1cce91fead..39fd77330aab 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -262,6 +262,9 @@ struct trace_array {
#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
+#ifdef CONFIG_DYNAMIC_FTRACE
+ struct list_head func_probes;
+#endif
/* function tracing enabled */
int function_enabled;
#endif
@@ -579,6 +582,8 @@ void tracing_reset_all_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
int tracer_tracing_is_on(struct trace_array *tr);
+void tracer_tracing_on(struct trace_array *tr);
+void tracer_tracing_off(struct trace_array *tr);
struct dentry *trace_create_file(const char *name,
umode_t mode,
struct dentry *parent,
@@ -696,6 +701,9 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
+void ftrace_init_trace_array(struct trace_array *tr);
+#else
+static inline void ftrace_init_trace_array(struct trace_array *tr) { }
#endif
#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
extern int DYN_FTRACE_TEST_NAME(void);
@@ -880,6 +888,14 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
extern struct list_head ftrace_pids;
#ifdef CONFIG_FUNCTION_TRACER
+struct ftrace_func_command {
+ struct list_head list;
+ char *name;
+ int (*func)(struct trace_array *tr,
+ struct ftrace_hash *hash,
+ char *func, char *cmd,
+ char *params, int enable);
+};
extern bool ftrace_filter_param __initdata;
static inline int ftrace_trace_task(struct trace_array *tr)
{
@@ -896,6 +912,9 @@ int using_ftrace_ops_list_func(void);
void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
void ftrace_init_tracefs_toplevel(struct trace_array *tr,
struct dentry *d_tracer);
+void ftrace_clear_pids(struct trace_array *tr);
+int init_function_trace(void);
+void ftrace_pid_follow_fork(struct trace_array *tr, bool enable);
#else
static inline int ftrace_trace_task(struct trace_array *tr)
{
@@ -914,15 +933,76 @@ ftrace_init_global_array_ops(struct trace_array *tr) { }
static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
+static inline void ftrace_clear_pids(struct trace_array *tr) { }
+static inline int init_function_trace(void) { return 0; }
+static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { }
/* ftace_func_t type is not defined, use macro instead of static inline */
#define ftrace_init_array_ops(tr, func) do { } while (0)
#endif /* CONFIG_FUNCTION_TRACER */
#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
+
+struct ftrace_probe_ops {
+ void (*func)(unsigned long ip,
+ unsigned long parent_ip,
+ struct trace_array *tr,
+ struct ftrace_probe_ops *ops,
+ void *data);
+ int (*init)(struct ftrace_probe_ops *ops,
+ struct trace_array *tr,
+ unsigned long ip, void *init_data,
+ void **data);
+ void (*free)(struct ftrace_probe_ops *ops,
+ struct trace_array *tr,
+ unsigned long ip, void *data);
+ int (*print)(struct seq_file *m,
+ unsigned long ip,
+ struct ftrace_probe_ops *ops,
+ void *data);
+};
+
+struct ftrace_func_mapper;
+typedef int (*ftrace_mapper_func)(void *data);
+
+struct ftrace_func_mapper *allocate_ftrace_func_mapper(void);
+void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper,
+ unsigned long ip);
+int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
+ unsigned long ip, void *data);
+void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper,
+ unsigned long ip);
+void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
+ ftrace_mapper_func free_func);
+
+extern int
+register_ftrace_function_probe(char *glob, struct trace_array *tr,
+ struct ftrace_probe_ops *ops, void *data);
+extern int
+unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
+ struct ftrace_probe_ops *ops);
+extern void clear_ftrace_function_probes(struct trace_array *tr);
+
+int register_ftrace_command(struct ftrace_func_command *cmd);
+int unregister_ftrace_command(struct ftrace_func_command *cmd);
+
void ftrace_create_filter_files(struct ftrace_ops *ops,
struct dentry *parent);
void ftrace_destroy_filter_files(struct ftrace_ops *ops);
#else
+struct ftrace_func_command;
+
+static inline __init int register_ftrace_command(struct ftrace_func_command *cmd)
+{
+ return -EINVAL;
+}
+static inline __init int unregister_ftrace_command(char *cmd_name)
+{
+ return -EINVAL;
+}
+static inline void clear_ftrace_function_probes(struct trace_array *tr)
+{
+}
+
/*
* The ops parameter passed in is usually undefined.
* This must be a macro.
@@ -987,11 +1067,13 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
#ifdef CONFIG_FUNCTION_TRACER
# define FUNCTION_FLAGS \
- C(FUNCTION, "function-trace"),
+ C(FUNCTION, "function-trace"), \
+ C(FUNC_FORK, "function-fork"),
# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION
#else
# define FUNCTION_FLAGS
# define FUNCTION_DEFAULT_FLAGS 0UL
+# define TRACE_ITER_FUNC_FORK 0UL
#endif
#ifdef CONFIG_STACKTRACE
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index e49fbe901cfc..16a8cf02eee9 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -153,10 +153,18 @@ static int benchmark_event_kthread(void *arg)
trace_do_benchmark();
/*
- * We don't go to sleep, but let others
- * run as well.
+ * We don't go to sleep, but let others run as well.
+ * This is bascially a "yield()" to let any task that
+ * wants to run, schedule in, but if the CPU is idle,
+ * we'll keep burning cycles.
+ *
+ * Note the _rcu_qs() version of cond_resched() will
+ * notify synchronize_rcu_tasks() that this thread has
+ * passed a quiescent state for rcu_tasks. Otherwise
+ * this thread will never voluntarily schedule which would
+ * block synchronize_rcu_tasks() indefinitely.
*/
- cond_resched();
+ cond_resched_rcu_qs();
}
return 0;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c203ac4df791..adcdbbeae010 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -348,14 +348,14 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
__field( u64, duration )
__field( u64, outer_duration )
__field( u64, nmi_total_ts )
- __field_struct( struct timespec, timestamp )
- __field_desc( long, timestamp, tv_sec )
+ __field_struct( struct timespec64, timestamp )
+ __field_desc( s64, timestamp, tv_sec )
__field_desc( long, timestamp, tv_nsec )
__field( unsigned int, nmi_count )
__field( unsigned int, seqnum )
),
- F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n",
+ F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n",
__entry->seqnum,
__entry->tv_sec,
__entry->tv_nsec,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 93116549a284..e7973e10398c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2460,15 +2460,8 @@ struct event_probe_data {
bool enable;
};
-static void
-event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
+static void update_event_probe(struct event_probe_data *data)
{
- struct event_probe_data **pdata = (struct event_probe_data **)_data;
- struct event_probe_data *data = *pdata;
-
- if (!data)
- return;
-
if (data->enable)
clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags);
else
@@ -2476,77 +2469,141 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
}
static void
-event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data)
+event_enable_probe(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
{
- struct event_probe_data **pdata = (struct event_probe_data **)_data;
- struct event_probe_data *data = *pdata;
+ struct ftrace_func_mapper *mapper = data;
+ struct event_probe_data *edata;
+ void **pdata;
- if (!data)
+ pdata = ftrace_func_mapper_find_ip(mapper, ip);
+ if (!pdata || !*pdata)
+ return;
+
+ edata = *pdata;
+ update_event_probe(edata);
+}
+
+static void
+event_enable_count_probe(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
+{
+ struct ftrace_func_mapper *mapper = data;
+ struct event_probe_data *edata;
+ void **pdata;
+
+ pdata = ftrace_func_mapper_find_ip(mapper, ip);
+ if (!pdata || !*pdata)
return;
- if (!data->count)
+ edata = *pdata;
+
+ if (!edata->count)
return;
/* Skip if the event is in a state we want to switch to */
- if (data->enable == !(data->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
+ if (edata->enable == !(edata->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
return;
- if (data->count != -1)
- (data->count)--;
+ if (edata->count != -1)
+ (edata->count)--;
- event_enable_probe(ip, parent_ip, _data);
+ update_event_probe(edata);
}
static int
event_enable_print(struct seq_file *m, unsigned long ip,
- struct ftrace_probe_ops *ops, void *_data)
+ struct ftrace_probe_ops *ops, void *data)
{
- struct event_probe_data *data = _data;
+ struct ftrace_func_mapper *mapper = data;
+ struct event_probe_data *edata;
+ void **pdata;
+
+ pdata = ftrace_func_mapper_find_ip(mapper, ip);
+
+ if (WARN_ON_ONCE(!pdata || !*pdata))
+ return 0;
+
+ edata = *pdata;
seq_printf(m, "%ps:", (void *)ip);
seq_printf(m, "%s:%s:%s",
- data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
- data->file->event_call->class->system,
- trace_event_name(data->file->event_call));
+ edata->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+ edata->file->event_call->class->system,
+ trace_event_name(edata->file->event_call));
- if (data->count == -1)
+ if (edata->count == -1)
seq_puts(m, ":unlimited\n");
else
- seq_printf(m, ":count=%ld\n", data->count);
+ seq_printf(m, ":count=%ld\n", edata->count);
return 0;
}
static int
-event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip,
- void **_data)
+event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
+ unsigned long ip, void *init_data, void **data)
{
- struct event_probe_data **pdata = (struct event_probe_data **)_data;
- struct event_probe_data *data = *pdata;
+ struct ftrace_func_mapper *mapper = *data;
+ struct event_probe_data *edata = init_data;
+ int ret;
+
+ if (!mapper) {
+ mapper = allocate_ftrace_func_mapper();
+ if (!mapper)
+ return -ENODEV;
+ *data = mapper;
+ }
+
+ ret = ftrace_func_mapper_add_ip(mapper, ip, edata);
+ if (ret < 0)
+ return ret;
+
+ edata->ref++;
- data->ref++;
+ return 0;
+}
+
+static int free_probe_data(void *data)
+{
+ struct event_probe_data *edata = data;
+
+ edata->ref--;
+ if (!edata->ref) {
+ /* Remove the SOFT_MODE flag */
+ __ftrace_event_enable_disable(edata->file, 0, 1);
+ module_put(edata->file->event_call->mod);
+ kfree(edata);
+ }
return 0;
}
static void
-event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip,
- void **_data)
+event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
+ unsigned long ip, void *data)
{
- struct event_probe_data **pdata = (struct event_probe_data **)_data;
- struct event_probe_data *data = *pdata;
+ struct ftrace_func_mapper *mapper = data;
+ struct event_probe_data *edata;
- if (WARN_ON_ONCE(data->ref <= 0))
+ if (!ip) {
+ if (!mapper)
+ return;
+ free_ftrace_func_mapper(mapper, free_probe_data);
return;
-
- data->ref--;
- if (!data->ref) {
- /* Remove the SOFT_MODE flag */
- __ftrace_event_enable_disable(data->file, 0, 1);
- module_put(data->file->event_call->mod);
- kfree(data);
}
- *pdata = NULL;
+
+ edata = ftrace_func_mapper_remove_ip(mapper, ip);
+
+ if (WARN_ON_ONCE(!edata))
+ return;
+
+ if (WARN_ON_ONCE(edata->ref <= 0))
+ return;
+
+ free_probe_data(edata);
}
static struct ftrace_probe_ops event_enable_probe_ops = {
@@ -2578,10 +2635,9 @@ static struct ftrace_probe_ops event_disable_count_probe_ops = {
};
static int
-event_enable_func(struct ftrace_hash *hash,
+event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
char *glob, char *cmd, char *param, int enabled)
{
- struct trace_array *tr = top_trace_array();
struct trace_event_file *file;
struct ftrace_probe_ops *ops;
struct event_probe_data *data;
@@ -2619,12 +2675,12 @@ event_enable_func(struct ftrace_hash *hash,
ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
if (glob[0] == '!') {
- unregister_ftrace_function_probe_func(glob+1, ops);
- ret = 0;
+ ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
goto out;
}
ret = -ENOMEM;
+
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
goto out;
@@ -2661,7 +2717,8 @@ event_enable_func(struct ftrace_hash *hash,
ret = __ftrace_event_enable_disable(file, 1, 1);
if (ret < 0)
goto out_put;
- ret = register_ftrace_function_probe(glob, ops, data);
+
+ ret = register_ftrace_function_probe(glob, tr, ops, data);
/*
* The above returns on success the # of functions enabled,
* but if it didn't find any functions it returns zero.
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 0efa00d80623..a3bddbfd0874 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -267,10 +267,14 @@ static struct tracer function_trace __tracer_data =
};
#ifdef CONFIG_DYNAMIC_FTRACE
-static void update_traceon_count(void **data, bool on)
+static void update_traceon_count(struct ftrace_probe_ops *ops,
+ unsigned long ip,
+ struct trace_array *tr, bool on,
+ void *data)
{
- long *count = (long *)data;
- long old_count = *count;
+ struct ftrace_func_mapper *mapper = data;
+ long *count;
+ long old_count;
/*
* Tracing gets disabled (or enabled) once per count.
@@ -301,23 +305,22 @@ static void update_traceon_count(void **data, bool on)
* setting the tracing_on file. But we currently don't care
* about that.
*/
- if (!old_count)
+ count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+ old_count = *count;
+
+ if (old_count <= 0)
return;
/* Make sure we see count before checking tracing state */
smp_rmb();
- if (on == !!tracing_is_on())
+ if (on == !!tracer_tracing_is_on(tr))
return;
if (on)
- tracing_on();
+ tracer_tracing_on(tr);
else
- tracing_off();
-
- /* unlimited? */
- if (old_count == -1)
- return;
+ tracer_tracing_off(tr);
/* Make sure tracing state is visible before updating count */
smp_wmb();
@@ -326,33 +329,41 @@ static void update_traceon_count(void **data, bool on)
}
static void
-ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_traceon_count(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
{
- update_traceon_count(data, 1);
+ update_traceon_count(ops, ip, tr, 1, data);
}
static void
-ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
{
- update_traceon_count(data, 0);
+ update_traceon_count(ops, ip, tr, 0, data);
}
static void
-ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_traceon(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
{
- if (tracing_is_on())
+ if (tracer_tracing_is_on(tr))
return;
- tracing_on();
+ tracer_tracing_on(tr);
}
static void
-ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
{
- if (!tracing_is_on())
+ if (!tracer_tracing_is_on(tr))
return;
- tracing_off();
+ tracer_tracing_off(tr);
}
/*
@@ -364,144 +375,218 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
*/
#define STACK_SKIP 4
+static __always_inline void trace_stack(struct trace_array *tr)
+{
+ unsigned long flags;
+ int pc;
+
+ local_save_flags(flags);
+ pc = preempt_count();
+
+ __trace_stack(tr, flags, STACK_SKIP, pc);
+}
+
static void
-ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_stacktrace(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
{
- trace_dump_stack(STACK_SKIP);
+ trace_stack(tr);
}
static void
-ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
{
- long *count = (long *)data;
+ struct ftrace_func_mapper *mapper = data;
+ long *count;
long old_count;
long new_count;
+ if (!tracing_is_on())
+ return;
+
+ /* unlimited? */
+ if (!mapper) {
+ trace_stack(tr);
+ return;
+ }
+
+ count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+
/*
* Stack traces should only execute the number of times the
* user specified in the counter.
*/
do {
-
- if (!tracing_is_on())
- return;
-
old_count = *count;
if (!old_count)
return;
- /* unlimited? */
- if (old_count == -1) {
- trace_dump_stack(STACK_SKIP);
- return;
- }
-
new_count = old_count - 1;
new_count = cmpxchg(count, old_count, new_count);
if (new_count == old_count)
- trace_dump_stack(STACK_SKIP);
+ trace_stack(tr);
+
+ if (!tracing_is_on())
+ return;
} while (new_count != old_count);
}
-static int update_count(void **data)
+static int update_count(struct ftrace_probe_ops *ops, unsigned long ip,
+ void *data)
{
- unsigned long *count = (long *)data;
+ struct ftrace_func_mapper *mapper = data;
+ long *count = NULL;
- if (!*count)
- return 0;
+ if (mapper)
+ count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
- if (*count != -1)
+ if (count) {
+ if (*count <= 0)
+ return 0;
(*count)--;
+ }
return 1;
}
static void
-ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_dump_probe(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
{
- if (update_count(data))
+ if (update_count(ops, ip, data))
ftrace_dump(DUMP_ALL);
}
/* Only dump the current CPU buffer. */
static void
-ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip,
+ struct trace_array *tr, struct ftrace_probe_ops *ops,
+ void *data)
{
- if (update_count(data))
+ if (update_count(ops, ip, data))
ftrace_dump(DUMP_ORIG);
}
static int
ftrace_probe_print(const char *name, struct seq_file *m,
- unsigned long ip, void *data)
+ unsigned long ip, struct ftrace_probe_ops *ops,
+ void *data)
{
- long count = (long)data;
+ struct ftrace_func_mapper *mapper = data;
+ long *count = NULL;
seq_printf(m, "%ps:%s", (void *)ip, name);
- if (count == -1)
- seq_puts(m, ":unlimited\n");
+ if (mapper)
+ count = (long *)ftrace_func_mapper_find_ip(mapper, ip);
+
+ if (count)
+ seq_printf(m, ":count=%ld\n", *count);
else
- seq_printf(m, ":count=%ld\n", count);
+ seq_puts(m, ":unlimited\n");
return 0;
}
static int
ftrace_traceon_print(struct seq_file *m, unsigned long ip,
- struct ftrace_probe_ops *ops, void *data)
+ struct ftrace_probe_ops *ops,
+ void *data)
{
- return ftrace_probe_print("traceon", m, ip, data);
+ return ftrace_probe_print("traceon", m, ip, ops, data);
}
static int
ftrace_traceoff_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data)
{
- return ftrace_probe_print("traceoff", m, ip, data);
+ return ftrace_probe_print("traceoff", m, ip, ops, data);
}
static int
ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data)
{
- return ftrace_probe_print("stacktrace", m, ip, data);
+ return ftrace_probe_print("stacktrace", m, ip, ops, data);
}
static int
ftrace_dump_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data)
{
- return ftrace_probe_print("dump", m, ip, data);
+ return ftrace_probe_print("dump", m, ip, ops, data);
}
static int
ftrace_cpudump_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data)
{
- return ftrace_probe_print("cpudump", m, ip, data);
+ return ftrace_probe_print("cpudump", m, ip, ops, data);
+}
+
+
+static int
+ftrace_count_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
+ unsigned long ip, void *init_data, void **data)
+{
+ struct ftrace_func_mapper *mapper = *data;
+
+ if (!mapper) {
+ mapper = allocate_ftrace_func_mapper();
+ if (!mapper)
+ return -ENOMEM;
+ *data = mapper;
+ }
+
+ return ftrace_func_mapper_add_ip(mapper, ip, init_data);
+}
+
+static void
+ftrace_count_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
+ unsigned long ip, void *data)
+{
+ struct ftrace_func_mapper *mapper = data;
+
+ if (!ip) {
+ free_ftrace_func_mapper(mapper, NULL);
+ return;
+ }
+
+ ftrace_func_mapper_remove_ip(mapper, ip);
}
static struct ftrace_probe_ops traceon_count_probe_ops = {
.func = ftrace_traceon_count,
.print = ftrace_traceon_print,
+ .init = ftrace_count_init,
+ .free = ftrace_count_free,
};
static struct ftrace_probe_ops traceoff_count_probe_ops = {
.func = ftrace_traceoff_count,
.print = ftrace_traceoff_print,
+ .init = ftrace_count_init,
+ .free = ftrace_count_free,
};
static struct ftrace_probe_ops stacktrace_count_probe_ops = {
.func = ftrace_stacktrace_count,
.print = ftrace_stacktrace_print,
+ .init = ftrace_count_init,
+ .free = ftrace_count_free,
};
static struct ftrace_probe_ops dump_probe_ops = {
.func = ftrace_dump_probe,
.print = ftrace_dump_print,
+ .init = ftrace_count_init,
+ .free = ftrace_count_free,
};
static struct ftrace_probe_ops cpudump_probe_ops = {
@@ -525,7 +610,8 @@ static struct ftrace_probe_ops stacktrace_probe_ops = {
};
static int
-ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
+ftrace_trace_probe_callback(struct trace_array *tr,
+ struct ftrace_probe_ops *ops,
struct ftrace_hash *hash, char *glob,
char *cmd, char *param, int enable)
{
@@ -537,10 +623,8 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
if (!enable)
return -EINVAL;
- if (glob[0] == '!') {
- unregister_ftrace_function_probe_func(glob+1, ops);
- return 0;
- }
+ if (glob[0] == '!')
+ return unregister_ftrace_function_probe_func(glob+1, tr, ops);
if (!param)
goto out_reg;
@@ -559,13 +643,13 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
return ret;
out_reg:
- ret = register_ftrace_function_probe(glob, ops, count);
+ ret = register_ftrace_function_probe(glob, tr, ops, count);
return ret < 0 ? ret : 0;
}
static int
-ftrace_trace_onoff_callback(struct ftrace_hash *hash,
+ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash,
char *glob, char *cmd, char *param, int enable)
{
struct ftrace_probe_ops *ops;
@@ -576,24 +660,24 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
else
ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
- return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
param, enable);
}
static int
-ftrace_stacktrace_callback(struct ftrace_hash *hash,
+ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash,
char *glob, char *cmd, char *param, int enable)
{
struct ftrace_probe_ops *ops;
ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
- return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
param, enable);
}
static int
-ftrace_dump_callback(struct ftrace_hash *hash,
+ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash,
char *glob, char *cmd, char *param, int enable)
{
struct ftrace_probe_ops *ops;
@@ -601,12 +685,12 @@ ftrace_dump_callback(struct ftrace_hash *hash,
ops = &dump_probe_ops;
/* Only dump once. */
- return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
"1", enable);
}
static int
-ftrace_cpudump_callback(struct ftrace_hash *hash,
+ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash,
char *glob, char *cmd, char *param, int enable)
{
struct ftrace_probe_ops *ops;
@@ -614,7 +698,7 @@ ftrace_cpudump_callback(struct ftrace_hash *hash,
ops = &cpudump_probe_ops;
/* Only dump once. */
- return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
"1", enable);
}
@@ -687,9 +771,8 @@ static inline int init_func_cmd_traceon(void)
}
#endif /* CONFIG_DYNAMIC_FTRACE */
-static __init int init_function_trace(void)
+__init int init_function_trace(void)
{
init_func_cmd_traceon();
return register_tracer(&function_trace);
}
-core_initcall(init_function_trace);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 21ea6ae77d93..d7c8e4ec3d9d 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -79,12 +79,12 @@ static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC;
/* Individual latency samples are stored here when detected. */
struct hwlat_sample {
- u64 seqnum; /* unique sequence */
- u64 duration; /* delta */
- u64 outer_duration; /* delta (outer loop) */
- u64 nmi_total_ts; /* Total time spent in NMIs */
- struct timespec timestamp; /* wall time */
- int nmi_count; /* # NMIs during this sample */
+ u64 seqnum; /* unique sequence */
+ u64 duration; /* delta */
+ u64 outer_duration; /* delta (outer loop) */
+ u64 nmi_total_ts; /* Total time spent in NMIs */
+ struct timespec64 timestamp; /* wall time */
+ int nmi_count; /* # NMIs during this sample */
};
/* keep the global state somewhere. */
@@ -250,7 +250,7 @@ static int get_sample(void)
s.seqnum = hwlat_data.count;
s.duration = sample;
s.outer_duration = outer_sample;
- s.timestamp = CURRENT_TIME;
+ ktime_get_real_ts64(&s.timestamp);
s.nmi_total_ts = nmi_total_ts;
s.nmi_count = nmi_count;
trace_hwlat_sample(&s);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5f688cc724f0..c129fca6ec99 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -25,6 +25,7 @@
#include "trace_probe.h"
#define KPROBE_EVENT_SYSTEM "kprobes"
+#define KRETPROBE_MAXACTIVE_MAX 4096
/**
* Kprobe event core functions
@@ -282,6 +283,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
void *addr,
const char *symbol,
unsigned long offs,
+ int maxactive,
int nargs, bool is_return)
{
struct trace_kprobe *tk;
@@ -309,6 +311,8 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
else
tk->rp.kp.pre_handler = kprobe_dispatcher;
+ tk->rp.maxactive = maxactive;
+
if (!event || !is_good_name(event)) {
ret = -EINVAL;
goto error;
@@ -598,8 +602,10 @@ static int create_trace_kprobe(int argc, char **argv)
{
/*
* Argument syntax:
- * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
- * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
+ * - Add kprobe:
+ * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
+ * - Add kretprobe:
+ * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
* Fetch args:
* $retval : fetch return value
* $stack : fetch stack address
@@ -619,6 +625,7 @@ static int create_trace_kprobe(int argc, char **argv)
int i, ret = 0;
bool is_return = false, is_delete = false;
char *symbol = NULL, *event = NULL, *group = NULL;
+ int maxactive = 0;
char *arg;
unsigned long offset = 0;
void *addr = NULL;
@@ -637,8 +644,28 @@ static int create_trace_kprobe(int argc, char **argv)
return -EINVAL;
}
- if (argv[0][1] == ':') {
- event = &argv[0][2];
+ event = strchr(&argv[0][1], ':');
+ if (event) {
+ event[0] = '\0';
+ event++;
+ }
+ if (is_return && isdigit(argv[0][1])) {
+ ret = kstrtouint(&argv[0][1], 0, &maxactive);
+ if (ret) {
+ pr_info("Failed to parse maxactive.\n");
+ return ret;
+ }
+ /* kretprobes instances are iterated over via a list. The
+ * maximum should stay reasonable.
+ */
+ if (maxactive > KRETPROBE_MAXACTIVE_MAX) {
+ pr_info("Maxactive is too big (%d > %d).\n",
+ maxactive, KRETPROBE_MAXACTIVE_MAX);
+ return -E2BIG;
+ }
+ }
+
+ if (event) {
if (strchr(event, '/')) {
group = event;
event = strchr(group, '/') + 1;
@@ -681,10 +708,6 @@ static int create_trace_kprobe(int argc, char **argv)
return -EINVAL;
}
if (isdigit(argv[1][0])) {
- if (is_return) {
- pr_info("Return probe point must be a symbol.\n");
- return -EINVAL;
- }
/* an address specified */
ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
if (ret) {
@@ -700,8 +723,9 @@ static int create_trace_kprobe(int argc, char **argv)
pr_info("Failed to parse symbol.\n");
return ret;
}
- if (offset && is_return) {
- pr_info("Return probe must be used without offset.\n");
+ if (offset && is_return &&
+ !function_offset_within_entry(NULL, symbol, offset)) {
+ pr_info("Given offset is not valid for return probe.\n");
return -EINVAL;
}
}
@@ -718,8 +742,8 @@ static int create_trace_kprobe(int argc, char **argv)
is_return ? 'r' : 'p', addr);
event = buf;
}
- tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc,
- is_return);
+ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
+ argc, is_return);
if (IS_ERR(tk)) {
pr_info("Failed to allocate trace_probe.(%d)\n",
(int)PTR_ERR(tk));
@@ -1511,6 +1535,11 @@ static __init int kprobe_trace_self_tests_init(void)
end:
release_all_trace_kprobes();
+ /*
+ * Wait for the optimizer work to finish. Otherwise it might fiddle
+ * with probes in already freed __init text.
+ */
+ wait_for_kprobe_optimizer();
if (warn)
pr_cont("NG: Some tests are failed. Please check them.\n");
else
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02a4aeb22c47..08f9bab8089e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -4,7 +4,6 @@
* Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
*
*/
-
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/ftrace.h>
@@ -1161,11 +1160,11 @@ trace_hwlat_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, entry);
- trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld",
+ trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld",
field->seqnum,
field->duration,
field->outer_duration,
- field->timestamp.tv_sec,
+ (long long)field->timestamp.tv_sec,
field->timestamp.tv_nsec);
if (field->nmi_count) {
@@ -1195,10 +1194,10 @@ trace_hwlat_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- trace_seq_printf(s, "%llu %lld %ld %09ld %u\n",
+ trace_seq_printf(s, "%llu %lld %lld %09ld %u\n",
field->duration,
field->outer_duration,
- field->timestamp.tv_sec,
+ (long long)field->timestamp.tv_sec,
field->timestamp.tv_nsec,
field->seqnum);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 5fb1f2c87e6b..76aa04d4c925 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -35,7 +35,7 @@ unsigned long stack_trace_max_size;
arch_spinlock_t stack_trace_max_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-static DEFINE_PER_CPU(int, trace_active);
+DEFINE_PER_CPU(int, disable_stack_tracer);
static DEFINE_MUTEX(stack_sysctl_mutex);
int stack_tracer_enabled;
@@ -96,6 +96,14 @@ check_stack(unsigned long ip, unsigned long *stack)
if (in_nmi())
return;
+ /*
+ * There's a slight chance that we are tracing inside the
+ * RCU infrastructure, and rcu_irq_enter() will not work
+ * as expected.
+ */
+ if (unlikely(rcu_irq_enter_disabled()))
+ return;
+
local_irq_save(flags);
arch_spin_lock(&stack_trace_max_lock);
@@ -207,13 +215,12 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
{
unsigned long stack;
- int cpu;
preempt_disable_notrace();
- cpu = raw_smp_processor_id();
/* no atomic needed, we only modify this variable by this cpu */
- if (per_cpu(trace_active, cpu)++ != 0)
+ __this_cpu_inc(disable_stack_tracer);
+ if (__this_cpu_read(disable_stack_tracer) != 1)
goto out;
ip += MCOUNT_INSN_SIZE;
@@ -221,7 +228,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
check_stack(ip, &stack);
out:
- per_cpu(trace_active, cpu)--;
+ __this_cpu_dec(disable_stack_tracer);
/* prevent recursion in schedule */
preempt_enable_notrace();
}
@@ -253,7 +260,6 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
long *ptr = filp->private_data;
unsigned long val, flags;
int ret;
- int cpu;
ret = kstrtoul_from_user(ubuf, count, 10, &val);
if (ret)
@@ -264,16 +270,15 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
/*
* In case we trace inside arch_spin_lock() or after (NMI),
* we will cause circular lock, so we also need to increase
- * the percpu trace_active here.
+ * the percpu disable_stack_tracer here.
*/
- cpu = smp_processor_id();
- per_cpu(trace_active, cpu)++;
+ __this_cpu_inc(disable_stack_tracer);
arch_spin_lock(&stack_trace_max_lock);
*ptr = val;
arch_spin_unlock(&stack_trace_max_lock);
- per_cpu(trace_active, cpu)--;
+ __this_cpu_dec(disable_stack_tracer);
local_irq_restore(flags);
return count;
@@ -307,12 +312,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
static void *t_start(struct seq_file *m, loff_t *pos)
{
- int cpu;
-
local_irq_disable();
- cpu = smp_processor_id();
- per_cpu(trace_active, cpu)++;
+ __this_cpu_inc(disable_stack_tracer);
arch_spin_lock(&stack_trace_max_lock);
@@ -324,12 +326,9 @@ static void *t_start(struct seq_file *m, loff_t *pos)
static void t_stop(struct seq_file *m, void *p)
{
- int cpu;
-
arch_spin_unlock(&stack_trace_max_lock);
- cpu = smp_processor_id();
- per_cpu(trace_active, cpu)--;
+ __this_cpu_dec(disable_stack_tracer);
local_irq_enable();
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c0168b7da1ea..c74bf39ef764 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3209,9 +3209,8 @@ static int init_worker_pool(struct worker_pool *pool)
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
- init_timer_deferrable(&pool->idle_timer);
- pool->idle_timer.function = idle_worker_timeout;
- pool->idle_timer.data = (unsigned long)pool;
+ setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout,
+ (unsigned long)pool);
setup_timer(&pool->mayday_timer, pool_mayday_timeout,
(unsigned long)pool);
@@ -4735,6 +4734,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu);
+
+/**
+ * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn: the function to run
+ * @arg: the function argument
+ *
+ * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
+ * any locks which would prevent @fn from completing.
+ *
+ * Return: The value @fn returns.
+ */
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+ long ret = -ENODEV;
+
+ get_online_cpus();
+ if (cpu_online(cpu))
+ ret = work_on_cpu(cpu, fn, arg);
+ put_online_cpus();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu_safe);
#endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER